From 9ca8dbcc65cfc63d6f5ef3312a33184e1d726e00 Mon Sep 17 00:00:00 2001 From: Yunhong Jiang Date: Tue, 4 Aug 2015 12:17:53 -0700 Subject: Add the rt linux 4.1.3-rt3 as base Import the rt linux 4.1.3-rt3 as OPNFV kvm base. It's from git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git linux-4.1.y-rt and the base is: commit 0917f823c59692d751951bf5ea699a2d1e2f26a2 Author: Sebastian Andrzej Siewior Date: Sat Jul 25 12:13:34 2015 +0200 Prepare v4.1.3-rt3 Signed-off-by: Sebastian Andrzej Siewior We lose all the git history this way and it's not good. We should apply another opnfv project repo in future. Change-Id: I87543d81c9df70d99c5001fbdf646b202c19f423 Signed-off-by: Yunhong Jiang --- kernel/drivers/infiniband/hw/Makefile | 12 + kernel/drivers/infiniband/hw/amso1100/Kbuild | 6 + kernel/drivers/infiniband/hw/amso1100/Kconfig | 15 + kernel/drivers/infiniband/hw/amso1100/c2.c | 1241 +++ kernel/drivers/infiniband/hw/amso1100/c2.h | 547 ++ kernel/drivers/infiniband/hw/amso1100/c2_ae.c | 327 + kernel/drivers/infiniband/hw/amso1100/c2_ae.h | 108 + kernel/drivers/infiniband/hw/amso1100/c2_alloc.c | 142 + kernel/drivers/infiniband/hw/amso1100/c2_cm.c | 461 ++ kernel/drivers/infiniband/hw/amso1100/c2_cq.c | 440 + kernel/drivers/infiniband/hw/amso1100/c2_intr.c | 219 + kernel/drivers/infiniband/hw/amso1100/c2_mm.c | 377 + kernel/drivers/infiniband/hw/amso1100/c2_mq.c | 174 + kernel/drivers/infiniband/hw/amso1100/c2_mq.h | 106 + kernel/drivers/infiniband/hw/amso1100/c2_pd.c | 90 + .../drivers/infiniband/hw/amso1100/c2_provider.c | 882 ++ .../drivers/infiniband/hw/amso1100/c2_provider.h | 182 + kernel/drivers/infiniband/hw/amso1100/c2_qp.c | 1024 +++ kernel/drivers/infiniband/hw/amso1100/c2_rnic.c | 655 ++ kernel/drivers/infiniband/hw/amso1100/c2_status.h | 158 + kernel/drivers/infiniband/hw/amso1100/c2_user.h | 82 + kernel/drivers/infiniband/hw/amso1100/c2_vq.c | 260 + kernel/drivers/infiniband/hw/amso1100/c2_vq.h | 63 + kernel/drivers/infiniband/hw/amso1100/c2_wr.h | 1520 ++++ kernel/drivers/infiniband/hw/cxgb3/Kconfig | 27 + kernel/drivers/infiniband/hw/cxgb3/Makefile | 8 + kernel/drivers/infiniband/hw/cxgb3/cxio_dbg.c | 207 + kernel/drivers/infiniband/hw/cxgb3/cxio_hal.c | 1343 +++ kernel/drivers/infiniband/hw/cxgb3/cxio_hal.h | 211 + kernel/drivers/infiniband/hw/cxgb3/cxio_resource.c | 343 + kernel/drivers/infiniband/hw/cxgb3/cxio_resource.h | 69 + kernel/drivers/infiniband/hw/cxgb3/cxio_wr.h | 802 ++ kernel/drivers/infiniband/hw/cxgb3/iwch.c | 292 + kernel/drivers/infiniband/hw/cxgb3/iwch.h | 180 + kernel/drivers/infiniband/hw/cxgb3/iwch_cm.c | 2272 ++++++ kernel/drivers/infiniband/hw/cxgb3/iwch_cm.h | 233 + kernel/drivers/infiniband/hw/cxgb3/iwch_cq.c | 233 + kernel/drivers/infiniband/hw/cxgb3/iwch_ev.c | 232 + kernel/drivers/infiniband/hw/cxgb3/iwch_mem.c | 203 + kernel/drivers/infiniband/hw/cxgb3/iwch_provider.c | 1467 ++++ kernel/drivers/infiniband/hw/cxgb3/iwch_provider.h | 360 + kernel/drivers/infiniband/hw/cxgb3/iwch_qp.c | 1163 +++ kernel/drivers/infiniband/hw/cxgb3/iwch_user.h | 74 + kernel/drivers/infiniband/hw/cxgb3/tcb.h | 632 ++ kernel/drivers/infiniband/hw/cxgb4/Kconfig | 18 + kernel/drivers/infiniband/hw/cxgb4/Makefile | 5 + kernel/drivers/infiniband/hw/cxgb4/cm.c | 4053 +++++++++ kernel/drivers/infiniband/hw/cxgb4/cq.c | 1015 +++ kernel/drivers/infiniband/hw/cxgb4/device.c | 1564 ++++ kernel/drivers/infiniband/hw/cxgb4/ev.c | 244 + kernel/drivers/infiniband/hw/cxgb4/id_table.c | 112 + kernel/drivers/infiniband/hw/cxgb4/iw_cxgb4.h | 1042 +++ kernel/drivers/infiniband/hw/cxgb4/mem.c | 979 +++ kernel/drivers/infiniband/hw/cxgb4/provider.c | 588 ++ kernel/drivers/infiniband/hw/cxgb4/qp.c | 1886 +++++ kernel/drivers/infiniband/hw/cxgb4/resource.c | 453 ++ kernel/drivers/infiniband/hw/cxgb4/t4.h | 685 ++ kernel/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h | 855 ++ kernel/drivers/infiniband/hw/cxgb4/user.h | 80 + kernel/drivers/infiniband/hw/ehca/Kconfig | 9 + kernel/drivers/infiniband/hw/ehca/Makefile | 16 + kernel/drivers/infiniband/hw/ehca/ehca_av.c | 277 + kernel/drivers/infiniband/hw/ehca/ehca_classes.h | 482 ++ .../infiniband/hw/ehca/ehca_classes_pSeries.h | 208 + kernel/drivers/infiniband/hw/ehca/ehca_cq.c | 392 + kernel/drivers/infiniband/hw/ehca/ehca_eq.c | 189 + kernel/drivers/infiniband/hw/ehca/ehca_hca.c | 410 + kernel/drivers/infiniband/hw/ehca/ehca_irq.c | 870 ++ kernel/drivers/infiniband/hw/ehca/ehca_irq.h | 77 + kernel/drivers/infiniband/hw/ehca/ehca_iverbs.h | 212 + kernel/drivers/infiniband/hw/ehca/ehca_main.c | 1100 +++ kernel/drivers/infiniband/hw/ehca/ehca_mcast.c | 131 + kernel/drivers/infiniband/hw/ehca/ehca_mrmw.c | 2593 ++++++ kernel/drivers/infiniband/hw/ehca/ehca_mrmw.h | 132 + kernel/drivers/infiniband/hw/ehca/ehca_pd.c | 124 + kernel/drivers/infiniband/hw/ehca/ehca_qes.h | 260 + kernel/drivers/infiniband/hw/ehca/ehca_qp.c | 2257 ++++++ kernel/drivers/infiniband/hw/ehca/ehca_reqs.c | 953 +++ kernel/drivers/infiniband/hw/ehca/ehca_sqp.c | 237 + kernel/drivers/infiniband/hw/ehca/ehca_tools.h | 155 + kernel/drivers/infiniband/hw/ehca/ehca_uverbs.c | 309 + kernel/drivers/infiniband/hw/ehca/hcp_if.c | 949 +++ kernel/drivers/infiniband/hw/ehca/hcp_if.h | 265 + kernel/drivers/infiniband/hw/ehca/hcp_phyp.c | 82 + kernel/drivers/infiniband/hw/ehca/hcp_phyp.h | 90 + kernel/drivers/infiniband/hw/ehca/hipz_fns.h | 68 + kernel/drivers/infiniband/hw/ehca/hipz_fns_core.h | 100 + kernel/drivers/infiniband/hw/ehca/hipz_hw.h | 414 + kernel/drivers/infiniband/hw/ehca/ipz_pt_fn.c | 295 + kernel/drivers/infiniband/hw/ehca/ipz_pt_fn.h | 289 + kernel/drivers/infiniband/hw/ipath/Kconfig | 11 + kernel/drivers/infiniband/hw/ipath/Makefile | 37 + kernel/drivers/infiniband/hw/ipath/ipath_common.h | 851 ++ kernel/drivers/infiniband/hw/ipath/ipath_cq.c | 478 ++ kernel/drivers/infiniband/hw/ipath/ipath_debug.h | 99 + kernel/drivers/infiniband/hw/ipath/ipath_diag.c | 551 ++ kernel/drivers/infiniband/hw/ipath/ipath_dma.c | 179 + kernel/drivers/infiniband/hw/ipath/ipath_driver.c | 2779 +++++++ kernel/drivers/infiniband/hw/ipath/ipath_eeprom.c | 1183 +++ .../drivers/infiniband/hw/ipath/ipath_file_ops.c | 2620 ++++++ kernel/drivers/infiniband/hw/ipath/ipath_fs.c | 422 + kernel/drivers/infiniband/hw/ipath/ipath_iba6110.c | 1940 +++++ .../drivers/infiniband/hw/ipath/ipath_init_chip.c | 1066 +++ kernel/drivers/infiniband/hw/ipath/ipath_intr.c | 1273 +++ kernel/drivers/infiniband/hw/ipath/ipath_kernel.h | 1375 ++++ kernel/drivers/infiniband/hw/ipath/ipath_keys.c | 270 + kernel/drivers/infiniband/hw/ipath/ipath_mad.c | 1513 ++++ kernel/drivers/infiniband/hw/ipath/ipath_mmap.c | 174 + kernel/drivers/infiniband/hw/ipath/ipath_mr.c | 425 + kernel/drivers/infiniband/hw/ipath/ipath_qp.c | 1080 +++ kernel/drivers/infiniband/hw/ipath/ipath_rc.c | 1969 +++++ .../drivers/infiniband/hw/ipath/ipath_registers.h | 512 ++ kernel/drivers/infiniband/hw/ipath/ipath_ruc.c | 734 ++ kernel/drivers/infiniband/hw/ipath/ipath_sdma.c | 818 ++ kernel/drivers/infiniband/hw/ipath/ipath_srq.c | 380 + kernel/drivers/infiniband/hw/ipath/ipath_stats.c | 347 + kernel/drivers/infiniband/hw/ipath/ipath_sysfs.c | 1238 +++ kernel/drivers/infiniband/hw/ipath/ipath_uc.c | 547 ++ kernel/drivers/infiniband/hw/ipath/ipath_ud.c | 580 ++ .../drivers/infiniband/hw/ipath/ipath_user_pages.c | 229 + .../drivers/infiniband/hw/ipath/ipath_user_sdma.c | 875 ++ .../drivers/infiniband/hw/ipath/ipath_user_sdma.h | 52 + kernel/drivers/infiniband/hw/ipath/ipath_verbs.c | 2342 ++++++ kernel/drivers/infiniband/hw/ipath/ipath_verbs.h | 936 +++ .../infiniband/hw/ipath/ipath_verbs_mcast.c | 364 + .../drivers/infiniband/hw/ipath/ipath_wc_ppc64.c | 49 + .../drivers/infiniband/hw/ipath/ipath_wc_x86_64.c | 169 + kernel/drivers/infiniband/hw/mlx4/Kconfig | 10 + kernel/drivers/infiniband/hw/mlx4/Makefile | 3 + kernel/drivers/infiniband/hw/mlx4/ah.c | 178 + kernel/drivers/infiniband/hw/mlx4/alias_GUID.c | 901 ++ kernel/drivers/infiniband/hw/mlx4/cm.c | 478 ++ kernel/drivers/infiniband/hw/mlx4/cq.c | 983 +++ kernel/drivers/infiniband/hw/mlx4/doorbell.c | 96 + kernel/drivers/infiniband/hw/mlx4/mad.c | 2185 +++++ kernel/drivers/infiniband/hw/mlx4/main.c | 2874 +++++++ kernel/drivers/infiniband/hw/mlx4/mcg.c | 1257 +++ kernel/drivers/infiniband/hw/mlx4/mlx4_ib.h | 819 ++ kernel/drivers/infiniband/hw/mlx4/mr.c | 525 ++ kernel/drivers/infiniband/hw/mlx4/qp.c | 3217 ++++++++ kernel/drivers/infiniband/hw/mlx4/srq.c | 377 + kernel/drivers/infiniband/hw/mlx4/sysfs.c | 886 ++ kernel/drivers/infiniband/hw/mlx4/user.h | 107 + kernel/drivers/infiniband/hw/mlx5/Kconfig | 10 + kernel/drivers/infiniband/hw/mlx5/Makefile | 4 + kernel/drivers/infiniband/hw/mlx5/ah.c | 92 + kernel/drivers/infiniband/hw/mlx5/cq.c | 1189 +++ kernel/drivers/infiniband/hw/mlx5/doorbell.c | 98 + kernel/drivers/infiniband/hw/mlx5/mad.c | 139 + kernel/drivers/infiniband/hw/mlx5/main.c | 1397 ++++ kernel/drivers/infiniband/hw/mlx5/mem.c | 225 + kernel/drivers/infiniband/hw/mlx5/mlx5_ib.h | 669 ++ kernel/drivers/infiniband/hw/mlx5/mr.c | 1479 ++++ kernel/drivers/infiniband/hw/mlx5/odp.c | 798 ++ kernel/drivers/infiniband/hw/mlx5/qp.c | 3174 ++++++++ kernel/drivers/infiniband/hw/mlx5/srq.c | 485 ++ kernel/drivers/infiniband/hw/mlx5/user.h | 133 + kernel/drivers/infiniband/hw/mthca/Kconfig | 17 + kernel/drivers/infiniband/hw/mthca/Makefile | 7 + .../drivers/infiniband/hw/mthca/mthca_allocator.c | 301 + kernel/drivers/infiniband/hw/mthca/mthca_av.c | 374 + kernel/drivers/infiniband/hw/mthca/mthca_catas.c | 200 + kernel/drivers/infiniband/hw/mthca/mthca_cmd.c | 1969 +++++ kernel/drivers/infiniband/hw/mthca/mthca_cmd.h | 325 + .../drivers/infiniband/hw/mthca/mthca_config_reg.h | 48 + kernel/drivers/infiniband/hw/mthca/mthca_cq.c | 984 +++ kernel/drivers/infiniband/hw/mthca/mthca_dev.h | 596 ++ .../drivers/infiniband/hw/mthca/mthca_doorbell.h | 109 + kernel/drivers/infiniband/hw/mthca/mthca_eq.c | 905 +++ kernel/drivers/infiniband/hw/mthca/mthca_mad.c | 341 + kernel/drivers/infiniband/hw/mthca/mthca_main.c | 1275 +++ kernel/drivers/infiniband/hw/mthca/mthca_mcg.c | 335 + kernel/drivers/infiniband/hw/mthca/mthca_memfree.c | 760 ++ kernel/drivers/infiniband/hw/mthca/mthca_memfree.h | 179 + kernel/drivers/infiniband/hw/mthca/mthca_mr.c | 965 +++ kernel/drivers/infiniband/hw/mthca/mthca_pd.c | 81 + kernel/drivers/infiniband/hw/mthca/mthca_profile.c | 285 + kernel/drivers/infiniband/hw/mthca/mthca_profile.h | 59 + .../drivers/infiniband/hw/mthca/mthca_provider.c | 1375 ++++ .../drivers/infiniband/hw/mthca/mthca_provider.h | 344 + kernel/drivers/infiniband/hw/mthca/mthca_qp.c | 2311 ++++++ kernel/drivers/infiniband/hw/mthca/mthca_reset.c | 288 + kernel/drivers/infiniband/hw/mthca/mthca_srq.c | 696 ++ kernel/drivers/infiniband/hw/mthca/mthca_uar.c | 78 + kernel/drivers/infiniband/hw/mthca/mthca_user.h | 112 + kernel/drivers/infiniband/hw/mthca/mthca_wqe.h | 131 + kernel/drivers/infiniband/hw/nes/Kconfig | 16 + kernel/drivers/infiniband/hw/nes/Makefile | 3 + kernel/drivers/infiniband/hw/nes/nes.c | 1270 +++ kernel/drivers/infiniband/hw/nes/nes.h | 582 ++ kernel/drivers/infiniband/hw/nes/nes_cm.c | 4184 ++++++++++ kernel/drivers/infiniband/hw/nes/nes_cm.h | 476 ++ kernel/drivers/infiniband/hw/nes/nes_context.h | 193 + kernel/drivers/infiniband/hw/nes/nes_hw.c | 3937 +++++++++ kernel/drivers/infiniband/hw/nes/nes_hw.h | 1392 ++++ kernel/drivers/infiniband/hw/nes/nes_mgt.c | 1160 +++ kernel/drivers/infiniband/hw/nes/nes_mgt.h | 97 + kernel/drivers/infiniband/hw/nes/nes_nic.c | 1883 +++++ kernel/drivers/infiniband/hw/nes/nes_user.h | 114 + kernel/drivers/infiniband/hw/nes/nes_utils.c | 972 +++ kernel/drivers/infiniband/hw/nes/nes_verbs.c | 4054 +++++++++ kernel/drivers/infiniband/hw/nes/nes_verbs.h | 189 + kernel/drivers/infiniband/hw/ocrdma/Kconfig | 8 + kernel/drivers/infiniband/hw/ocrdma/Makefile | 5 + kernel/drivers/infiniband/hw/ocrdma/ocrdma.h | 579 ++ kernel/drivers/infiniband/hw/ocrdma/ocrdma_abi.h | 134 + kernel/drivers/infiniband/hw/ocrdma/ocrdma_ah.c | 227 + kernel/drivers/infiniband/hw/ocrdma/ocrdma_ah.h | 48 + kernel/drivers/infiniband/hw/ocrdma/ocrdma_hw.c | 3172 ++++++++ kernel/drivers/infiniband/hw/ocrdma/ocrdma_hw.h | 142 + kernel/drivers/infiniband/hw/ocrdma/ocrdma_main.c | 682 ++ kernel/drivers/infiniband/hw/ocrdma/ocrdma_sli.h | 2173 +++++ kernel/drivers/infiniband/hw/ocrdma/ocrdma_stats.c | 857 ++ kernel/drivers/infiniband/hw/ocrdma/ocrdma_stats.h | 58 + kernel/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 3195 ++++++++ kernel/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h | 99 + kernel/drivers/infiniband/hw/qib/Kconfig | 15 + kernel/drivers/infiniband/hw/qib/Makefile | 16 + kernel/drivers/infiniband/hw/qib/qib.h | 1543 ++++ kernel/drivers/infiniband/hw/qib/qib_6120_regs.h | 977 +++ kernel/drivers/infiniband/hw/qib/qib_7220.h | 149 + kernel/drivers/infiniband/hw/qib/qib_7220_regs.h | 1496 ++++ kernel/drivers/infiniband/hw/qib/qib_7322_regs.h | 3163 ++++++++ kernel/drivers/infiniband/hw/qib/qib_common.h | 812 ++ kernel/drivers/infiniband/hw/qib/qib_cq.c | 540 ++ kernel/drivers/infiniband/hw/qib/qib_debugfs.c | 283 + kernel/drivers/infiniband/hw/qib/qib_debugfs.h | 45 + kernel/drivers/infiniband/hw/qib/qib_diag.c | 916 +++ kernel/drivers/infiniband/hw/qib/qib_dma.c | 169 + kernel/drivers/infiniband/hw/qib/qib_driver.c | 820 ++ kernel/drivers/infiniband/hw/qib/qib_eeprom.c | 276 + kernel/drivers/infiniband/hw/qib/qib_file_ops.c | 2418 ++++++ kernel/drivers/infiniband/hw/qib/qib_fs.c | 621 ++ kernel/drivers/infiniband/hw/qib/qib_iba6120.c | 3600 ++++++++ kernel/drivers/infiniband/hw/qib/qib_iba7220.c | 4657 +++++++++++ kernel/drivers/infiniband/hw/qib/qib_iba7322.c | 8573 ++++++++++++++++++++ kernel/drivers/infiniband/hw/qib/qib_init.c | 1847 +++++ kernel/drivers/infiniband/hw/qib/qib_intr.c | 240 + kernel/drivers/infiniband/hw/qib/qib_keys.c | 387 + kernel/drivers/infiniband/hw/qib/qib_mad.c | 2533 ++++++ kernel/drivers/infiniband/hw/qib/qib_mad.h | 431 + kernel/drivers/infiniband/hw/qib/qib_mmap.c | 174 + kernel/drivers/infiniband/hw/qib/qib_mr.c | 532 ++ kernel/drivers/infiniband/hw/qib/qib_pcie.c | 719 ++ kernel/drivers/infiniband/hw/qib/qib_pio_copy.c | 64 + kernel/drivers/infiniband/hw/qib/qib_qp.c | 1376 ++++ kernel/drivers/infiniband/hw/qib/qib_qsfp.c | 559 ++ kernel/drivers/infiniband/hw/qib/qib_qsfp.h | 189 + kernel/drivers/infiniband/hw/qib/qib_rc.c | 2290 ++++++ kernel/drivers/infiniband/hw/qib/qib_ruc.c | 819 ++ kernel/drivers/infiniband/hw/qib/qib_sd7220.c | 1454 ++++ kernel/drivers/infiniband/hw/qib/qib_sdma.c | 1039 +++ kernel/drivers/infiniband/hw/qib/qib_srq.c | 380 + kernel/drivers/infiniband/hw/qib/qib_sysfs.c | 818 ++ kernel/drivers/infiniband/hw/qib/qib_twsi.c | 501 ++ kernel/drivers/infiniband/hw/qib/qib_tx.c | 572 ++ kernel/drivers/infiniband/hw/qib/qib_uc.c | 536 ++ kernel/drivers/infiniband/hw/qib/qib_ud.c | 590 ++ kernel/drivers/infiniband/hw/qib/qib_user_pages.c | 157 + kernel/drivers/infiniband/hw/qib/qib_user_sdma.c | 1465 ++++ kernel/drivers/infiniband/hw/qib/qib_user_sdma.h | 52 + kernel/drivers/infiniband/hw/qib/qib_verbs.c | 2339 ++++++ kernel/drivers/infiniband/hw/qib/qib_verbs.h | 1173 +++ kernel/drivers/infiniband/hw/qib/qib_verbs_mcast.c | 368 + kernel/drivers/infiniband/hw/qib/qib_wc_ppc64.c | 62 + kernel/drivers/infiniband/hw/qib/qib_wc_x86_64.c | 150 + kernel/drivers/infiniband/hw/usnic/Kconfig | 10 + kernel/drivers/infiniband/hw/usnic/Makefile | 15 + kernel/drivers/infiniband/hw/usnic/usnic.h | 29 + kernel/drivers/infiniband/hw/usnic/usnic_abi.h | 73 + .../infiniband/hw/usnic/usnic_common_pkt_hdr.h | 27 + .../infiniband/hw/usnic/usnic_common_util.h | 68 + kernel/drivers/infiniband/hw/usnic/usnic_debugfs.c | 154 + kernel/drivers/infiniband/hw/usnic/usnic_debugfs.h | 29 + kernel/drivers/infiniband/hw/usnic/usnic_fwd.c | 350 + kernel/drivers/infiniband/hw/usnic/usnic_fwd.h | 113 + kernel/drivers/infiniband/hw/usnic/usnic_ib.h | 118 + kernel/drivers/infiniband/hw/usnic/usnic_ib_main.c | 682 ++ .../drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c | 761 ++ .../drivers/infiniband/hw/usnic/usnic_ib_qp_grp.h | 117 + .../drivers/infiniband/hw/usnic/usnic_ib_sysfs.c | 341 + .../drivers/infiniband/hw/usnic/usnic_ib_sysfs.h | 29 + .../drivers/infiniband/hw/usnic/usnic_ib_verbs.c | 768 ++ .../drivers/infiniband/hw/usnic/usnic_ib_verbs.h | 72 + kernel/drivers/infiniband/hw/usnic/usnic_log.h | 58 + .../drivers/infiniband/hw/usnic/usnic_transport.c | 202 + .../drivers/infiniband/hw/usnic/usnic_transport.h | 51 + kernel/drivers/infiniband/hw/usnic/usnic_uiom.c | 604 ++ kernel/drivers/infiniband/hw/usnic/usnic_uiom.h | 80 + .../infiniband/hw/usnic/usnic_uiom_interval_tree.c | 254 + .../infiniband/hw/usnic/usnic_uiom_interval_tree.h | 73 + kernel/drivers/infiniband/hw/usnic/usnic_vnic.c | 467 ++ kernel/drivers/infiniband/hw/usnic/usnic_vnic.h | 103 + 293 files changed, 210125 insertions(+) create mode 100644 kernel/drivers/infiniband/hw/Makefile create mode 100644 kernel/drivers/infiniband/hw/amso1100/Kbuild create mode 100644 kernel/drivers/infiniband/hw/amso1100/Kconfig create mode 100644 kernel/drivers/infiniband/hw/amso1100/c2.c create mode 100644 kernel/drivers/infiniband/hw/amso1100/c2.h create mode 100644 kernel/drivers/infiniband/hw/amso1100/c2_ae.c create mode 100644 kernel/drivers/infiniband/hw/amso1100/c2_ae.h create mode 100644 kernel/drivers/infiniband/hw/amso1100/c2_alloc.c create mode 100644 kernel/drivers/infiniband/hw/amso1100/c2_cm.c create mode 100644 kernel/drivers/infiniband/hw/amso1100/c2_cq.c create mode 100644 kernel/drivers/infiniband/hw/amso1100/c2_intr.c create mode 100644 kernel/drivers/infiniband/hw/amso1100/c2_mm.c create mode 100644 kernel/drivers/infiniband/hw/amso1100/c2_mq.c create mode 100644 kernel/drivers/infiniband/hw/amso1100/c2_mq.h create mode 100644 kernel/drivers/infiniband/hw/amso1100/c2_pd.c create mode 100644 kernel/drivers/infiniband/hw/amso1100/c2_provider.c create mode 100644 kernel/drivers/infiniband/hw/amso1100/c2_provider.h create mode 100644 kernel/drivers/infiniband/hw/amso1100/c2_qp.c create mode 100644 kernel/drivers/infiniband/hw/amso1100/c2_rnic.c create mode 100644 kernel/drivers/infiniband/hw/amso1100/c2_status.h create mode 100644 kernel/drivers/infiniband/hw/amso1100/c2_user.h create mode 100644 kernel/drivers/infiniband/hw/amso1100/c2_vq.c create mode 100644 kernel/drivers/infiniband/hw/amso1100/c2_vq.h create mode 100644 kernel/drivers/infiniband/hw/amso1100/c2_wr.h create mode 100644 kernel/drivers/infiniband/hw/cxgb3/Kconfig create mode 100644 kernel/drivers/infiniband/hw/cxgb3/Makefile create mode 100644 kernel/drivers/infiniband/hw/cxgb3/cxio_dbg.c create mode 100644 kernel/drivers/infiniband/hw/cxgb3/cxio_hal.c create mode 100644 kernel/drivers/infiniband/hw/cxgb3/cxio_hal.h create mode 100644 kernel/drivers/infiniband/hw/cxgb3/cxio_resource.c create mode 100644 kernel/drivers/infiniband/hw/cxgb3/cxio_resource.h create mode 100644 kernel/drivers/infiniband/hw/cxgb3/cxio_wr.h create mode 100644 kernel/drivers/infiniband/hw/cxgb3/iwch.c create mode 100644 kernel/drivers/infiniband/hw/cxgb3/iwch.h create mode 100644 kernel/drivers/infiniband/hw/cxgb3/iwch_cm.c create mode 100644 kernel/drivers/infiniband/hw/cxgb3/iwch_cm.h create mode 100644 kernel/drivers/infiniband/hw/cxgb3/iwch_cq.c create mode 100644 kernel/drivers/infiniband/hw/cxgb3/iwch_ev.c create mode 100644 kernel/drivers/infiniband/hw/cxgb3/iwch_mem.c create mode 100644 kernel/drivers/infiniband/hw/cxgb3/iwch_provider.c create mode 100644 kernel/drivers/infiniband/hw/cxgb3/iwch_provider.h create mode 100644 kernel/drivers/infiniband/hw/cxgb3/iwch_qp.c create mode 100644 kernel/drivers/infiniband/hw/cxgb3/iwch_user.h create mode 100644 kernel/drivers/infiniband/hw/cxgb3/tcb.h create mode 100644 kernel/drivers/infiniband/hw/cxgb4/Kconfig create mode 100644 kernel/drivers/infiniband/hw/cxgb4/Makefile create mode 100644 kernel/drivers/infiniband/hw/cxgb4/cm.c create mode 100644 kernel/drivers/infiniband/hw/cxgb4/cq.c create mode 100644 kernel/drivers/infiniband/hw/cxgb4/device.c create mode 100644 kernel/drivers/infiniband/hw/cxgb4/ev.c create mode 100644 kernel/drivers/infiniband/hw/cxgb4/id_table.c create mode 100644 kernel/drivers/infiniband/hw/cxgb4/iw_cxgb4.h create mode 100644 kernel/drivers/infiniband/hw/cxgb4/mem.c create mode 100644 kernel/drivers/infiniband/hw/cxgb4/provider.c create mode 100644 kernel/drivers/infiniband/hw/cxgb4/qp.c create mode 100644 kernel/drivers/infiniband/hw/cxgb4/resource.c create mode 100644 kernel/drivers/infiniband/hw/cxgb4/t4.h create mode 100644 kernel/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h create mode 100644 kernel/drivers/infiniband/hw/cxgb4/user.h create mode 100644 kernel/drivers/infiniband/hw/ehca/Kconfig create mode 100644 kernel/drivers/infiniband/hw/ehca/Makefile create mode 100644 kernel/drivers/infiniband/hw/ehca/ehca_av.c create mode 100644 kernel/drivers/infiniband/hw/ehca/ehca_classes.h create mode 100644 kernel/drivers/infiniband/hw/ehca/ehca_classes_pSeries.h create mode 100644 kernel/drivers/infiniband/hw/ehca/ehca_cq.c create mode 100644 kernel/drivers/infiniband/hw/ehca/ehca_eq.c create mode 100644 kernel/drivers/infiniband/hw/ehca/ehca_hca.c create mode 100644 kernel/drivers/infiniband/hw/ehca/ehca_irq.c create mode 100644 kernel/drivers/infiniband/hw/ehca/ehca_irq.h create mode 100644 kernel/drivers/infiniband/hw/ehca/ehca_iverbs.h create mode 100644 kernel/drivers/infiniband/hw/ehca/ehca_main.c create mode 100644 kernel/drivers/infiniband/hw/ehca/ehca_mcast.c create mode 100644 kernel/drivers/infiniband/hw/ehca/ehca_mrmw.c create mode 100644 kernel/drivers/infiniband/hw/ehca/ehca_mrmw.h create mode 100644 kernel/drivers/infiniband/hw/ehca/ehca_pd.c create mode 100644 kernel/drivers/infiniband/hw/ehca/ehca_qes.h create mode 100644 kernel/drivers/infiniband/hw/ehca/ehca_qp.c create mode 100644 kernel/drivers/infiniband/hw/ehca/ehca_reqs.c create mode 100644 kernel/drivers/infiniband/hw/ehca/ehca_sqp.c create mode 100644 kernel/drivers/infiniband/hw/ehca/ehca_tools.h create mode 100644 kernel/drivers/infiniband/hw/ehca/ehca_uverbs.c create mode 100644 kernel/drivers/infiniband/hw/ehca/hcp_if.c create mode 100644 kernel/drivers/infiniband/hw/ehca/hcp_if.h create mode 100644 kernel/drivers/infiniband/hw/ehca/hcp_phyp.c create mode 100644 kernel/drivers/infiniband/hw/ehca/hcp_phyp.h create mode 100644 kernel/drivers/infiniband/hw/ehca/hipz_fns.h create mode 100644 kernel/drivers/infiniband/hw/ehca/hipz_fns_core.h create mode 100644 kernel/drivers/infiniband/hw/ehca/hipz_hw.h create mode 100644 kernel/drivers/infiniband/hw/ehca/ipz_pt_fn.c create mode 100644 kernel/drivers/infiniband/hw/ehca/ipz_pt_fn.h create mode 100644 kernel/drivers/infiniband/hw/ipath/Kconfig create mode 100644 kernel/drivers/infiniband/hw/ipath/Makefile create mode 100644 kernel/drivers/infiniband/hw/ipath/ipath_common.h create mode 100644 kernel/drivers/infiniband/hw/ipath/ipath_cq.c create mode 100644 kernel/drivers/infiniband/hw/ipath/ipath_debug.h create mode 100644 kernel/drivers/infiniband/hw/ipath/ipath_diag.c create mode 100644 kernel/drivers/infiniband/hw/ipath/ipath_dma.c create mode 100644 kernel/drivers/infiniband/hw/ipath/ipath_driver.c create mode 100644 kernel/drivers/infiniband/hw/ipath/ipath_eeprom.c create mode 100644 kernel/drivers/infiniband/hw/ipath/ipath_file_ops.c create mode 100644 kernel/drivers/infiniband/hw/ipath/ipath_fs.c create mode 100644 kernel/drivers/infiniband/hw/ipath/ipath_iba6110.c create mode 100644 kernel/drivers/infiniband/hw/ipath/ipath_init_chip.c create mode 100644 kernel/drivers/infiniband/hw/ipath/ipath_intr.c create mode 100644 kernel/drivers/infiniband/hw/ipath/ipath_kernel.h create mode 100644 kernel/drivers/infiniband/hw/ipath/ipath_keys.c create mode 100644 kernel/drivers/infiniband/hw/ipath/ipath_mad.c create mode 100644 kernel/drivers/infiniband/hw/ipath/ipath_mmap.c create mode 100644 kernel/drivers/infiniband/hw/ipath/ipath_mr.c create mode 100644 kernel/drivers/infiniband/hw/ipath/ipath_qp.c create mode 100644 kernel/drivers/infiniband/hw/ipath/ipath_rc.c create mode 100644 kernel/drivers/infiniband/hw/ipath/ipath_registers.h create mode 100644 kernel/drivers/infiniband/hw/ipath/ipath_ruc.c create mode 100644 kernel/drivers/infiniband/hw/ipath/ipath_sdma.c create mode 100644 kernel/drivers/infiniband/hw/ipath/ipath_srq.c create mode 100644 kernel/drivers/infiniband/hw/ipath/ipath_stats.c create mode 100644 kernel/drivers/infiniband/hw/ipath/ipath_sysfs.c create mode 100644 kernel/drivers/infiniband/hw/ipath/ipath_uc.c create mode 100644 kernel/drivers/infiniband/hw/ipath/ipath_ud.c create mode 100644 kernel/drivers/infiniband/hw/ipath/ipath_user_pages.c create mode 100644 kernel/drivers/infiniband/hw/ipath/ipath_user_sdma.c create mode 100644 kernel/drivers/infiniband/hw/ipath/ipath_user_sdma.h create mode 100644 kernel/drivers/infiniband/hw/ipath/ipath_verbs.c create mode 100644 kernel/drivers/infiniband/hw/ipath/ipath_verbs.h create mode 100644 kernel/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c create mode 100644 kernel/drivers/infiniband/hw/ipath/ipath_wc_ppc64.c create mode 100644 kernel/drivers/infiniband/hw/ipath/ipath_wc_x86_64.c create mode 100644 kernel/drivers/infiniband/hw/mlx4/Kconfig create mode 100644 kernel/drivers/infiniband/hw/mlx4/Makefile create mode 100644 kernel/drivers/infiniband/hw/mlx4/ah.c create mode 100644 kernel/drivers/infiniband/hw/mlx4/alias_GUID.c create mode 100644 kernel/drivers/infiniband/hw/mlx4/cm.c create mode 100644 kernel/drivers/infiniband/hw/mlx4/cq.c create mode 100644 kernel/drivers/infiniband/hw/mlx4/doorbell.c create mode 100644 kernel/drivers/infiniband/hw/mlx4/mad.c create mode 100644 kernel/drivers/infiniband/hw/mlx4/main.c create mode 100644 kernel/drivers/infiniband/hw/mlx4/mcg.c create mode 100644 kernel/drivers/infiniband/hw/mlx4/mlx4_ib.h create mode 100644 kernel/drivers/infiniband/hw/mlx4/mr.c create mode 100644 kernel/drivers/infiniband/hw/mlx4/qp.c create mode 100644 kernel/drivers/infiniband/hw/mlx4/srq.c create mode 100644 kernel/drivers/infiniband/hw/mlx4/sysfs.c create mode 100644 kernel/drivers/infiniband/hw/mlx4/user.h create mode 100644 kernel/drivers/infiniband/hw/mlx5/Kconfig create mode 100644 kernel/drivers/infiniband/hw/mlx5/Makefile create mode 100644 kernel/drivers/infiniband/hw/mlx5/ah.c create mode 100644 kernel/drivers/infiniband/hw/mlx5/cq.c create mode 100644 kernel/drivers/infiniband/hw/mlx5/doorbell.c create mode 100644 kernel/drivers/infiniband/hw/mlx5/mad.c create mode 100644 kernel/drivers/infiniband/hw/mlx5/main.c create mode 100644 kernel/drivers/infiniband/hw/mlx5/mem.c create mode 100644 kernel/drivers/infiniband/hw/mlx5/mlx5_ib.h create mode 100644 kernel/drivers/infiniband/hw/mlx5/mr.c create mode 100644 kernel/drivers/infiniband/hw/mlx5/odp.c create mode 100644 kernel/drivers/infiniband/hw/mlx5/qp.c create mode 100644 kernel/drivers/infiniband/hw/mlx5/srq.c create mode 100644 kernel/drivers/infiniband/hw/mlx5/user.h create mode 100644 kernel/drivers/infiniband/hw/mthca/Kconfig create mode 100644 kernel/drivers/infiniband/hw/mthca/Makefile create mode 100644 kernel/drivers/infiniband/hw/mthca/mthca_allocator.c create mode 100644 kernel/drivers/infiniband/hw/mthca/mthca_av.c create mode 100644 kernel/drivers/infiniband/hw/mthca/mthca_catas.c create mode 100644 kernel/drivers/infiniband/hw/mthca/mthca_cmd.c create mode 100644 kernel/drivers/infiniband/hw/mthca/mthca_cmd.h create mode 100644 kernel/drivers/infiniband/hw/mthca/mthca_config_reg.h create mode 100644 kernel/drivers/infiniband/hw/mthca/mthca_cq.c create mode 100644 kernel/drivers/infiniband/hw/mthca/mthca_dev.h create mode 100644 kernel/drivers/infiniband/hw/mthca/mthca_doorbell.h create mode 100644 kernel/drivers/infiniband/hw/mthca/mthca_eq.c create mode 100644 kernel/drivers/infiniband/hw/mthca/mthca_mad.c create mode 100644 kernel/drivers/infiniband/hw/mthca/mthca_main.c create mode 100644 kernel/drivers/infiniband/hw/mthca/mthca_mcg.c create mode 100644 kernel/drivers/infiniband/hw/mthca/mthca_memfree.c create mode 100644 kernel/drivers/infiniband/hw/mthca/mthca_memfree.h create mode 100644 kernel/drivers/infiniband/hw/mthca/mthca_mr.c create mode 100644 kernel/drivers/infiniband/hw/mthca/mthca_pd.c create mode 100644 kernel/drivers/infiniband/hw/mthca/mthca_profile.c create mode 100644 kernel/drivers/infiniband/hw/mthca/mthca_profile.h create mode 100644 kernel/drivers/infiniband/hw/mthca/mthca_provider.c create mode 100644 kernel/drivers/infiniband/hw/mthca/mthca_provider.h create mode 100644 kernel/drivers/infiniband/hw/mthca/mthca_qp.c create mode 100644 kernel/drivers/infiniband/hw/mthca/mthca_reset.c create mode 100644 kernel/drivers/infiniband/hw/mthca/mthca_srq.c create mode 100644 kernel/drivers/infiniband/hw/mthca/mthca_uar.c create mode 100644 kernel/drivers/infiniband/hw/mthca/mthca_user.h create mode 100644 kernel/drivers/infiniband/hw/mthca/mthca_wqe.h create mode 100644 kernel/drivers/infiniband/hw/nes/Kconfig create mode 100644 kernel/drivers/infiniband/hw/nes/Makefile create mode 100644 kernel/drivers/infiniband/hw/nes/nes.c create mode 100644 kernel/drivers/infiniband/hw/nes/nes.h create mode 100644 kernel/drivers/infiniband/hw/nes/nes_cm.c create mode 100644 kernel/drivers/infiniband/hw/nes/nes_cm.h create mode 100644 kernel/drivers/infiniband/hw/nes/nes_context.h create mode 100644 kernel/drivers/infiniband/hw/nes/nes_hw.c create mode 100644 kernel/drivers/infiniband/hw/nes/nes_hw.h create mode 100644 kernel/drivers/infiniband/hw/nes/nes_mgt.c create mode 100644 kernel/drivers/infiniband/hw/nes/nes_mgt.h create mode 100644 kernel/drivers/infiniband/hw/nes/nes_nic.c create mode 100644 kernel/drivers/infiniband/hw/nes/nes_user.h create mode 100644 kernel/drivers/infiniband/hw/nes/nes_utils.c create mode 100644 kernel/drivers/infiniband/hw/nes/nes_verbs.c create mode 100644 kernel/drivers/infiniband/hw/nes/nes_verbs.h create mode 100644 kernel/drivers/infiniband/hw/ocrdma/Kconfig create mode 100644 kernel/drivers/infiniband/hw/ocrdma/Makefile create mode 100644 kernel/drivers/infiniband/hw/ocrdma/ocrdma.h create mode 100644 kernel/drivers/infiniband/hw/ocrdma/ocrdma_abi.h create mode 100644 kernel/drivers/infiniband/hw/ocrdma/ocrdma_ah.c create mode 100644 kernel/drivers/infiniband/hw/ocrdma/ocrdma_ah.h create mode 100644 kernel/drivers/infiniband/hw/ocrdma/ocrdma_hw.c create mode 100644 kernel/drivers/infiniband/hw/ocrdma/ocrdma_hw.h create mode 100644 kernel/drivers/infiniband/hw/ocrdma/ocrdma_main.c create mode 100644 kernel/drivers/infiniband/hw/ocrdma/ocrdma_sli.h create mode 100644 kernel/drivers/infiniband/hw/ocrdma/ocrdma_stats.c create mode 100644 kernel/drivers/infiniband/hw/ocrdma/ocrdma_stats.h create mode 100644 kernel/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c create mode 100644 kernel/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h create mode 100644 kernel/drivers/infiniband/hw/qib/Kconfig create mode 100644 kernel/drivers/infiniband/hw/qib/Makefile create mode 100644 kernel/drivers/infiniband/hw/qib/qib.h create mode 100644 kernel/drivers/infiniband/hw/qib/qib_6120_regs.h create mode 100644 kernel/drivers/infiniband/hw/qib/qib_7220.h create mode 100644 kernel/drivers/infiniband/hw/qib/qib_7220_regs.h create mode 100644 kernel/drivers/infiniband/hw/qib/qib_7322_regs.h create mode 100644 kernel/drivers/infiniband/hw/qib/qib_common.h create mode 100644 kernel/drivers/infiniband/hw/qib/qib_cq.c create mode 100644 kernel/drivers/infiniband/hw/qib/qib_debugfs.c create mode 100644 kernel/drivers/infiniband/hw/qib/qib_debugfs.h create mode 100644 kernel/drivers/infiniband/hw/qib/qib_diag.c create mode 100644 kernel/drivers/infiniband/hw/qib/qib_dma.c create mode 100644 kernel/drivers/infiniband/hw/qib/qib_driver.c create mode 100644 kernel/drivers/infiniband/hw/qib/qib_eeprom.c create mode 100644 kernel/drivers/infiniband/hw/qib/qib_file_ops.c create mode 100644 kernel/drivers/infiniband/hw/qib/qib_fs.c create mode 100644 kernel/drivers/infiniband/hw/qib/qib_iba6120.c create mode 100644 kernel/drivers/infiniband/hw/qib/qib_iba7220.c create mode 100644 kernel/drivers/infiniband/hw/qib/qib_iba7322.c create mode 100644 kernel/drivers/infiniband/hw/qib/qib_init.c create mode 100644 kernel/drivers/infiniband/hw/qib/qib_intr.c create mode 100644 kernel/drivers/infiniband/hw/qib/qib_keys.c create mode 100644 kernel/drivers/infiniband/hw/qib/qib_mad.c create mode 100644 kernel/drivers/infiniband/hw/qib/qib_mad.h create mode 100644 kernel/drivers/infiniband/hw/qib/qib_mmap.c create mode 100644 kernel/drivers/infiniband/hw/qib/qib_mr.c create mode 100644 kernel/drivers/infiniband/hw/qib/qib_pcie.c create mode 100644 kernel/drivers/infiniband/hw/qib/qib_pio_copy.c create mode 100644 kernel/drivers/infiniband/hw/qib/qib_qp.c create mode 100644 kernel/drivers/infiniband/hw/qib/qib_qsfp.c create mode 100644 kernel/drivers/infiniband/hw/qib/qib_qsfp.h create mode 100644 kernel/drivers/infiniband/hw/qib/qib_rc.c create mode 100644 kernel/drivers/infiniband/hw/qib/qib_ruc.c create mode 100644 kernel/drivers/infiniband/hw/qib/qib_sd7220.c create mode 100644 kernel/drivers/infiniband/hw/qib/qib_sdma.c create mode 100644 kernel/drivers/infiniband/hw/qib/qib_srq.c create mode 100644 kernel/drivers/infiniband/hw/qib/qib_sysfs.c create mode 100644 kernel/drivers/infiniband/hw/qib/qib_twsi.c create mode 100644 kernel/drivers/infiniband/hw/qib/qib_tx.c create mode 100644 kernel/drivers/infiniband/hw/qib/qib_uc.c create mode 100644 kernel/drivers/infiniband/hw/qib/qib_ud.c create mode 100644 kernel/drivers/infiniband/hw/qib/qib_user_pages.c create mode 100644 kernel/drivers/infiniband/hw/qib/qib_user_sdma.c create mode 100644 kernel/drivers/infiniband/hw/qib/qib_user_sdma.h create mode 100644 kernel/drivers/infiniband/hw/qib/qib_verbs.c create mode 100644 kernel/drivers/infiniband/hw/qib/qib_verbs.h create mode 100644 kernel/drivers/infiniband/hw/qib/qib_verbs_mcast.c create mode 100644 kernel/drivers/infiniband/hw/qib/qib_wc_ppc64.c create mode 100644 kernel/drivers/infiniband/hw/qib/qib_wc_x86_64.c create mode 100644 kernel/drivers/infiniband/hw/usnic/Kconfig create mode 100644 kernel/drivers/infiniband/hw/usnic/Makefile create mode 100644 kernel/drivers/infiniband/hw/usnic/usnic.h create mode 100644 kernel/drivers/infiniband/hw/usnic/usnic_abi.h create mode 100644 kernel/drivers/infiniband/hw/usnic/usnic_common_pkt_hdr.h create mode 100644 kernel/drivers/infiniband/hw/usnic/usnic_common_util.h create mode 100644 kernel/drivers/infiniband/hw/usnic/usnic_debugfs.c create mode 100644 kernel/drivers/infiniband/hw/usnic/usnic_debugfs.h create mode 100644 kernel/drivers/infiniband/hw/usnic/usnic_fwd.c create mode 100644 kernel/drivers/infiniband/hw/usnic/usnic_fwd.h create mode 100644 kernel/drivers/infiniband/hw/usnic/usnic_ib.h create mode 100644 kernel/drivers/infiniband/hw/usnic/usnic_ib_main.c create mode 100644 kernel/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c create mode 100644 kernel/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.h create mode 100644 kernel/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c create mode 100644 kernel/drivers/infiniband/hw/usnic/usnic_ib_sysfs.h create mode 100644 kernel/drivers/infiniband/hw/usnic/usnic_ib_verbs.c create mode 100644 kernel/drivers/infiniband/hw/usnic/usnic_ib_verbs.h create mode 100644 kernel/drivers/infiniband/hw/usnic/usnic_log.h create mode 100644 kernel/drivers/infiniband/hw/usnic/usnic_transport.c create mode 100644 kernel/drivers/infiniband/hw/usnic/usnic_transport.h create mode 100644 kernel/drivers/infiniband/hw/usnic/usnic_uiom.c create mode 100644 kernel/drivers/infiniband/hw/usnic/usnic_uiom.h create mode 100644 kernel/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c create mode 100644 kernel/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h create mode 100644 kernel/drivers/infiniband/hw/usnic/usnic_vnic.c create mode 100644 kernel/drivers/infiniband/hw/usnic/usnic_vnic.h (limited to 'kernel/drivers/infiniband/hw') diff --git a/kernel/drivers/infiniband/hw/Makefile b/kernel/drivers/infiniband/hw/Makefile new file mode 100644 index 000000000..e900b0353 --- /dev/null +++ b/kernel/drivers/infiniband/hw/Makefile @@ -0,0 +1,12 @@ +obj-$(CONFIG_INFINIBAND_MTHCA) += mthca/ +obj-$(CONFIG_INFINIBAND_IPATH) += ipath/ +obj-$(CONFIG_INFINIBAND_QIB) += qib/ +obj-$(CONFIG_INFINIBAND_EHCA) += ehca/ +obj-$(CONFIG_INFINIBAND_AMSO1100) += amso1100/ +obj-$(CONFIG_INFINIBAND_CXGB3) += cxgb3/ +obj-$(CONFIG_INFINIBAND_CXGB4) += cxgb4/ +obj-$(CONFIG_MLX4_INFINIBAND) += mlx4/ +obj-$(CONFIG_MLX5_INFINIBAND) += mlx5/ +obj-$(CONFIG_INFINIBAND_NES) += nes/ +obj-$(CONFIG_INFINIBAND_OCRDMA) += ocrdma/ +obj-$(CONFIG_INFINIBAND_USNIC) += usnic/ diff --git a/kernel/drivers/infiniband/hw/amso1100/Kbuild b/kernel/drivers/infiniband/hw/amso1100/Kbuild new file mode 100644 index 000000000..950dfabcd --- /dev/null +++ b/kernel/drivers/infiniband/hw/amso1100/Kbuild @@ -0,0 +1,6 @@ +ccflags-$(CONFIG_INFINIBAND_AMSO1100_DEBUG) := -DDEBUG + +obj-$(CONFIG_INFINIBAND_AMSO1100) += iw_c2.o + +iw_c2-y := c2.o c2_provider.o c2_rnic.o c2_alloc.o c2_mq.o c2_ae.o c2_vq.o \ + c2_intr.o c2_cq.o c2_qp.o c2_cm.o c2_mm.o c2_pd.o diff --git a/kernel/drivers/infiniband/hw/amso1100/Kconfig b/kernel/drivers/infiniband/hw/amso1100/Kconfig new file mode 100644 index 000000000..e6ce5f209 --- /dev/null +++ b/kernel/drivers/infiniband/hw/amso1100/Kconfig @@ -0,0 +1,15 @@ +config INFINIBAND_AMSO1100 + tristate "Ammasso 1100 HCA support" + depends on PCI && INET + ---help--- + This is a low-level driver for the Ammasso 1100 host + channel adapter (HCA). + +config INFINIBAND_AMSO1100_DEBUG + bool "Verbose debugging output" + depends on INFINIBAND_AMSO1100 + default n + ---help--- + This option causes the amso1100 driver to produce a bunch of + debug messages. Select this if you are developing the driver + or trying to diagnose a problem. diff --git a/kernel/drivers/infiniband/hw/amso1100/c2.c b/kernel/drivers/infiniband/hw/amso1100/c2.c new file mode 100644 index 000000000..766a71cce --- /dev/null +++ b/kernel/drivers/infiniband/hw/amso1100/c2.c @@ -0,0 +1,1241 @@ +/* + * Copyright (c) 2005 Ammasso, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include "c2.h" +#include "c2_provider.h" + +MODULE_AUTHOR("Tom Tucker "); +MODULE_DESCRIPTION("Ammasso AMSO1100 Low-level iWARP Driver"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION(DRV_VERSION); + +static const u32 default_msg = NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_LINK + | NETIF_MSG_IFUP | NETIF_MSG_IFDOWN; + +static int debug = -1; /* defaults above */ +module_param(debug, int, 0); +MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all)"); + +static int c2_up(struct net_device *netdev); +static int c2_down(struct net_device *netdev); +static int c2_xmit_frame(struct sk_buff *skb, struct net_device *netdev); +static void c2_tx_interrupt(struct net_device *netdev); +static void c2_rx_interrupt(struct net_device *netdev); +static irqreturn_t c2_interrupt(int irq, void *dev_id); +static void c2_tx_timeout(struct net_device *netdev); +static int c2_change_mtu(struct net_device *netdev, int new_mtu); +static void c2_reset(struct c2_port *c2_port); + +static struct pci_device_id c2_pci_table[] = { + { PCI_DEVICE(0x18b8, 0xb001) }, + { 0 } +}; + +MODULE_DEVICE_TABLE(pci, c2_pci_table); + +static void c2_print_macaddr(struct net_device *netdev) +{ + pr_debug("%s: MAC %pM, IRQ %u\n", netdev->name, netdev->dev_addr, netdev->irq); +} + +static void c2_set_rxbufsize(struct c2_port *c2_port) +{ + struct net_device *netdev = c2_port->netdev; + + if (netdev->mtu > RX_BUF_SIZE) + c2_port->rx_buf_size = + netdev->mtu + ETH_HLEN + sizeof(struct c2_rxp_hdr) + + NET_IP_ALIGN; + else + c2_port->rx_buf_size = sizeof(struct c2_rxp_hdr) + RX_BUF_SIZE; +} + +/* + * Allocate TX ring elements and chain them together. + * One-to-one association of adapter descriptors with ring elements. + */ +static int c2_tx_ring_alloc(struct c2_ring *tx_ring, void *vaddr, + dma_addr_t base, void __iomem * mmio_txp_ring) +{ + struct c2_tx_desc *tx_desc; + struct c2_txp_desc __iomem *txp_desc; + struct c2_element *elem; + int i; + + tx_ring->start = kmalloc(sizeof(*elem) * tx_ring->count, GFP_KERNEL); + if (!tx_ring->start) + return -ENOMEM; + + elem = tx_ring->start; + tx_desc = vaddr; + txp_desc = mmio_txp_ring; + for (i = 0; i < tx_ring->count; i++, elem++, tx_desc++, txp_desc++) { + tx_desc->len = 0; + tx_desc->status = 0; + + /* Set TXP_HTXD_UNINIT */ + __raw_writeq((__force u64) cpu_to_be64(0x1122334455667788ULL), + (void __iomem *) txp_desc + C2_TXP_ADDR); + __raw_writew(0, (void __iomem *) txp_desc + C2_TXP_LEN); + __raw_writew((__force u16) cpu_to_be16(TXP_HTXD_UNINIT), + (void __iomem *) txp_desc + C2_TXP_FLAGS); + + elem->skb = NULL; + elem->ht_desc = tx_desc; + elem->hw_desc = txp_desc; + + if (i == tx_ring->count - 1) { + elem->next = tx_ring->start; + tx_desc->next_offset = base; + } else { + elem->next = elem + 1; + tx_desc->next_offset = + base + (i + 1) * sizeof(*tx_desc); + } + } + + tx_ring->to_use = tx_ring->to_clean = tx_ring->start; + + return 0; +} + +/* + * Allocate RX ring elements and chain them together. + * One-to-one association of adapter descriptors with ring elements. + */ +static int c2_rx_ring_alloc(struct c2_ring *rx_ring, void *vaddr, + dma_addr_t base, void __iomem * mmio_rxp_ring) +{ + struct c2_rx_desc *rx_desc; + struct c2_rxp_desc __iomem *rxp_desc; + struct c2_element *elem; + int i; + + rx_ring->start = kmalloc(sizeof(*elem) * rx_ring->count, GFP_KERNEL); + if (!rx_ring->start) + return -ENOMEM; + + elem = rx_ring->start; + rx_desc = vaddr; + rxp_desc = mmio_rxp_ring; + for (i = 0; i < rx_ring->count; i++, elem++, rx_desc++, rxp_desc++) { + rx_desc->len = 0; + rx_desc->status = 0; + + /* Set RXP_HRXD_UNINIT */ + __raw_writew((__force u16) cpu_to_be16(RXP_HRXD_OK), + (void __iomem *) rxp_desc + C2_RXP_STATUS); + __raw_writew(0, (void __iomem *) rxp_desc + C2_RXP_COUNT); + __raw_writew(0, (void __iomem *) rxp_desc + C2_RXP_LEN); + __raw_writeq((__force u64) cpu_to_be64(0x99aabbccddeeffULL), + (void __iomem *) rxp_desc + C2_RXP_ADDR); + __raw_writew((__force u16) cpu_to_be16(RXP_HRXD_UNINIT), + (void __iomem *) rxp_desc + C2_RXP_FLAGS); + + elem->skb = NULL; + elem->ht_desc = rx_desc; + elem->hw_desc = rxp_desc; + + if (i == rx_ring->count - 1) { + elem->next = rx_ring->start; + rx_desc->next_offset = base; + } else { + elem->next = elem + 1; + rx_desc->next_offset = + base + (i + 1) * sizeof(*rx_desc); + } + } + + rx_ring->to_use = rx_ring->to_clean = rx_ring->start; + + return 0; +} + +/* Setup buffer for receiving */ +static inline int c2_rx_alloc(struct c2_port *c2_port, struct c2_element *elem) +{ + struct c2_dev *c2dev = c2_port->c2dev; + struct c2_rx_desc *rx_desc = elem->ht_desc; + struct sk_buff *skb; + dma_addr_t mapaddr; + u32 maplen; + struct c2_rxp_hdr *rxp_hdr; + + skb = dev_alloc_skb(c2_port->rx_buf_size); + if (unlikely(!skb)) { + pr_debug("%s: out of memory for receive\n", + c2_port->netdev->name); + return -ENOMEM; + } + + /* Zero out the rxp hdr in the sk_buff */ + memset(skb->data, 0, sizeof(*rxp_hdr)); + + skb->dev = c2_port->netdev; + + maplen = c2_port->rx_buf_size; + mapaddr = + pci_map_single(c2dev->pcidev, skb->data, maplen, + PCI_DMA_FROMDEVICE); + + /* Set the sk_buff RXP_header to RXP_HRXD_READY */ + rxp_hdr = (struct c2_rxp_hdr *) skb->data; + rxp_hdr->flags = RXP_HRXD_READY; + + __raw_writew(0, elem->hw_desc + C2_RXP_STATUS); + __raw_writew((__force u16) cpu_to_be16((u16) maplen - sizeof(*rxp_hdr)), + elem->hw_desc + C2_RXP_LEN); + __raw_writeq((__force u64) cpu_to_be64(mapaddr), elem->hw_desc + C2_RXP_ADDR); + __raw_writew((__force u16) cpu_to_be16(RXP_HRXD_READY), + elem->hw_desc + C2_RXP_FLAGS); + + elem->skb = skb; + elem->mapaddr = mapaddr; + elem->maplen = maplen; + rx_desc->len = maplen; + + return 0; +} + +/* + * Allocate buffers for the Rx ring + * For receive: rx_ring.to_clean is next received frame + */ +static int c2_rx_fill(struct c2_port *c2_port) +{ + struct c2_ring *rx_ring = &c2_port->rx_ring; + struct c2_element *elem; + int ret = 0; + + elem = rx_ring->start; + do { + if (c2_rx_alloc(c2_port, elem)) { + ret = 1; + break; + } + } while ((elem = elem->next) != rx_ring->start); + + rx_ring->to_clean = rx_ring->start; + return ret; +} + +/* Free all buffers in RX ring, assumes receiver stopped */ +static void c2_rx_clean(struct c2_port *c2_port) +{ + struct c2_dev *c2dev = c2_port->c2dev; + struct c2_ring *rx_ring = &c2_port->rx_ring; + struct c2_element *elem; + struct c2_rx_desc *rx_desc; + + elem = rx_ring->start; + do { + rx_desc = elem->ht_desc; + rx_desc->len = 0; + + __raw_writew(0, elem->hw_desc + C2_RXP_STATUS); + __raw_writew(0, elem->hw_desc + C2_RXP_COUNT); + __raw_writew(0, elem->hw_desc + C2_RXP_LEN); + __raw_writeq((__force u64) cpu_to_be64(0x99aabbccddeeffULL), + elem->hw_desc + C2_RXP_ADDR); + __raw_writew((__force u16) cpu_to_be16(RXP_HRXD_UNINIT), + elem->hw_desc + C2_RXP_FLAGS); + + if (elem->skb) { + pci_unmap_single(c2dev->pcidev, elem->mapaddr, + elem->maplen, PCI_DMA_FROMDEVICE); + dev_kfree_skb(elem->skb); + elem->skb = NULL; + } + } while ((elem = elem->next) != rx_ring->start); +} + +static inline int c2_tx_free(struct c2_dev *c2dev, struct c2_element *elem) +{ + struct c2_tx_desc *tx_desc = elem->ht_desc; + + tx_desc->len = 0; + + pci_unmap_single(c2dev->pcidev, elem->mapaddr, elem->maplen, + PCI_DMA_TODEVICE); + + if (elem->skb) { + dev_kfree_skb_any(elem->skb); + elem->skb = NULL; + } + + return 0; +} + +/* Free all buffers in TX ring, assumes transmitter stopped */ +static void c2_tx_clean(struct c2_port *c2_port) +{ + struct c2_ring *tx_ring = &c2_port->tx_ring; + struct c2_element *elem; + struct c2_txp_desc txp_htxd; + int retry; + unsigned long flags; + + spin_lock_irqsave(&c2_port->tx_lock, flags); + + elem = tx_ring->start; + + do { + retry = 0; + do { + txp_htxd.flags = + readw(elem->hw_desc + C2_TXP_FLAGS); + + if (txp_htxd.flags == TXP_HTXD_READY) { + retry = 1; + __raw_writew(0, + elem->hw_desc + C2_TXP_LEN); + __raw_writeq(0, + elem->hw_desc + C2_TXP_ADDR); + __raw_writew((__force u16) cpu_to_be16(TXP_HTXD_DONE), + elem->hw_desc + C2_TXP_FLAGS); + c2_port->netdev->stats.tx_dropped++; + break; + } else { + __raw_writew(0, + elem->hw_desc + C2_TXP_LEN); + __raw_writeq((__force u64) cpu_to_be64(0x1122334455667788ULL), + elem->hw_desc + C2_TXP_ADDR); + __raw_writew((__force u16) cpu_to_be16(TXP_HTXD_UNINIT), + elem->hw_desc + C2_TXP_FLAGS); + } + + c2_tx_free(c2_port->c2dev, elem); + + } while ((elem = elem->next) != tx_ring->start); + } while (retry); + + c2_port->tx_avail = c2_port->tx_ring.count - 1; + c2_port->c2dev->cur_tx = tx_ring->to_use - tx_ring->start; + + if (c2_port->tx_avail > MAX_SKB_FRAGS + 1) + netif_wake_queue(c2_port->netdev); + + spin_unlock_irqrestore(&c2_port->tx_lock, flags); +} + +/* + * Process transmit descriptors marked 'DONE' by the firmware, + * freeing up their unneeded sk_buffs. + */ +static void c2_tx_interrupt(struct net_device *netdev) +{ + struct c2_port *c2_port = netdev_priv(netdev); + struct c2_dev *c2dev = c2_port->c2dev; + struct c2_ring *tx_ring = &c2_port->tx_ring; + struct c2_element *elem; + struct c2_txp_desc txp_htxd; + + spin_lock(&c2_port->tx_lock); + + for (elem = tx_ring->to_clean; elem != tx_ring->to_use; + elem = elem->next) { + txp_htxd.flags = + be16_to_cpu((__force __be16) readw(elem->hw_desc + C2_TXP_FLAGS)); + + if (txp_htxd.flags != TXP_HTXD_DONE) + break; + + if (netif_msg_tx_done(c2_port)) { + /* PCI reads are expensive in fast path */ + txp_htxd.len = + be16_to_cpu((__force __be16) readw(elem->hw_desc + C2_TXP_LEN)); + pr_debug("%s: tx done slot %3Zu status 0x%x len " + "%5u bytes\n", + netdev->name, elem - tx_ring->start, + txp_htxd.flags, txp_htxd.len); + } + + c2_tx_free(c2dev, elem); + ++(c2_port->tx_avail); + } + + tx_ring->to_clean = elem; + + if (netif_queue_stopped(netdev) + && c2_port->tx_avail > MAX_SKB_FRAGS + 1) + netif_wake_queue(netdev); + + spin_unlock(&c2_port->tx_lock); +} + +static void c2_rx_error(struct c2_port *c2_port, struct c2_element *elem) +{ + struct c2_rx_desc *rx_desc = elem->ht_desc; + struct c2_rxp_hdr *rxp_hdr = (struct c2_rxp_hdr *) elem->skb->data; + + if (rxp_hdr->status != RXP_HRXD_OK || + rxp_hdr->len > (rx_desc->len - sizeof(*rxp_hdr))) { + pr_debug("BAD RXP_HRXD\n"); + pr_debug(" rx_desc : %p\n", rx_desc); + pr_debug(" index : %Zu\n", + elem - c2_port->rx_ring.start); + pr_debug(" len : %u\n", rx_desc->len); + pr_debug(" rxp_hdr : %p [PA %p]\n", rxp_hdr, + (void *) __pa((unsigned long) rxp_hdr)); + pr_debug(" flags : 0x%x\n", rxp_hdr->flags); + pr_debug(" status: 0x%x\n", rxp_hdr->status); + pr_debug(" len : %u\n", rxp_hdr->len); + pr_debug(" rsvd : 0x%x\n", rxp_hdr->rsvd); + } + + /* Setup the skb for reuse since we're dropping this pkt */ + elem->skb->data = elem->skb->head; + skb_reset_tail_pointer(elem->skb); + + /* Zero out the rxp hdr in the sk_buff */ + memset(elem->skb->data, 0, sizeof(*rxp_hdr)); + + /* Write the descriptor to the adapter's rx ring */ + __raw_writew(0, elem->hw_desc + C2_RXP_STATUS); + __raw_writew(0, elem->hw_desc + C2_RXP_COUNT); + __raw_writew((__force u16) cpu_to_be16((u16) elem->maplen - sizeof(*rxp_hdr)), + elem->hw_desc + C2_RXP_LEN); + __raw_writeq((__force u64) cpu_to_be64(elem->mapaddr), + elem->hw_desc + C2_RXP_ADDR); + __raw_writew((__force u16) cpu_to_be16(RXP_HRXD_READY), + elem->hw_desc + C2_RXP_FLAGS); + + pr_debug("packet dropped\n"); + c2_port->netdev->stats.rx_dropped++; +} + +static void c2_rx_interrupt(struct net_device *netdev) +{ + struct c2_port *c2_port = netdev_priv(netdev); + struct c2_dev *c2dev = c2_port->c2dev; + struct c2_ring *rx_ring = &c2_port->rx_ring; + struct c2_element *elem; + struct c2_rx_desc *rx_desc; + struct c2_rxp_hdr *rxp_hdr; + struct sk_buff *skb; + dma_addr_t mapaddr; + u32 maplen, buflen; + unsigned long flags; + + spin_lock_irqsave(&c2dev->lock, flags); + + /* Begin where we left off */ + rx_ring->to_clean = rx_ring->start + c2dev->cur_rx; + + for (elem = rx_ring->to_clean; elem->next != rx_ring->to_clean; + elem = elem->next) { + rx_desc = elem->ht_desc; + mapaddr = elem->mapaddr; + maplen = elem->maplen; + skb = elem->skb; + rxp_hdr = (struct c2_rxp_hdr *) skb->data; + + if (rxp_hdr->flags != RXP_HRXD_DONE) + break; + buflen = rxp_hdr->len; + + /* Sanity check the RXP header */ + if (rxp_hdr->status != RXP_HRXD_OK || + buflen > (rx_desc->len - sizeof(*rxp_hdr))) { + c2_rx_error(c2_port, elem); + continue; + } + + /* + * Allocate and map a new skb for replenishing the host + * RX desc + */ + if (c2_rx_alloc(c2_port, elem)) { + c2_rx_error(c2_port, elem); + continue; + } + + /* Unmap the old skb */ + pci_unmap_single(c2dev->pcidev, mapaddr, maplen, + PCI_DMA_FROMDEVICE); + + prefetch(skb->data); + + /* + * Skip past the leading 8 bytes comprising of the + * "struct c2_rxp_hdr", prepended by the adapter + * to the usual Ethernet header ("struct ethhdr"), + * to the start of the raw Ethernet packet. + * + * Fix up the various fields in the sk_buff before + * passing it up to netif_rx(). The transfer size + * (in bytes) specified by the adapter len field of + * the "struct rxp_hdr_t" does NOT include the + * "sizeof(struct c2_rxp_hdr)". + */ + skb->data += sizeof(*rxp_hdr); + skb_set_tail_pointer(skb, buflen); + skb->len = buflen; + skb->protocol = eth_type_trans(skb, netdev); + + netif_rx(skb); + + netdev->stats.rx_packets++; + netdev->stats.rx_bytes += buflen; + } + + /* Save where we left off */ + rx_ring->to_clean = elem; + c2dev->cur_rx = elem - rx_ring->start; + C2_SET_CUR_RX(c2dev, c2dev->cur_rx); + + spin_unlock_irqrestore(&c2dev->lock, flags); +} + +/* + * Handle netisr0 TX & RX interrupts. + */ +static irqreturn_t c2_interrupt(int irq, void *dev_id) +{ + unsigned int netisr0, dmaisr; + int handled = 0; + struct c2_dev *c2dev = (struct c2_dev *) dev_id; + + /* Process CCILNET interrupts */ + netisr0 = readl(c2dev->regs + C2_NISR0); + if (netisr0) { + + /* + * There is an issue with the firmware that always + * provides the status of RX for both TX & RX + * interrupts. So process both queues here. + */ + c2_rx_interrupt(c2dev->netdev); + c2_tx_interrupt(c2dev->netdev); + + /* Clear the interrupt */ + writel(netisr0, c2dev->regs + C2_NISR0); + handled++; + } + + /* Process RNIC interrupts */ + dmaisr = readl(c2dev->regs + C2_DISR); + if (dmaisr) { + writel(dmaisr, c2dev->regs + C2_DISR); + c2_rnic_interrupt(c2dev); + handled++; + } + + if (handled) { + return IRQ_HANDLED; + } else { + return IRQ_NONE; + } +} + +static int c2_up(struct net_device *netdev) +{ + struct c2_port *c2_port = netdev_priv(netdev); + struct c2_dev *c2dev = c2_port->c2dev; + struct c2_element *elem; + struct c2_rxp_hdr *rxp_hdr; + struct in_device *in_dev; + size_t rx_size, tx_size; + int ret, i; + unsigned int netimr0; + + if (netif_msg_ifup(c2_port)) + pr_debug("%s: enabling interface\n", netdev->name); + + /* Set the Rx buffer size based on MTU */ + c2_set_rxbufsize(c2_port); + + /* Allocate DMA'able memory for Tx/Rx host descriptor rings */ + rx_size = c2_port->rx_ring.count * sizeof(struct c2_rx_desc); + tx_size = c2_port->tx_ring.count * sizeof(struct c2_tx_desc); + + c2_port->mem_size = tx_size + rx_size; + c2_port->mem = pci_zalloc_consistent(c2dev->pcidev, c2_port->mem_size, + &c2_port->dma); + if (c2_port->mem == NULL) { + pr_debug("Unable to allocate memory for " + "host descriptor rings\n"); + return -ENOMEM; + } + + /* Create the Rx host descriptor ring */ + if ((ret = + c2_rx_ring_alloc(&c2_port->rx_ring, c2_port->mem, c2_port->dma, + c2dev->mmio_rxp_ring))) { + pr_debug("Unable to create RX ring\n"); + goto bail0; + } + + /* Allocate Rx buffers for the host descriptor ring */ + if (c2_rx_fill(c2_port)) { + pr_debug("Unable to fill RX ring\n"); + goto bail1; + } + + /* Create the Tx host descriptor ring */ + if ((ret = c2_tx_ring_alloc(&c2_port->tx_ring, c2_port->mem + rx_size, + c2_port->dma + rx_size, + c2dev->mmio_txp_ring))) { + pr_debug("Unable to create TX ring\n"); + goto bail1; + } + + /* Set the TX pointer to where we left off */ + c2_port->tx_avail = c2_port->tx_ring.count - 1; + c2_port->tx_ring.to_use = c2_port->tx_ring.to_clean = + c2_port->tx_ring.start + c2dev->cur_tx; + + /* missing: Initialize MAC */ + + BUG_ON(c2_port->tx_ring.to_use != c2_port->tx_ring.to_clean); + + /* Reset the adapter, ensures the driver is in sync with the RXP */ + c2_reset(c2_port); + + /* Reset the READY bit in the sk_buff RXP headers & adapter HRXDQ */ + for (i = 0, elem = c2_port->rx_ring.start; i < c2_port->rx_ring.count; + i++, elem++) { + rxp_hdr = (struct c2_rxp_hdr *) elem->skb->data; + rxp_hdr->flags = 0; + __raw_writew((__force u16) cpu_to_be16(RXP_HRXD_READY), + elem->hw_desc + C2_RXP_FLAGS); + } + + /* Enable network packets */ + netif_start_queue(netdev); + + /* Enable IRQ */ + writel(0, c2dev->regs + C2_IDIS); + netimr0 = readl(c2dev->regs + C2_NIMR0); + netimr0 &= ~(C2_PCI_HTX_INT | C2_PCI_HRX_INT); + writel(netimr0, c2dev->regs + C2_NIMR0); + + /* Tell the stack to ignore arp requests for ipaddrs bound to + * other interfaces. This is needed to prevent the host stack + * from responding to arp requests to the ipaddr bound on the + * rdma interface. + */ + in_dev = in_dev_get(netdev); + IN_DEV_CONF_SET(in_dev, ARP_IGNORE, 1); + in_dev_put(in_dev); + + return 0; + + bail1: + c2_rx_clean(c2_port); + kfree(c2_port->rx_ring.start); + + bail0: + pci_free_consistent(c2dev->pcidev, c2_port->mem_size, c2_port->mem, + c2_port->dma); + + return ret; +} + +static int c2_down(struct net_device *netdev) +{ + struct c2_port *c2_port = netdev_priv(netdev); + struct c2_dev *c2dev = c2_port->c2dev; + + if (netif_msg_ifdown(c2_port)) + pr_debug("%s: disabling interface\n", + netdev->name); + + /* Wait for all the queued packets to get sent */ + c2_tx_interrupt(netdev); + + /* Disable network packets */ + netif_stop_queue(netdev); + + /* Disable IRQs by clearing the interrupt mask */ + writel(1, c2dev->regs + C2_IDIS); + writel(0, c2dev->regs + C2_NIMR0); + + /* missing: Stop transmitter */ + + /* missing: Stop receiver */ + + /* Reset the adapter, ensures the driver is in sync with the RXP */ + c2_reset(c2_port); + + /* missing: Turn off LEDs here */ + + /* Free all buffers in the host descriptor rings */ + c2_tx_clean(c2_port); + c2_rx_clean(c2_port); + + /* Free the host descriptor rings */ + kfree(c2_port->rx_ring.start); + kfree(c2_port->tx_ring.start); + pci_free_consistent(c2dev->pcidev, c2_port->mem_size, c2_port->mem, + c2_port->dma); + + return 0; +} + +static void c2_reset(struct c2_port *c2_port) +{ + struct c2_dev *c2dev = c2_port->c2dev; + unsigned int cur_rx = c2dev->cur_rx; + + /* Tell the hardware to quiesce */ + C2_SET_CUR_RX(c2dev, cur_rx | C2_PCI_HRX_QUI); + + /* + * The hardware will reset the C2_PCI_HRX_QUI bit once + * the RXP is quiesced. Wait 2 seconds for this. + */ + ssleep(2); + + cur_rx = C2_GET_CUR_RX(c2dev); + + if (cur_rx & C2_PCI_HRX_QUI) + pr_debug("c2_reset: failed to quiesce the hardware!\n"); + + cur_rx &= ~C2_PCI_HRX_QUI; + + c2dev->cur_rx = cur_rx; + + pr_debug("Current RX: %u\n", c2dev->cur_rx); +} + +static int c2_xmit_frame(struct sk_buff *skb, struct net_device *netdev) +{ + struct c2_port *c2_port = netdev_priv(netdev); + struct c2_dev *c2dev = c2_port->c2dev; + struct c2_ring *tx_ring = &c2_port->tx_ring; + struct c2_element *elem; + dma_addr_t mapaddr; + u32 maplen; + unsigned long flags; + unsigned int i; + + spin_lock_irqsave(&c2_port->tx_lock, flags); + + if (unlikely(c2_port->tx_avail < (skb_shinfo(skb)->nr_frags + 1))) { + netif_stop_queue(netdev); + spin_unlock_irqrestore(&c2_port->tx_lock, flags); + + pr_debug("%s: Tx ring full when queue awake!\n", + netdev->name); + return NETDEV_TX_BUSY; + } + + maplen = skb_headlen(skb); + mapaddr = + pci_map_single(c2dev->pcidev, skb->data, maplen, PCI_DMA_TODEVICE); + + elem = tx_ring->to_use; + elem->skb = skb; + elem->mapaddr = mapaddr; + elem->maplen = maplen; + + /* Tell HW to xmit */ + __raw_writeq((__force u64) cpu_to_be64(mapaddr), + elem->hw_desc + C2_TXP_ADDR); + __raw_writew((__force u16) cpu_to_be16(maplen), + elem->hw_desc + C2_TXP_LEN); + __raw_writew((__force u16) cpu_to_be16(TXP_HTXD_READY), + elem->hw_desc + C2_TXP_FLAGS); + + netdev->stats.tx_packets++; + netdev->stats.tx_bytes += maplen; + + /* Loop thru additional data fragments and queue them */ + if (skb_shinfo(skb)->nr_frags) { + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + maplen = skb_frag_size(frag); + mapaddr = skb_frag_dma_map(&c2dev->pcidev->dev, frag, + 0, maplen, DMA_TO_DEVICE); + elem = elem->next; + elem->skb = NULL; + elem->mapaddr = mapaddr; + elem->maplen = maplen; + + /* Tell HW to xmit */ + __raw_writeq((__force u64) cpu_to_be64(mapaddr), + elem->hw_desc + C2_TXP_ADDR); + __raw_writew((__force u16) cpu_to_be16(maplen), + elem->hw_desc + C2_TXP_LEN); + __raw_writew((__force u16) cpu_to_be16(TXP_HTXD_READY), + elem->hw_desc + C2_TXP_FLAGS); + + netdev->stats.tx_packets++; + netdev->stats.tx_bytes += maplen; + } + } + + tx_ring->to_use = elem->next; + c2_port->tx_avail -= (skb_shinfo(skb)->nr_frags + 1); + + if (c2_port->tx_avail <= MAX_SKB_FRAGS + 1) { + netif_stop_queue(netdev); + if (netif_msg_tx_queued(c2_port)) + pr_debug("%s: transmit queue full\n", + netdev->name); + } + + spin_unlock_irqrestore(&c2_port->tx_lock, flags); + + netdev->trans_start = jiffies; + + return NETDEV_TX_OK; +} + +static void c2_tx_timeout(struct net_device *netdev) +{ + struct c2_port *c2_port = netdev_priv(netdev); + + if (netif_msg_timer(c2_port)) + pr_debug("%s: tx timeout\n", netdev->name); + + c2_tx_clean(c2_port); +} + +static int c2_change_mtu(struct net_device *netdev, int new_mtu) +{ + int ret = 0; + + if (new_mtu < ETH_ZLEN || new_mtu > ETH_JUMBO_MTU) + return -EINVAL; + + netdev->mtu = new_mtu; + + if (netif_running(netdev)) { + c2_down(netdev); + + c2_up(netdev); + } + + return ret; +} + +static const struct net_device_ops c2_netdev = { + .ndo_open = c2_up, + .ndo_stop = c2_down, + .ndo_start_xmit = c2_xmit_frame, + .ndo_tx_timeout = c2_tx_timeout, + .ndo_change_mtu = c2_change_mtu, + .ndo_set_mac_address = eth_mac_addr, + .ndo_validate_addr = eth_validate_addr, +}; + +/* Initialize network device */ +static struct net_device *c2_devinit(struct c2_dev *c2dev, + void __iomem * mmio_addr) +{ + struct c2_port *c2_port = NULL; + struct net_device *netdev = alloc_etherdev(sizeof(*c2_port)); + + if (!netdev) { + pr_debug("c2_port etherdev alloc failed"); + return NULL; + } + + SET_NETDEV_DEV(netdev, &c2dev->pcidev->dev); + + netdev->netdev_ops = &c2_netdev; + netdev->watchdog_timeo = C2_TX_TIMEOUT; + netdev->irq = c2dev->pcidev->irq; + + c2_port = netdev_priv(netdev); + c2_port->netdev = netdev; + c2_port->c2dev = c2dev; + c2_port->msg_enable = netif_msg_init(debug, default_msg); + c2_port->tx_ring.count = C2_NUM_TX_DESC; + c2_port->rx_ring.count = C2_NUM_RX_DESC; + + spin_lock_init(&c2_port->tx_lock); + + /* Copy our 48-bit ethernet hardware address */ + memcpy_fromio(netdev->dev_addr, mmio_addr + C2_REGS_ENADDR, 6); + + /* Validate the MAC address */ + if (!is_valid_ether_addr(netdev->dev_addr)) { + pr_debug("Invalid MAC Address\n"); + c2_print_macaddr(netdev); + free_netdev(netdev); + return NULL; + } + + c2dev->netdev = netdev; + + return netdev; +} + +static int c2_probe(struct pci_dev *pcidev, const struct pci_device_id *ent) +{ + int ret = 0, i; + unsigned long reg0_start, reg0_flags, reg0_len; + unsigned long reg2_start, reg2_flags, reg2_len; + unsigned long reg4_start, reg4_flags, reg4_len; + unsigned kva_map_size; + struct net_device *netdev = NULL; + struct c2_dev *c2dev = NULL; + void __iomem *mmio_regs = NULL; + + printk(KERN_INFO PFX "AMSO1100 Gigabit Ethernet driver v%s loaded\n", + DRV_VERSION); + + /* Enable PCI device */ + ret = pci_enable_device(pcidev); + if (ret) { + printk(KERN_ERR PFX "%s: Unable to enable PCI device\n", + pci_name(pcidev)); + goto bail0; + } + + reg0_start = pci_resource_start(pcidev, BAR_0); + reg0_len = pci_resource_len(pcidev, BAR_0); + reg0_flags = pci_resource_flags(pcidev, BAR_0); + + reg2_start = pci_resource_start(pcidev, BAR_2); + reg2_len = pci_resource_len(pcidev, BAR_2); + reg2_flags = pci_resource_flags(pcidev, BAR_2); + + reg4_start = pci_resource_start(pcidev, BAR_4); + reg4_len = pci_resource_len(pcidev, BAR_4); + reg4_flags = pci_resource_flags(pcidev, BAR_4); + + pr_debug("BAR0 size = 0x%lX bytes\n", reg0_len); + pr_debug("BAR2 size = 0x%lX bytes\n", reg2_len); + pr_debug("BAR4 size = 0x%lX bytes\n", reg4_len); + + /* Make sure PCI base addr are MMIO */ + if (!(reg0_flags & IORESOURCE_MEM) || + !(reg2_flags & IORESOURCE_MEM) || !(reg4_flags & IORESOURCE_MEM)) { + printk(KERN_ERR PFX "PCI regions not an MMIO resource\n"); + ret = -ENODEV; + goto bail1; + } + + /* Check for weird/broken PCI region reporting */ + if ((reg0_len < C2_REG0_SIZE) || + (reg2_len < C2_REG2_SIZE) || (reg4_len < C2_REG4_SIZE)) { + printk(KERN_ERR PFX "Invalid PCI region sizes\n"); + ret = -ENODEV; + goto bail1; + } + + /* Reserve PCI I/O and memory resources */ + ret = pci_request_regions(pcidev, DRV_NAME); + if (ret) { + printk(KERN_ERR PFX "%s: Unable to request regions\n", + pci_name(pcidev)); + goto bail1; + } + + if ((sizeof(dma_addr_t) > 4)) { + ret = pci_set_dma_mask(pcidev, DMA_BIT_MASK(64)); + if (ret < 0) { + printk(KERN_ERR PFX "64b DMA configuration failed\n"); + goto bail2; + } + } else { + ret = pci_set_dma_mask(pcidev, DMA_BIT_MASK(32)); + if (ret < 0) { + printk(KERN_ERR PFX "32b DMA configuration failed\n"); + goto bail2; + } + } + + /* Enables bus-mastering on the device */ + pci_set_master(pcidev); + + /* Remap the adapter PCI registers in BAR4 */ + mmio_regs = ioremap_nocache(reg4_start + C2_PCI_REGS_OFFSET, + sizeof(struct c2_adapter_pci_regs)); + if (!mmio_regs) { + printk(KERN_ERR PFX + "Unable to remap adapter PCI registers in BAR4\n"); + ret = -EIO; + goto bail2; + } + + /* Validate PCI regs magic */ + for (i = 0; i < sizeof(c2_magic); i++) { + if (c2_magic[i] != readb(mmio_regs + C2_REGS_MAGIC + i)) { + printk(KERN_ERR PFX "Downlevel Firmware boot loader " + "[%d/%Zd: got 0x%x, exp 0x%x]. Use the cc_flash " + "utility to update your boot loader\n", + i + 1, sizeof(c2_magic), + readb(mmio_regs + C2_REGS_MAGIC + i), + c2_magic[i]); + printk(KERN_ERR PFX "Adapter not claimed\n"); + iounmap(mmio_regs); + ret = -EIO; + goto bail2; + } + } + + /* Validate the adapter version */ + if (be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_VERS)) != C2_VERSION) { + printk(KERN_ERR PFX "Version mismatch " + "[fw=%u, c2=%u], Adapter not claimed\n", + be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_VERS)), + C2_VERSION); + ret = -EINVAL; + iounmap(mmio_regs); + goto bail2; + } + + /* Validate the adapter IVN */ + if (be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_IVN)) != C2_IVN) { + printk(KERN_ERR PFX "Downlevel FIrmware level. You should be using " + "the OpenIB device support kit. " + "[fw=0x%x, c2=0x%x], Adapter not claimed\n", + be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_IVN)), + C2_IVN); + ret = -EINVAL; + iounmap(mmio_regs); + goto bail2; + } + + /* Allocate hardware structure */ + c2dev = (struct c2_dev *) ib_alloc_device(sizeof(*c2dev)); + if (!c2dev) { + printk(KERN_ERR PFX "%s: Unable to alloc hardware struct\n", + pci_name(pcidev)); + ret = -ENOMEM; + iounmap(mmio_regs); + goto bail2; + } + + memset(c2dev, 0, sizeof(*c2dev)); + spin_lock_init(&c2dev->lock); + c2dev->pcidev = pcidev; + c2dev->cur_tx = 0; + + /* Get the last RX index */ + c2dev->cur_rx = + (be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_HRX_CUR)) - + 0xffffc000) / sizeof(struct c2_rxp_desc); + + /* Request an interrupt line for the driver */ + ret = request_irq(pcidev->irq, c2_interrupt, IRQF_SHARED, DRV_NAME, c2dev); + if (ret) { + printk(KERN_ERR PFX "%s: requested IRQ %u is busy\n", + pci_name(pcidev), pcidev->irq); + iounmap(mmio_regs); + goto bail3; + } + + /* Set driver specific data */ + pci_set_drvdata(pcidev, c2dev); + + /* Initialize network device */ + if ((netdev = c2_devinit(c2dev, mmio_regs)) == NULL) { + ret = -ENOMEM; + iounmap(mmio_regs); + goto bail4; + } + + /* Save off the actual size prior to unmapping mmio_regs */ + kva_map_size = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_PCI_WINSIZE)); + + /* Unmap the adapter PCI registers in BAR4 */ + iounmap(mmio_regs); + + /* Register network device */ + ret = register_netdev(netdev); + if (ret) { + printk(KERN_ERR PFX "Unable to register netdev, ret = %d\n", + ret); + goto bail5; + } + + /* Disable network packets */ + netif_stop_queue(netdev); + + /* Remap the adapter HRXDQ PA space to kernel VA space */ + c2dev->mmio_rxp_ring = ioremap_nocache(reg4_start + C2_RXP_HRXDQ_OFFSET, + C2_RXP_HRXDQ_SIZE); + if (!c2dev->mmio_rxp_ring) { + printk(KERN_ERR PFX "Unable to remap MMIO HRXDQ region\n"); + ret = -EIO; + goto bail6; + } + + /* Remap the adapter HTXDQ PA space to kernel VA space */ + c2dev->mmio_txp_ring = ioremap_nocache(reg4_start + C2_TXP_HTXDQ_OFFSET, + C2_TXP_HTXDQ_SIZE); + if (!c2dev->mmio_txp_ring) { + printk(KERN_ERR PFX "Unable to remap MMIO HTXDQ region\n"); + ret = -EIO; + goto bail7; + } + + /* Save off the current RX index in the last 4 bytes of the TXP Ring */ + C2_SET_CUR_RX(c2dev, c2dev->cur_rx); + + /* Remap the PCI registers in adapter BAR0 to kernel VA space */ + c2dev->regs = ioremap_nocache(reg0_start, reg0_len); + if (!c2dev->regs) { + printk(KERN_ERR PFX "Unable to remap BAR0\n"); + ret = -EIO; + goto bail8; + } + + /* Remap the PCI registers in adapter BAR4 to kernel VA space */ + c2dev->pa = reg4_start + C2_PCI_REGS_OFFSET; + c2dev->kva = ioremap_nocache(reg4_start + C2_PCI_REGS_OFFSET, + kva_map_size); + if (!c2dev->kva) { + printk(KERN_ERR PFX "Unable to remap BAR4\n"); + ret = -EIO; + goto bail9; + } + + /* Print out the MAC address */ + c2_print_macaddr(netdev); + + ret = c2_rnic_init(c2dev); + if (ret) { + printk(KERN_ERR PFX "c2_rnic_init failed: %d\n", ret); + goto bail10; + } + + ret = c2_register_device(c2dev); + if (ret) + goto bail10; + + return 0; + + bail10: + iounmap(c2dev->kva); + + bail9: + iounmap(c2dev->regs); + + bail8: + iounmap(c2dev->mmio_txp_ring); + + bail7: + iounmap(c2dev->mmio_rxp_ring); + + bail6: + unregister_netdev(netdev); + + bail5: + free_netdev(netdev); + + bail4: + free_irq(pcidev->irq, c2dev); + + bail3: + ib_dealloc_device(&c2dev->ibdev); + + bail2: + pci_release_regions(pcidev); + + bail1: + pci_disable_device(pcidev); + + bail0: + return ret; +} + +static void c2_remove(struct pci_dev *pcidev) +{ + struct c2_dev *c2dev = pci_get_drvdata(pcidev); + struct net_device *netdev = c2dev->netdev; + + /* Unregister with OpenIB */ + c2_unregister_device(c2dev); + + /* Clean up the RNIC resources */ + c2_rnic_term(c2dev); + + /* Remove network device from the kernel */ + unregister_netdev(netdev); + + /* Free network device */ + free_netdev(netdev); + + /* Free the interrupt line */ + free_irq(pcidev->irq, c2dev); + + /* missing: Turn LEDs off here */ + + /* Unmap adapter PA space */ + iounmap(c2dev->kva); + iounmap(c2dev->regs); + iounmap(c2dev->mmio_txp_ring); + iounmap(c2dev->mmio_rxp_ring); + + /* Free the hardware structure */ + ib_dealloc_device(&c2dev->ibdev); + + /* Release reserved PCI I/O and memory resources */ + pci_release_regions(pcidev); + + /* Disable PCI device */ + pci_disable_device(pcidev); + + /* Clear driver specific data */ + pci_set_drvdata(pcidev, NULL); +} + +static struct pci_driver c2_pci_driver = { + .name = DRV_NAME, + .id_table = c2_pci_table, + .probe = c2_probe, + .remove = c2_remove, +}; + +module_pci_driver(c2_pci_driver); diff --git a/kernel/drivers/infiniband/hw/amso1100/c2.h b/kernel/drivers/infiniband/hw/amso1100/c2.h new file mode 100644 index 000000000..d619d7358 --- /dev/null +++ b/kernel/drivers/infiniband/hw/amso1100/c2.h @@ -0,0 +1,547 @@ +/* + * Copyright (c) 2005 Ammasso, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __C2_H +#define __C2_H + +#include +#include +#include +#include +#include +#include + +#include "c2_provider.h" +#include "c2_mq.h" +#include "c2_status.h" + +#define DRV_NAME "c2" +#define DRV_VERSION "1.1" +#define PFX DRV_NAME ": " + +#define BAR_0 0 +#define BAR_2 2 +#define BAR_4 4 + +#define RX_BUF_SIZE (1536 + 8) +#define ETH_JUMBO_MTU 9000 +#define C2_MAGIC "CEPHEUS" +#define C2_VERSION 4 +#define C2_IVN (18 & 0x7fffffff) + +#define C2_REG0_SIZE (16 * 1024) +#define C2_REG2_SIZE (2 * 1024 * 1024) +#define C2_REG4_SIZE (256 * 1024 * 1024) +#define C2_NUM_TX_DESC 341 +#define C2_NUM_RX_DESC 256 +#define C2_PCI_REGS_OFFSET (0x10000) +#define C2_RXP_HRXDQ_OFFSET (((C2_REG4_SIZE)/2)) +#define C2_RXP_HRXDQ_SIZE (4096) +#define C2_TXP_HTXDQ_OFFSET (((C2_REG4_SIZE)/2) + C2_RXP_HRXDQ_SIZE) +#define C2_TXP_HTXDQ_SIZE (4096) +#define C2_TX_TIMEOUT (6*HZ) + +/* CEPHEUS */ +static const u8 c2_magic[] = { + 0x43, 0x45, 0x50, 0x48, 0x45, 0x55, 0x53 +}; + +enum adapter_pci_regs { + C2_REGS_MAGIC = 0x0000, + C2_REGS_VERS = 0x0008, + C2_REGS_IVN = 0x000C, + C2_REGS_PCI_WINSIZE = 0x0010, + C2_REGS_Q0_QSIZE = 0x0014, + C2_REGS_Q0_MSGSIZE = 0x0018, + C2_REGS_Q0_POOLSTART = 0x001C, + C2_REGS_Q0_SHARED = 0x0020, + C2_REGS_Q1_QSIZE = 0x0024, + C2_REGS_Q1_MSGSIZE = 0x0028, + C2_REGS_Q1_SHARED = 0x0030, + C2_REGS_Q2_QSIZE = 0x0034, + C2_REGS_Q2_MSGSIZE = 0x0038, + C2_REGS_Q2_SHARED = 0x0040, + C2_REGS_ENADDR = 0x004C, + C2_REGS_RDMA_ENADDR = 0x0054, + C2_REGS_HRX_CUR = 0x006C, +}; + +struct c2_adapter_pci_regs { + char reg_magic[8]; + u32 version; + u32 ivn; + u32 pci_window_size; + u32 q0_q_size; + u32 q0_msg_size; + u32 q0_pool_start; + u32 q0_shared; + u32 q1_q_size; + u32 q1_msg_size; + u32 q1_pool_start; + u32 q1_shared; + u32 q2_q_size; + u32 q2_msg_size; + u32 q2_pool_start; + u32 q2_shared; + u32 log_start; + u32 log_size; + u8 host_enaddr[8]; + u8 rdma_enaddr[8]; + u32 crash_entry; + u32 crash_ready[2]; + u32 fw_txd_cur; + u32 fw_hrxd_cur; + u32 fw_rxd_cur; +}; + +enum pci_regs { + C2_HISR = 0x0000, + C2_DISR = 0x0004, + C2_HIMR = 0x0008, + C2_DIMR = 0x000C, + C2_NISR0 = 0x0010, + C2_NISR1 = 0x0014, + C2_NIMR0 = 0x0018, + C2_NIMR1 = 0x001C, + C2_IDIS = 0x0020, +}; + +enum { + C2_PCI_HRX_INT = 1 << 8, + C2_PCI_HTX_INT = 1 << 17, + C2_PCI_HRX_QUI = 1 << 31, +}; + +/* + * Cepheus registers in BAR0. + */ +struct c2_pci_regs { + u32 hostisr; + u32 dmaisr; + u32 hostimr; + u32 dmaimr; + u32 netisr0; + u32 netisr1; + u32 netimr0; + u32 netimr1; + u32 int_disable; +}; + +/* TXP flags */ +enum c2_txp_flags { + TXP_HTXD_DONE = 0, + TXP_HTXD_READY = 1 << 0, + TXP_HTXD_UNINIT = 1 << 1, +}; + +/* RXP flags */ +enum c2_rxp_flags { + RXP_HRXD_UNINIT = 0, + RXP_HRXD_READY = 1 << 0, + RXP_HRXD_DONE = 1 << 1, +}; + +/* RXP status */ +enum c2_rxp_status { + RXP_HRXD_ZERO = 0, + RXP_HRXD_OK = 1 << 0, + RXP_HRXD_BUF_OV = 1 << 1, +}; + +/* TXP descriptor fields */ +enum txp_desc { + C2_TXP_FLAGS = 0x0000, + C2_TXP_LEN = 0x0002, + C2_TXP_ADDR = 0x0004, +}; + +/* RXP descriptor fields */ +enum rxp_desc { + C2_RXP_FLAGS = 0x0000, + C2_RXP_STATUS = 0x0002, + C2_RXP_COUNT = 0x0004, + C2_RXP_LEN = 0x0006, + C2_RXP_ADDR = 0x0008, +}; + +struct c2_txp_desc { + u16 flags; + u16 len; + u64 addr; +} __attribute__ ((packed)); + +struct c2_rxp_desc { + u16 flags; + u16 status; + u16 count; + u16 len; + u64 addr; +} __attribute__ ((packed)); + +struct c2_rxp_hdr { + u16 flags; + u16 status; + u16 len; + u16 rsvd; +} __attribute__ ((packed)); + +struct c2_tx_desc { + u32 len; + u32 status; + dma_addr_t next_offset; +}; + +struct c2_rx_desc { + u32 len; + u32 status; + dma_addr_t next_offset; +}; + +struct c2_alloc { + u32 last; + u32 max; + spinlock_t lock; + unsigned long *table; +}; + +struct c2_array { + struct { + void **page; + int used; + } *page_list; +}; + +/* + * The MQ shared pointer pool is organized as a linked list of + * chunks. Each chunk contains a linked list of free shared pointers + * that can be allocated to a given user mode client. + * + */ +struct sp_chunk { + struct sp_chunk *next; + dma_addr_t dma_addr; + DEFINE_DMA_UNMAP_ADDR(mapping); + u16 head; + u16 shared_ptr[0]; +}; + +struct c2_pd_table { + u32 last; + u32 max; + spinlock_t lock; + unsigned long *table; +}; + +struct c2_qp_table { + struct idr idr; + spinlock_t lock; +}; + +struct c2_element { + struct c2_element *next; + void *ht_desc; /* host descriptor */ + void __iomem *hw_desc; /* hardware descriptor */ + struct sk_buff *skb; + dma_addr_t mapaddr; + u32 maplen; +}; + +struct c2_ring { + struct c2_element *to_clean; + struct c2_element *to_use; + struct c2_element *start; + unsigned long count; +}; + +struct c2_dev { + struct ib_device ibdev; + void __iomem *regs; + void __iomem *mmio_txp_ring; /* remapped adapter memory for hw rings */ + void __iomem *mmio_rxp_ring; + spinlock_t lock; + struct pci_dev *pcidev; + struct net_device *netdev; + struct net_device *pseudo_netdev; + unsigned int cur_tx; + unsigned int cur_rx; + u32 adapter_handle; + int device_cap_flags; + void __iomem *kva; /* KVA device memory */ + unsigned long pa; /* PA device memory */ + void **qptr_array; + + struct kmem_cache *host_msg_cache; + + struct list_head cca_link; /* adapter list */ + struct list_head eh_wakeup_list; /* event wakeup list */ + wait_queue_head_t req_vq_wo; + + /* Cached RNIC properties */ + struct ib_device_attr props; + + struct c2_pd_table pd_table; + struct c2_qp_table qp_table; + int ports; /* num of GigE ports */ + int devnum; + spinlock_t vqlock; /* sync vbs req MQ */ + + /* Verbs Queues */ + struct c2_mq req_vq; /* Verbs Request MQ */ + struct c2_mq rep_vq; /* Verbs Reply MQ */ + struct c2_mq aeq; /* Async Events MQ */ + + /* Kernel client MQs */ + struct sp_chunk *kern_mqsp_pool; + + /* Device updates these values when posting messages to a host + * target queue */ + u16 req_vq_shared; + u16 rep_vq_shared; + u16 aeq_shared; + u16 irq_claimed; + + /* + * Shared host target pages for user-accessible MQs. + */ + int hthead; /* index of first free entry */ + void *htpages; /* kernel vaddr */ + int htlen; /* length of htpages memory */ + void *htuva; /* user mapped vaddr */ + spinlock_t htlock; /* serialize allocation */ + + u64 adapter_hint_uva; /* access to the activity FIFO */ + + // spinlock_t aeq_lock; + // spinlock_t rnic_lock; + + __be16 *hint_count; + dma_addr_t hint_count_dma; + u16 hints_read; + + int init; /* TRUE if it's ready */ + char ae_cache_name[16]; + char vq_cache_name[16]; +}; + +struct c2_port { + u32 msg_enable; + struct c2_dev *c2dev; + struct net_device *netdev; + + spinlock_t tx_lock; + u32 tx_avail; + struct c2_ring tx_ring; + struct c2_ring rx_ring; + + void *mem; /* PCI memory for host rings */ + dma_addr_t dma; + unsigned long mem_size; + + u32 rx_buf_size; +}; + +/* + * Activity FIFO registers in BAR0. + */ +#define PCI_BAR0_HOST_HINT 0x100 +#define PCI_BAR0_ADAPTER_HINT 0x2000 + +/* + * Ammasso PCI vendor id and Cepheus PCI device id. + */ +#define CQ_ARMED 0x01 +#define CQ_WAIT_FOR_DMA 0x80 + +/* + * The format of a hint is as follows: + * Lower 16 bits are the count of hints for the queue. + * Next 15 bits are the qp_index + * Upper most bit depends on who reads it: + * If read by producer, then it means Full (1) or Not-Full (0) + * If read by consumer, then it means Empty (1) or Not-Empty (0) + */ +#define C2_HINT_MAKE(q_index, hint_count) (((q_index) << 16) | hint_count) +#define C2_HINT_GET_INDEX(hint) (((hint) & 0x7FFF0000) >> 16) +#define C2_HINT_GET_COUNT(hint) ((hint) & 0x0000FFFF) + + +/* + * The following defines the offset in SDRAM for the c2_adapter_pci_regs_t + * struct. + */ +#define C2_ADAPTER_PCI_REGS_OFFSET 0x10000 + +#ifndef readq +static inline u64 readq(const void __iomem * addr) +{ + u64 ret = readl(addr + 4); + ret <<= 32; + ret |= readl(addr); + + return ret; +} +#endif + +#ifndef writeq +static inline void __raw_writeq(u64 val, void __iomem * addr) +{ + __raw_writel((u32) (val), addr); + __raw_writel((u32) (val >> 32), (addr + 4)); +} +#endif + +#define C2_SET_CUR_RX(c2dev, cur_rx) \ + __raw_writel((__force u32) cpu_to_be32(cur_rx), c2dev->mmio_txp_ring + 4092) + +#define C2_GET_CUR_RX(c2dev) \ + be32_to_cpu((__force __be32) readl(c2dev->mmio_txp_ring + 4092)) + +static inline struct c2_dev *to_c2dev(struct ib_device *ibdev) +{ + return container_of(ibdev, struct c2_dev, ibdev); +} + +static inline int c2_errno(void *reply) +{ + switch (c2_wr_get_result(reply)) { + case C2_OK: + return 0; + case CCERR_NO_BUFS: + case CCERR_INSUFFICIENT_RESOURCES: + case CCERR_ZERO_RDMA_READ_RESOURCES: + return -ENOMEM; + case CCERR_MR_IN_USE: + case CCERR_QP_IN_USE: + return -EBUSY; + case CCERR_ADDR_IN_USE: + return -EADDRINUSE; + case CCERR_ADDR_NOT_AVAIL: + return -EADDRNOTAVAIL; + case CCERR_CONN_RESET: + return -ECONNRESET; + case CCERR_NOT_IMPLEMENTED: + case CCERR_INVALID_WQE: + return -ENOSYS; + case CCERR_QP_NOT_PRIVILEGED: + return -EPERM; + case CCERR_STACK_ERROR: + return -EPROTO; + case CCERR_ACCESS_VIOLATION: + case CCERR_BASE_AND_BOUNDS_VIOLATION: + return -EFAULT; + case CCERR_STAG_STATE_NOT_INVALID: + case CCERR_INVALID_ADDRESS: + case CCERR_INVALID_CQ: + case CCERR_INVALID_EP: + case CCERR_INVALID_MODIFIER: + case CCERR_INVALID_MTU: + case CCERR_INVALID_PD_ID: + case CCERR_INVALID_QP: + case CCERR_INVALID_RNIC: + case CCERR_INVALID_STAG: + return -EINVAL; + default: + return -EAGAIN; + } +} + +/* Device */ +extern int c2_register_device(struct c2_dev *c2dev); +extern void c2_unregister_device(struct c2_dev *c2dev); +extern int c2_rnic_init(struct c2_dev *c2dev); +extern void c2_rnic_term(struct c2_dev *c2dev); +extern void c2_rnic_interrupt(struct c2_dev *c2dev); +extern int c2_del_addr(struct c2_dev *c2dev, __be32 inaddr, __be32 inmask); +extern int c2_add_addr(struct c2_dev *c2dev, __be32 inaddr, __be32 inmask); + +/* QPs */ +extern int c2_alloc_qp(struct c2_dev *c2dev, struct c2_pd *pd, + struct ib_qp_init_attr *qp_attrs, struct c2_qp *qp); +extern void c2_free_qp(struct c2_dev *c2dev, struct c2_qp *qp); +extern struct ib_qp *c2_get_qp(struct ib_device *device, int qpn); +extern int c2_qp_modify(struct c2_dev *c2dev, struct c2_qp *qp, + struct ib_qp_attr *attr, int attr_mask); +extern int c2_qp_set_read_limits(struct c2_dev *c2dev, struct c2_qp *qp, + int ord, int ird); +extern int c2_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr, + struct ib_send_wr **bad_wr); +extern int c2_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *ib_wr, + struct ib_recv_wr **bad_wr); +extern void c2_init_qp_table(struct c2_dev *c2dev); +extern void c2_cleanup_qp_table(struct c2_dev *c2dev); +extern void c2_set_qp_state(struct c2_qp *, int); +extern struct c2_qp *c2_find_qpn(struct c2_dev *c2dev, int qpn); + +/* PDs */ +extern int c2_pd_alloc(struct c2_dev *c2dev, int privileged, struct c2_pd *pd); +extern void c2_pd_free(struct c2_dev *c2dev, struct c2_pd *pd); +extern int c2_init_pd_table(struct c2_dev *c2dev); +extern void c2_cleanup_pd_table(struct c2_dev *c2dev); + +/* CQs */ +extern int c2_init_cq(struct c2_dev *c2dev, int entries, + struct c2_ucontext *ctx, struct c2_cq *cq); +extern void c2_free_cq(struct c2_dev *c2dev, struct c2_cq *cq); +extern void c2_cq_event(struct c2_dev *c2dev, u32 mq_index); +extern void c2_cq_clean(struct c2_dev *c2dev, struct c2_qp *qp, u32 mq_index); +extern int c2_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry); +extern int c2_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); + +/* CM */ +extern int c2_llp_connect(struct iw_cm_id *cm_id, + struct iw_cm_conn_param *iw_param); +extern int c2_llp_accept(struct iw_cm_id *cm_id, + struct iw_cm_conn_param *iw_param); +extern int c2_llp_reject(struct iw_cm_id *cm_id, const void *pdata, + u8 pdata_len); +extern int c2_llp_service_create(struct iw_cm_id *cm_id, int backlog); +extern int c2_llp_service_destroy(struct iw_cm_id *cm_id); + +/* MM */ +extern int c2_nsmr_register_phys_kern(struct c2_dev *c2dev, u64 *addr_list, + int page_size, int pbl_depth, u32 length, + u32 off, u64 *va, enum c2_acf acf, + struct c2_mr *mr); +extern int c2_stag_dealloc(struct c2_dev *c2dev, u32 stag_index); + +/* AE */ +extern void c2_ae_event(struct c2_dev *c2dev, u32 mq_index); + +/* MQSP Allocator */ +extern int c2_init_mqsp_pool(struct c2_dev *c2dev, gfp_t gfp_mask, + struct sp_chunk **root); +extern void c2_free_mqsp_pool(struct c2_dev *c2dev, struct sp_chunk *root); +extern __be16 *c2_alloc_mqsp(struct c2_dev *c2dev, struct sp_chunk *head, + dma_addr_t *dma_addr, gfp_t gfp_mask); +extern void c2_free_mqsp(__be16* mqsp); +#endif diff --git a/kernel/drivers/infiniband/hw/amso1100/c2_ae.c b/kernel/drivers/infiniband/hw/amso1100/c2_ae.c new file mode 100644 index 000000000..cedda2523 --- /dev/null +++ b/kernel/drivers/infiniband/hw/amso1100/c2_ae.c @@ -0,0 +1,327 @@ +/* + * Copyright (c) 2005 Ammasso, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "c2.h" +#include +#include "c2_status.h" +#include "c2_ae.h" + +static int c2_convert_cm_status(u32 c2_status) +{ + switch (c2_status) { + case C2_CONN_STATUS_SUCCESS: + return 0; + case C2_CONN_STATUS_REJECTED: + return -ENETRESET; + case C2_CONN_STATUS_REFUSED: + return -ECONNREFUSED; + case C2_CONN_STATUS_TIMEDOUT: + return -ETIMEDOUT; + case C2_CONN_STATUS_NETUNREACH: + return -ENETUNREACH; + case C2_CONN_STATUS_HOSTUNREACH: + return -EHOSTUNREACH; + case C2_CONN_STATUS_INVALID_RNIC: + return -EINVAL; + case C2_CONN_STATUS_INVALID_QP: + return -EINVAL; + case C2_CONN_STATUS_INVALID_QP_STATE: + return -EINVAL; + case C2_CONN_STATUS_ADDR_NOT_AVAIL: + return -EADDRNOTAVAIL; + default: + printk(KERN_ERR PFX + "%s - Unable to convert CM status: %d\n", + __func__, c2_status); + return -EIO; + } +} + +static const char* to_event_str(int event) +{ + static const char* event_str[] = { + "CCAE_REMOTE_SHUTDOWN", + "CCAE_ACTIVE_CONNECT_RESULTS", + "CCAE_CONNECTION_REQUEST", + "CCAE_LLP_CLOSE_COMPLETE", + "CCAE_TERMINATE_MESSAGE_RECEIVED", + "CCAE_LLP_CONNECTION_RESET", + "CCAE_LLP_CONNECTION_LOST", + "CCAE_LLP_SEGMENT_SIZE_INVALID", + "CCAE_LLP_INVALID_CRC", + "CCAE_LLP_BAD_FPDU", + "CCAE_INVALID_DDP_VERSION", + "CCAE_INVALID_RDMA_VERSION", + "CCAE_UNEXPECTED_OPCODE", + "CCAE_INVALID_DDP_QUEUE_NUMBER", + "CCAE_RDMA_READ_NOT_ENABLED", + "CCAE_RDMA_WRITE_NOT_ENABLED", + "CCAE_RDMA_READ_TOO_SMALL", + "CCAE_NO_L_BIT", + "CCAE_TAGGED_INVALID_STAG", + "CCAE_TAGGED_BASE_BOUNDS_VIOLATION", + "CCAE_TAGGED_ACCESS_RIGHTS_VIOLATION", + "CCAE_TAGGED_INVALID_PD", + "CCAE_WRAP_ERROR", + "CCAE_BAD_CLOSE", + "CCAE_BAD_LLP_CLOSE", + "CCAE_INVALID_MSN_RANGE", + "CCAE_INVALID_MSN_GAP", + "CCAE_IRRQ_OVERFLOW", + "CCAE_IRRQ_MSN_GAP", + "CCAE_IRRQ_MSN_RANGE", + "CCAE_IRRQ_INVALID_STAG", + "CCAE_IRRQ_BASE_BOUNDS_VIOLATION", + "CCAE_IRRQ_ACCESS_RIGHTS_VIOLATION", + "CCAE_IRRQ_INVALID_PD", + "CCAE_IRRQ_WRAP_ERROR", + "CCAE_CQ_SQ_COMPLETION_OVERFLOW", + "CCAE_CQ_RQ_COMPLETION_ERROR", + "CCAE_QP_SRQ_WQE_ERROR", + "CCAE_QP_LOCAL_CATASTROPHIC_ERROR", + "CCAE_CQ_OVERFLOW", + "CCAE_CQ_OPERATION_ERROR", + "CCAE_SRQ_LIMIT_REACHED", + "CCAE_QP_RQ_LIMIT_REACHED", + "CCAE_SRQ_CATASTROPHIC_ERROR", + "CCAE_RNIC_CATASTROPHIC_ERROR" + }; + + if (event < CCAE_REMOTE_SHUTDOWN || + event > CCAE_RNIC_CATASTROPHIC_ERROR) + return ""; + + event -= CCAE_REMOTE_SHUTDOWN; + return event_str[event]; +} + +static const char *to_qp_state_str(int state) +{ + switch (state) { + case C2_QP_STATE_IDLE: + return "C2_QP_STATE_IDLE"; + case C2_QP_STATE_CONNECTING: + return "C2_QP_STATE_CONNECTING"; + case C2_QP_STATE_RTS: + return "C2_QP_STATE_RTS"; + case C2_QP_STATE_CLOSING: + return "C2_QP_STATE_CLOSING"; + case C2_QP_STATE_TERMINATE: + return "C2_QP_STATE_TERMINATE"; + case C2_QP_STATE_ERROR: + return "C2_QP_STATE_ERROR"; + default: + return ""; + } +} + +void c2_ae_event(struct c2_dev *c2dev, u32 mq_index) +{ + struct c2_mq *mq = c2dev->qptr_array[mq_index]; + union c2wr *wr; + void *resource_user_context; + struct iw_cm_event cm_event; + struct ib_event ib_event; + enum c2_resource_indicator resource_indicator; + enum c2_event_id event_id; + unsigned long flags; + int status; + struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_event.local_addr; + struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_event.remote_addr; + + /* + * retrieve the message + */ + wr = c2_mq_consume(mq); + if (!wr) + return; + + memset(&ib_event, 0, sizeof(ib_event)); + memset(&cm_event, 0, sizeof(cm_event)); + + event_id = c2_wr_get_id(wr); + resource_indicator = be32_to_cpu(wr->ae.ae_generic.resource_type); + resource_user_context = + (void *) (unsigned long) wr->ae.ae_generic.user_context; + + status = cm_event.status = c2_convert_cm_status(c2_wr_get_result(wr)); + + pr_debug("event received c2_dev=%p, event_id=%d, " + "resource_indicator=%d, user_context=%p, status = %d\n", + c2dev, event_id, resource_indicator, resource_user_context, + status); + + switch (resource_indicator) { + case C2_RES_IND_QP:{ + + struct c2_qp *qp = (struct c2_qp *)resource_user_context; + struct iw_cm_id *cm_id = qp->cm_id; + struct c2wr_ae_active_connect_results *res; + + if (!cm_id) { + pr_debug("event received, but cm_id is , qp=%p!\n", + qp); + goto ignore_it; + } + pr_debug("%s: event = %s, user_context=%llx, " + "resource_type=%x, " + "resource=%x, qp_state=%s\n", + __func__, + to_event_str(event_id), + (unsigned long long) wr->ae.ae_generic.user_context, + be32_to_cpu(wr->ae.ae_generic.resource_type), + be32_to_cpu(wr->ae.ae_generic.resource), + to_qp_state_str(be32_to_cpu(wr->ae.ae_generic.qp_state))); + + c2_set_qp_state(qp, be32_to_cpu(wr->ae.ae_generic.qp_state)); + + switch (event_id) { + case CCAE_ACTIVE_CONNECT_RESULTS: + res = &wr->ae.ae_active_connect_results; + cm_event.event = IW_CM_EVENT_CONNECT_REPLY; + laddr->sin_addr.s_addr = res->laddr; + raddr->sin_addr.s_addr = res->raddr; + laddr->sin_port = res->lport; + raddr->sin_port = res->rport; + if (status == 0) { + cm_event.private_data_len = + be32_to_cpu(res->private_data_length); + cm_event.private_data = res->private_data; + } else { + spin_lock_irqsave(&qp->lock, flags); + if (qp->cm_id) { + qp->cm_id->rem_ref(qp->cm_id); + qp->cm_id = NULL; + } + spin_unlock_irqrestore(&qp->lock, flags); + cm_event.private_data_len = 0; + cm_event.private_data = NULL; + } + if (cm_id->event_handler) + cm_id->event_handler(cm_id, &cm_event); + break; + case CCAE_TERMINATE_MESSAGE_RECEIVED: + case CCAE_CQ_SQ_COMPLETION_OVERFLOW: + ib_event.device = &c2dev->ibdev; + ib_event.element.qp = &qp->ibqp; + ib_event.event = IB_EVENT_QP_REQ_ERR; + + if (qp->ibqp.event_handler) + qp->ibqp.event_handler(&ib_event, + qp->ibqp. + qp_context); + break; + case CCAE_BAD_CLOSE: + case CCAE_LLP_CLOSE_COMPLETE: + case CCAE_LLP_CONNECTION_RESET: + case CCAE_LLP_CONNECTION_LOST: + BUG_ON(cm_id->event_handler==(void*)0x6b6b6b6b); + + spin_lock_irqsave(&qp->lock, flags); + if (qp->cm_id) { + qp->cm_id->rem_ref(qp->cm_id); + qp->cm_id = NULL; + } + spin_unlock_irqrestore(&qp->lock, flags); + cm_event.event = IW_CM_EVENT_CLOSE; + cm_event.status = 0; + if (cm_id->event_handler) + cm_id->event_handler(cm_id, &cm_event); + break; + default: + BUG_ON(1); + pr_debug("%s:%d Unexpected event_id=%d on QP=%p, " + "CM_ID=%p\n", + __func__, __LINE__, + event_id, qp, cm_id); + break; + } + break; + } + + case C2_RES_IND_EP:{ + + struct c2wr_ae_connection_request *req = + &wr->ae.ae_connection_request; + struct iw_cm_id *cm_id = + (struct iw_cm_id *)resource_user_context; + + pr_debug("C2_RES_IND_EP event_id=%d\n", event_id); + if (event_id != CCAE_CONNECTION_REQUEST) { + pr_debug("%s: Invalid event_id: %d\n", + __func__, event_id); + break; + } + cm_event.event = IW_CM_EVENT_CONNECT_REQUEST; + cm_event.provider_data = (void*)(unsigned long)req->cr_handle; + laddr->sin_addr.s_addr = req->laddr; + raddr->sin_addr.s_addr = req->raddr; + laddr->sin_port = req->lport; + raddr->sin_port = req->rport; + cm_event.private_data_len = + be32_to_cpu(req->private_data_length); + cm_event.private_data = req->private_data; + /* + * Until ird/ord negotiation via MPAv2 support is added, send + * max supported values + */ + cm_event.ird = cm_event.ord = 128; + + if (cm_id->event_handler) + cm_id->event_handler(cm_id, &cm_event); + break; + } + + case C2_RES_IND_CQ:{ + struct c2_cq *cq = + (struct c2_cq *) resource_user_context; + + pr_debug("IB_EVENT_CQ_ERR\n"); + ib_event.device = &c2dev->ibdev; + ib_event.element.cq = &cq->ibcq; + ib_event.event = IB_EVENT_CQ_ERR; + + if (cq->ibcq.event_handler) + cq->ibcq.event_handler(&ib_event, + cq->ibcq.cq_context); + break; + } + + default: + printk("Bad resource indicator = %d\n", + resource_indicator); + break; + } + + ignore_it: + c2_mq_free(mq); +} diff --git a/kernel/drivers/infiniband/hw/amso1100/c2_ae.h b/kernel/drivers/infiniband/hw/amso1100/c2_ae.h new file mode 100644 index 000000000..3a065c33b --- /dev/null +++ b/kernel/drivers/infiniband/hw/amso1100/c2_ae.h @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2005 Ammasso, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _C2_AE_H_ +#define _C2_AE_H_ + +/* + * WARNING: If you change this file, also bump C2_IVN_BASE + * in common/include/clustercore/c2_ivn.h. + */ + +/* + * Asynchronous Event Identifiers + * + * These start at 0x80 only so it's obvious from inspection that + * they are not work-request statuses. This isn't critical. + * + * NOTE: these event id's must fit in eight bits. + */ +enum c2_event_id { + CCAE_REMOTE_SHUTDOWN = 0x80, + CCAE_ACTIVE_CONNECT_RESULTS, + CCAE_CONNECTION_REQUEST, + CCAE_LLP_CLOSE_COMPLETE, + CCAE_TERMINATE_MESSAGE_RECEIVED, + CCAE_LLP_CONNECTION_RESET, + CCAE_LLP_CONNECTION_LOST, + CCAE_LLP_SEGMENT_SIZE_INVALID, + CCAE_LLP_INVALID_CRC, + CCAE_LLP_BAD_FPDU, + CCAE_INVALID_DDP_VERSION, + CCAE_INVALID_RDMA_VERSION, + CCAE_UNEXPECTED_OPCODE, + CCAE_INVALID_DDP_QUEUE_NUMBER, + CCAE_RDMA_READ_NOT_ENABLED, + CCAE_RDMA_WRITE_NOT_ENABLED, + CCAE_RDMA_READ_TOO_SMALL, + CCAE_NO_L_BIT, + CCAE_TAGGED_INVALID_STAG, + CCAE_TAGGED_BASE_BOUNDS_VIOLATION, + CCAE_TAGGED_ACCESS_RIGHTS_VIOLATION, + CCAE_TAGGED_INVALID_PD, + CCAE_WRAP_ERROR, + CCAE_BAD_CLOSE, + CCAE_BAD_LLP_CLOSE, + CCAE_INVALID_MSN_RANGE, + CCAE_INVALID_MSN_GAP, + CCAE_IRRQ_OVERFLOW, + CCAE_IRRQ_MSN_GAP, + CCAE_IRRQ_MSN_RANGE, + CCAE_IRRQ_INVALID_STAG, + CCAE_IRRQ_BASE_BOUNDS_VIOLATION, + CCAE_IRRQ_ACCESS_RIGHTS_VIOLATION, + CCAE_IRRQ_INVALID_PD, + CCAE_IRRQ_WRAP_ERROR, + CCAE_CQ_SQ_COMPLETION_OVERFLOW, + CCAE_CQ_RQ_COMPLETION_ERROR, + CCAE_QP_SRQ_WQE_ERROR, + CCAE_QP_LOCAL_CATASTROPHIC_ERROR, + CCAE_CQ_OVERFLOW, + CCAE_CQ_OPERATION_ERROR, + CCAE_SRQ_LIMIT_REACHED, + CCAE_QP_RQ_LIMIT_REACHED, + CCAE_SRQ_CATASTROPHIC_ERROR, + CCAE_RNIC_CATASTROPHIC_ERROR +/* WARNING If you add more id's, make sure their values fit in eight bits. */ +}; + +/* + * Resource Indicators and Identifiers + */ +enum c2_resource_indicator { + C2_RES_IND_QP = 1, + C2_RES_IND_EP, + C2_RES_IND_CQ, + C2_RES_IND_SRQ, +}; + +#endif /* _C2_AE_H_ */ diff --git a/kernel/drivers/infiniband/hw/amso1100/c2_alloc.c b/kernel/drivers/infiniband/hw/amso1100/c2_alloc.c new file mode 100644 index 000000000..78d247ec6 --- /dev/null +++ b/kernel/drivers/infiniband/hw/amso1100/c2_alloc.c @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include "c2.h" + +static int c2_alloc_mqsp_chunk(struct c2_dev *c2dev, gfp_t gfp_mask, + struct sp_chunk **head) +{ + int i; + struct sp_chunk *new_head; + dma_addr_t dma_addr; + + new_head = dma_alloc_coherent(&c2dev->pcidev->dev, PAGE_SIZE, + &dma_addr, gfp_mask); + if (new_head == NULL) + return -ENOMEM; + + new_head->dma_addr = dma_addr; + dma_unmap_addr_set(new_head, mapping, new_head->dma_addr); + + new_head->next = NULL; + new_head->head = 0; + + /* build list where each index is the next free slot */ + for (i = 0; + i < (PAGE_SIZE - sizeof(struct sp_chunk) - + sizeof(u16)) / sizeof(u16) - 1; + i++) { + new_head->shared_ptr[i] = i + 1; + } + /* terminate list */ + new_head->shared_ptr[i] = 0xFFFF; + + *head = new_head; + return 0; +} + +int c2_init_mqsp_pool(struct c2_dev *c2dev, gfp_t gfp_mask, + struct sp_chunk **root) +{ + return c2_alloc_mqsp_chunk(c2dev, gfp_mask, root); +} + +void c2_free_mqsp_pool(struct c2_dev *c2dev, struct sp_chunk *root) +{ + struct sp_chunk *next; + + while (root) { + next = root->next; + dma_free_coherent(&c2dev->pcidev->dev, PAGE_SIZE, root, + dma_unmap_addr(root, mapping)); + root = next; + } +} + +__be16 *c2_alloc_mqsp(struct c2_dev *c2dev, struct sp_chunk *head, + dma_addr_t *dma_addr, gfp_t gfp_mask) +{ + u16 mqsp; + + while (head) { + mqsp = head->head; + if (mqsp != 0xFFFF) { + head->head = head->shared_ptr[mqsp]; + break; + } else if (head->next == NULL) { + if (c2_alloc_mqsp_chunk(c2dev, gfp_mask, &head->next) == + 0) { + head = head->next; + mqsp = head->head; + head->head = head->shared_ptr[mqsp]; + break; + } else + return NULL; + } else + head = head->next; + } + if (head) { + *dma_addr = head->dma_addr + + ((unsigned long) &(head->shared_ptr[mqsp]) - + (unsigned long) head); + pr_debug("%s addr %p dma_addr %llx\n", __func__, + &(head->shared_ptr[mqsp]), (unsigned long long) *dma_addr); + return (__force __be16 *) &(head->shared_ptr[mqsp]); + } + return NULL; +} + +void c2_free_mqsp(__be16 *mqsp) +{ + struct sp_chunk *head; + u16 idx; + + /* The chunk containing this ptr begins at the page boundary */ + head = (struct sp_chunk *) ((unsigned long) mqsp & PAGE_MASK); + + /* Link head to new mqsp */ + *mqsp = (__force __be16) head->head; + + /* Compute the shared_ptr index */ + idx = ((unsigned long) mqsp & ~PAGE_MASK) >> 1; + idx -= (unsigned long) &(((struct sp_chunk *) 0)->shared_ptr[0]) >> 1; + + /* Point this index at the head */ + head->shared_ptr[idx] = head->head; + + /* Point head at this index */ + head->head = idx; +} diff --git a/kernel/drivers/infiniband/hw/amso1100/c2_cm.c b/kernel/drivers/infiniband/hw/amso1100/c2_cm.c new file mode 100644 index 000000000..23bfa94fb --- /dev/null +++ b/kernel/drivers/infiniband/hw/amso1100/c2_cm.c @@ -0,0 +1,461 @@ +/* + * Copyright (c) 2005 Ammasso, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include + +#include "c2.h" +#include "c2_wr.h" +#include "c2_vq.h" +#include + +int c2_llp_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param) +{ + struct c2_dev *c2dev = to_c2dev(cm_id->device); + struct ib_qp *ibqp; + struct c2_qp *qp; + struct c2wr_qp_connect_req *wr; /* variable size needs a malloc. */ + struct c2_vq_req *vq_req; + int err; + struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->remote_addr; + + if (cm_id->remote_addr.ss_family != AF_INET) + return -ENOSYS; + + ibqp = c2_get_qp(cm_id->device, iw_param->qpn); + if (!ibqp) + return -EINVAL; + qp = to_c2qp(ibqp); + + /* Associate QP <--> CM_ID */ + cm_id->provider_data = qp; + cm_id->add_ref(cm_id); + qp->cm_id = cm_id; + + /* + * only support the max private_data length + */ + if (iw_param->private_data_len > C2_MAX_PRIVATE_DATA_SIZE) { + err = -EINVAL; + goto bail0; + } + /* + * Set the rdma read limits + */ + err = c2_qp_set_read_limits(c2dev, qp, iw_param->ord, iw_param->ird); + if (err) + goto bail0; + + /* + * Create and send a WR_QP_CONNECT... + */ + wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL); + if (!wr) { + err = -ENOMEM; + goto bail0; + } + + vq_req = vq_req_alloc(c2dev); + if (!vq_req) { + err = -ENOMEM; + goto bail1; + } + + c2_wr_set_id(wr, CCWR_QP_CONNECT); + wr->hdr.context = 0; + wr->rnic_handle = c2dev->adapter_handle; + wr->qp_handle = qp->adapter_handle; + + wr->remote_addr = raddr->sin_addr.s_addr; + wr->remote_port = raddr->sin_port; + + /* + * Move any private data from the callers's buf into + * the WR. + */ + if (iw_param->private_data) { + wr->private_data_length = + cpu_to_be32(iw_param->private_data_len); + memcpy(&wr->private_data[0], iw_param->private_data, + iw_param->private_data_len); + } else + wr->private_data_length = 0; + + /* + * Send WR to adapter. NOTE: There is no synch reply from + * the adapter. + */ + err = vq_send_wr(c2dev, (union c2wr *) wr); + vq_req_free(c2dev, vq_req); + + bail1: + kfree(wr); + bail0: + if (err) { + /* + * If we fail, release reference on QP and + * disassociate QP from CM_ID + */ + cm_id->provider_data = NULL; + qp->cm_id = NULL; + cm_id->rem_ref(cm_id); + } + return err; +} + +int c2_llp_service_create(struct iw_cm_id *cm_id, int backlog) +{ + struct c2_dev *c2dev; + struct c2wr_ep_listen_create_req wr; + struct c2wr_ep_listen_create_rep *reply; + struct c2_vq_req *vq_req; + int err; + struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->local_addr; + + if (cm_id->local_addr.ss_family != AF_INET) + return -ENOSYS; + + c2dev = to_c2dev(cm_id->device); + if (c2dev == NULL) + return -EINVAL; + + /* + * Allocate verbs request. + */ + vq_req = vq_req_alloc(c2dev); + if (!vq_req) + return -ENOMEM; + + /* + * Build the WR + */ + c2_wr_set_id(&wr, CCWR_EP_LISTEN_CREATE); + wr.hdr.context = (u64) (unsigned long) vq_req; + wr.rnic_handle = c2dev->adapter_handle; + wr.local_addr = laddr->sin_addr.s_addr; + wr.local_port = laddr->sin_port; + wr.backlog = cpu_to_be32(backlog); + wr.user_context = (u64) (unsigned long) cm_id; + + /* + * Reference the request struct. Dereferenced in the int handler. + */ + vq_req_get(c2dev, vq_req); + + /* + * Send WR to adapter + */ + err = vq_send_wr(c2dev, (union c2wr *) & wr); + if (err) { + vq_req_put(c2dev, vq_req); + goto bail0; + } + + /* + * Wait for reply from adapter + */ + err = vq_wait_for_reply(c2dev, vq_req); + if (err) + goto bail0; + + /* + * Process reply + */ + reply = + (struct c2wr_ep_listen_create_rep *) (unsigned long) vq_req->reply_msg; + if (!reply) { + err = -ENOMEM; + goto bail1; + } + + if ((err = c2_errno(reply)) != 0) + goto bail1; + + /* + * Keep the adapter handle. Used in subsequent destroy + */ + cm_id->provider_data = (void*)(unsigned long) reply->ep_handle; + + /* + * free vq stuff + */ + vq_repbuf_free(c2dev, reply); + vq_req_free(c2dev, vq_req); + + return 0; + + bail1: + vq_repbuf_free(c2dev, reply); + bail0: + vq_req_free(c2dev, vq_req); + return err; +} + + +int c2_llp_service_destroy(struct iw_cm_id *cm_id) +{ + + struct c2_dev *c2dev; + struct c2wr_ep_listen_destroy_req wr; + struct c2wr_ep_listen_destroy_rep *reply; + struct c2_vq_req *vq_req; + int err; + + c2dev = to_c2dev(cm_id->device); + if (c2dev == NULL) + return -EINVAL; + + /* + * Allocate verbs request. + */ + vq_req = vq_req_alloc(c2dev); + if (!vq_req) + return -ENOMEM; + + /* + * Build the WR + */ + c2_wr_set_id(&wr, CCWR_EP_LISTEN_DESTROY); + wr.hdr.context = (unsigned long) vq_req; + wr.rnic_handle = c2dev->adapter_handle; + wr.ep_handle = (u32)(unsigned long)cm_id->provider_data; + + /* + * reference the request struct. dereferenced in the int handler. + */ + vq_req_get(c2dev, vq_req); + + /* + * Send WR to adapter + */ + err = vq_send_wr(c2dev, (union c2wr *) & wr); + if (err) { + vq_req_put(c2dev, vq_req); + goto bail0; + } + + /* + * Wait for reply from adapter + */ + err = vq_wait_for_reply(c2dev, vq_req); + if (err) + goto bail0; + + /* + * Process reply + */ + reply=(struct c2wr_ep_listen_destroy_rep *)(unsigned long)vq_req->reply_msg; + if (!reply) { + err = -ENOMEM; + goto bail0; + } + if ((err = c2_errno(reply)) != 0) + goto bail1; + + bail1: + vq_repbuf_free(c2dev, reply); + bail0: + vq_req_free(c2dev, vq_req); + return err; +} + +int c2_llp_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param) +{ + struct c2_dev *c2dev = to_c2dev(cm_id->device); + struct c2_qp *qp; + struct ib_qp *ibqp; + struct c2wr_cr_accept_req *wr; /* variable length WR */ + struct c2_vq_req *vq_req; + struct c2wr_cr_accept_rep *reply; /* VQ Reply msg ptr. */ + int err; + + ibqp = c2_get_qp(cm_id->device, iw_param->qpn); + if (!ibqp) + return -EINVAL; + qp = to_c2qp(ibqp); + + /* Set the RDMA read limits */ + err = c2_qp_set_read_limits(c2dev, qp, iw_param->ord, iw_param->ird); + if (err) + goto bail0; + + /* Allocate verbs request. */ + vq_req = vq_req_alloc(c2dev); + if (!vq_req) { + err = -ENOMEM; + goto bail0; + } + vq_req->qp = qp; + vq_req->cm_id = cm_id; + vq_req->event = IW_CM_EVENT_ESTABLISHED; + + wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL); + if (!wr) { + err = -ENOMEM; + goto bail1; + } + + /* Build the WR */ + c2_wr_set_id(wr, CCWR_CR_ACCEPT); + wr->hdr.context = (unsigned long) vq_req; + wr->rnic_handle = c2dev->adapter_handle; + wr->ep_handle = (u32) (unsigned long) cm_id->provider_data; + wr->qp_handle = qp->adapter_handle; + + /* Replace the cr_handle with the QP after accept */ + cm_id->provider_data = qp; + cm_id->add_ref(cm_id); + qp->cm_id = cm_id; + + cm_id->provider_data = qp; + + /* Validate private_data length */ + if (iw_param->private_data_len > C2_MAX_PRIVATE_DATA_SIZE) { + err = -EINVAL; + goto bail1; + } + + if (iw_param->private_data) { + wr->private_data_length = cpu_to_be32(iw_param->private_data_len); + memcpy(&wr->private_data[0], + iw_param->private_data, iw_param->private_data_len); + } else + wr->private_data_length = 0; + + /* Reference the request struct. Dereferenced in the int handler. */ + vq_req_get(c2dev, vq_req); + + /* Send WR to adapter */ + err = vq_send_wr(c2dev, (union c2wr *) wr); + if (err) { + vq_req_put(c2dev, vq_req); + goto bail1; + } + + /* Wait for reply from adapter */ + err = vq_wait_for_reply(c2dev, vq_req); + if (err) + goto bail1; + + /* Check that reply is present */ + reply = (struct c2wr_cr_accept_rep *) (unsigned long) vq_req->reply_msg; + if (!reply) { + err = -ENOMEM; + goto bail1; + } + + err = c2_errno(reply); + vq_repbuf_free(c2dev, reply); + + if (!err) + c2_set_qp_state(qp, C2_QP_STATE_RTS); + bail1: + kfree(wr); + vq_req_free(c2dev, vq_req); + bail0: + if (err) { + /* + * If we fail, release reference on QP and + * disassociate QP from CM_ID + */ + cm_id->provider_data = NULL; + qp->cm_id = NULL; + cm_id->rem_ref(cm_id); + } + return err; +} + +int c2_llp_reject(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len) +{ + struct c2_dev *c2dev; + struct c2wr_cr_reject_req wr; + struct c2_vq_req *vq_req; + struct c2wr_cr_reject_rep *reply; + int err; + + c2dev = to_c2dev(cm_id->device); + + /* + * Allocate verbs request. + */ + vq_req = vq_req_alloc(c2dev); + if (!vq_req) + return -ENOMEM; + + /* + * Build the WR + */ + c2_wr_set_id(&wr, CCWR_CR_REJECT); + wr.hdr.context = (unsigned long) vq_req; + wr.rnic_handle = c2dev->adapter_handle; + wr.ep_handle = (u32) (unsigned long) cm_id->provider_data; + + /* + * reference the request struct. dereferenced in the int handler. + */ + vq_req_get(c2dev, vq_req); + + /* + * Send WR to adapter + */ + err = vq_send_wr(c2dev, (union c2wr *) & wr); + if (err) { + vq_req_put(c2dev, vq_req); + goto bail0; + } + + /* + * Wait for reply from adapter + */ + err = vq_wait_for_reply(c2dev, vq_req); + if (err) + goto bail0; + + /* + * Process reply + */ + reply = (struct c2wr_cr_reject_rep *) (unsigned long) + vq_req->reply_msg; + if (!reply) { + err = -ENOMEM; + goto bail0; + } + err = c2_errno(reply); + /* + * free vq stuff + */ + vq_repbuf_free(c2dev, reply); + + bail0: + vq_req_free(c2dev, vq_req); + return err; +} diff --git a/kernel/drivers/infiniband/hw/amso1100/c2_cq.c b/kernel/drivers/infiniband/hw/amso1100/c2_cq.c new file mode 100644 index 000000000..1b63185b4 --- /dev/null +++ b/kernel/drivers/infiniband/hw/amso1100/c2_cq.c @@ -0,0 +1,440 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2005 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * Copyright (c) 2004 Voltaire, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include + +#include "c2.h" +#include "c2_vq.h" +#include "c2_status.h" + +#define C2_CQ_MSG_SIZE ((sizeof(struct c2wr_ce) + 32-1) & ~(32-1)) + +static struct c2_cq *c2_cq_get(struct c2_dev *c2dev, int cqn) +{ + struct c2_cq *cq; + unsigned long flags; + + spin_lock_irqsave(&c2dev->lock, flags); + cq = c2dev->qptr_array[cqn]; + if (!cq) { + spin_unlock_irqrestore(&c2dev->lock, flags); + return NULL; + } + atomic_inc(&cq->refcount); + spin_unlock_irqrestore(&c2dev->lock, flags); + return cq; +} + +static void c2_cq_put(struct c2_cq *cq) +{ + if (atomic_dec_and_test(&cq->refcount)) + wake_up(&cq->wait); +} + +void c2_cq_event(struct c2_dev *c2dev, u32 mq_index) +{ + struct c2_cq *cq; + + cq = c2_cq_get(c2dev, mq_index); + if (!cq) { + printk("discarding events on destroyed CQN=%d\n", mq_index); + return; + } + + (*cq->ibcq.comp_handler) (&cq->ibcq, cq->ibcq.cq_context); + c2_cq_put(cq); +} + +void c2_cq_clean(struct c2_dev *c2dev, struct c2_qp *qp, u32 mq_index) +{ + struct c2_cq *cq; + struct c2_mq *q; + + cq = c2_cq_get(c2dev, mq_index); + if (!cq) + return; + + spin_lock_irq(&cq->lock); + q = &cq->mq; + if (q && !c2_mq_empty(q)) { + u16 priv = q->priv; + struct c2wr_ce *msg; + + while (priv != be16_to_cpu(*q->shared)) { + msg = (struct c2wr_ce *) + (q->msg_pool.host + priv * q->msg_size); + if (msg->qp_user_context == (u64) (unsigned long) qp) { + msg->qp_user_context = (u64) 0; + } + priv = (priv + 1) % q->q_size; + } + } + spin_unlock_irq(&cq->lock); + c2_cq_put(cq); +} + +static inline enum ib_wc_status c2_cqe_status_to_openib(u8 status) +{ + switch (status) { + case C2_OK: + return IB_WC_SUCCESS; + case CCERR_FLUSHED: + return IB_WC_WR_FLUSH_ERR; + case CCERR_BASE_AND_BOUNDS_VIOLATION: + return IB_WC_LOC_PROT_ERR; + case CCERR_ACCESS_VIOLATION: + return IB_WC_LOC_ACCESS_ERR; + case CCERR_TOTAL_LENGTH_TOO_BIG: + return IB_WC_LOC_LEN_ERR; + case CCERR_INVALID_WINDOW: + return IB_WC_MW_BIND_ERR; + default: + return IB_WC_GENERAL_ERR; + } +} + + +static inline int c2_poll_one(struct c2_dev *c2dev, + struct c2_cq *cq, struct ib_wc *entry) +{ + struct c2wr_ce *ce; + struct c2_qp *qp; + int is_recv = 0; + + ce = c2_mq_consume(&cq->mq); + if (!ce) { + return -EAGAIN; + } + + /* + * if the qp returned is null then this qp has already + * been freed and we are unable process the completion. + * try pulling the next message + */ + while ((qp = + (struct c2_qp *) (unsigned long) ce->qp_user_context) == NULL) { + c2_mq_free(&cq->mq); + ce = c2_mq_consume(&cq->mq); + if (!ce) + return -EAGAIN; + } + + entry->status = c2_cqe_status_to_openib(c2_wr_get_result(ce)); + entry->wr_id = ce->hdr.context; + entry->qp = &qp->ibqp; + entry->wc_flags = 0; + entry->slid = 0; + entry->sl = 0; + entry->src_qp = 0; + entry->dlid_path_bits = 0; + entry->pkey_index = 0; + + switch (c2_wr_get_id(ce)) { + case C2_WR_TYPE_SEND: + entry->opcode = IB_WC_SEND; + break; + case C2_WR_TYPE_RDMA_WRITE: + entry->opcode = IB_WC_RDMA_WRITE; + break; + case C2_WR_TYPE_RDMA_READ: + entry->opcode = IB_WC_RDMA_READ; + break; + case C2_WR_TYPE_BIND_MW: + entry->opcode = IB_WC_BIND_MW; + break; + case C2_WR_TYPE_RECV: + entry->byte_len = be32_to_cpu(ce->bytes_rcvd); + entry->opcode = IB_WC_RECV; + is_recv = 1; + break; + default: + break; + } + + /* consume the WQEs */ + if (is_recv) + c2_mq_lconsume(&qp->rq_mq, 1); + else + c2_mq_lconsume(&qp->sq_mq, + be32_to_cpu(c2_wr_get_wqe_count(ce)) + 1); + + /* free the message */ + c2_mq_free(&cq->mq); + + return 0; +} + +int c2_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) +{ + struct c2_dev *c2dev = to_c2dev(ibcq->device); + struct c2_cq *cq = to_c2cq(ibcq); + unsigned long flags; + int npolled, err; + + spin_lock_irqsave(&cq->lock, flags); + + for (npolled = 0; npolled < num_entries; ++npolled) { + + err = c2_poll_one(c2dev, cq, entry + npolled); + if (err) + break; + } + + spin_unlock_irqrestore(&cq->lock, flags); + + return npolled; +} + +int c2_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags) +{ + struct c2_mq_shared __iomem *shared; + struct c2_cq *cq; + unsigned long flags; + int ret = 0; + + cq = to_c2cq(ibcq); + shared = cq->mq.peer; + + if ((notify_flags & IB_CQ_SOLICITED_MASK) == IB_CQ_NEXT_COMP) + writeb(C2_CQ_NOTIFICATION_TYPE_NEXT, &shared->notification_type); + else if ((notify_flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED) + writeb(C2_CQ_NOTIFICATION_TYPE_NEXT_SE, &shared->notification_type); + else + return -EINVAL; + + writeb(CQ_WAIT_FOR_DMA | CQ_ARMED, &shared->armed); + + /* + * Now read back shared->armed to make the PCI + * write synchronous. This is necessary for + * correct cq notification semantics. + */ + readb(&shared->armed); + + if (notify_flags & IB_CQ_REPORT_MISSED_EVENTS) { + spin_lock_irqsave(&cq->lock, flags); + ret = !c2_mq_empty(&cq->mq); + spin_unlock_irqrestore(&cq->lock, flags); + } + + return ret; +} + +static void c2_free_cq_buf(struct c2_dev *c2dev, struct c2_mq *mq) +{ + dma_free_coherent(&c2dev->pcidev->dev, mq->q_size * mq->msg_size, + mq->msg_pool.host, dma_unmap_addr(mq, mapping)); +} + +static int c2_alloc_cq_buf(struct c2_dev *c2dev, struct c2_mq *mq, + size_t q_size, size_t msg_size) +{ + u8 *pool_start; + + if (q_size > SIZE_MAX / msg_size) + return -EINVAL; + + pool_start = dma_alloc_coherent(&c2dev->pcidev->dev, q_size * msg_size, + &mq->host_dma, GFP_KERNEL); + if (!pool_start) + return -ENOMEM; + + c2_mq_rep_init(mq, + 0, /* index (currently unknown) */ + q_size, + msg_size, + pool_start, + NULL, /* peer (currently unknown) */ + C2_MQ_HOST_TARGET); + + dma_unmap_addr_set(mq, mapping, mq->host_dma); + + return 0; +} + +int c2_init_cq(struct c2_dev *c2dev, int entries, + struct c2_ucontext *ctx, struct c2_cq *cq) +{ + struct c2wr_cq_create_req wr; + struct c2wr_cq_create_rep *reply; + unsigned long peer_pa; + struct c2_vq_req *vq_req; + int err; + + might_sleep(); + + cq->ibcq.cqe = entries - 1; + cq->is_kernel = !ctx; + + /* Allocate a shared pointer */ + cq->mq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool, + &cq->mq.shared_dma, GFP_KERNEL); + if (!cq->mq.shared) + return -ENOMEM; + + /* Allocate pages for the message pool */ + err = c2_alloc_cq_buf(c2dev, &cq->mq, entries + 1, C2_CQ_MSG_SIZE); + if (err) + goto bail0; + + vq_req = vq_req_alloc(c2dev); + if (!vq_req) { + err = -ENOMEM; + goto bail1; + } + + memset(&wr, 0, sizeof(wr)); + c2_wr_set_id(&wr, CCWR_CQ_CREATE); + wr.hdr.context = (unsigned long) vq_req; + wr.rnic_handle = c2dev->adapter_handle; + wr.msg_size = cpu_to_be32(cq->mq.msg_size); + wr.depth = cpu_to_be32(cq->mq.q_size); + wr.shared_ht = cpu_to_be64(cq->mq.shared_dma); + wr.msg_pool = cpu_to_be64(cq->mq.host_dma); + wr.user_context = (u64) (unsigned long) (cq); + + vq_req_get(c2dev, vq_req); + + err = vq_send_wr(c2dev, (union c2wr *) & wr); + if (err) { + vq_req_put(c2dev, vq_req); + goto bail2; + } + + err = vq_wait_for_reply(c2dev, vq_req); + if (err) + goto bail2; + + reply = (struct c2wr_cq_create_rep *) (unsigned long) (vq_req->reply_msg); + if (!reply) { + err = -ENOMEM; + goto bail2; + } + + if ((err = c2_errno(reply)) != 0) + goto bail3; + + cq->adapter_handle = reply->cq_handle; + cq->mq.index = be32_to_cpu(reply->mq_index); + + peer_pa = c2dev->pa + be32_to_cpu(reply->adapter_shared); + cq->mq.peer = ioremap_nocache(peer_pa, PAGE_SIZE); + if (!cq->mq.peer) { + err = -ENOMEM; + goto bail3; + } + + vq_repbuf_free(c2dev, reply); + vq_req_free(c2dev, vq_req); + + spin_lock_init(&cq->lock); + atomic_set(&cq->refcount, 1); + init_waitqueue_head(&cq->wait); + + /* + * Use the MQ index allocated by the adapter to + * store the CQ in the qptr_array + */ + cq->cqn = cq->mq.index; + c2dev->qptr_array[cq->cqn] = cq; + + return 0; + + bail3: + vq_repbuf_free(c2dev, reply); + bail2: + vq_req_free(c2dev, vq_req); + bail1: + c2_free_cq_buf(c2dev, &cq->mq); + bail0: + c2_free_mqsp(cq->mq.shared); + + return err; +} + +void c2_free_cq(struct c2_dev *c2dev, struct c2_cq *cq) +{ + int err; + struct c2_vq_req *vq_req; + struct c2wr_cq_destroy_req wr; + struct c2wr_cq_destroy_rep *reply; + + might_sleep(); + + /* Clear CQ from the qptr array */ + spin_lock_irq(&c2dev->lock); + c2dev->qptr_array[cq->mq.index] = NULL; + atomic_dec(&cq->refcount); + spin_unlock_irq(&c2dev->lock); + + wait_event(cq->wait, !atomic_read(&cq->refcount)); + + vq_req = vq_req_alloc(c2dev); + if (!vq_req) { + goto bail0; + } + + memset(&wr, 0, sizeof(wr)); + c2_wr_set_id(&wr, CCWR_CQ_DESTROY); + wr.hdr.context = (unsigned long) vq_req; + wr.rnic_handle = c2dev->adapter_handle; + wr.cq_handle = cq->adapter_handle; + + vq_req_get(c2dev, vq_req); + + err = vq_send_wr(c2dev, (union c2wr *) & wr); + if (err) { + vq_req_put(c2dev, vq_req); + goto bail1; + } + + err = vq_wait_for_reply(c2dev, vq_req); + if (err) + goto bail1; + + reply = (struct c2wr_cq_destroy_rep *) (unsigned long) (vq_req->reply_msg); + if (reply) + vq_repbuf_free(c2dev, reply); + bail1: + vq_req_free(c2dev, vq_req); + bail0: + if (cq->is_kernel) { + c2_free_cq_buf(c2dev, &cq->mq); + } + + return; +} diff --git a/kernel/drivers/infiniband/hw/amso1100/c2_intr.c b/kernel/drivers/infiniband/hw/amso1100/c2_intr.c new file mode 100644 index 000000000..3a17d9b36 --- /dev/null +++ b/kernel/drivers/infiniband/hw/amso1100/c2_intr.c @@ -0,0 +1,219 @@ +/* + * Copyright (c) 2005 Ammasso, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "c2.h" +#include +#include "c2_vq.h" + +static void handle_mq(struct c2_dev *c2dev, u32 index); +static void handle_vq(struct c2_dev *c2dev, u32 mq_index); + +/* + * Handle RNIC interrupts + */ +void c2_rnic_interrupt(struct c2_dev *c2dev) +{ + unsigned int mq_index; + + while (c2dev->hints_read != be16_to_cpu(*c2dev->hint_count)) { + mq_index = readl(c2dev->regs + PCI_BAR0_HOST_HINT); + if (mq_index & 0x80000000) { + break; + } + + c2dev->hints_read++; + handle_mq(c2dev, mq_index); + } + +} + +/* + * Top level MQ handler + */ +static void handle_mq(struct c2_dev *c2dev, u32 mq_index) +{ + if (c2dev->qptr_array[mq_index] == NULL) { + pr_debug("handle_mq: stray activity for mq_index=%d\n", + mq_index); + return; + } + + switch (mq_index) { + case (0): + /* + * An index of 0 in the activity queue + * indicates the req vq now has messages + * available... + * + * Wake up any waiters waiting on req VQ + * message availability. + */ + wake_up(&c2dev->req_vq_wo); + break; + case (1): + handle_vq(c2dev, mq_index); + break; + case (2): + /* We have to purge the VQ in case there are pending + * accept reply requests that would result in the + * generation of an ESTABLISHED event. If we don't + * generate these first, a CLOSE event could end up + * being delivered before the ESTABLISHED event. + */ + handle_vq(c2dev, 1); + + c2_ae_event(c2dev, mq_index); + break; + default: + /* There is no event synchronization between CQ events + * and AE or CM events. In fact, CQE could be + * delivered for all of the I/O up to and including the + * FLUSH for a peer disconenct prior to the ESTABLISHED + * event being delivered to the app. The reason for this + * is that CM events are delivered on a thread, while AE + * and CM events are delivered on interrupt context. + */ + c2_cq_event(c2dev, mq_index); + break; + } + + return; +} + +/* + * Handles verbs WR replies. + */ +static void handle_vq(struct c2_dev *c2dev, u32 mq_index) +{ + void *adapter_msg, *reply_msg; + struct c2wr_hdr *host_msg; + struct c2wr_hdr tmp; + struct c2_mq *reply_vq; + struct c2_vq_req *req; + struct iw_cm_event cm_event; + int err; + + reply_vq = (struct c2_mq *) c2dev->qptr_array[mq_index]; + + /* + * get next msg from mq_index into adapter_msg. + * don't free it yet. + */ + adapter_msg = c2_mq_consume(reply_vq); + if (adapter_msg == NULL) { + return; + } + + host_msg = vq_repbuf_alloc(c2dev); + + /* + * If we can't get a host buffer, then we'll still + * wakeup the waiter, we just won't give him the msg. + * It is assumed the waiter will deal with this... + */ + if (!host_msg) { + pr_debug("handle_vq: no repbufs!\n"); + + /* + * just copy the WR header into a local variable. + * this allows us to still demux on the context + */ + host_msg = &tmp; + memcpy(host_msg, adapter_msg, sizeof(tmp)); + reply_msg = NULL; + } else { + memcpy(host_msg, adapter_msg, reply_vq->msg_size); + reply_msg = host_msg; + } + + /* + * consume the msg from the MQ + */ + c2_mq_free(reply_vq); + + /* + * wakeup the waiter. + */ + req = (struct c2_vq_req *) (unsigned long) host_msg->context; + if (req == NULL) { + /* + * We should never get here, as the adapter should + * never send us a reply that we're not expecting. + */ + if (reply_msg != NULL) + vq_repbuf_free(c2dev, host_msg); + pr_debug("handle_vq: UNEXPECTEDLY got NULL req\n"); + return; + } + + if (reply_msg) + err = c2_errno(reply_msg); + else + err = -ENOMEM; + + if (!err) switch (req->event) { + case IW_CM_EVENT_ESTABLISHED: + c2_set_qp_state(req->qp, + C2_QP_STATE_RTS); + /* + * Until ird/ord negotiation via MPAv2 support is added, send + * max supported values + */ + cm_event.ird = cm_event.ord = 128; + case IW_CM_EVENT_CLOSE: + + /* + * Move the QP to RTS if this is + * the established event + */ + cm_event.event = req->event; + cm_event.status = 0; + cm_event.local_addr = req->cm_id->local_addr; + cm_event.remote_addr = req->cm_id->remote_addr; + cm_event.private_data = NULL; + cm_event.private_data_len = 0; + req->cm_id->event_handler(req->cm_id, &cm_event); + break; + default: + break; + } + + req->reply_msg = (u64) (unsigned long) (reply_msg); + atomic_set(&req->reply_ready, 1); + wake_up(&req->wait_object); + + /* + * If the request was cancelled, then this put will + * free the vq_req memory...and reply_msg!!! + */ + vq_req_put(c2dev, req); +} diff --git a/kernel/drivers/infiniband/hw/amso1100/c2_mm.c b/kernel/drivers/infiniband/hw/amso1100/c2_mm.c new file mode 100644 index 000000000..119c4f3d9 --- /dev/null +++ b/kernel/drivers/infiniband/hw/amso1100/c2_mm.c @@ -0,0 +1,377 @@ +/* + * Copyright (c) 2005 Ammasso, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include + +#include "c2.h" +#include "c2_vq.h" + +#define PBL_VIRT 1 +#define PBL_PHYS 2 + +/* + * Send all the PBL messages to convey the remainder of the PBL + * Wait for the adapter's reply on the last one. + * This is indicated by setting the MEM_PBL_COMPLETE in the flags. + * + * NOTE: vq_req is _not_ freed by this function. The VQ Host + * Reply buffer _is_ freed by this function. + */ +static int +send_pbl_messages(struct c2_dev *c2dev, __be32 stag_index, + unsigned long va, u32 pbl_depth, + struct c2_vq_req *vq_req, int pbl_type) +{ + u32 pbe_count; /* amt that fits in a PBL msg */ + u32 count; /* amt in this PBL MSG. */ + struct c2wr_nsmr_pbl_req *wr; /* PBL WR ptr */ + struct c2wr_nsmr_pbl_rep *reply; /* reply ptr */ + int err, pbl_virt, pbl_index, i; + + switch (pbl_type) { + case PBL_VIRT: + pbl_virt = 1; + break; + case PBL_PHYS: + pbl_virt = 0; + break; + default: + return -EINVAL; + break; + } + + pbe_count = (c2dev->req_vq.msg_size - + sizeof(struct c2wr_nsmr_pbl_req)) / sizeof(u64); + wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL); + if (!wr) { + return -ENOMEM; + } + c2_wr_set_id(wr, CCWR_NSMR_PBL); + + /* + * Only the last PBL message will generate a reply from the verbs, + * so we set the context to 0 indicating there is no kernel verbs + * handler blocked awaiting this reply. + */ + wr->hdr.context = 0; + wr->rnic_handle = c2dev->adapter_handle; + wr->stag_index = stag_index; /* already swapped */ + wr->flags = 0; + pbl_index = 0; + while (pbl_depth) { + count = min(pbe_count, pbl_depth); + wr->addrs_length = cpu_to_be32(count); + + /* + * If this is the last message, then reference the + * vq request struct cuz we're gonna wait for a reply. + * also make this PBL msg as the last one. + */ + if (count == pbl_depth) { + /* + * reference the request struct. dereferenced in the + * int handler. + */ + vq_req_get(c2dev, vq_req); + wr->flags = cpu_to_be32(MEM_PBL_COMPLETE); + + /* + * This is the last PBL message. + * Set the context to our VQ Request Object so we can + * wait for the reply. + */ + wr->hdr.context = (unsigned long) vq_req; + } + + /* + * If pbl_virt is set then va is a virtual address + * that describes a virtually contiguous memory + * allocation. The wr needs the start of each virtual page + * to be converted to the corresponding physical address + * of the page. If pbl_virt is not set then va is an array + * of physical addresses and there is no conversion to do. + * Just fill in the wr with what is in the array. + */ + for (i = 0; i < count; i++) { + if (pbl_virt) { + va += PAGE_SIZE; + } else { + wr->paddrs[i] = + cpu_to_be64(((u64 *)va)[pbl_index + i]); + } + } + + /* + * Send WR to adapter + */ + err = vq_send_wr(c2dev, (union c2wr *) wr); + if (err) { + if (count <= pbe_count) { + vq_req_put(c2dev, vq_req); + } + goto bail0; + } + pbl_depth -= count; + pbl_index += count; + } + + /* + * Now wait for the reply... + */ + err = vq_wait_for_reply(c2dev, vq_req); + if (err) { + goto bail0; + } + + /* + * Process reply + */ + reply = (struct c2wr_nsmr_pbl_rep *) (unsigned long) vq_req->reply_msg; + if (!reply) { + err = -ENOMEM; + goto bail0; + } + + err = c2_errno(reply); + + vq_repbuf_free(c2dev, reply); + bail0: + kfree(wr); + return err; +} + +#define C2_PBL_MAX_DEPTH 131072 +int +c2_nsmr_register_phys_kern(struct c2_dev *c2dev, u64 *addr_list, + int page_size, int pbl_depth, u32 length, + u32 offset, u64 *va, enum c2_acf acf, + struct c2_mr *mr) +{ + struct c2_vq_req *vq_req; + struct c2wr_nsmr_register_req *wr; + struct c2wr_nsmr_register_rep *reply; + u16 flags; + int i, pbe_count, count; + int err; + + if (!va || !length || !addr_list || !pbl_depth) + return -EINTR; + + /* + * Verify PBL depth is within rnic max + */ + if (pbl_depth > C2_PBL_MAX_DEPTH) { + return -EINTR; + } + + /* + * allocate verbs request object + */ + vq_req = vq_req_alloc(c2dev); + if (!vq_req) + return -ENOMEM; + + wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL); + if (!wr) { + err = -ENOMEM; + goto bail0; + } + + /* + * build the WR + */ + c2_wr_set_id(wr, CCWR_NSMR_REGISTER); + wr->hdr.context = (unsigned long) vq_req; + wr->rnic_handle = c2dev->adapter_handle; + + flags = (acf | MEM_VA_BASED | MEM_REMOTE); + + /* + * compute how many pbes can fit in the message + */ + pbe_count = (c2dev->req_vq.msg_size - + sizeof(struct c2wr_nsmr_register_req)) / sizeof(u64); + + if (pbl_depth <= pbe_count) { + flags |= MEM_PBL_COMPLETE; + } + wr->flags = cpu_to_be16(flags); + wr->stag_key = 0; //stag_key; + wr->va = cpu_to_be64(*va); + wr->pd_id = mr->pd->pd_id; + wr->pbe_size = cpu_to_be32(page_size); + wr->length = cpu_to_be32(length); + wr->pbl_depth = cpu_to_be32(pbl_depth); + wr->fbo = cpu_to_be32(offset); + count = min(pbl_depth, pbe_count); + wr->addrs_length = cpu_to_be32(count); + + /* + * fill out the PBL for this message + */ + for (i = 0; i < count; i++) { + wr->paddrs[i] = cpu_to_be64(addr_list[i]); + } + + /* + * regerence the request struct + */ + vq_req_get(c2dev, vq_req); + + /* + * send the WR to the adapter + */ + err = vq_send_wr(c2dev, (union c2wr *) wr); + if (err) { + vq_req_put(c2dev, vq_req); + goto bail1; + } + + /* + * wait for reply from adapter + */ + err = vq_wait_for_reply(c2dev, vq_req); + if (err) { + goto bail1; + } + + /* + * process reply + */ + reply = + (struct c2wr_nsmr_register_rep *) (unsigned long) (vq_req->reply_msg); + if (!reply) { + err = -ENOMEM; + goto bail1; + } + if ((err = c2_errno(reply))) { + goto bail2; + } + //*p_pb_entries = be32_to_cpu(reply->pbl_depth); + mr->ibmr.lkey = mr->ibmr.rkey = be32_to_cpu(reply->stag_index); + vq_repbuf_free(c2dev, reply); + + /* + * if there are still more PBEs we need to send them to + * the adapter and wait for a reply on the final one. + * reuse vq_req for this purpose. + */ + pbl_depth -= count; + if (pbl_depth) { + + vq_req->reply_msg = (unsigned long) NULL; + atomic_set(&vq_req->reply_ready, 0); + err = send_pbl_messages(c2dev, + cpu_to_be32(mr->ibmr.lkey), + (unsigned long) &addr_list[i], + pbl_depth, vq_req, PBL_PHYS); + if (err) { + goto bail1; + } + } + + vq_req_free(c2dev, vq_req); + kfree(wr); + + return err; + + bail2: + vq_repbuf_free(c2dev, reply); + bail1: + kfree(wr); + bail0: + vq_req_free(c2dev, vq_req); + return err; +} + +int c2_stag_dealloc(struct c2_dev *c2dev, u32 stag_index) +{ + struct c2_vq_req *vq_req; /* verbs request object */ + struct c2wr_stag_dealloc_req wr; /* work request */ + struct c2wr_stag_dealloc_rep *reply; /* WR reply */ + int err; + + + /* + * allocate verbs request object + */ + vq_req = vq_req_alloc(c2dev); + if (!vq_req) { + return -ENOMEM; + } + + /* + * Build the WR + */ + c2_wr_set_id(&wr, CCWR_STAG_DEALLOC); + wr.hdr.context = (u64) (unsigned long) vq_req; + wr.rnic_handle = c2dev->adapter_handle; + wr.stag_index = cpu_to_be32(stag_index); + + /* + * reference the request struct. dereferenced in the int handler. + */ + vq_req_get(c2dev, vq_req); + + /* + * Send WR to adapter + */ + err = vq_send_wr(c2dev, (union c2wr *) & wr); + if (err) { + vq_req_put(c2dev, vq_req); + goto bail0; + } + + /* + * Wait for reply from adapter + */ + err = vq_wait_for_reply(c2dev, vq_req); + if (err) { + goto bail0; + } + + /* + * Process reply + */ + reply = (struct c2wr_stag_dealloc_rep *) (unsigned long) vq_req->reply_msg; + if (!reply) { + err = -ENOMEM; + goto bail0; + } + + err = c2_errno(reply); + + vq_repbuf_free(c2dev, reply); + bail0: + vq_req_free(c2dev, vq_req); + return err; +} diff --git a/kernel/drivers/infiniband/hw/amso1100/c2_mq.c b/kernel/drivers/infiniband/hw/amso1100/c2_mq.c new file mode 100644 index 000000000..0cddc49be --- /dev/null +++ b/kernel/drivers/infiniband/hw/amso1100/c2_mq.c @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2005 Ammasso, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "c2.h" +#include "c2_mq.h" + +void *c2_mq_alloc(struct c2_mq *q) +{ + BUG_ON(q->magic != C2_MQ_MAGIC); + BUG_ON(q->type != C2_MQ_ADAPTER_TARGET); + + if (c2_mq_full(q)) { + return NULL; + } else { +#ifdef DEBUG + struct c2wr_hdr *m = + (struct c2wr_hdr *) (q->msg_pool.host + q->priv * q->msg_size); +#ifdef CCMSGMAGIC + BUG_ON(m->magic != be32_to_cpu(~CCWR_MAGIC)); + m->magic = cpu_to_be32(CCWR_MAGIC); +#endif + return m; +#else + return q->msg_pool.host + q->priv * q->msg_size; +#endif + } +} + +void c2_mq_produce(struct c2_mq *q) +{ + BUG_ON(q->magic != C2_MQ_MAGIC); + BUG_ON(q->type != C2_MQ_ADAPTER_TARGET); + + if (!c2_mq_full(q)) { + q->priv = (q->priv + 1) % q->q_size; + q->hint_count++; + /* Update peer's offset. */ + __raw_writew((__force u16) cpu_to_be16(q->priv), &q->peer->shared); + } +} + +void *c2_mq_consume(struct c2_mq *q) +{ + BUG_ON(q->magic != C2_MQ_MAGIC); + BUG_ON(q->type != C2_MQ_HOST_TARGET); + + if (c2_mq_empty(q)) { + return NULL; + } else { +#ifdef DEBUG + struct c2wr_hdr *m = (struct c2wr_hdr *) + (q->msg_pool.host + q->priv * q->msg_size); +#ifdef CCMSGMAGIC + BUG_ON(m->magic != be32_to_cpu(CCWR_MAGIC)); +#endif + return m; +#else + return q->msg_pool.host + q->priv * q->msg_size; +#endif + } +} + +void c2_mq_free(struct c2_mq *q) +{ + BUG_ON(q->magic != C2_MQ_MAGIC); + BUG_ON(q->type != C2_MQ_HOST_TARGET); + + if (!c2_mq_empty(q)) { + +#ifdef CCMSGMAGIC + { + struct c2wr_hdr __iomem *m = (struct c2wr_hdr __iomem *) + (q->msg_pool.adapter + q->priv * q->msg_size); + __raw_writel(cpu_to_be32(~CCWR_MAGIC), &m->magic); + } +#endif + q->priv = (q->priv + 1) % q->q_size; + /* Update peer's offset. */ + __raw_writew((__force u16) cpu_to_be16(q->priv), &q->peer->shared); + } +} + + +void c2_mq_lconsume(struct c2_mq *q, u32 wqe_count) +{ + BUG_ON(q->magic != C2_MQ_MAGIC); + BUG_ON(q->type != C2_MQ_ADAPTER_TARGET); + + while (wqe_count--) { + BUG_ON(c2_mq_empty(q)); + *q->shared = cpu_to_be16((be16_to_cpu(*q->shared)+1) % q->q_size); + } +} + +#if 0 +u32 c2_mq_count(struct c2_mq *q) +{ + s32 count; + + if (q->type == C2_MQ_HOST_TARGET) + count = be16_to_cpu(*q->shared) - q->priv; + else + count = q->priv - be16_to_cpu(*q->shared); + + if (count < 0) + count += q->q_size; + + return (u32) count; +} +#endif /* 0 */ + +void c2_mq_req_init(struct c2_mq *q, u32 index, u32 q_size, u32 msg_size, + u8 __iomem *pool_start, u16 __iomem *peer, u32 type) +{ + BUG_ON(!q->shared); + + /* This code assumes the byte swapping has already been done! */ + q->index = index; + q->q_size = q_size; + q->msg_size = msg_size; + q->msg_pool.adapter = pool_start; + q->peer = (struct c2_mq_shared __iomem *) peer; + q->magic = C2_MQ_MAGIC; + q->type = type; + q->priv = 0; + q->hint_count = 0; + return; +} +void c2_mq_rep_init(struct c2_mq *q, u32 index, u32 q_size, u32 msg_size, + u8 *pool_start, u16 __iomem *peer, u32 type) +{ + BUG_ON(!q->shared); + + /* This code assumes the byte swapping has already been done! */ + q->index = index; + q->q_size = q_size; + q->msg_size = msg_size; + q->msg_pool.host = pool_start; + q->peer = (struct c2_mq_shared __iomem *) peer; + q->magic = C2_MQ_MAGIC; + q->type = type; + q->priv = 0; + q->hint_count = 0; + return; +} diff --git a/kernel/drivers/infiniband/hw/amso1100/c2_mq.h b/kernel/drivers/infiniband/hw/amso1100/c2_mq.h new file mode 100644 index 000000000..fc1b9a7ce --- /dev/null +++ b/kernel/drivers/infiniband/hw/amso1100/c2_mq.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2005 Ammasso, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _C2_MQ_H_ +#define _C2_MQ_H_ +#include +#include +#include "c2_wr.h" + +enum c2_shared_regs { + + C2_SHARED_ARMED = 0x10, + C2_SHARED_NOTIFY = 0x18, + C2_SHARED_SHARED = 0x40, +}; + +struct c2_mq_shared { + u16 unused1; + u8 armed; + u8 notification_type; + u32 unused2; + u16 shared; + /* Pad to 64 bytes. */ + u8 pad[64 - sizeof(u16) - 2 * sizeof(u8) - sizeof(u32) - sizeof(u16)]; +}; + +enum c2_mq_type { + C2_MQ_HOST_TARGET = 1, + C2_MQ_ADAPTER_TARGET = 2, +}; + +/* + * c2_mq_t is for kernel-mode MQs like the VQs Cand the AEQ. + * c2_user_mq_t (which is the same format) is for user-mode MQs... + */ +#define C2_MQ_MAGIC 0x4d512020 /* 'MQ ' */ +struct c2_mq { + u32 magic; + union { + u8 *host; + u8 __iomem *adapter; + } msg_pool; + dma_addr_t host_dma; + DEFINE_DMA_UNMAP_ADDR(mapping); + u16 hint_count; + u16 priv; + struct c2_mq_shared __iomem *peer; + __be16 *shared; + dma_addr_t shared_dma; + u32 q_size; + u32 msg_size; + u32 index; + enum c2_mq_type type; +}; + +static __inline__ int c2_mq_empty(struct c2_mq *q) +{ + return q->priv == be16_to_cpu(*q->shared); +} + +static __inline__ int c2_mq_full(struct c2_mq *q) +{ + return q->priv == (be16_to_cpu(*q->shared) + q->q_size - 1) % q->q_size; +} + +extern void c2_mq_lconsume(struct c2_mq *q, u32 wqe_count); +extern void *c2_mq_alloc(struct c2_mq *q); +extern void c2_mq_produce(struct c2_mq *q); +extern void *c2_mq_consume(struct c2_mq *q); +extern void c2_mq_free(struct c2_mq *q); +extern void c2_mq_req_init(struct c2_mq *q, u32 index, u32 q_size, u32 msg_size, + u8 __iomem *pool_start, u16 __iomem *peer, u32 type); +extern void c2_mq_rep_init(struct c2_mq *q, u32 index, u32 q_size, u32 msg_size, + u8 *pool_start, u16 __iomem *peer, u32 type); + +#endif /* _C2_MQ_H_ */ diff --git a/kernel/drivers/infiniband/hw/amso1100/c2_pd.c b/kernel/drivers/infiniband/hw/amso1100/c2_pd.c new file mode 100644 index 000000000..f3e81dc35 --- /dev/null +++ b/kernel/drivers/infiniband/hw/amso1100/c2_pd.c @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Cisco Systems. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include "c2.h" +#include "c2_provider.h" + +int c2_pd_alloc(struct c2_dev *c2dev, int privileged, struct c2_pd *pd) +{ + u32 obj; + int ret = 0; + + spin_lock(&c2dev->pd_table.lock); + obj = find_next_zero_bit(c2dev->pd_table.table, c2dev->pd_table.max, + c2dev->pd_table.last); + if (obj >= c2dev->pd_table.max) + obj = find_first_zero_bit(c2dev->pd_table.table, + c2dev->pd_table.max); + if (obj < c2dev->pd_table.max) { + pd->pd_id = obj; + __set_bit(obj, c2dev->pd_table.table); + c2dev->pd_table.last = obj+1; + if (c2dev->pd_table.last >= c2dev->pd_table.max) + c2dev->pd_table.last = 0; + } else + ret = -ENOMEM; + spin_unlock(&c2dev->pd_table.lock); + return ret; +} + +void c2_pd_free(struct c2_dev *c2dev, struct c2_pd *pd) +{ + spin_lock(&c2dev->pd_table.lock); + __clear_bit(pd->pd_id, c2dev->pd_table.table); + spin_unlock(&c2dev->pd_table.lock); +} + +int c2_init_pd_table(struct c2_dev *c2dev) +{ + + c2dev->pd_table.last = 0; + c2dev->pd_table.max = c2dev->props.max_pd; + spin_lock_init(&c2dev->pd_table.lock); + c2dev->pd_table.table = kmalloc(BITS_TO_LONGS(c2dev->props.max_pd) * + sizeof(long), GFP_KERNEL); + if (!c2dev->pd_table.table) + return -ENOMEM; + bitmap_zero(c2dev->pd_table.table, c2dev->props.max_pd); + return 0; +} + +void c2_cleanup_pd_table(struct c2_dev *c2dev) +{ + kfree(c2dev->pd_table.table); +} diff --git a/kernel/drivers/infiniband/hw/amso1100/c2_provider.c b/kernel/drivers/infiniband/hw/amso1100/c2_provider.c new file mode 100644 index 000000000..bdf350781 --- /dev/null +++ b/kernel/drivers/infiniband/hw/amso1100/c2_provider.c @@ -0,0 +1,882 @@ +/* + * Copyright (c) 2005 Ammasso, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include "c2.h" +#include "c2_provider.h" +#include "c2_user.h" + +static int c2_query_device(struct ib_device *ibdev, + struct ib_device_attr *props) +{ + struct c2_dev *c2dev = to_c2dev(ibdev); + + pr_debug("%s:%u\n", __func__, __LINE__); + + *props = c2dev->props; + return 0; +} + +static int c2_query_port(struct ib_device *ibdev, + u8 port, struct ib_port_attr *props) +{ + pr_debug("%s:%u\n", __func__, __LINE__); + + props->max_mtu = IB_MTU_4096; + props->lid = 0; + props->lmc = 0; + props->sm_lid = 0; + props->sm_sl = 0; + props->state = IB_PORT_ACTIVE; + props->phys_state = 0; + props->port_cap_flags = + IB_PORT_CM_SUP | + IB_PORT_REINIT_SUP | + IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP; + props->gid_tbl_len = 1; + props->pkey_tbl_len = 1; + props->qkey_viol_cntr = 0; + props->active_width = 1; + props->active_speed = IB_SPEED_SDR; + + return 0; +} + +static int c2_query_pkey(struct ib_device *ibdev, + u8 port, u16 index, u16 * pkey) +{ + pr_debug("%s:%u\n", __func__, __LINE__); + *pkey = 0; + return 0; +} + +static int c2_query_gid(struct ib_device *ibdev, u8 port, + int index, union ib_gid *gid) +{ + struct c2_dev *c2dev = to_c2dev(ibdev); + + pr_debug("%s:%u\n", __func__, __LINE__); + memset(&(gid->raw[0]), 0, sizeof(gid->raw)); + memcpy(&(gid->raw[0]), c2dev->pseudo_netdev->dev_addr, 6); + + return 0; +} + +/* Allocate the user context data structure. This keeps track + * of all objects associated with a particular user-mode client. + */ +static struct ib_ucontext *c2_alloc_ucontext(struct ib_device *ibdev, + struct ib_udata *udata) +{ + struct c2_ucontext *context; + + pr_debug("%s:%u\n", __func__, __LINE__); + context = kmalloc(sizeof(*context), GFP_KERNEL); + if (!context) + return ERR_PTR(-ENOMEM); + + return &context->ibucontext; +} + +static int c2_dealloc_ucontext(struct ib_ucontext *context) +{ + pr_debug("%s:%u\n", __func__, __LINE__); + kfree(context); + return 0; +} + +static int c2_mmap_uar(struct ib_ucontext *context, struct vm_area_struct *vma) +{ + pr_debug("%s:%u\n", __func__, __LINE__); + return -ENOSYS; +} + +static struct ib_pd *c2_alloc_pd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct c2_pd *pd; + int err; + + pr_debug("%s:%u\n", __func__, __LINE__); + + pd = kmalloc(sizeof(*pd), GFP_KERNEL); + if (!pd) + return ERR_PTR(-ENOMEM); + + err = c2_pd_alloc(to_c2dev(ibdev), !context, pd); + if (err) { + kfree(pd); + return ERR_PTR(err); + } + + if (context) { + if (ib_copy_to_udata(udata, &pd->pd_id, sizeof(__u32))) { + c2_pd_free(to_c2dev(ibdev), pd); + kfree(pd); + return ERR_PTR(-EFAULT); + } + } + + return &pd->ibpd; +} + +static int c2_dealloc_pd(struct ib_pd *pd) +{ + pr_debug("%s:%u\n", __func__, __LINE__); + c2_pd_free(to_c2dev(pd->device), to_c2pd(pd)); + kfree(pd); + + return 0; +} + +static struct ib_ah *c2_ah_create(struct ib_pd *pd, struct ib_ah_attr *ah_attr) +{ + pr_debug("%s:%u\n", __func__, __LINE__); + return ERR_PTR(-ENOSYS); +} + +static int c2_ah_destroy(struct ib_ah *ah) +{ + pr_debug("%s:%u\n", __func__, __LINE__); + return -ENOSYS; +} + +static void c2_add_ref(struct ib_qp *ibqp) +{ + struct c2_qp *qp; + BUG_ON(!ibqp); + qp = to_c2qp(ibqp); + atomic_inc(&qp->refcount); +} + +static void c2_rem_ref(struct ib_qp *ibqp) +{ + struct c2_qp *qp; + BUG_ON(!ibqp); + qp = to_c2qp(ibqp); + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); +} + +struct ib_qp *c2_get_qp(struct ib_device *device, int qpn) +{ + struct c2_dev* c2dev = to_c2dev(device); + struct c2_qp *qp; + + qp = c2_find_qpn(c2dev, qpn); + pr_debug("%s Returning QP=%p for QPN=%d, device=%p, refcount=%d\n", + __func__, qp, qpn, device, + (qp?atomic_read(&qp->refcount):0)); + + return (qp?&qp->ibqp:NULL); +} + +static struct ib_qp *c2_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata) +{ + struct c2_qp *qp; + int err; + + pr_debug("%s:%u\n", __func__, __LINE__); + + if (init_attr->create_flags) + return ERR_PTR(-EINVAL); + + switch (init_attr->qp_type) { + case IB_QPT_RC: + qp = kzalloc(sizeof(*qp), GFP_KERNEL); + if (!qp) { + pr_debug("%s: Unable to allocate QP\n", __func__); + return ERR_PTR(-ENOMEM); + } + spin_lock_init(&qp->lock); + if (pd->uobject) { + /* userspace specific */ + } + + err = c2_alloc_qp(to_c2dev(pd->device), + to_c2pd(pd), init_attr, qp); + + if (err && pd->uobject) { + /* userspace specific */ + } + + break; + default: + pr_debug("%s: Invalid QP type: %d\n", __func__, + init_attr->qp_type); + return ERR_PTR(-EINVAL); + } + + if (err) { + kfree(qp); + return ERR_PTR(err); + } + + return &qp->ibqp; +} + +static int c2_destroy_qp(struct ib_qp *ib_qp) +{ + struct c2_qp *qp = to_c2qp(ib_qp); + + pr_debug("%s:%u qp=%p,qp->state=%d\n", + __func__, __LINE__, ib_qp, qp->state); + c2_free_qp(to_c2dev(ib_qp->device), qp); + kfree(qp); + return 0; +} + +static struct ib_cq *c2_create_cq(struct ib_device *ibdev, int entries, int vector, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct c2_cq *cq; + int err; + + cq = kmalloc(sizeof(*cq), GFP_KERNEL); + if (!cq) { + pr_debug("%s: Unable to allocate CQ\n", __func__); + return ERR_PTR(-ENOMEM); + } + + err = c2_init_cq(to_c2dev(ibdev), entries, NULL, cq); + if (err) { + pr_debug("%s: error initializing CQ\n", __func__); + kfree(cq); + return ERR_PTR(err); + } + + return &cq->ibcq; +} + +static int c2_destroy_cq(struct ib_cq *ib_cq) +{ + struct c2_cq *cq = to_c2cq(ib_cq); + + pr_debug("%s:%u\n", __func__, __LINE__); + + c2_free_cq(to_c2dev(ib_cq->device), cq); + kfree(cq); + + return 0; +} + +static inline u32 c2_convert_access(int acc) +{ + return (acc & IB_ACCESS_REMOTE_WRITE ? C2_ACF_REMOTE_WRITE : 0) | + (acc & IB_ACCESS_REMOTE_READ ? C2_ACF_REMOTE_READ : 0) | + (acc & IB_ACCESS_LOCAL_WRITE ? C2_ACF_LOCAL_WRITE : 0) | + C2_ACF_LOCAL_READ | C2_ACF_WINDOW_BIND; +} + +static struct ib_mr *c2_reg_phys_mr(struct ib_pd *ib_pd, + struct ib_phys_buf *buffer_list, + int num_phys_buf, int acc, u64 * iova_start) +{ + struct c2_mr *mr; + u64 *page_list; + u32 total_len; + int err, i, j, k, page_shift, pbl_depth; + + pbl_depth = 0; + total_len = 0; + + page_shift = PAGE_SHIFT; + /* + * If there is only 1 buffer we assume this could + * be a map of all phy mem...use a 32k page_shift. + */ + if (num_phys_buf == 1) + page_shift += 3; + + for (i = 0; i < num_phys_buf; i++) { + + if (buffer_list[i].addr & ~PAGE_MASK) { + pr_debug("Unaligned Memory Buffer: 0x%x\n", + (unsigned int) buffer_list[i].addr); + return ERR_PTR(-EINVAL); + } + + if (!buffer_list[i].size) { + pr_debug("Invalid Buffer Size\n"); + return ERR_PTR(-EINVAL); + } + + total_len += buffer_list[i].size; + pbl_depth += ALIGN(buffer_list[i].size, + (1 << page_shift)) >> page_shift; + } + + page_list = vmalloc(sizeof(u64) * pbl_depth); + if (!page_list) { + pr_debug("couldn't vmalloc page_list of size %zd\n", + (sizeof(u64) * pbl_depth)); + return ERR_PTR(-ENOMEM); + } + + for (i = 0, j = 0; i < num_phys_buf; i++) { + + int naddrs; + + naddrs = ALIGN(buffer_list[i].size, + (1 << page_shift)) >> page_shift; + for (k = 0; k < naddrs; k++) + page_list[j++] = (buffer_list[i].addr + + (k << page_shift)); + } + + mr = kmalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) { + vfree(page_list); + return ERR_PTR(-ENOMEM); + } + + mr->pd = to_c2pd(ib_pd); + mr->umem = NULL; + pr_debug("%s - page shift %d, pbl_depth %d, total_len %u, " + "*iova_start %llx, first pa %llx, last pa %llx\n", + __func__, page_shift, pbl_depth, total_len, + (unsigned long long) *iova_start, + (unsigned long long) page_list[0], + (unsigned long long) page_list[pbl_depth-1]); + err = c2_nsmr_register_phys_kern(to_c2dev(ib_pd->device), page_list, + (1 << page_shift), pbl_depth, + total_len, 0, iova_start, + c2_convert_access(acc), mr); + vfree(page_list); + if (err) { + kfree(mr); + return ERR_PTR(err); + } + + return &mr->ibmr; +} + +static struct ib_mr *c2_get_dma_mr(struct ib_pd *pd, int acc) +{ + struct ib_phys_buf bl; + u64 kva = 0; + + pr_debug("%s:%u\n", __func__, __LINE__); + + /* AMSO1100 limit */ + bl.size = 0xffffffff; + bl.addr = 0; + return c2_reg_phys_mr(pd, &bl, 1, acc, &kva); +} + +static struct ib_mr *c2_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt, int acc, struct ib_udata *udata) +{ + u64 *pages; + u64 kva = 0; + int shift, n, len; + int i, k, entry; + int err = 0; + struct scatterlist *sg; + struct c2_pd *c2pd = to_c2pd(pd); + struct c2_mr *c2mr; + + pr_debug("%s:%u\n", __func__, __LINE__); + + c2mr = kmalloc(sizeof(*c2mr), GFP_KERNEL); + if (!c2mr) + return ERR_PTR(-ENOMEM); + c2mr->pd = c2pd; + + c2mr->umem = ib_umem_get(pd->uobject->context, start, length, acc, 0); + if (IS_ERR(c2mr->umem)) { + err = PTR_ERR(c2mr->umem); + kfree(c2mr); + return ERR_PTR(err); + } + + shift = ffs(c2mr->umem->page_size) - 1; + n = c2mr->umem->nmap; + + pages = kmalloc(n * sizeof(u64), GFP_KERNEL); + if (!pages) { + err = -ENOMEM; + goto err; + } + + i = 0; + for_each_sg(c2mr->umem->sg_head.sgl, sg, c2mr->umem->nmap, entry) { + len = sg_dma_len(sg) >> shift; + for (k = 0; k < len; ++k) { + pages[i++] = + sg_dma_address(sg) + + (c2mr->umem->page_size * k); + } + } + + kva = virt; + err = c2_nsmr_register_phys_kern(to_c2dev(pd->device), + pages, + c2mr->umem->page_size, + i, + length, + ib_umem_offset(c2mr->umem), + &kva, + c2_convert_access(acc), + c2mr); + kfree(pages); + if (err) + goto err; + return &c2mr->ibmr; + +err: + ib_umem_release(c2mr->umem); + kfree(c2mr); + return ERR_PTR(err); +} + +static int c2_dereg_mr(struct ib_mr *ib_mr) +{ + struct c2_mr *mr = to_c2mr(ib_mr); + int err; + + pr_debug("%s:%u\n", __func__, __LINE__); + + err = c2_stag_dealloc(to_c2dev(ib_mr->device), ib_mr->lkey); + if (err) + pr_debug("c2_stag_dealloc failed: %d\n", err); + else { + if (mr->umem) + ib_umem_release(mr->umem); + kfree(mr); + } + + return err; +} + +static ssize_t show_rev(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct c2_dev *c2dev = container_of(dev, struct c2_dev, ibdev.dev); + pr_debug("%s:%u\n", __func__, __LINE__); + return sprintf(buf, "%x\n", c2dev->props.hw_ver); +} + +static ssize_t show_fw_ver(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct c2_dev *c2dev = container_of(dev, struct c2_dev, ibdev.dev); + pr_debug("%s:%u\n", __func__, __LINE__); + return sprintf(buf, "%x.%x.%x\n", + (int) (c2dev->props.fw_ver >> 32), + (int) (c2dev->props.fw_ver >> 16) & 0xffff, + (int) (c2dev->props.fw_ver & 0xffff)); +} + +static ssize_t show_hca(struct device *dev, struct device_attribute *attr, + char *buf) +{ + pr_debug("%s:%u\n", __func__, __LINE__); + return sprintf(buf, "AMSO1100\n"); +} + +static ssize_t show_board(struct device *dev, struct device_attribute *attr, + char *buf) +{ + pr_debug("%s:%u\n", __func__, __LINE__); + return sprintf(buf, "%.*s\n", 32, "AMSO1100 Board ID"); +} + +static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); +static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); +static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); +static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); + +static struct device_attribute *c2_dev_attributes[] = { + &dev_attr_hw_rev, + &dev_attr_fw_ver, + &dev_attr_hca_type, + &dev_attr_board_id +}; + +static int c2_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + int err; + + err = + c2_qp_modify(to_c2dev(ibqp->device), to_c2qp(ibqp), attr, + attr_mask); + + return err; +} + +static int c2_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + pr_debug("%s:%u\n", __func__, __LINE__); + return -ENOSYS; +} + +static int c2_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + pr_debug("%s:%u\n", __func__, __LINE__); + return -ENOSYS; +} + +static int c2_process_mad(struct ib_device *ibdev, + int mad_flags, + u8 port_num, + struct ib_wc *in_wc, + struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad) +{ + pr_debug("%s:%u\n", __func__, __LINE__); + return -ENOSYS; +} + +static int c2_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param) +{ + pr_debug("%s:%u\n", __func__, __LINE__); + + /* Request a connection */ + return c2_llp_connect(cm_id, iw_param); +} + +static int c2_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param) +{ + pr_debug("%s:%u\n", __func__, __LINE__); + + /* Accept the new connection */ + return c2_llp_accept(cm_id, iw_param); +} + +static int c2_reject(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len) +{ + int err; + + pr_debug("%s:%u\n", __func__, __LINE__); + + err = c2_llp_reject(cm_id, pdata, pdata_len); + return err; +} + +static int c2_service_create(struct iw_cm_id *cm_id, int backlog) +{ + int err; + + pr_debug("%s:%u\n", __func__, __LINE__); + err = c2_llp_service_create(cm_id, backlog); + pr_debug("%s:%u err=%d\n", + __func__, __LINE__, + err); + return err; +} + +static int c2_service_destroy(struct iw_cm_id *cm_id) +{ + int err; + pr_debug("%s:%u\n", __func__, __LINE__); + + err = c2_llp_service_destroy(cm_id); + + return err; +} + +static int c2_pseudo_up(struct net_device *netdev) +{ + struct in_device *ind; + struct c2_dev *c2dev = netdev->ml_priv; + + ind = in_dev_get(netdev); + if (!ind) + return 0; + + pr_debug("adding...\n"); + for_ifa(ind) { +#ifdef DEBUG + u8 *ip = (u8 *) & ifa->ifa_address; + + pr_debug("%s: %d.%d.%d.%d\n", + ifa->ifa_label, ip[0], ip[1], ip[2], ip[3]); +#endif + c2_add_addr(c2dev, ifa->ifa_address, ifa->ifa_mask); + } + endfor_ifa(ind); + in_dev_put(ind); + + return 0; +} + +static int c2_pseudo_down(struct net_device *netdev) +{ + struct in_device *ind; + struct c2_dev *c2dev = netdev->ml_priv; + + ind = in_dev_get(netdev); + if (!ind) + return 0; + + pr_debug("deleting...\n"); + for_ifa(ind) { +#ifdef DEBUG + u8 *ip = (u8 *) & ifa->ifa_address; + + pr_debug("%s: %d.%d.%d.%d\n", + ifa->ifa_label, ip[0], ip[1], ip[2], ip[3]); +#endif + c2_del_addr(c2dev, ifa->ifa_address, ifa->ifa_mask); + } + endfor_ifa(ind); + in_dev_put(ind); + + return 0; +} + +static int c2_pseudo_xmit_frame(struct sk_buff *skb, struct net_device *netdev) +{ + kfree_skb(skb); + return NETDEV_TX_OK; +} + +static int c2_pseudo_change_mtu(struct net_device *netdev, int new_mtu) +{ + if (new_mtu < ETH_ZLEN || new_mtu > ETH_JUMBO_MTU) + return -EINVAL; + + netdev->mtu = new_mtu; + + /* TODO: Tell rnic about new rmda interface mtu */ + return 0; +} + +static const struct net_device_ops c2_pseudo_netdev_ops = { + .ndo_open = c2_pseudo_up, + .ndo_stop = c2_pseudo_down, + .ndo_start_xmit = c2_pseudo_xmit_frame, + .ndo_change_mtu = c2_pseudo_change_mtu, + .ndo_validate_addr = eth_validate_addr, +}; + +static void setup(struct net_device *netdev) +{ + netdev->netdev_ops = &c2_pseudo_netdev_ops; + + netdev->watchdog_timeo = 0; + netdev->type = ARPHRD_ETHER; + netdev->mtu = 1500; + netdev->hard_header_len = ETH_HLEN; + netdev->addr_len = ETH_ALEN; + netdev->tx_queue_len = 0; + netdev->flags |= IFF_NOARP; +} + +static struct net_device *c2_pseudo_netdev_init(struct c2_dev *c2dev) +{ + char name[IFNAMSIZ]; + struct net_device *netdev; + + /* change ethxxx to iwxxx */ + strcpy(name, "iw"); + strcat(name, &c2dev->netdev->name[3]); + netdev = alloc_netdev(0, name, NET_NAME_UNKNOWN, setup); + if (!netdev) { + printk(KERN_ERR PFX "%s - etherdev alloc failed", + __func__); + return NULL; + } + + netdev->ml_priv = c2dev; + + SET_NETDEV_DEV(netdev, &c2dev->pcidev->dev); + + memcpy_fromio(netdev->dev_addr, c2dev->kva + C2_REGS_RDMA_ENADDR, 6); + + /* Print out the MAC address */ + pr_debug("%s: MAC %pM\n", netdev->name, netdev->dev_addr); + +#if 0 + /* Disable network packets */ + netif_stop_queue(netdev); +#endif + return netdev; +} + +int c2_register_device(struct c2_dev *dev) +{ + int ret = -ENOMEM; + int i; + + /* Register pseudo network device */ + dev->pseudo_netdev = c2_pseudo_netdev_init(dev); + if (!dev->pseudo_netdev) + goto out; + + ret = register_netdev(dev->pseudo_netdev); + if (ret) + goto out_free_netdev; + + pr_debug("%s:%u\n", __func__, __LINE__); + strlcpy(dev->ibdev.name, "amso%d", IB_DEVICE_NAME_MAX); + dev->ibdev.owner = THIS_MODULE; + dev->ibdev.uverbs_cmd_mask = + (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | + (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | + (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_DEREG_MR) | + (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | + (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | + (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_QP) | + (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | + (1ull << IB_USER_VERBS_CMD_POLL_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | + (1ull << IB_USER_VERBS_CMD_POST_SEND) | + (1ull << IB_USER_VERBS_CMD_POST_RECV); + + dev->ibdev.node_type = RDMA_NODE_RNIC; + memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid)); + memcpy(&dev->ibdev.node_guid, dev->pseudo_netdev->dev_addr, 6); + dev->ibdev.phys_port_cnt = 1; + dev->ibdev.num_comp_vectors = 1; + dev->ibdev.dma_device = &dev->pcidev->dev; + dev->ibdev.query_device = c2_query_device; + dev->ibdev.query_port = c2_query_port; + dev->ibdev.query_pkey = c2_query_pkey; + dev->ibdev.query_gid = c2_query_gid; + dev->ibdev.alloc_ucontext = c2_alloc_ucontext; + dev->ibdev.dealloc_ucontext = c2_dealloc_ucontext; + dev->ibdev.mmap = c2_mmap_uar; + dev->ibdev.alloc_pd = c2_alloc_pd; + dev->ibdev.dealloc_pd = c2_dealloc_pd; + dev->ibdev.create_ah = c2_ah_create; + dev->ibdev.destroy_ah = c2_ah_destroy; + dev->ibdev.create_qp = c2_create_qp; + dev->ibdev.modify_qp = c2_modify_qp; + dev->ibdev.destroy_qp = c2_destroy_qp; + dev->ibdev.create_cq = c2_create_cq; + dev->ibdev.destroy_cq = c2_destroy_cq; + dev->ibdev.poll_cq = c2_poll_cq; + dev->ibdev.get_dma_mr = c2_get_dma_mr; + dev->ibdev.reg_phys_mr = c2_reg_phys_mr; + dev->ibdev.reg_user_mr = c2_reg_user_mr; + dev->ibdev.dereg_mr = c2_dereg_mr; + + dev->ibdev.alloc_fmr = NULL; + dev->ibdev.unmap_fmr = NULL; + dev->ibdev.dealloc_fmr = NULL; + dev->ibdev.map_phys_fmr = NULL; + + dev->ibdev.attach_mcast = c2_multicast_attach; + dev->ibdev.detach_mcast = c2_multicast_detach; + dev->ibdev.process_mad = c2_process_mad; + + dev->ibdev.req_notify_cq = c2_arm_cq; + dev->ibdev.post_send = c2_post_send; + dev->ibdev.post_recv = c2_post_receive; + + dev->ibdev.iwcm = kmalloc(sizeof(*dev->ibdev.iwcm), GFP_KERNEL); + if (dev->ibdev.iwcm == NULL) { + ret = -ENOMEM; + goto out_unregister_netdev; + } + dev->ibdev.iwcm->add_ref = c2_add_ref; + dev->ibdev.iwcm->rem_ref = c2_rem_ref; + dev->ibdev.iwcm->get_qp = c2_get_qp; + dev->ibdev.iwcm->connect = c2_connect; + dev->ibdev.iwcm->accept = c2_accept; + dev->ibdev.iwcm->reject = c2_reject; + dev->ibdev.iwcm->create_listen = c2_service_create; + dev->ibdev.iwcm->destroy_listen = c2_service_destroy; + + ret = ib_register_device(&dev->ibdev, NULL); + if (ret) + goto out_free_iwcm; + + for (i = 0; i < ARRAY_SIZE(c2_dev_attributes); ++i) { + ret = device_create_file(&dev->ibdev.dev, + c2_dev_attributes[i]); + if (ret) + goto out_unregister_ibdev; + } + goto out; + +out_unregister_ibdev: + ib_unregister_device(&dev->ibdev); +out_free_iwcm: + kfree(dev->ibdev.iwcm); +out_unregister_netdev: + unregister_netdev(dev->pseudo_netdev); +out_free_netdev: + free_netdev(dev->pseudo_netdev); +out: + pr_debug("%s:%u ret=%d\n", __func__, __LINE__, ret); + return ret; +} + +void c2_unregister_device(struct c2_dev *dev) +{ + pr_debug("%s:%u\n", __func__, __LINE__); + unregister_netdev(dev->pseudo_netdev); + free_netdev(dev->pseudo_netdev); + ib_unregister_device(&dev->ibdev); +} diff --git a/kernel/drivers/infiniband/hw/amso1100/c2_provider.h b/kernel/drivers/infiniband/hw/amso1100/c2_provider.h new file mode 100644 index 000000000..bf1899877 --- /dev/null +++ b/kernel/drivers/infiniband/hw/amso1100/c2_provider.h @@ -0,0 +1,182 @@ +/* + * Copyright (c) 2005 Ammasso, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef C2_PROVIDER_H +#define C2_PROVIDER_H +#include + +#include +#include + +#include "c2_mq.h" +#include + +#define C2_MPT_FLAG_ATOMIC (1 << 14) +#define C2_MPT_FLAG_REMOTE_WRITE (1 << 13) +#define C2_MPT_FLAG_REMOTE_READ (1 << 12) +#define C2_MPT_FLAG_LOCAL_WRITE (1 << 11) +#define C2_MPT_FLAG_LOCAL_READ (1 << 10) + +struct c2_buf_list { + void *buf; + DEFINE_DMA_UNMAP_ADDR(mapping); +}; + + +/* The user context keeps track of objects allocated for a + * particular user-mode client. */ +struct c2_ucontext { + struct ib_ucontext ibucontext; +}; + +struct c2_mtt; + +/* All objects associated with a PD are kept in the + * associated user context if present. + */ +struct c2_pd { + struct ib_pd ibpd; + u32 pd_id; +}; + +struct c2_mr { + struct ib_mr ibmr; + struct c2_pd *pd; + struct ib_umem *umem; +}; + +struct c2_av; + +enum c2_ah_type { + C2_AH_ON_HCA, + C2_AH_PCI_POOL, + C2_AH_KMALLOC +}; + +struct c2_ah { + struct ib_ah ibah; +}; + +struct c2_cq { + struct ib_cq ibcq; + spinlock_t lock; + atomic_t refcount; + int cqn; + int is_kernel; + wait_queue_head_t wait; + + u32 adapter_handle; + struct c2_mq mq; +}; + +struct c2_wq { + spinlock_t lock; +}; +struct iw_cm_id; +struct c2_qp { + struct ib_qp ibqp; + struct iw_cm_id *cm_id; + spinlock_t lock; + atomic_t refcount; + wait_queue_head_t wait; + int qpn; + + u32 adapter_handle; + u32 send_sgl_depth; + u32 recv_sgl_depth; + u32 rdma_write_sgl_depth; + u8 state; + + struct c2_mq sq_mq; + struct c2_mq rq_mq; +}; + +struct c2_cr_query_attrs { + u32 local_addr; + u32 remote_addr; + u16 local_port; + u16 remote_port; +}; + +static inline struct c2_pd *to_c2pd(struct ib_pd *ibpd) +{ + return container_of(ibpd, struct c2_pd, ibpd); +} + +static inline struct c2_ucontext *to_c2ucontext(struct ib_ucontext *ibucontext) +{ + return container_of(ibucontext, struct c2_ucontext, ibucontext); +} + +static inline struct c2_mr *to_c2mr(struct ib_mr *ibmr) +{ + return container_of(ibmr, struct c2_mr, ibmr); +} + + +static inline struct c2_ah *to_c2ah(struct ib_ah *ibah) +{ + return container_of(ibah, struct c2_ah, ibah); +} + +static inline struct c2_cq *to_c2cq(struct ib_cq *ibcq) +{ + return container_of(ibcq, struct c2_cq, ibcq); +} + +static inline struct c2_qp *to_c2qp(struct ib_qp *ibqp) +{ + return container_of(ibqp, struct c2_qp, ibqp); +} + +static inline int is_rnic_addr(struct net_device *netdev, u32 addr) +{ + struct in_device *ind; + int ret = 0; + + ind = in_dev_get(netdev); + if (!ind) + return 0; + + for_ifa(ind) { + if (ifa->ifa_address == addr) { + ret = 1; + break; + } + } + endfor_ifa(ind); + in_dev_put(ind); + return ret; +} +#endif /* C2_PROVIDER_H */ diff --git a/kernel/drivers/infiniband/hw/amso1100/c2_qp.c b/kernel/drivers/infiniband/hw/amso1100/c2_qp.c new file mode 100644 index 000000000..86708dee5 --- /dev/null +++ b/kernel/drivers/infiniband/hw/amso1100/c2_qp.c @@ -0,0 +1,1024 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Cisco Systems. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * Copyright (c) 2004 Voltaire, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include + +#include "c2.h" +#include "c2_vq.h" +#include "c2_status.h" + +#define C2_MAX_ORD_PER_QP 128 +#define C2_MAX_IRD_PER_QP 128 + +#define C2_HINT_MAKE(q_index, hint_count) (((q_index) << 16) | hint_count) +#define C2_HINT_GET_INDEX(hint) (((hint) & 0x7FFF0000) >> 16) +#define C2_HINT_GET_COUNT(hint) ((hint) & 0x0000FFFF) + +#define NO_SUPPORT -1 +static const u8 c2_opcode[] = { + [IB_WR_SEND] = C2_WR_TYPE_SEND, + [IB_WR_SEND_WITH_IMM] = NO_SUPPORT, + [IB_WR_RDMA_WRITE] = C2_WR_TYPE_RDMA_WRITE, + [IB_WR_RDMA_WRITE_WITH_IMM] = NO_SUPPORT, + [IB_WR_RDMA_READ] = C2_WR_TYPE_RDMA_READ, + [IB_WR_ATOMIC_CMP_AND_SWP] = NO_SUPPORT, + [IB_WR_ATOMIC_FETCH_AND_ADD] = NO_SUPPORT, +}; + +static int to_c2_state(enum ib_qp_state ib_state) +{ + switch (ib_state) { + case IB_QPS_RESET: + return C2_QP_STATE_IDLE; + case IB_QPS_RTS: + return C2_QP_STATE_RTS; + case IB_QPS_SQD: + return C2_QP_STATE_CLOSING; + case IB_QPS_SQE: + return C2_QP_STATE_CLOSING; + case IB_QPS_ERR: + return C2_QP_STATE_ERROR; + default: + return -1; + } +} + +static int to_ib_state(enum c2_qp_state c2_state) +{ + switch (c2_state) { + case C2_QP_STATE_IDLE: + return IB_QPS_RESET; + case C2_QP_STATE_CONNECTING: + return IB_QPS_RTR; + case C2_QP_STATE_RTS: + return IB_QPS_RTS; + case C2_QP_STATE_CLOSING: + return IB_QPS_SQD; + case C2_QP_STATE_ERROR: + return IB_QPS_ERR; + case C2_QP_STATE_TERMINATE: + return IB_QPS_SQE; + default: + return -1; + } +} + +static const char *to_ib_state_str(int ib_state) +{ + static const char *state_str[] = { + "IB_QPS_RESET", + "IB_QPS_INIT", + "IB_QPS_RTR", + "IB_QPS_RTS", + "IB_QPS_SQD", + "IB_QPS_SQE", + "IB_QPS_ERR" + }; + if (ib_state < IB_QPS_RESET || + ib_state > IB_QPS_ERR) + return ""; + + ib_state -= IB_QPS_RESET; + return state_str[ib_state]; +} + +void c2_set_qp_state(struct c2_qp *qp, int c2_state) +{ + int new_state = to_ib_state(c2_state); + + pr_debug("%s: qp[%p] state modify %s --> %s\n", + __func__, + qp, + to_ib_state_str(qp->state), + to_ib_state_str(new_state)); + qp->state = new_state; +} + +#define C2_QP_NO_ATTR_CHANGE 0xFFFFFFFF + +int c2_qp_modify(struct c2_dev *c2dev, struct c2_qp *qp, + struct ib_qp_attr *attr, int attr_mask) +{ + struct c2wr_qp_modify_req wr; + struct c2wr_qp_modify_rep *reply; + struct c2_vq_req *vq_req; + unsigned long flags; + u8 next_state; + int err; + + pr_debug("%s:%d qp=%p, %s --> %s\n", + __func__, __LINE__, + qp, + to_ib_state_str(qp->state), + to_ib_state_str(attr->qp_state)); + + vq_req = vq_req_alloc(c2dev); + if (!vq_req) + return -ENOMEM; + + c2_wr_set_id(&wr, CCWR_QP_MODIFY); + wr.hdr.context = (unsigned long) vq_req; + wr.rnic_handle = c2dev->adapter_handle; + wr.qp_handle = qp->adapter_handle; + wr.ord = cpu_to_be32(C2_QP_NO_ATTR_CHANGE); + wr.ird = cpu_to_be32(C2_QP_NO_ATTR_CHANGE); + wr.sq_depth = cpu_to_be32(C2_QP_NO_ATTR_CHANGE); + wr.rq_depth = cpu_to_be32(C2_QP_NO_ATTR_CHANGE); + + if (attr_mask & IB_QP_STATE) { + /* Ensure the state is valid */ + if (attr->qp_state < 0 || attr->qp_state > IB_QPS_ERR) { + err = -EINVAL; + goto bail0; + } + + wr.next_qp_state = cpu_to_be32(to_c2_state(attr->qp_state)); + + if (attr->qp_state == IB_QPS_ERR) { + spin_lock_irqsave(&qp->lock, flags); + if (qp->cm_id && qp->state == IB_QPS_RTS) { + pr_debug("Generating CLOSE event for QP-->ERR, " + "qp=%p, cm_id=%p\n",qp,qp->cm_id); + /* Generate an CLOSE event */ + vq_req->cm_id = qp->cm_id; + vq_req->event = IW_CM_EVENT_CLOSE; + } + spin_unlock_irqrestore(&qp->lock, flags); + } + next_state = attr->qp_state; + + } else if (attr_mask & IB_QP_CUR_STATE) { + + if (attr->cur_qp_state != IB_QPS_RTR && + attr->cur_qp_state != IB_QPS_RTS && + attr->cur_qp_state != IB_QPS_SQD && + attr->cur_qp_state != IB_QPS_SQE) { + err = -EINVAL; + goto bail0; + } else + wr.next_qp_state = + cpu_to_be32(to_c2_state(attr->cur_qp_state)); + + next_state = attr->cur_qp_state; + + } else { + err = 0; + goto bail0; + } + + /* reference the request struct */ + vq_req_get(c2dev, vq_req); + + err = vq_send_wr(c2dev, (union c2wr *) & wr); + if (err) { + vq_req_put(c2dev, vq_req); + goto bail0; + } + + err = vq_wait_for_reply(c2dev, vq_req); + if (err) + goto bail0; + + reply = (struct c2wr_qp_modify_rep *) (unsigned long) vq_req->reply_msg; + if (!reply) { + err = -ENOMEM; + goto bail0; + } + + err = c2_errno(reply); + if (!err) + qp->state = next_state; +#ifdef DEBUG + else + pr_debug("%s: c2_errno=%d\n", __func__, err); +#endif + /* + * If we're going to error and generating the event here, then + * we need to remove the reference because there will be no + * close event generated by the adapter + */ + spin_lock_irqsave(&qp->lock, flags); + if (vq_req->event==IW_CM_EVENT_CLOSE && qp->cm_id) { + qp->cm_id->rem_ref(qp->cm_id); + qp->cm_id = NULL; + } + spin_unlock_irqrestore(&qp->lock, flags); + + vq_repbuf_free(c2dev, reply); + bail0: + vq_req_free(c2dev, vq_req); + + pr_debug("%s:%d qp=%p, cur_state=%s\n", + __func__, __LINE__, + qp, + to_ib_state_str(qp->state)); + return err; +} + +int c2_qp_set_read_limits(struct c2_dev *c2dev, struct c2_qp *qp, + int ord, int ird) +{ + struct c2wr_qp_modify_req wr; + struct c2wr_qp_modify_rep *reply; + struct c2_vq_req *vq_req; + int err; + + vq_req = vq_req_alloc(c2dev); + if (!vq_req) + return -ENOMEM; + + c2_wr_set_id(&wr, CCWR_QP_MODIFY); + wr.hdr.context = (unsigned long) vq_req; + wr.rnic_handle = c2dev->adapter_handle; + wr.qp_handle = qp->adapter_handle; + wr.ord = cpu_to_be32(ord); + wr.ird = cpu_to_be32(ird); + wr.sq_depth = cpu_to_be32(C2_QP_NO_ATTR_CHANGE); + wr.rq_depth = cpu_to_be32(C2_QP_NO_ATTR_CHANGE); + wr.next_qp_state = cpu_to_be32(C2_QP_NO_ATTR_CHANGE); + + /* reference the request struct */ + vq_req_get(c2dev, vq_req); + + err = vq_send_wr(c2dev, (union c2wr *) & wr); + if (err) { + vq_req_put(c2dev, vq_req); + goto bail0; + } + + err = vq_wait_for_reply(c2dev, vq_req); + if (err) + goto bail0; + + reply = (struct c2wr_qp_modify_rep *) (unsigned long) + vq_req->reply_msg; + if (!reply) { + err = -ENOMEM; + goto bail0; + } + + err = c2_errno(reply); + vq_repbuf_free(c2dev, reply); + bail0: + vq_req_free(c2dev, vq_req); + return err; +} + +static int destroy_qp(struct c2_dev *c2dev, struct c2_qp *qp) +{ + struct c2_vq_req *vq_req; + struct c2wr_qp_destroy_req wr; + struct c2wr_qp_destroy_rep *reply; + unsigned long flags; + int err; + + /* + * Allocate a verb request message + */ + vq_req = vq_req_alloc(c2dev); + if (!vq_req) { + return -ENOMEM; + } + + /* + * Initialize the WR + */ + c2_wr_set_id(&wr, CCWR_QP_DESTROY); + wr.hdr.context = (unsigned long) vq_req; + wr.rnic_handle = c2dev->adapter_handle; + wr.qp_handle = qp->adapter_handle; + + /* + * reference the request struct. dereferenced in the int handler. + */ + vq_req_get(c2dev, vq_req); + + spin_lock_irqsave(&qp->lock, flags); + if (qp->cm_id && qp->state == IB_QPS_RTS) { + pr_debug("destroy_qp: generating CLOSE event for QP-->ERR, " + "qp=%p, cm_id=%p\n",qp,qp->cm_id); + /* Generate an CLOSE event */ + vq_req->qp = qp; + vq_req->cm_id = qp->cm_id; + vq_req->event = IW_CM_EVENT_CLOSE; + } + spin_unlock_irqrestore(&qp->lock, flags); + + /* + * Send WR to adapter + */ + err = vq_send_wr(c2dev, (union c2wr *) & wr); + if (err) { + vq_req_put(c2dev, vq_req); + goto bail0; + } + + /* + * Wait for reply from adapter + */ + err = vq_wait_for_reply(c2dev, vq_req); + if (err) { + goto bail0; + } + + /* + * Process reply + */ + reply = (struct c2wr_qp_destroy_rep *) (unsigned long) (vq_req->reply_msg); + if (!reply) { + err = -ENOMEM; + goto bail0; + } + + spin_lock_irqsave(&qp->lock, flags); + if (qp->cm_id) { + qp->cm_id->rem_ref(qp->cm_id); + qp->cm_id = NULL; + } + spin_unlock_irqrestore(&qp->lock, flags); + + vq_repbuf_free(c2dev, reply); + bail0: + vq_req_free(c2dev, vq_req); + return err; +} + +static int c2_alloc_qpn(struct c2_dev *c2dev, struct c2_qp *qp) +{ + int ret; + + idr_preload(GFP_KERNEL); + spin_lock_irq(&c2dev->qp_table.lock); + + ret = idr_alloc_cyclic(&c2dev->qp_table.idr, qp, 0, 0, GFP_NOWAIT); + if (ret >= 0) + qp->qpn = ret; + + spin_unlock_irq(&c2dev->qp_table.lock); + idr_preload_end(); + return ret < 0 ? ret : 0; +} + +static void c2_free_qpn(struct c2_dev *c2dev, int qpn) +{ + spin_lock_irq(&c2dev->qp_table.lock); + idr_remove(&c2dev->qp_table.idr, qpn); + spin_unlock_irq(&c2dev->qp_table.lock); +} + +struct c2_qp *c2_find_qpn(struct c2_dev *c2dev, int qpn) +{ + unsigned long flags; + struct c2_qp *qp; + + spin_lock_irqsave(&c2dev->qp_table.lock, flags); + qp = idr_find(&c2dev->qp_table.idr, qpn); + spin_unlock_irqrestore(&c2dev->qp_table.lock, flags); + return qp; +} + +int c2_alloc_qp(struct c2_dev *c2dev, + struct c2_pd *pd, + struct ib_qp_init_attr *qp_attrs, struct c2_qp *qp) +{ + struct c2wr_qp_create_req wr; + struct c2wr_qp_create_rep *reply; + struct c2_vq_req *vq_req; + struct c2_cq *send_cq = to_c2cq(qp_attrs->send_cq); + struct c2_cq *recv_cq = to_c2cq(qp_attrs->recv_cq); + unsigned long peer_pa; + u32 q_size, msg_size, mmap_size; + void __iomem *mmap; + int err; + + err = c2_alloc_qpn(c2dev, qp); + if (err) + return err; + qp->ibqp.qp_num = qp->qpn; + qp->ibqp.qp_type = IB_QPT_RC; + + /* Allocate the SQ and RQ shared pointers */ + qp->sq_mq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool, + &qp->sq_mq.shared_dma, GFP_KERNEL); + if (!qp->sq_mq.shared) { + err = -ENOMEM; + goto bail0; + } + + qp->rq_mq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool, + &qp->rq_mq.shared_dma, GFP_KERNEL); + if (!qp->rq_mq.shared) { + err = -ENOMEM; + goto bail1; + } + + /* Allocate the verbs request */ + vq_req = vq_req_alloc(c2dev); + if (vq_req == NULL) { + err = -ENOMEM; + goto bail2; + } + + /* Initialize the work request */ + memset(&wr, 0, sizeof(wr)); + c2_wr_set_id(&wr, CCWR_QP_CREATE); + wr.hdr.context = (unsigned long) vq_req; + wr.rnic_handle = c2dev->adapter_handle; + wr.sq_cq_handle = send_cq->adapter_handle; + wr.rq_cq_handle = recv_cq->adapter_handle; + wr.sq_depth = cpu_to_be32(qp_attrs->cap.max_send_wr + 1); + wr.rq_depth = cpu_to_be32(qp_attrs->cap.max_recv_wr + 1); + wr.srq_handle = 0; + wr.flags = cpu_to_be32(QP_RDMA_READ | QP_RDMA_WRITE | QP_MW_BIND | + QP_ZERO_STAG | QP_RDMA_READ_RESPONSE); + wr.send_sgl_depth = cpu_to_be32(qp_attrs->cap.max_send_sge); + wr.recv_sgl_depth = cpu_to_be32(qp_attrs->cap.max_recv_sge); + wr.rdma_write_sgl_depth = cpu_to_be32(qp_attrs->cap.max_send_sge); + wr.shared_sq_ht = cpu_to_be64(qp->sq_mq.shared_dma); + wr.shared_rq_ht = cpu_to_be64(qp->rq_mq.shared_dma); + wr.ord = cpu_to_be32(C2_MAX_ORD_PER_QP); + wr.ird = cpu_to_be32(C2_MAX_IRD_PER_QP); + wr.pd_id = pd->pd_id; + wr.user_context = (unsigned long) qp; + + vq_req_get(c2dev, vq_req); + + /* Send the WR to the adapter */ + err = vq_send_wr(c2dev, (union c2wr *) & wr); + if (err) { + vq_req_put(c2dev, vq_req); + goto bail3; + } + + /* Wait for the verb reply */ + err = vq_wait_for_reply(c2dev, vq_req); + if (err) { + goto bail3; + } + + /* Process the reply */ + reply = (struct c2wr_qp_create_rep *) (unsigned long) (vq_req->reply_msg); + if (!reply) { + err = -ENOMEM; + goto bail3; + } + + if ((err = c2_wr_get_result(reply)) != 0) { + goto bail4; + } + + /* Fill in the kernel QP struct */ + atomic_set(&qp->refcount, 1); + qp->adapter_handle = reply->qp_handle; + qp->state = IB_QPS_RESET; + qp->send_sgl_depth = qp_attrs->cap.max_send_sge; + qp->rdma_write_sgl_depth = qp_attrs->cap.max_send_sge; + qp->recv_sgl_depth = qp_attrs->cap.max_recv_sge; + init_waitqueue_head(&qp->wait); + + /* Initialize the SQ MQ */ + q_size = be32_to_cpu(reply->sq_depth); + msg_size = be32_to_cpu(reply->sq_msg_size); + peer_pa = c2dev->pa + be32_to_cpu(reply->sq_mq_start); + mmap_size = PAGE_ALIGN(sizeof(struct c2_mq_shared) + msg_size * q_size); + mmap = ioremap_nocache(peer_pa, mmap_size); + if (!mmap) { + err = -ENOMEM; + goto bail5; + } + + c2_mq_req_init(&qp->sq_mq, + be32_to_cpu(reply->sq_mq_index), + q_size, + msg_size, + mmap + sizeof(struct c2_mq_shared), /* pool start */ + mmap, /* peer */ + C2_MQ_ADAPTER_TARGET); + + /* Initialize the RQ mq */ + q_size = be32_to_cpu(reply->rq_depth); + msg_size = be32_to_cpu(reply->rq_msg_size); + peer_pa = c2dev->pa + be32_to_cpu(reply->rq_mq_start); + mmap_size = PAGE_ALIGN(sizeof(struct c2_mq_shared) + msg_size * q_size); + mmap = ioremap_nocache(peer_pa, mmap_size); + if (!mmap) { + err = -ENOMEM; + goto bail6; + } + + c2_mq_req_init(&qp->rq_mq, + be32_to_cpu(reply->rq_mq_index), + q_size, + msg_size, + mmap + sizeof(struct c2_mq_shared), /* pool start */ + mmap, /* peer */ + C2_MQ_ADAPTER_TARGET); + + vq_repbuf_free(c2dev, reply); + vq_req_free(c2dev, vq_req); + + return 0; + + bail6: + iounmap(qp->sq_mq.peer); + bail5: + destroy_qp(c2dev, qp); + bail4: + vq_repbuf_free(c2dev, reply); + bail3: + vq_req_free(c2dev, vq_req); + bail2: + c2_free_mqsp(qp->rq_mq.shared); + bail1: + c2_free_mqsp(qp->sq_mq.shared); + bail0: + c2_free_qpn(c2dev, qp->qpn); + return err; +} + +static inline void c2_lock_cqs(struct c2_cq *send_cq, struct c2_cq *recv_cq) +{ + if (send_cq == recv_cq) + spin_lock_irq(&send_cq->lock); + else if (send_cq > recv_cq) { + spin_lock_irq(&send_cq->lock); + spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING); + } else { + spin_lock_irq(&recv_cq->lock); + spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING); + } +} + +static inline void c2_unlock_cqs(struct c2_cq *send_cq, struct c2_cq *recv_cq) +{ + if (send_cq == recv_cq) + spin_unlock_irq(&send_cq->lock); + else if (send_cq > recv_cq) { + spin_unlock(&recv_cq->lock); + spin_unlock_irq(&send_cq->lock); + } else { + spin_unlock(&send_cq->lock); + spin_unlock_irq(&recv_cq->lock); + } +} + +void c2_free_qp(struct c2_dev *c2dev, struct c2_qp *qp) +{ + struct c2_cq *send_cq; + struct c2_cq *recv_cq; + + send_cq = to_c2cq(qp->ibqp.send_cq); + recv_cq = to_c2cq(qp->ibqp.recv_cq); + + /* + * Lock CQs here, so that CQ polling code can do QP lookup + * without taking a lock. + */ + c2_lock_cqs(send_cq, recv_cq); + c2_free_qpn(c2dev, qp->qpn); + c2_unlock_cqs(send_cq, recv_cq); + + /* + * Destroy qp in the rnic... + */ + destroy_qp(c2dev, qp); + + /* + * Mark any unreaped CQEs as null and void. + */ + c2_cq_clean(c2dev, qp, send_cq->cqn); + if (send_cq != recv_cq) + c2_cq_clean(c2dev, qp, recv_cq->cqn); + /* + * Unmap the MQs and return the shared pointers + * to the message pool. + */ + iounmap(qp->sq_mq.peer); + iounmap(qp->rq_mq.peer); + c2_free_mqsp(qp->sq_mq.shared); + c2_free_mqsp(qp->rq_mq.shared); + + atomic_dec(&qp->refcount); + wait_event(qp->wait, !atomic_read(&qp->refcount)); +} + +/* + * Function: move_sgl + * + * Description: + * Move an SGL from the user's work request struct into a CCIL Work Request + * message, swapping to WR byte order and ensure the total length doesn't + * overflow. + * + * IN: + * dst - ptr to CCIL Work Request message SGL memory. + * src - ptr to the consumers SGL memory. + * + * OUT: none + * + * Return: + * CCIL status codes. + */ +static int +move_sgl(struct c2_data_addr * dst, struct ib_sge *src, int count, u32 * p_len, + u8 * actual_count) +{ + u32 tot = 0; /* running total */ + u8 acount = 0; /* running total non-0 len sge's */ + + while (count > 0) { + /* + * If the addition of this SGE causes the + * total SGL length to exceed 2^32-1, then + * fail-n-bail. + * + * If the current total plus the next element length + * wraps, then it will go negative and be less than the + * current total... + */ + if ((tot + src->length) < tot) { + return -EINVAL; + } + /* + * Bug: 1456 (as well as 1498 & 1643) + * Skip over any sge's supplied with len=0 + */ + if (src->length) { + tot += src->length; + dst->stag = cpu_to_be32(src->lkey); + dst->to = cpu_to_be64(src->addr); + dst->length = cpu_to_be32(src->length); + dst++; + acount++; + } + src++; + count--; + } + + if (acount == 0) { + /* + * Bug: 1476 (as well as 1498, 1456 and 1643) + * Setup the SGL in the WR to make it easier for the RNIC. + * This way, the FW doesn't have to deal with special cases. + * Setting length=0 should be sufficient. + */ + dst->stag = 0; + dst->to = 0; + dst->length = 0; + } + + *p_len = tot; + *actual_count = acount; + return 0; +} + +/* + * Function: c2_activity (private function) + * + * Description: + * Post an mq index to the host->adapter activity fifo. + * + * IN: + * c2dev - ptr to c2dev structure + * mq_index - mq index to post + * shared - value most recently written to shared + * + * OUT: + * + * Return: + * none + */ +static inline void c2_activity(struct c2_dev *c2dev, u32 mq_index, u16 shared) +{ + /* + * First read the register to see if the FIFO is full, and if so, + * spin until it's not. This isn't perfect -- there is no + * synchronization among the clients of the register, but in + * practice it prevents multiple CPU from hammering the bus + * with PCI RETRY. Note that when this does happen, the card + * cannot get on the bus and the card and system hang in a + * deadlock -- thus the need for this code. [TOT] + */ + while (readl(c2dev->regs + PCI_BAR0_ADAPTER_HINT) & 0x80000000) + udelay(10); + + __raw_writel(C2_HINT_MAKE(mq_index, shared), + c2dev->regs + PCI_BAR0_ADAPTER_HINT); +} + +/* + * Function: qp_wr_post + * + * Description: + * This in-line function allocates a MQ msg, then moves the host-copy of + * the completed WR into msg. Then it posts the message. + * + * IN: + * q - ptr to user MQ. + * wr - ptr to host-copy of the WR. + * qp - ptr to user qp + * size - Number of bytes to post. Assumed to be divisible by 4. + * + * OUT: none + * + * Return: + * CCIL status codes. + */ +static int qp_wr_post(struct c2_mq *q, union c2wr * wr, struct c2_qp *qp, u32 size) +{ + union c2wr *msg; + + msg = c2_mq_alloc(q); + if (msg == NULL) { + return -EINVAL; + } +#ifdef CCMSGMAGIC + ((c2wr_hdr_t *) wr)->magic = cpu_to_be32(CCWR_MAGIC); +#endif + + /* + * Since all header fields in the WR are the same as the + * CQE, set the following so the adapter need not. + */ + c2_wr_set_result(wr, CCERR_PENDING); + + /* + * Copy the wr down to the adapter + */ + memcpy((void *) msg, (void *) wr, size); + + c2_mq_produce(q); + return 0; +} + + +int c2_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr, + struct ib_send_wr **bad_wr) +{ + struct c2_dev *c2dev = to_c2dev(ibqp->device); + struct c2_qp *qp = to_c2qp(ibqp); + union c2wr wr; + unsigned long lock_flags; + int err = 0; + + u32 flags; + u32 tot_len; + u8 actual_sge_count; + u32 msg_size; + + if (qp->state > IB_QPS_RTS) { + err = -EINVAL; + goto out; + } + + while (ib_wr) { + + flags = 0; + wr.sqwr.sq_hdr.user_hdr.hdr.context = ib_wr->wr_id; + if (ib_wr->send_flags & IB_SEND_SIGNALED) { + flags |= SQ_SIGNALED; + } + + switch (ib_wr->opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_INV: + if (ib_wr->opcode == IB_WR_SEND) { + if (ib_wr->send_flags & IB_SEND_SOLICITED) + c2_wr_set_id(&wr, C2_WR_TYPE_SEND_SE); + else + c2_wr_set_id(&wr, C2_WR_TYPE_SEND); + wr.sqwr.send.remote_stag = 0; + } else { + if (ib_wr->send_flags & IB_SEND_SOLICITED) + c2_wr_set_id(&wr, C2_WR_TYPE_SEND_SE_INV); + else + c2_wr_set_id(&wr, C2_WR_TYPE_SEND_INV); + wr.sqwr.send.remote_stag = + cpu_to_be32(ib_wr->ex.invalidate_rkey); + } + + msg_size = sizeof(struct c2wr_send_req) + + sizeof(struct c2_data_addr) * ib_wr->num_sge; + if (ib_wr->num_sge > qp->send_sgl_depth) { + err = -EINVAL; + break; + } + if (ib_wr->send_flags & IB_SEND_FENCE) { + flags |= SQ_READ_FENCE; + } + err = move_sgl((struct c2_data_addr *) & (wr.sqwr.send.data), + ib_wr->sg_list, + ib_wr->num_sge, + &tot_len, &actual_sge_count); + wr.sqwr.send.sge_len = cpu_to_be32(tot_len); + c2_wr_set_sge_count(&wr, actual_sge_count); + break; + case IB_WR_RDMA_WRITE: + c2_wr_set_id(&wr, C2_WR_TYPE_RDMA_WRITE); + msg_size = sizeof(struct c2wr_rdma_write_req) + + (sizeof(struct c2_data_addr) * ib_wr->num_sge); + if (ib_wr->num_sge > qp->rdma_write_sgl_depth) { + err = -EINVAL; + break; + } + if (ib_wr->send_flags & IB_SEND_FENCE) { + flags |= SQ_READ_FENCE; + } + wr.sqwr.rdma_write.remote_stag = + cpu_to_be32(ib_wr->wr.rdma.rkey); + wr.sqwr.rdma_write.remote_to = + cpu_to_be64(ib_wr->wr.rdma.remote_addr); + err = move_sgl((struct c2_data_addr *) + & (wr.sqwr.rdma_write.data), + ib_wr->sg_list, + ib_wr->num_sge, + &tot_len, &actual_sge_count); + wr.sqwr.rdma_write.sge_len = cpu_to_be32(tot_len); + c2_wr_set_sge_count(&wr, actual_sge_count); + break; + case IB_WR_RDMA_READ: + c2_wr_set_id(&wr, C2_WR_TYPE_RDMA_READ); + msg_size = sizeof(struct c2wr_rdma_read_req); + + /* IWarp only suppots 1 sge for RDMA reads */ + if (ib_wr->num_sge > 1) { + err = -EINVAL; + break; + } + + /* + * Move the local and remote stag/to/len into the WR. + */ + wr.sqwr.rdma_read.local_stag = + cpu_to_be32(ib_wr->sg_list->lkey); + wr.sqwr.rdma_read.local_to = + cpu_to_be64(ib_wr->sg_list->addr); + wr.sqwr.rdma_read.remote_stag = + cpu_to_be32(ib_wr->wr.rdma.rkey); + wr.sqwr.rdma_read.remote_to = + cpu_to_be64(ib_wr->wr.rdma.remote_addr); + wr.sqwr.rdma_read.length = + cpu_to_be32(ib_wr->sg_list->length); + break; + default: + /* error */ + msg_size = 0; + err = -EINVAL; + break; + } + + /* + * If we had an error on the last wr build, then + * break out. Possible errors include bogus WR + * type, and a bogus SGL length... + */ + if (err) { + break; + } + + /* + * Store flags + */ + c2_wr_set_flags(&wr, flags); + + /* + * Post the puppy! + */ + spin_lock_irqsave(&qp->lock, lock_flags); + err = qp_wr_post(&qp->sq_mq, &wr, qp, msg_size); + if (err) { + spin_unlock_irqrestore(&qp->lock, lock_flags); + break; + } + + /* + * Enqueue mq index to activity FIFO. + */ + c2_activity(c2dev, qp->sq_mq.index, qp->sq_mq.hint_count); + spin_unlock_irqrestore(&qp->lock, lock_flags); + + ib_wr = ib_wr->next; + } + +out: + if (err) + *bad_wr = ib_wr; + return err; +} + +int c2_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *ib_wr, + struct ib_recv_wr **bad_wr) +{ + struct c2_dev *c2dev = to_c2dev(ibqp->device); + struct c2_qp *qp = to_c2qp(ibqp); + union c2wr wr; + unsigned long lock_flags; + int err = 0; + + if (qp->state > IB_QPS_RTS) { + err = -EINVAL; + goto out; + } + + /* + * Try and post each work request + */ + while (ib_wr) { + u32 tot_len; + u8 actual_sge_count; + + if (ib_wr->num_sge > qp->recv_sgl_depth) { + err = -EINVAL; + break; + } + + /* + * Create local host-copy of the WR + */ + wr.rqwr.rq_hdr.user_hdr.hdr.context = ib_wr->wr_id; + c2_wr_set_id(&wr, CCWR_RECV); + c2_wr_set_flags(&wr, 0); + + /* sge_count is limited to eight bits. */ + BUG_ON(ib_wr->num_sge >= 256); + err = move_sgl((struct c2_data_addr *) & (wr.rqwr.data), + ib_wr->sg_list, + ib_wr->num_sge, &tot_len, &actual_sge_count); + c2_wr_set_sge_count(&wr, actual_sge_count); + + /* + * If we had an error on the last wr build, then + * break out. Possible errors include bogus WR + * type, and a bogus SGL length... + */ + if (err) { + break; + } + + spin_lock_irqsave(&qp->lock, lock_flags); + err = qp_wr_post(&qp->rq_mq, &wr, qp, qp->rq_mq.msg_size); + if (err) { + spin_unlock_irqrestore(&qp->lock, lock_flags); + break; + } + + /* + * Enqueue mq index to activity FIFO + */ + c2_activity(c2dev, qp->rq_mq.index, qp->rq_mq.hint_count); + spin_unlock_irqrestore(&qp->lock, lock_flags); + + ib_wr = ib_wr->next; + } + +out: + if (err) + *bad_wr = ib_wr; + return err; +} + +void c2_init_qp_table(struct c2_dev *c2dev) +{ + spin_lock_init(&c2dev->qp_table.lock); + idr_init(&c2dev->qp_table.idr); +} + +void c2_cleanup_qp_table(struct c2_dev *c2dev) +{ + idr_destroy(&c2dev->qp_table.idr); +} diff --git a/kernel/drivers/infiniband/hw/amso1100/c2_rnic.c b/kernel/drivers/infiniband/hw/amso1100/c2_rnic.c new file mode 100644 index 000000000..d2a6d9613 --- /dev/null +++ b/kernel/drivers/infiniband/hw/amso1100/c2_rnic.c @@ -0,0 +1,655 @@ +/* + * Copyright (c) 2005 Ammasso, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include "c2.h" +#include "c2_vq.h" + +/* Device capabilities */ +#define C2_MIN_PAGESIZE 1024 + +#define C2_MAX_MRS 32768 +#define C2_MAX_QPS 16000 +#define C2_MAX_WQE_SZ 256 +#define C2_MAX_QP_WR ((128*1024)/C2_MAX_WQE_SZ) +#define C2_MAX_SGES 4 +#define C2_MAX_SGE_RD 1 +#define C2_MAX_CQS 32768 +#define C2_MAX_CQES 4096 +#define C2_MAX_PDS 16384 + +/* + * Send the adapter INIT message to the amso1100 + */ +static int c2_adapter_init(struct c2_dev *c2dev) +{ + struct c2wr_init_req wr; + int err; + + memset(&wr, 0, sizeof(wr)); + c2_wr_set_id(&wr, CCWR_INIT); + wr.hdr.context = 0; + wr.hint_count = cpu_to_be64(c2dev->hint_count_dma); + wr.q0_host_shared = cpu_to_be64(c2dev->req_vq.shared_dma); + wr.q1_host_shared = cpu_to_be64(c2dev->rep_vq.shared_dma); + wr.q1_host_msg_pool = cpu_to_be64(c2dev->rep_vq.host_dma); + wr.q2_host_shared = cpu_to_be64(c2dev->aeq.shared_dma); + wr.q2_host_msg_pool = cpu_to_be64(c2dev->aeq.host_dma); + + /* Post the init message */ + err = vq_send_wr(c2dev, (union c2wr *) & wr); + + return err; +} + +/* + * Send the adapter TERM message to the amso1100 + */ +static void c2_adapter_term(struct c2_dev *c2dev) +{ + struct c2wr_init_req wr; + + memset(&wr, 0, sizeof(wr)); + c2_wr_set_id(&wr, CCWR_TERM); + wr.hdr.context = 0; + + /* Post the init message */ + vq_send_wr(c2dev, (union c2wr *) & wr); + c2dev->init = 0; + + return; +} + +/* + * Query the adapter + */ +static int c2_rnic_query(struct c2_dev *c2dev, struct ib_device_attr *props) +{ + struct c2_vq_req *vq_req; + struct c2wr_rnic_query_req wr; + struct c2wr_rnic_query_rep *reply; + int err; + + vq_req = vq_req_alloc(c2dev); + if (!vq_req) + return -ENOMEM; + + c2_wr_set_id(&wr, CCWR_RNIC_QUERY); + wr.hdr.context = (unsigned long) vq_req; + wr.rnic_handle = c2dev->adapter_handle; + + vq_req_get(c2dev, vq_req); + + err = vq_send_wr(c2dev, (union c2wr *) &wr); + if (err) { + vq_req_put(c2dev, vq_req); + goto bail1; + } + + err = vq_wait_for_reply(c2dev, vq_req); + if (err) + goto bail1; + + reply = + (struct c2wr_rnic_query_rep *) (unsigned long) (vq_req->reply_msg); + if (!reply) + err = -ENOMEM; + else + err = c2_errno(reply); + if (err) + goto bail2; + + props->fw_ver = + ((u64)be32_to_cpu(reply->fw_ver_major) << 32) | + ((be32_to_cpu(reply->fw_ver_minor) & 0xFFFF) << 16) | + (be32_to_cpu(reply->fw_ver_patch) & 0xFFFF); + memcpy(&props->sys_image_guid, c2dev->netdev->dev_addr, 6); + props->max_mr_size = 0xFFFFFFFF; + props->page_size_cap = ~(C2_MIN_PAGESIZE-1); + props->vendor_id = be32_to_cpu(reply->vendor_id); + props->vendor_part_id = be32_to_cpu(reply->part_number); + props->hw_ver = be32_to_cpu(reply->hw_version); + props->max_qp = be32_to_cpu(reply->max_qps); + props->max_qp_wr = be32_to_cpu(reply->max_qp_depth); + props->device_cap_flags = c2dev->device_cap_flags; + props->max_sge = C2_MAX_SGES; + props->max_sge_rd = C2_MAX_SGE_RD; + props->max_cq = be32_to_cpu(reply->max_cqs); + props->max_cqe = be32_to_cpu(reply->max_cq_depth); + props->max_mr = be32_to_cpu(reply->max_mrs); + props->max_pd = be32_to_cpu(reply->max_pds); + props->max_qp_rd_atom = be32_to_cpu(reply->max_qp_ird); + props->max_ee_rd_atom = 0; + props->max_res_rd_atom = be32_to_cpu(reply->max_global_ird); + props->max_qp_init_rd_atom = be32_to_cpu(reply->max_qp_ord); + props->max_ee_init_rd_atom = 0; + props->atomic_cap = IB_ATOMIC_NONE; + props->max_ee = 0; + props->max_rdd = 0; + props->max_mw = be32_to_cpu(reply->max_mws); + props->max_raw_ipv6_qp = 0; + props->max_raw_ethy_qp = 0; + props->max_mcast_grp = 0; + props->max_mcast_qp_attach = 0; + props->max_total_mcast_qp_attach = 0; + props->max_ah = 0; + props->max_fmr = 0; + props->max_map_per_fmr = 0; + props->max_srq = 0; + props->max_srq_wr = 0; + props->max_srq_sge = 0; + props->max_pkeys = 0; + props->local_ca_ack_delay = 0; + + bail2: + vq_repbuf_free(c2dev, reply); + + bail1: + vq_req_free(c2dev, vq_req); + return err; +} + +/* + * Add an IP address to the RNIC interface + */ +int c2_add_addr(struct c2_dev *c2dev, __be32 inaddr, __be32 inmask) +{ + struct c2_vq_req *vq_req; + struct c2wr_rnic_setconfig_req *wr; + struct c2wr_rnic_setconfig_rep *reply; + struct c2_netaddr netaddr; + int err, len; + + vq_req = vq_req_alloc(c2dev); + if (!vq_req) + return -ENOMEM; + + len = sizeof(struct c2_netaddr); + wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL); + if (!wr) { + err = -ENOMEM; + goto bail0; + } + + c2_wr_set_id(wr, CCWR_RNIC_SETCONFIG); + wr->hdr.context = (unsigned long) vq_req; + wr->rnic_handle = c2dev->adapter_handle; + wr->option = cpu_to_be32(C2_CFG_ADD_ADDR); + + netaddr.ip_addr = inaddr; + netaddr.netmask = inmask; + netaddr.mtu = 0; + + memcpy(wr->data, &netaddr, len); + + vq_req_get(c2dev, vq_req); + + err = vq_send_wr(c2dev, (union c2wr *) wr); + if (err) { + vq_req_put(c2dev, vq_req); + goto bail1; + } + + err = vq_wait_for_reply(c2dev, vq_req); + if (err) + goto bail1; + + reply = + (struct c2wr_rnic_setconfig_rep *) (unsigned long) (vq_req->reply_msg); + if (!reply) { + err = -ENOMEM; + goto bail1; + } + + err = c2_errno(reply); + vq_repbuf_free(c2dev, reply); + + bail1: + kfree(wr); + bail0: + vq_req_free(c2dev, vq_req); + return err; +} + +/* + * Delete an IP address from the RNIC interface + */ +int c2_del_addr(struct c2_dev *c2dev, __be32 inaddr, __be32 inmask) +{ + struct c2_vq_req *vq_req; + struct c2wr_rnic_setconfig_req *wr; + struct c2wr_rnic_setconfig_rep *reply; + struct c2_netaddr netaddr; + int err, len; + + vq_req = vq_req_alloc(c2dev); + if (!vq_req) + return -ENOMEM; + + len = sizeof(struct c2_netaddr); + wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL); + if (!wr) { + err = -ENOMEM; + goto bail0; + } + + c2_wr_set_id(wr, CCWR_RNIC_SETCONFIG); + wr->hdr.context = (unsigned long) vq_req; + wr->rnic_handle = c2dev->adapter_handle; + wr->option = cpu_to_be32(C2_CFG_DEL_ADDR); + + netaddr.ip_addr = inaddr; + netaddr.netmask = inmask; + netaddr.mtu = 0; + + memcpy(wr->data, &netaddr, len); + + vq_req_get(c2dev, vq_req); + + err = vq_send_wr(c2dev, (union c2wr *) wr); + if (err) { + vq_req_put(c2dev, vq_req); + goto bail1; + } + + err = vq_wait_for_reply(c2dev, vq_req); + if (err) + goto bail1; + + reply = + (struct c2wr_rnic_setconfig_rep *) (unsigned long) (vq_req->reply_msg); + if (!reply) { + err = -ENOMEM; + goto bail1; + } + + err = c2_errno(reply); + vq_repbuf_free(c2dev, reply); + + bail1: + kfree(wr); + bail0: + vq_req_free(c2dev, vq_req); + return err; +} + +/* + * Open a single RNIC instance to use with all + * low level openib calls + */ +static int c2_rnic_open(struct c2_dev *c2dev) +{ + struct c2_vq_req *vq_req; + union c2wr wr; + struct c2wr_rnic_open_rep *reply; + int err; + + vq_req = vq_req_alloc(c2dev); + if (vq_req == NULL) { + return -ENOMEM; + } + + memset(&wr, 0, sizeof(wr)); + c2_wr_set_id(&wr, CCWR_RNIC_OPEN); + wr.rnic_open.req.hdr.context = (unsigned long) (vq_req); + wr.rnic_open.req.flags = cpu_to_be16(RNIC_PRIV_MODE); + wr.rnic_open.req.port_num = cpu_to_be16(0); + wr.rnic_open.req.user_context = (unsigned long) c2dev; + + vq_req_get(c2dev, vq_req); + + err = vq_send_wr(c2dev, &wr); + if (err) { + vq_req_put(c2dev, vq_req); + goto bail0; + } + + err = vq_wait_for_reply(c2dev, vq_req); + if (err) { + goto bail0; + } + + reply = (struct c2wr_rnic_open_rep *) (unsigned long) (vq_req->reply_msg); + if (!reply) { + err = -ENOMEM; + goto bail0; + } + + if ((err = c2_errno(reply)) != 0) { + goto bail1; + } + + c2dev->adapter_handle = reply->rnic_handle; + + bail1: + vq_repbuf_free(c2dev, reply); + bail0: + vq_req_free(c2dev, vq_req); + return err; +} + +/* + * Close the RNIC instance + */ +static int c2_rnic_close(struct c2_dev *c2dev) +{ + struct c2_vq_req *vq_req; + union c2wr wr; + struct c2wr_rnic_close_rep *reply; + int err; + + vq_req = vq_req_alloc(c2dev); + if (vq_req == NULL) { + return -ENOMEM; + } + + memset(&wr, 0, sizeof(wr)); + c2_wr_set_id(&wr, CCWR_RNIC_CLOSE); + wr.rnic_close.req.hdr.context = (unsigned long) vq_req; + wr.rnic_close.req.rnic_handle = c2dev->adapter_handle; + + vq_req_get(c2dev, vq_req); + + err = vq_send_wr(c2dev, &wr); + if (err) { + vq_req_put(c2dev, vq_req); + goto bail0; + } + + err = vq_wait_for_reply(c2dev, vq_req); + if (err) { + goto bail0; + } + + reply = (struct c2wr_rnic_close_rep *) (unsigned long) (vq_req->reply_msg); + if (!reply) { + err = -ENOMEM; + goto bail0; + } + + if ((err = c2_errno(reply)) != 0) { + goto bail1; + } + + c2dev->adapter_handle = 0; + + bail1: + vq_repbuf_free(c2dev, reply); + bail0: + vq_req_free(c2dev, vq_req); + return err; +} + +/* + * Called by c2_probe to initialize the RNIC. This principally + * involves initializing the various limits and resource pools that + * comprise the RNIC instance. + */ +int c2_rnic_init(struct c2_dev *c2dev) +{ + int err; + u32 qsize, msgsize; + void *q1_pages; + void *q2_pages; + void __iomem *mmio_regs; + + /* Device capabilities */ + c2dev->device_cap_flags = + (IB_DEVICE_RESIZE_MAX_WR | + IB_DEVICE_CURR_QP_STATE_MOD | + IB_DEVICE_SYS_IMAGE_GUID | + IB_DEVICE_LOCAL_DMA_LKEY | + IB_DEVICE_MEM_WINDOW); + + /* Allocate the qptr_array */ + c2dev->qptr_array = vzalloc(C2_MAX_CQS * sizeof(void *)); + if (!c2dev->qptr_array) { + return -ENOMEM; + } + + /* Initialize the qptr_array */ + c2dev->qptr_array[0] = (void *) &c2dev->req_vq; + c2dev->qptr_array[1] = (void *) &c2dev->rep_vq; + c2dev->qptr_array[2] = (void *) &c2dev->aeq; + + /* Initialize data structures */ + init_waitqueue_head(&c2dev->req_vq_wo); + spin_lock_init(&c2dev->vqlock); + spin_lock_init(&c2dev->lock); + + /* Allocate MQ shared pointer pool for kernel clients. User + * mode client pools are hung off the user context + */ + err = c2_init_mqsp_pool(c2dev, GFP_KERNEL, &c2dev->kern_mqsp_pool); + if (err) { + goto bail0; + } + + /* Allocate shared pointers for Q0, Q1, and Q2 from + * the shared pointer pool. + */ + + c2dev->hint_count = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool, + &c2dev->hint_count_dma, + GFP_KERNEL); + c2dev->req_vq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool, + &c2dev->req_vq.shared_dma, + GFP_KERNEL); + c2dev->rep_vq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool, + &c2dev->rep_vq.shared_dma, + GFP_KERNEL); + c2dev->aeq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool, + &c2dev->aeq.shared_dma, GFP_KERNEL); + if (!c2dev->hint_count || !c2dev->req_vq.shared || + !c2dev->rep_vq.shared || !c2dev->aeq.shared) { + err = -ENOMEM; + goto bail1; + } + + mmio_regs = c2dev->kva; + /* Initialize the Verbs Request Queue */ + c2_mq_req_init(&c2dev->req_vq, 0, + be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q0_QSIZE)), + be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q0_MSGSIZE)), + mmio_regs + + be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q0_POOLSTART)), + mmio_regs + + be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q0_SHARED)), + C2_MQ_ADAPTER_TARGET); + + /* Initialize the Verbs Reply Queue */ + qsize = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q1_QSIZE)); + msgsize = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q1_MSGSIZE)); + q1_pages = dma_alloc_coherent(&c2dev->pcidev->dev, qsize * msgsize, + &c2dev->rep_vq.host_dma, GFP_KERNEL); + if (!q1_pages) { + err = -ENOMEM; + goto bail1; + } + dma_unmap_addr_set(&c2dev->rep_vq, mapping, c2dev->rep_vq.host_dma); + pr_debug("%s rep_vq va %p dma %llx\n", __func__, q1_pages, + (unsigned long long) c2dev->rep_vq.host_dma); + c2_mq_rep_init(&c2dev->rep_vq, + 1, + qsize, + msgsize, + q1_pages, + mmio_regs + + be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q1_SHARED)), + C2_MQ_HOST_TARGET); + + /* Initialize the Asynchronus Event Queue */ + qsize = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q2_QSIZE)); + msgsize = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q2_MSGSIZE)); + q2_pages = dma_alloc_coherent(&c2dev->pcidev->dev, qsize * msgsize, + &c2dev->aeq.host_dma, GFP_KERNEL); + if (!q2_pages) { + err = -ENOMEM; + goto bail2; + } + dma_unmap_addr_set(&c2dev->aeq, mapping, c2dev->aeq.host_dma); + pr_debug("%s aeq va %p dma %llx\n", __func__, q2_pages, + (unsigned long long) c2dev->aeq.host_dma); + c2_mq_rep_init(&c2dev->aeq, + 2, + qsize, + msgsize, + q2_pages, + mmio_regs + + be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q2_SHARED)), + C2_MQ_HOST_TARGET); + + /* Initialize the verbs request allocator */ + err = vq_init(c2dev); + if (err) + goto bail3; + + /* Enable interrupts on the adapter */ + writel(0, c2dev->regs + C2_IDIS); + + /* create the WR init message */ + err = c2_adapter_init(c2dev); + if (err) + goto bail4; + c2dev->init++; + + /* open an adapter instance */ + err = c2_rnic_open(c2dev); + if (err) + goto bail4; + + /* Initialize cached the adapter limits */ + err = c2_rnic_query(c2dev, &c2dev->props); + if (err) + goto bail5; + + /* Initialize the PD pool */ + err = c2_init_pd_table(c2dev); + if (err) + goto bail5; + + /* Initialize the QP pool */ + c2_init_qp_table(c2dev); + return 0; + + bail5: + c2_rnic_close(c2dev); + bail4: + vq_term(c2dev); + bail3: + dma_free_coherent(&c2dev->pcidev->dev, + c2dev->aeq.q_size * c2dev->aeq.msg_size, + q2_pages, dma_unmap_addr(&c2dev->aeq, mapping)); + bail2: + dma_free_coherent(&c2dev->pcidev->dev, + c2dev->rep_vq.q_size * c2dev->rep_vq.msg_size, + q1_pages, dma_unmap_addr(&c2dev->rep_vq, mapping)); + bail1: + c2_free_mqsp_pool(c2dev, c2dev->kern_mqsp_pool); + bail0: + vfree(c2dev->qptr_array); + + return err; +} + +/* + * Called by c2_remove to cleanup the RNIC resources. + */ +void c2_rnic_term(struct c2_dev *c2dev) +{ + + /* Close the open adapter instance */ + c2_rnic_close(c2dev); + + /* Send the TERM message to the adapter */ + c2_adapter_term(c2dev); + + /* Disable interrupts on the adapter */ + writel(1, c2dev->regs + C2_IDIS); + + /* Free the QP pool */ + c2_cleanup_qp_table(c2dev); + + /* Free the PD pool */ + c2_cleanup_pd_table(c2dev); + + /* Free the verbs request allocator */ + vq_term(c2dev); + + /* Free the asynchronus event queue */ + dma_free_coherent(&c2dev->pcidev->dev, + c2dev->aeq.q_size * c2dev->aeq.msg_size, + c2dev->aeq.msg_pool.host, + dma_unmap_addr(&c2dev->aeq, mapping)); + + /* Free the verbs reply queue */ + dma_free_coherent(&c2dev->pcidev->dev, + c2dev->rep_vq.q_size * c2dev->rep_vq.msg_size, + c2dev->rep_vq.msg_pool.host, + dma_unmap_addr(&c2dev->rep_vq, mapping)); + + /* Free the MQ shared pointer pool */ + c2_free_mqsp_pool(c2dev, c2dev->kern_mqsp_pool); + + /* Free the qptr_array */ + vfree(c2dev->qptr_array); + + return; +} diff --git a/kernel/drivers/infiniband/hw/amso1100/c2_status.h b/kernel/drivers/infiniband/hw/amso1100/c2_status.h new file mode 100644 index 000000000..6ee4aa92d --- /dev/null +++ b/kernel/drivers/infiniband/hw/amso1100/c2_status.h @@ -0,0 +1,158 @@ +/* + * Copyright (c) 2005 Ammasso, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _C2_STATUS_H_ +#define _C2_STATUS_H_ + +/* + * Verbs Status Codes + */ +enum c2_status { + C2_OK = 0, /* This must be zero */ + CCERR_INSUFFICIENT_RESOURCES = 1, + CCERR_INVALID_MODIFIER = 2, + CCERR_INVALID_MODE = 3, + CCERR_IN_USE = 4, + CCERR_INVALID_RNIC = 5, + CCERR_INTERRUPTED_OPERATION = 6, + CCERR_INVALID_EH = 7, + CCERR_INVALID_CQ = 8, + CCERR_CQ_EMPTY = 9, + CCERR_NOT_IMPLEMENTED = 10, + CCERR_CQ_DEPTH_TOO_SMALL = 11, + CCERR_PD_IN_USE = 12, + CCERR_INVALID_PD = 13, + CCERR_INVALID_SRQ = 14, + CCERR_INVALID_ADDRESS = 15, + CCERR_INVALID_NETMASK = 16, + CCERR_INVALID_QP = 17, + CCERR_INVALID_QP_STATE = 18, + CCERR_TOO_MANY_WRS_POSTED = 19, + CCERR_INVALID_WR_TYPE = 20, + CCERR_INVALID_SGL_LENGTH = 21, + CCERR_INVALID_SQ_DEPTH = 22, + CCERR_INVALID_RQ_DEPTH = 23, + CCERR_INVALID_ORD = 24, + CCERR_INVALID_IRD = 25, + CCERR_QP_ATTR_CANNOT_CHANGE = 26, + CCERR_INVALID_STAG = 27, + CCERR_QP_IN_USE = 28, + CCERR_OUTSTANDING_WRS = 29, + CCERR_STAG_IN_USE = 30, + CCERR_INVALID_STAG_INDEX = 31, + CCERR_INVALID_SGL_FORMAT = 32, + CCERR_ADAPTER_TIMEOUT = 33, + CCERR_INVALID_CQ_DEPTH = 34, + CCERR_INVALID_PRIVATE_DATA_LENGTH = 35, + CCERR_INVALID_EP = 36, + CCERR_MR_IN_USE = CCERR_STAG_IN_USE, + CCERR_FLUSHED = 38, + CCERR_INVALID_WQE = 39, + CCERR_LOCAL_QP_CATASTROPHIC_ERROR = 40, + CCERR_REMOTE_TERMINATION_ERROR = 41, + CCERR_BASE_AND_BOUNDS_VIOLATION = 42, + CCERR_ACCESS_VIOLATION = 43, + CCERR_INVALID_PD_ID = 44, + CCERR_WRAP_ERROR = 45, + CCERR_INV_STAG_ACCESS_ERROR = 46, + CCERR_ZERO_RDMA_READ_RESOURCES = 47, + CCERR_QP_NOT_PRIVILEGED = 48, + CCERR_STAG_STATE_NOT_INVALID = 49, + CCERR_INVALID_PAGE_SIZE = 50, + CCERR_INVALID_BUFFER_SIZE = 51, + CCERR_INVALID_PBE = 52, + CCERR_INVALID_FBO = 53, + CCERR_INVALID_LENGTH = 54, + CCERR_INVALID_ACCESS_RIGHTS = 55, + CCERR_PBL_TOO_BIG = 56, + CCERR_INVALID_VA = 57, + CCERR_INVALID_REGION = 58, + CCERR_INVALID_WINDOW = 59, + CCERR_TOTAL_LENGTH_TOO_BIG = 60, + CCERR_INVALID_QP_ID = 61, + CCERR_ADDR_IN_USE = 62, + CCERR_ADDR_NOT_AVAIL = 63, + CCERR_NET_DOWN = 64, + CCERR_NET_UNREACHABLE = 65, + CCERR_CONN_ABORTED = 66, + CCERR_CONN_RESET = 67, + CCERR_NO_BUFS = 68, + CCERR_CONN_TIMEDOUT = 69, + CCERR_CONN_REFUSED = 70, + CCERR_HOST_UNREACHABLE = 71, + CCERR_INVALID_SEND_SGL_DEPTH = 72, + CCERR_INVALID_RECV_SGL_DEPTH = 73, + CCERR_INVALID_RDMA_WRITE_SGL_DEPTH = 74, + CCERR_INSUFFICIENT_PRIVILEGES = 75, + CCERR_STACK_ERROR = 76, + CCERR_INVALID_VERSION = 77, + CCERR_INVALID_MTU = 78, + CCERR_INVALID_IMAGE = 79, + CCERR_PENDING = 98, /* not an error; user internally by adapter */ + CCERR_DEFER = 99, /* not an error; used internally by adapter */ + CCERR_FAILED_WRITE = 100, + CCERR_FAILED_ERASE = 101, + CCERR_FAILED_VERIFICATION = 102, + CCERR_NOT_FOUND = 103, + +}; + +/* + * CCAE_ACTIVE_CONNECT_RESULTS status result codes. + */ +enum c2_connect_status { + C2_CONN_STATUS_SUCCESS = C2_OK, + C2_CONN_STATUS_NO_MEM = CCERR_INSUFFICIENT_RESOURCES, + C2_CONN_STATUS_TIMEDOUT = CCERR_CONN_TIMEDOUT, + C2_CONN_STATUS_REFUSED = CCERR_CONN_REFUSED, + C2_CONN_STATUS_NETUNREACH = CCERR_NET_UNREACHABLE, + C2_CONN_STATUS_HOSTUNREACH = CCERR_HOST_UNREACHABLE, + C2_CONN_STATUS_INVALID_RNIC = CCERR_INVALID_RNIC, + C2_CONN_STATUS_INVALID_QP = CCERR_INVALID_QP, + C2_CONN_STATUS_INVALID_QP_STATE = CCERR_INVALID_QP_STATE, + C2_CONN_STATUS_REJECTED = CCERR_CONN_RESET, + C2_CONN_STATUS_ADDR_NOT_AVAIL = CCERR_ADDR_NOT_AVAIL, +}; + +/* + * Flash programming status codes. + */ +enum c2_flash_status { + C2_FLASH_STATUS_SUCCESS = 0x0000, + C2_FLASH_STATUS_VERIFY_ERR = 0x0002, + C2_FLASH_STATUS_IMAGE_ERR = 0x0004, + C2_FLASH_STATUS_ECLBS = 0x0400, + C2_FLASH_STATUS_PSLBS = 0x0800, + C2_FLASH_STATUS_VPENS = 0x1000, +}; + +#endif /* _C2_STATUS_H_ */ diff --git a/kernel/drivers/infiniband/hw/amso1100/c2_user.h b/kernel/drivers/infiniband/hw/amso1100/c2_user.h new file mode 100644 index 000000000..7e9e7ad65 --- /dev/null +++ b/kernel/drivers/infiniband/hw/amso1100/c2_user.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Cisco Systems. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef C2_USER_H +#define C2_USER_H + +#include + +/* + * Make sure that all structs defined in this file remain laid out so + * that they pack the same way on 32-bit and 64-bit architectures (to + * avoid incompatibility between 32-bit userspace and 64-bit kernels). + * In particular do not use pointer types -- pass pointers in __u64 + * instead. + */ + +struct c2_alloc_ucontext_resp { + __u32 qp_tab_size; + __u32 uarc_size; +}; + +struct c2_alloc_pd_resp { + __u32 pdn; + __u32 reserved; +}; + +struct c2_create_cq { + __u32 lkey; + __u32 pdn; + __u64 arm_db_page; + __u64 set_db_page; + __u32 arm_db_index; + __u32 set_db_index; +}; + +struct c2_create_cq_resp { + __u32 cqn; + __u32 reserved; +}; + +struct c2_create_qp { + __u32 lkey; + __u32 reserved; + __u64 sq_db_page; + __u64 rq_db_page; + __u32 sq_db_index; + __u32 rq_db_index; +}; + +#endif /* C2_USER_H */ diff --git a/kernel/drivers/infiniband/hw/amso1100/c2_vq.c b/kernel/drivers/infiniband/hw/amso1100/c2_vq.c new file mode 100644 index 000000000..2ec716fb2 --- /dev/null +++ b/kernel/drivers/infiniband/hw/amso1100/c2_vq.c @@ -0,0 +1,260 @@ +/* + * Copyright (c) 2005 Ammasso, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include + +#include "c2_vq.h" +#include "c2_provider.h" + +/* + * Verbs Request Objects: + * + * VQ Request Objects are allocated by the kernel verbs handlers. + * They contain a wait object, a refcnt, an atomic bool indicating that the + * adapter has replied, and a copy of the verb reply work request. + * A pointer to the VQ Request Object is passed down in the context + * field of the work request message, and reflected back by the adapter + * in the verbs reply message. The function handle_vq() in the interrupt + * path will use this pointer to: + * 1) append a copy of the verbs reply message + * 2) mark that the reply is ready + * 3) wake up the kernel verbs handler blocked awaiting the reply. + * + * + * The kernel verbs handlers do a "get" to put a 2nd reference on the + * VQ Request object. If the kernel verbs handler exits before the adapter + * can respond, this extra reference will keep the VQ Request object around + * until the adapter's reply can be processed. The reason we need this is + * because a pointer to this object is stuffed into the context field of + * the verbs work request message, and reflected back in the reply message. + * It is used in the interrupt handler (handle_vq()) to wake up the appropriate + * kernel verb handler that is blocked awaiting the verb reply. + * So handle_vq() will do a "put" on the object when it's done accessing it. + * NOTE: If we guarantee that the kernel verb handler will never bail before + * getting the reply, then we don't need these refcnts. + * + * + * VQ Request objects are freed by the kernel verbs handlers only + * after the verb has been processed, or when the adapter fails and + * does not reply. + * + * + * Verbs Reply Buffers: + * + * VQ Reply bufs are local host memory copies of a + * outstanding Verb Request reply + * message. The are always allocated by the kernel verbs handlers, and _may_ be + * freed by either the kernel verbs handler -or- the interrupt handler. The + * kernel verbs handler _must_ free the repbuf, then free the vq request object + * in that order. + */ + +int vq_init(struct c2_dev *c2dev) +{ + sprintf(c2dev->vq_cache_name, "c2-vq:dev%c", + (char) ('0' + c2dev->devnum)); + c2dev->host_msg_cache = + kmem_cache_create(c2dev->vq_cache_name, c2dev->rep_vq.msg_size, 0, + SLAB_HWCACHE_ALIGN, NULL); + if (c2dev->host_msg_cache == NULL) { + return -ENOMEM; + } + return 0; +} + +void vq_term(struct c2_dev *c2dev) +{ + kmem_cache_destroy(c2dev->host_msg_cache); +} + +/* vq_req_alloc - allocate a VQ Request Object and initialize it. + * The refcnt is set to 1. + */ +struct c2_vq_req *vq_req_alloc(struct c2_dev *c2dev) +{ + struct c2_vq_req *r; + + r = kmalloc(sizeof(struct c2_vq_req), GFP_KERNEL); + if (r) { + init_waitqueue_head(&r->wait_object); + r->reply_msg = 0; + r->event = 0; + r->cm_id = NULL; + r->qp = NULL; + atomic_set(&r->refcnt, 1); + atomic_set(&r->reply_ready, 0); + } + return r; +} + + +/* vq_req_free - free the VQ Request Object. It is assumed the verbs handler + * has already free the VQ Reply Buffer if it existed. + */ +void vq_req_free(struct c2_dev *c2dev, struct c2_vq_req *r) +{ + r->reply_msg = 0; + if (atomic_dec_and_test(&r->refcnt)) { + kfree(r); + } +} + +/* vq_req_get - reference a VQ Request Object. Done + * only in the kernel verbs handlers. + */ +void vq_req_get(struct c2_dev *c2dev, struct c2_vq_req *r) +{ + atomic_inc(&r->refcnt); +} + + +/* vq_req_put - dereference and potentially free a VQ Request Object. + * + * This is only called by handle_vq() on the + * interrupt when it is done processing + * a verb reply message. If the associated + * kernel verbs handler has already bailed, + * then this put will actually free the VQ + * Request object _and_ the VQ Reply Buffer + * if it exists. + */ +void vq_req_put(struct c2_dev *c2dev, struct c2_vq_req *r) +{ + if (atomic_dec_and_test(&r->refcnt)) { + if (r->reply_msg != 0) + vq_repbuf_free(c2dev, + (void *) (unsigned long) r->reply_msg); + kfree(r); + } +} + + +/* + * vq_repbuf_alloc - allocate a VQ Reply Buffer. + */ +void *vq_repbuf_alloc(struct c2_dev *c2dev) +{ + return kmem_cache_alloc(c2dev->host_msg_cache, GFP_ATOMIC); +} + +/* + * vq_send_wr - post a verbs request message to the Verbs Request Queue. + * If a message is not available in the MQ, then block until one is available. + * NOTE: handle_mq() on the interrupt context will wake up threads blocked here. + * When the adapter drains the Verbs Request Queue, + * it inserts MQ index 0 in to the + * adapter->host activity fifo and interrupts the host. + */ +int vq_send_wr(struct c2_dev *c2dev, union c2wr *wr) +{ + void *msg; + wait_queue_t __wait; + + /* + * grab adapter vq lock + */ + spin_lock(&c2dev->vqlock); + + /* + * allocate msg + */ + msg = c2_mq_alloc(&c2dev->req_vq); + + /* + * If we cannot get a msg, then we'll wait + * When a messages are available, the int handler will wake_up() + * any waiters. + */ + while (msg == NULL) { + pr_debug("%s:%d no available msg in VQ, waiting...\n", + __func__, __LINE__); + init_waitqueue_entry(&__wait, current); + add_wait_queue(&c2dev->req_vq_wo, &__wait); + spin_unlock(&c2dev->vqlock); + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + if (!c2_mq_full(&c2dev->req_vq)) { + break; + } + if (!signal_pending(current)) { + schedule_timeout(1 * HZ); /* 1 second... */ + continue; + } + set_current_state(TASK_RUNNING); + remove_wait_queue(&c2dev->req_vq_wo, &__wait); + return -EINTR; + } + set_current_state(TASK_RUNNING); + remove_wait_queue(&c2dev->req_vq_wo, &__wait); + spin_lock(&c2dev->vqlock); + msg = c2_mq_alloc(&c2dev->req_vq); + } + + /* + * copy wr into adapter msg + */ + memcpy(msg, wr, c2dev->req_vq.msg_size); + + /* + * post msg + */ + c2_mq_produce(&c2dev->req_vq); + + /* + * release adapter vq lock + */ + spin_unlock(&c2dev->vqlock); + return 0; +} + + +/* + * vq_wait_for_reply - block until the adapter posts a Verb Reply Message. + */ +int vq_wait_for_reply(struct c2_dev *c2dev, struct c2_vq_req *req) +{ + if (!wait_event_timeout(req->wait_object, + atomic_read(&req->reply_ready), + 60*HZ)) + return -ETIMEDOUT; + + return 0; +} + +/* + * vq_repbuf_free - Free a Verbs Reply Buffer. + */ +void vq_repbuf_free(struct c2_dev *c2dev, void *reply) +{ + kmem_cache_free(c2dev->host_msg_cache, reply); +} diff --git a/kernel/drivers/infiniband/hw/amso1100/c2_vq.h b/kernel/drivers/infiniband/hw/amso1100/c2_vq.h new file mode 100644 index 000000000..33805627a --- /dev/null +++ b/kernel/drivers/infiniband/hw/amso1100/c2_vq.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2005 Ammasso, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _C2_VQ_H_ +#define _C2_VQ_H_ +#include +#include "c2.h" +#include "c2_wr.h" +#include "c2_provider.h" + +struct c2_vq_req { + u64 reply_msg; /* ptr to reply msg */ + wait_queue_head_t wait_object; /* wait object for vq reqs */ + atomic_t reply_ready; /* set when reply is ready */ + atomic_t refcnt; /* used to cancel WRs... */ + int event; + struct iw_cm_id *cm_id; + struct c2_qp *qp; +}; + +extern int vq_init(struct c2_dev *c2dev); +extern void vq_term(struct c2_dev *c2dev); + +extern struct c2_vq_req *vq_req_alloc(struct c2_dev *c2dev); +extern void vq_req_free(struct c2_dev *c2dev, struct c2_vq_req *req); +extern void vq_req_get(struct c2_dev *c2dev, struct c2_vq_req *req); +extern void vq_req_put(struct c2_dev *c2dev, struct c2_vq_req *req); +extern int vq_send_wr(struct c2_dev *c2dev, union c2wr * wr); + +extern void *vq_repbuf_alloc(struct c2_dev *c2dev); +extern void vq_repbuf_free(struct c2_dev *c2dev, void *reply); + +extern int vq_wait_for_reply(struct c2_dev *c2dev, struct c2_vq_req *req); +#endif /* _C2_VQ_H_ */ diff --git a/kernel/drivers/infiniband/hw/amso1100/c2_wr.h b/kernel/drivers/infiniband/hw/amso1100/c2_wr.h new file mode 100644 index 000000000..8d4b4ca46 --- /dev/null +++ b/kernel/drivers/infiniband/hw/amso1100/c2_wr.h @@ -0,0 +1,1520 @@ +/* + * Copyright (c) 2005 Ammasso, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _C2_WR_H_ +#define _C2_WR_H_ + +#ifdef CCDEBUG +#define CCWR_MAGIC 0xb07700b0 +#endif + +#define C2_QP_NO_ATTR_CHANGE 0xFFFFFFFF + +/* Maximum allowed size in bytes of private_data exchange + * on connect. + */ +#define C2_MAX_PRIVATE_DATA_SIZE 200 + +/* + * These types are shared among the adapter, host, and CCIL consumer. + */ +enum c2_cq_notification_type { + C2_CQ_NOTIFICATION_TYPE_NONE = 1, + C2_CQ_NOTIFICATION_TYPE_NEXT, + C2_CQ_NOTIFICATION_TYPE_NEXT_SE +}; + +enum c2_setconfig_cmd { + C2_CFG_ADD_ADDR = 1, + C2_CFG_DEL_ADDR = 2, + C2_CFG_ADD_ROUTE = 3, + C2_CFG_DEL_ROUTE = 4 +}; + +enum c2_getconfig_cmd { + C2_GETCONFIG_ROUTES = 1, + C2_GETCONFIG_ADDRS +}; + +/* + * CCIL Work Request Identifiers + */ +enum c2wr_ids { + CCWR_RNIC_OPEN = 1, + CCWR_RNIC_QUERY, + CCWR_RNIC_SETCONFIG, + CCWR_RNIC_GETCONFIG, + CCWR_RNIC_CLOSE, + CCWR_CQ_CREATE, + CCWR_CQ_QUERY, + CCWR_CQ_MODIFY, + CCWR_CQ_DESTROY, + CCWR_QP_CONNECT, + CCWR_PD_ALLOC, + CCWR_PD_DEALLOC, + CCWR_SRQ_CREATE, + CCWR_SRQ_QUERY, + CCWR_SRQ_MODIFY, + CCWR_SRQ_DESTROY, + CCWR_QP_CREATE, + CCWR_QP_QUERY, + CCWR_QP_MODIFY, + CCWR_QP_DESTROY, + CCWR_NSMR_STAG_ALLOC, + CCWR_NSMR_REGISTER, + CCWR_NSMR_PBL, + CCWR_STAG_DEALLOC, + CCWR_NSMR_REREGISTER, + CCWR_SMR_REGISTER, + CCWR_MR_QUERY, + CCWR_MW_ALLOC, + CCWR_MW_QUERY, + CCWR_EP_CREATE, + CCWR_EP_GETOPT, + CCWR_EP_SETOPT, + CCWR_EP_DESTROY, + CCWR_EP_BIND, + CCWR_EP_CONNECT, + CCWR_EP_LISTEN, + CCWR_EP_SHUTDOWN, + CCWR_EP_LISTEN_CREATE, + CCWR_EP_LISTEN_DESTROY, + CCWR_EP_QUERY, + CCWR_CR_ACCEPT, + CCWR_CR_REJECT, + CCWR_CONSOLE, + CCWR_TERM, + CCWR_FLASH_INIT, + CCWR_FLASH, + CCWR_BUF_ALLOC, + CCWR_BUF_FREE, + CCWR_FLASH_WRITE, + CCWR_INIT, /* WARNING: Don't move this ever again! */ + + + + /* Add new IDs here */ + + + + /* + * WARNING: CCWR_LAST must always be the last verbs id defined! + * All the preceding IDs are fixed, and must not change. + * You can add new IDs, but must not remove or reorder + * any IDs. If you do, YOU will ruin any hope of + * compatibility between versions. + */ + CCWR_LAST, + + /* + * Start over at 1 so that arrays indexed by user wr id's + * begin at 1. This is OK since the verbs and user wr id's + * are always used on disjoint sets of queues. + */ + /* + * The order of the CCWR_SEND_XX verbs must + * match the order of the RDMA_OPs + */ + CCWR_SEND = 1, + CCWR_SEND_INV, + CCWR_SEND_SE, + CCWR_SEND_SE_INV, + CCWR_RDMA_WRITE, + CCWR_RDMA_READ, + CCWR_RDMA_READ_INV, + CCWR_MW_BIND, + CCWR_NSMR_FASTREG, + CCWR_STAG_INVALIDATE, + CCWR_RECV, + CCWR_NOP, + CCWR_UNIMPL, +/* WARNING: This must always be the last user wr id defined! */ +}; +#define RDMA_SEND_OPCODE_FROM_WR_ID(x) (x+2) + +/* + * SQ/RQ Work Request Types + */ +enum c2_wr_type { + C2_WR_TYPE_SEND = CCWR_SEND, + C2_WR_TYPE_SEND_SE = CCWR_SEND_SE, + C2_WR_TYPE_SEND_INV = CCWR_SEND_INV, + C2_WR_TYPE_SEND_SE_INV = CCWR_SEND_SE_INV, + C2_WR_TYPE_RDMA_WRITE = CCWR_RDMA_WRITE, + C2_WR_TYPE_RDMA_READ = CCWR_RDMA_READ, + C2_WR_TYPE_RDMA_READ_INV_STAG = CCWR_RDMA_READ_INV, + C2_WR_TYPE_BIND_MW = CCWR_MW_BIND, + C2_WR_TYPE_FASTREG_NSMR = CCWR_NSMR_FASTREG, + C2_WR_TYPE_INV_STAG = CCWR_STAG_INVALIDATE, + C2_WR_TYPE_RECV = CCWR_RECV, + C2_WR_TYPE_NOP = CCWR_NOP, +}; + +struct c2_netaddr { + __be32 ip_addr; + __be32 netmask; + u32 mtu; +}; + +struct c2_route { + u32 ip_addr; /* 0 indicates the default route */ + u32 netmask; /* netmask associated with dst */ + u32 flags; + union { + u32 ipaddr; /* address of the nexthop interface */ + u8 enaddr[6]; + } nexthop; +}; + +/* + * A Scatter Gather Entry. + */ +struct c2_data_addr { + __be32 stag; + __be32 length; + __be64 to; +}; + +/* + * MR and MW flags used by the consumer, RI, and RNIC. + */ +enum c2_mm_flags { + MEM_REMOTE = 0x0001, /* allow mw binds with remote access. */ + MEM_VA_BASED = 0x0002, /* Not Zero-based */ + MEM_PBL_COMPLETE = 0x0004, /* PBL array is complete in this msg */ + MEM_LOCAL_READ = 0x0008, /* allow local reads */ + MEM_LOCAL_WRITE = 0x0010, /* allow local writes */ + MEM_REMOTE_READ = 0x0020, /* allow remote reads */ + MEM_REMOTE_WRITE = 0x0040, /* allow remote writes */ + MEM_WINDOW_BIND = 0x0080, /* binds allowed */ + MEM_SHARED = 0x0100, /* set if MR is shared */ + MEM_STAG_VALID = 0x0200 /* set if STAG is in valid state */ +}; + +/* + * CCIL API ACF flags defined in terms of the low level mem flags. + * This minimizes translation needed in the user API + */ +enum c2_acf { + C2_ACF_LOCAL_READ = MEM_LOCAL_READ, + C2_ACF_LOCAL_WRITE = MEM_LOCAL_WRITE, + C2_ACF_REMOTE_READ = MEM_REMOTE_READ, + C2_ACF_REMOTE_WRITE = MEM_REMOTE_WRITE, + C2_ACF_WINDOW_BIND = MEM_WINDOW_BIND +}; + +/* + * Image types of objects written to flash + */ +#define C2_FLASH_IMG_BITFILE 1 +#define C2_FLASH_IMG_OPTION_ROM 2 +#define C2_FLASH_IMG_VPD 3 + +/* + * to fix bug 1815 we define the max size allowable of the + * terminate message (per the IETF spec).Refer to the IETF + * protocol specification, section 12.1.6, page 64) + * The message is prefixed by 20 types of DDP info. + * + * Then the message has 6 bytes for the terminate control + * and DDP segment length info plus a DDP header (either + * 14 or 18 byts) plus 28 bytes for the RDMA header. + * Thus the max size in: + * 20 + (6 + 18 + 28) = 72 + */ +#define C2_MAX_TERMINATE_MESSAGE_SIZE (72) + +/* + * Build String Length. It must be the same as C2_BUILD_STR_LEN in ccil_api.h + */ +#define WR_BUILD_STR_LEN 64 + +/* + * WARNING: All of these structs need to align any 64bit types on + * 64 bit boundaries! 64bit types include u64 and u64. + */ + +/* + * Clustercore Work Request Header. Be sensitive to field layout + * and alignment. + */ +struct c2wr_hdr { + /* wqe_count is part of the cqe. It is put here so the + * adapter can write to it while the wr is pending without + * clobbering part of the wr. This word need not be dma'd + * from the host to adapter by libccil, but we copy it anyway + * to make the memcpy to the adapter better aligned. + */ + __be32 wqe_count; + + /* Put these fields next so that later 32- and 64-bit + * quantities are naturally aligned. + */ + u8 id; + u8 result; /* adapter -> host */ + u8 sge_count; /* host -> adapter */ + u8 flags; /* host -> adapter */ + + u64 context; +#ifdef CCMSGMAGIC + u32 magic; + u32 pad; +#endif +} __attribute__((packed)); + +/* + *------------------------ RNIC ------------------------ + */ + +/* + * WR_RNIC_OPEN + */ + +/* + * Flags for the RNIC WRs + */ +enum c2_rnic_flags { + RNIC_IRD_STATIC = 0x0001, + RNIC_ORD_STATIC = 0x0002, + RNIC_QP_STATIC = 0x0004, + RNIC_SRQ_SUPPORTED = 0x0008, + RNIC_PBL_BLOCK_MODE = 0x0010, + RNIC_SRQ_MODEL_ARRIVAL = 0x0020, + RNIC_CQ_OVF_DETECTED = 0x0040, + RNIC_PRIV_MODE = 0x0080 +}; + +struct c2wr_rnic_open_req { + struct c2wr_hdr hdr; + u64 user_context; + __be16 flags; /* See enum c2_rnic_flags */ + __be16 port_num; +} __attribute__((packed)); + +struct c2wr_rnic_open_rep { + struct c2wr_hdr hdr; + u32 rnic_handle; +} __attribute__((packed)); + +union c2wr_rnic_open { + struct c2wr_rnic_open_req req; + struct c2wr_rnic_open_rep rep; +} __attribute__((packed)); + +struct c2wr_rnic_query_req { + struct c2wr_hdr hdr; + u32 rnic_handle; +} __attribute__((packed)); + +/* + * WR_RNIC_QUERY + */ +struct c2wr_rnic_query_rep { + struct c2wr_hdr hdr; + u64 user_context; + __be32 vendor_id; + __be32 part_number; + __be32 hw_version; + __be32 fw_ver_major; + __be32 fw_ver_minor; + __be32 fw_ver_patch; + char fw_ver_build_str[WR_BUILD_STR_LEN]; + __be32 max_qps; + __be32 max_qp_depth; + u32 max_srq_depth; + u32 max_send_sgl_depth; + u32 max_rdma_sgl_depth; + __be32 max_cqs; + __be32 max_cq_depth; + u32 max_cq_event_handlers; + __be32 max_mrs; + u32 max_pbl_depth; + __be32 max_pds; + __be32 max_global_ird; + u32 max_global_ord; + __be32 max_qp_ird; + __be32 max_qp_ord; + u32 flags; + __be32 max_mws; + u32 pbe_range_low; + u32 pbe_range_high; + u32 max_srqs; + u32 page_size; +} __attribute__((packed)); + +union c2wr_rnic_query { + struct c2wr_rnic_query_req req; + struct c2wr_rnic_query_rep rep; +} __attribute__((packed)); + +/* + * WR_RNIC_GETCONFIG + */ + +struct c2wr_rnic_getconfig_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 option; /* see c2_getconfig_cmd_t */ + u64 reply_buf; + u32 reply_buf_len; +} __attribute__((packed)) ; + +struct c2wr_rnic_getconfig_rep { + struct c2wr_hdr hdr; + u32 option; /* see c2_getconfig_cmd_t */ + u32 count_len; /* length of the number of addresses configured */ +} __attribute__((packed)) ; + +union c2wr_rnic_getconfig { + struct c2wr_rnic_getconfig_req req; + struct c2wr_rnic_getconfig_rep rep; +} __attribute__((packed)) ; + +/* + * WR_RNIC_SETCONFIG + */ +struct c2wr_rnic_setconfig_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + __be32 option; /* See c2_setconfig_cmd_t */ + /* variable data and pad. See c2_netaddr and c2_route */ + u8 data[0]; +} __attribute__((packed)) ; + +struct c2wr_rnic_setconfig_rep { + struct c2wr_hdr hdr; +} __attribute__((packed)) ; + +union c2wr_rnic_setconfig { + struct c2wr_rnic_setconfig_req req; + struct c2wr_rnic_setconfig_rep rep; +} __attribute__((packed)) ; + +/* + * WR_RNIC_CLOSE + */ +struct c2wr_rnic_close_req { + struct c2wr_hdr hdr; + u32 rnic_handle; +} __attribute__((packed)) ; + +struct c2wr_rnic_close_rep { + struct c2wr_hdr hdr; +} __attribute__((packed)) ; + +union c2wr_rnic_close { + struct c2wr_rnic_close_req req; + struct c2wr_rnic_close_rep rep; +} __attribute__((packed)) ; + +/* + *------------------------ CQ ------------------------ + */ +struct c2wr_cq_create_req { + struct c2wr_hdr hdr; + __be64 shared_ht; + u64 user_context; + __be64 msg_pool; + u32 rnic_handle; + __be32 msg_size; + __be32 depth; +} __attribute__((packed)) ; + +struct c2wr_cq_create_rep { + struct c2wr_hdr hdr; + __be32 mq_index; + __be32 adapter_shared; + u32 cq_handle; +} __attribute__((packed)) ; + +union c2wr_cq_create { + struct c2wr_cq_create_req req; + struct c2wr_cq_create_rep rep; +} __attribute__((packed)) ; + +struct c2wr_cq_modify_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 cq_handle; + u32 new_depth; + u64 new_msg_pool; +} __attribute__((packed)) ; + +struct c2wr_cq_modify_rep { + struct c2wr_hdr hdr; +} __attribute__((packed)) ; + +union c2wr_cq_modify { + struct c2wr_cq_modify_req req; + struct c2wr_cq_modify_rep rep; +} __attribute__((packed)) ; + +struct c2wr_cq_destroy_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 cq_handle; +} __attribute__((packed)) ; + +struct c2wr_cq_destroy_rep { + struct c2wr_hdr hdr; +} __attribute__((packed)) ; + +union c2wr_cq_destroy { + struct c2wr_cq_destroy_req req; + struct c2wr_cq_destroy_rep rep; +} __attribute__((packed)) ; + +/* + *------------------------ PD ------------------------ + */ +struct c2wr_pd_alloc_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 pd_id; +} __attribute__((packed)) ; + +struct c2wr_pd_alloc_rep { + struct c2wr_hdr hdr; +} __attribute__((packed)) ; + +union c2wr_pd_alloc { + struct c2wr_pd_alloc_req req; + struct c2wr_pd_alloc_rep rep; +} __attribute__((packed)) ; + +struct c2wr_pd_dealloc_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 pd_id; +} __attribute__((packed)) ; + +struct c2wr_pd_dealloc_rep { + struct c2wr_hdr hdr; +} __attribute__((packed)) ; + +union c2wr_pd_dealloc { + struct c2wr_pd_dealloc_req req; + struct c2wr_pd_dealloc_rep rep; +} __attribute__((packed)) ; + +/* + *------------------------ SRQ ------------------------ + */ +struct c2wr_srq_create_req { + struct c2wr_hdr hdr; + u64 shared_ht; + u64 user_context; + u32 rnic_handle; + u32 srq_depth; + u32 srq_limit; + u32 sgl_depth; + u32 pd_id; +} __attribute__((packed)) ; + +struct c2wr_srq_create_rep { + struct c2wr_hdr hdr; + u32 srq_depth; + u32 sgl_depth; + u32 msg_size; + u32 mq_index; + u32 mq_start; + u32 srq_handle; +} __attribute__((packed)) ; + +union c2wr_srq_create { + struct c2wr_srq_create_req req; + struct c2wr_srq_create_rep rep; +} __attribute__((packed)) ; + +struct c2wr_srq_destroy_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 srq_handle; +} __attribute__((packed)) ; + +struct c2wr_srq_destroy_rep { + struct c2wr_hdr hdr; +} __attribute__((packed)) ; + +union c2wr_srq_destroy { + struct c2wr_srq_destroy_req req; + struct c2wr_srq_destroy_rep rep; +} __attribute__((packed)) ; + +/* + *------------------------ QP ------------------------ + */ +enum c2wr_qp_flags { + QP_RDMA_READ = 0x00000001, /* RDMA read enabled? */ + QP_RDMA_WRITE = 0x00000002, /* RDMA write enabled? */ + QP_MW_BIND = 0x00000004, /* MWs enabled */ + QP_ZERO_STAG = 0x00000008, /* enabled? */ + QP_REMOTE_TERMINATION = 0x00000010, /* remote end terminated */ + QP_RDMA_READ_RESPONSE = 0x00000020 /* Remote RDMA read */ + /* enabled? */ +}; + +struct c2wr_qp_create_req { + struct c2wr_hdr hdr; + __be64 shared_sq_ht; + __be64 shared_rq_ht; + u64 user_context; + u32 rnic_handle; + u32 sq_cq_handle; + u32 rq_cq_handle; + __be32 sq_depth; + __be32 rq_depth; + u32 srq_handle; + u32 srq_limit; + __be32 flags; /* see enum c2wr_qp_flags */ + __be32 send_sgl_depth; + __be32 recv_sgl_depth; + __be32 rdma_write_sgl_depth; + __be32 ord; + __be32 ird; + u32 pd_id; +} __attribute__((packed)) ; + +struct c2wr_qp_create_rep { + struct c2wr_hdr hdr; + __be32 sq_depth; + __be32 rq_depth; + u32 send_sgl_depth; + u32 recv_sgl_depth; + u32 rdma_write_sgl_depth; + u32 ord; + u32 ird; + __be32 sq_msg_size; + __be32 sq_mq_index; + __be32 sq_mq_start; + __be32 rq_msg_size; + __be32 rq_mq_index; + __be32 rq_mq_start; + u32 qp_handle; +} __attribute__((packed)) ; + +union c2wr_qp_create { + struct c2wr_qp_create_req req; + struct c2wr_qp_create_rep rep; +} __attribute__((packed)) ; + +struct c2wr_qp_query_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 qp_handle; +} __attribute__((packed)) ; + +struct c2wr_qp_query_rep { + struct c2wr_hdr hdr; + u64 user_context; + u32 rnic_handle; + u32 sq_depth; + u32 rq_depth; + u32 send_sgl_depth; + u32 rdma_write_sgl_depth; + u32 recv_sgl_depth; + u32 ord; + u32 ird; + u16 qp_state; + u16 flags; /* see c2wr_qp_flags_t */ + u32 qp_id; + u32 local_addr; + u32 remote_addr; + u16 local_port; + u16 remote_port; + u32 terminate_msg_length; /* 0 if not present */ + u8 data[0]; + /* Terminate Message in-line here. */ +} __attribute__((packed)) ; + +union c2wr_qp_query { + struct c2wr_qp_query_req req; + struct c2wr_qp_query_rep rep; +} __attribute__((packed)) ; + +struct c2wr_qp_modify_req { + struct c2wr_hdr hdr; + u64 stream_msg; + u32 stream_msg_length; + u32 rnic_handle; + u32 qp_handle; + __be32 next_qp_state; + __be32 ord; + __be32 ird; + __be32 sq_depth; + __be32 rq_depth; + u32 llp_ep_handle; +} __attribute__((packed)) ; + +struct c2wr_qp_modify_rep { + struct c2wr_hdr hdr; + u32 ord; + u32 ird; + u32 sq_depth; + u32 rq_depth; + u32 sq_msg_size; + u32 sq_mq_index; + u32 sq_mq_start; + u32 rq_msg_size; + u32 rq_mq_index; + u32 rq_mq_start; +} __attribute__((packed)) ; + +union c2wr_qp_modify { + struct c2wr_qp_modify_req req; + struct c2wr_qp_modify_rep rep; +} __attribute__((packed)) ; + +struct c2wr_qp_destroy_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 qp_handle; +} __attribute__((packed)) ; + +struct c2wr_qp_destroy_rep { + struct c2wr_hdr hdr; +} __attribute__((packed)) ; + +union c2wr_qp_destroy { + struct c2wr_qp_destroy_req req; + struct c2wr_qp_destroy_rep rep; +} __attribute__((packed)) ; + +/* + * The CCWR_QP_CONNECT msg is posted on the verbs request queue. It can + * only be posted when a QP is in IDLE state. After the connect request is + * submitted to the LLP, the adapter moves the QP to CONNECT_PENDING state. + * No synchronous reply from adapter to this WR. The results of + * connection are passed back in an async event CCAE_ACTIVE_CONNECT_RESULTS + * See c2wr_ae_active_connect_results_t + */ +struct c2wr_qp_connect_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 qp_handle; + __be32 remote_addr; + __be16 remote_port; + u16 pad; + __be32 private_data_length; + u8 private_data[0]; /* Private data in-line. */ +} __attribute__((packed)) ; + +struct c2wr_qp_connect { + struct c2wr_qp_connect_req req; + /* no synchronous reply. */ +} __attribute__((packed)) ; + + +/* + *------------------------ MM ------------------------ + */ + +struct c2wr_nsmr_stag_alloc_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 pbl_depth; + u32 pd_id; + u32 flags; +} __attribute__((packed)) ; + +struct c2wr_nsmr_stag_alloc_rep { + struct c2wr_hdr hdr; + u32 pbl_depth; + u32 stag_index; +} __attribute__((packed)) ; + +union c2wr_nsmr_stag_alloc { + struct c2wr_nsmr_stag_alloc_req req; + struct c2wr_nsmr_stag_alloc_rep rep; +} __attribute__((packed)) ; + +struct c2wr_nsmr_register_req { + struct c2wr_hdr hdr; + __be64 va; + u32 rnic_handle; + __be16 flags; + u8 stag_key; + u8 pad; + u32 pd_id; + __be32 pbl_depth; + __be32 pbe_size; + __be32 fbo; + __be32 length; + __be32 addrs_length; + /* array of paddrs (must be aligned on a 64bit boundary) */ + __be64 paddrs[0]; +} __attribute__((packed)) ; + +struct c2wr_nsmr_register_rep { + struct c2wr_hdr hdr; + u32 pbl_depth; + __be32 stag_index; +} __attribute__((packed)) ; + +union c2wr_nsmr_register { + struct c2wr_nsmr_register_req req; + struct c2wr_nsmr_register_rep rep; +} __attribute__((packed)) ; + +struct c2wr_nsmr_pbl_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + __be32 flags; + __be32 stag_index; + __be32 addrs_length; + /* array of paddrs (must be aligned on a 64bit boundary) */ + __be64 paddrs[0]; +} __attribute__((packed)) ; + +struct c2wr_nsmr_pbl_rep { + struct c2wr_hdr hdr; +} __attribute__((packed)) ; + +union c2wr_nsmr_pbl { + struct c2wr_nsmr_pbl_req req; + struct c2wr_nsmr_pbl_rep rep; +} __attribute__((packed)) ; + +struct c2wr_mr_query_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 stag_index; +} __attribute__((packed)) ; + +struct c2wr_mr_query_rep { + struct c2wr_hdr hdr; + u8 stag_key; + u8 pad[3]; + u32 pd_id; + u32 flags; + u32 pbl_depth; +} __attribute__((packed)) ; + +union c2wr_mr_query { + struct c2wr_mr_query_req req; + struct c2wr_mr_query_rep rep; +} __attribute__((packed)) ; + +struct c2wr_mw_query_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 stag_index; +} __attribute__((packed)) ; + +struct c2wr_mw_query_rep { + struct c2wr_hdr hdr; + u8 stag_key; + u8 pad[3]; + u32 pd_id; + u32 flags; +} __attribute__((packed)) ; + +union c2wr_mw_query { + struct c2wr_mw_query_req req; + struct c2wr_mw_query_rep rep; +} __attribute__((packed)) ; + + +struct c2wr_stag_dealloc_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + __be32 stag_index; +} __attribute__((packed)) ; + +struct c2wr_stag_dealloc_rep { + struct c2wr_hdr hdr; +} __attribute__((packed)) ; + +union c2wr_stag_dealloc { + struct c2wr_stag_dealloc_req req; + struct c2wr_stag_dealloc_rep rep; +} __attribute__((packed)) ; + +struct c2wr_nsmr_reregister_req { + struct c2wr_hdr hdr; + u64 va; + u32 rnic_handle; + u16 flags; + u8 stag_key; + u8 pad; + u32 stag_index; + u32 pd_id; + u32 pbl_depth; + u32 pbe_size; + u32 fbo; + u32 length; + u32 addrs_length; + u32 pad1; + /* array of paddrs (must be aligned on a 64bit boundary) */ + u64 paddrs[0]; +} __attribute__((packed)) ; + +struct c2wr_nsmr_reregister_rep { + struct c2wr_hdr hdr; + u32 pbl_depth; + u32 stag_index; +} __attribute__((packed)) ; + +union c2wr_nsmr_reregister { + struct c2wr_nsmr_reregister_req req; + struct c2wr_nsmr_reregister_rep rep; +} __attribute__((packed)) ; + +struct c2wr_smr_register_req { + struct c2wr_hdr hdr; + u64 va; + u32 rnic_handle; + u16 flags; + u8 stag_key; + u8 pad; + u32 stag_index; + u32 pd_id; +} __attribute__((packed)) ; + +struct c2wr_smr_register_rep { + struct c2wr_hdr hdr; + u32 stag_index; +} __attribute__((packed)) ; + +union c2wr_smr_register { + struct c2wr_smr_register_req req; + struct c2wr_smr_register_rep rep; +} __attribute__((packed)) ; + +struct c2wr_mw_alloc_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 pd_id; +} __attribute__((packed)) ; + +struct c2wr_mw_alloc_rep { + struct c2wr_hdr hdr; + u32 stag_index; +} __attribute__((packed)) ; + +union c2wr_mw_alloc { + struct c2wr_mw_alloc_req req; + struct c2wr_mw_alloc_rep rep; +} __attribute__((packed)) ; + +/* + *------------------------ WRs ----------------------- + */ + +struct c2wr_user_hdr { + struct c2wr_hdr hdr; /* Has status and WR Type */ +} __attribute__((packed)) ; + +enum c2_qp_state { + C2_QP_STATE_IDLE = 0x01, + C2_QP_STATE_CONNECTING = 0x02, + C2_QP_STATE_RTS = 0x04, + C2_QP_STATE_CLOSING = 0x08, + C2_QP_STATE_TERMINATE = 0x10, + C2_QP_STATE_ERROR = 0x20, +}; + +/* Completion queue entry. */ +struct c2wr_ce { + struct c2wr_hdr hdr; /* Has status and WR Type */ + u64 qp_user_context; /* c2_user_qp_t * */ + u32 qp_state; /* Current QP State */ + u32 handle; /* QPID or EP Handle */ + __be32 bytes_rcvd; /* valid for RECV WCs */ + u32 stag; +} __attribute__((packed)) ; + + +/* + * Flags used for all post-sq WRs. These must fit in the flags + * field of the struct c2wr_hdr (eight bits). + */ +enum { + SQ_SIGNALED = 0x01, + SQ_READ_FENCE = 0x02, + SQ_FENCE = 0x04, +}; + +/* + * Common fields for all post-sq WRs. Namely the standard header and a + * secondary header with fields common to all post-sq WRs. + */ +struct c2_sq_hdr { + struct c2wr_user_hdr user_hdr; +} __attribute__((packed)); + +/* + * Same as above but for post-rq WRs. + */ +struct c2_rq_hdr { + struct c2wr_user_hdr user_hdr; +} __attribute__((packed)); + +/* + * use the same struct for all sends. + */ +struct c2wr_send_req { + struct c2_sq_hdr sq_hdr; + __be32 sge_len; + __be32 remote_stag; + u8 data[0]; /* SGE array */ +} __attribute__((packed)); + +union c2wr_send { + struct c2wr_send_req req; + struct c2wr_ce rep; +} __attribute__((packed)); + +struct c2wr_rdma_write_req { + struct c2_sq_hdr sq_hdr; + __be64 remote_to; + __be32 remote_stag; + __be32 sge_len; + u8 data[0]; /* SGE array */ +} __attribute__((packed)); + +union c2wr_rdma_write { + struct c2wr_rdma_write_req req; + struct c2wr_ce rep; +} __attribute__((packed)); + +struct c2wr_rdma_read_req { + struct c2_sq_hdr sq_hdr; + __be64 local_to; + __be64 remote_to; + __be32 local_stag; + __be32 remote_stag; + __be32 length; +} __attribute__((packed)); + +union c2wr_rdma_read { + struct c2wr_rdma_read_req req; + struct c2wr_ce rep; +} __attribute__((packed)); + +struct c2wr_mw_bind_req { + struct c2_sq_hdr sq_hdr; + u64 va; + u8 stag_key; + u8 pad[3]; + u32 mw_stag_index; + u32 mr_stag_index; + u32 length; + u32 flags; +} __attribute__((packed)); + +union c2wr_mw_bind { + struct c2wr_mw_bind_req req; + struct c2wr_ce rep; +} __attribute__((packed)); + +struct c2wr_nsmr_fastreg_req { + struct c2_sq_hdr sq_hdr; + u64 va; + u8 stag_key; + u8 pad[3]; + u32 stag_index; + u32 pbe_size; + u32 fbo; + u32 length; + u32 addrs_length; + /* array of paddrs (must be aligned on a 64bit boundary) */ + u64 paddrs[0]; +} __attribute__((packed)); + +union c2wr_nsmr_fastreg { + struct c2wr_nsmr_fastreg_req req; + struct c2wr_ce rep; +} __attribute__((packed)); + +struct c2wr_stag_invalidate_req { + struct c2_sq_hdr sq_hdr; + u8 stag_key; + u8 pad[3]; + u32 stag_index; +} __attribute__((packed)); + +union c2wr_stag_invalidate { + struct c2wr_stag_invalidate_req req; + struct c2wr_ce rep; +} __attribute__((packed)); + +union c2wr_sqwr { + struct c2_sq_hdr sq_hdr; + struct c2wr_send_req send; + struct c2wr_send_req send_se; + struct c2wr_send_req send_inv; + struct c2wr_send_req send_se_inv; + struct c2wr_rdma_write_req rdma_write; + struct c2wr_rdma_read_req rdma_read; + struct c2wr_mw_bind_req mw_bind; + struct c2wr_nsmr_fastreg_req nsmr_fastreg; + struct c2wr_stag_invalidate_req stag_inv; +} __attribute__((packed)); + + +/* + * RQ WRs + */ +struct c2wr_rqwr { + struct c2_rq_hdr rq_hdr; + u8 data[0]; /* array of SGEs */ +} __attribute__((packed)); + +union c2wr_recv { + struct c2wr_rqwr req; + struct c2wr_ce rep; +} __attribute__((packed)); + +/* + * All AEs start with this header. Most AEs only need to convey the + * information in the header. Some, like LLP connection events, need + * more info. The union typdef c2wr_ae_t has all the possible AEs. + * + * hdr.context is the user_context from the rnic_open WR. NULL If this + * is not affiliated with an rnic + * + * hdr.id is the AE identifier (eg; CCAE_REMOTE_SHUTDOWN, + * CCAE_LLP_CLOSE_COMPLETE) + * + * resource_type is one of: C2_RES_IND_QP, C2_RES_IND_CQ, C2_RES_IND_SRQ + * + * user_context is the context passed down when the host created the resource. + */ +struct c2wr_ae_hdr { + struct c2wr_hdr hdr; + u64 user_context; /* user context for this res. */ + __be32 resource_type; /* see enum c2_resource_indicator */ + __be32 resource; /* handle for resource */ + __be32 qp_state; /* current QP State */ +} __attribute__((packed)); + +/* + * After submitting the CCAE_ACTIVE_CONNECT_RESULTS message on the AEQ, + * the adapter moves the QP into RTS state + */ +struct c2wr_ae_active_connect_results { + struct c2wr_ae_hdr ae_hdr; + __be32 laddr; + __be32 raddr; + __be16 lport; + __be16 rport; + __be32 private_data_length; + u8 private_data[0]; /* data is in-line in the msg. */ +} __attribute__((packed)); + +/* + * When connections are established by the stack (and the private data + * MPA frame is received), the adapter will generate an event to the host. + * The details of the connection, any private data, and the new connection + * request handle is passed up via the CCAE_CONNECTION_REQUEST msg on the + * AE queue: + */ +struct c2wr_ae_connection_request { + struct c2wr_ae_hdr ae_hdr; + u32 cr_handle; /* connreq handle (sock ptr) */ + __be32 laddr; + __be32 raddr; + __be16 lport; + __be16 rport; + __be32 private_data_length; + u8 private_data[0]; /* data is in-line in the msg. */ +} __attribute__((packed)); + +union c2wr_ae { + struct c2wr_ae_hdr ae_generic; + struct c2wr_ae_active_connect_results ae_active_connect_results; + struct c2wr_ae_connection_request ae_connection_request; +} __attribute__((packed)); + +struct c2wr_init_req { + struct c2wr_hdr hdr; + __be64 hint_count; + __be64 q0_host_shared; + __be64 q1_host_shared; + __be64 q1_host_msg_pool; + __be64 q2_host_shared; + __be64 q2_host_msg_pool; +} __attribute__((packed)); + +struct c2wr_init_rep { + struct c2wr_hdr hdr; +} __attribute__((packed)); + +union c2wr_init { + struct c2wr_init_req req; + struct c2wr_init_rep rep; +} __attribute__((packed)); + +/* + * For upgrading flash. + */ + +struct c2wr_flash_init_req { + struct c2wr_hdr hdr; + u32 rnic_handle; +} __attribute__((packed)); + +struct c2wr_flash_init_rep { + struct c2wr_hdr hdr; + u32 adapter_flash_buf_offset; + u32 adapter_flash_len; +} __attribute__((packed)); + +union c2wr_flash_init { + struct c2wr_flash_init_req req; + struct c2wr_flash_init_rep rep; +} __attribute__((packed)); + +struct c2wr_flash_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 len; +} __attribute__((packed)); + +struct c2wr_flash_rep { + struct c2wr_hdr hdr; + u32 status; +} __attribute__((packed)); + +union c2wr_flash { + struct c2wr_flash_req req; + struct c2wr_flash_rep rep; +} __attribute__((packed)); + +struct c2wr_buf_alloc_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 size; +} __attribute__((packed)); + +struct c2wr_buf_alloc_rep { + struct c2wr_hdr hdr; + u32 offset; /* 0 if mem not available */ + u32 size; /* 0 if mem not available */ +} __attribute__((packed)); + +union c2wr_buf_alloc { + struct c2wr_buf_alloc_req req; + struct c2wr_buf_alloc_rep rep; +} __attribute__((packed)); + +struct c2wr_buf_free_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 offset; /* Must match value from alloc */ + u32 size; /* Must match value from alloc */ +} __attribute__((packed)); + +struct c2wr_buf_free_rep { + struct c2wr_hdr hdr; +} __attribute__((packed)); + +union c2wr_buf_free { + struct c2wr_buf_free_req req; + struct c2wr_ce rep; +} __attribute__((packed)); + +struct c2wr_flash_write_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 offset; + u32 size; + u32 type; + u32 flags; +} __attribute__((packed)); + +struct c2wr_flash_write_rep { + struct c2wr_hdr hdr; + u32 status; +} __attribute__((packed)); + +union c2wr_flash_write { + struct c2wr_flash_write_req req; + struct c2wr_flash_write_rep rep; +} __attribute__((packed)); + +/* + * Messages for LLP connection setup. + */ + +/* + * Listen Request. This allocates a listening endpoint to allow passive + * connection setup. Newly established LLP connections are passed up + * via an AE. See c2wr_ae_connection_request_t + */ +struct c2wr_ep_listen_create_req { + struct c2wr_hdr hdr; + u64 user_context; /* returned in AEs. */ + u32 rnic_handle; + __be32 local_addr; /* local addr, or 0 */ + __be16 local_port; /* 0 means "pick one" */ + u16 pad; + __be32 backlog; /* tradional tcp listen bl */ +} __attribute__((packed)); + +struct c2wr_ep_listen_create_rep { + struct c2wr_hdr hdr; + u32 ep_handle; /* handle to new listening ep */ + u16 local_port; /* resulting port... */ + u16 pad; +} __attribute__((packed)); + +union c2wr_ep_listen_create { + struct c2wr_ep_listen_create_req req; + struct c2wr_ep_listen_create_rep rep; +} __attribute__((packed)); + +struct c2wr_ep_listen_destroy_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 ep_handle; +} __attribute__((packed)); + +struct c2wr_ep_listen_destroy_rep { + struct c2wr_hdr hdr; +} __attribute__((packed)); + +union c2wr_ep_listen_destroy { + struct c2wr_ep_listen_destroy_req req; + struct c2wr_ep_listen_destroy_rep rep; +} __attribute__((packed)); + +struct c2wr_ep_query_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 ep_handle; +} __attribute__((packed)); + +struct c2wr_ep_query_rep { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 local_addr; + u32 remote_addr; + u16 local_port; + u16 remote_port; +} __attribute__((packed)); + +union c2wr_ep_query { + struct c2wr_ep_query_req req; + struct c2wr_ep_query_rep rep; +} __attribute__((packed)); + + +/* + * The host passes this down to indicate acceptance of a pending iWARP + * connection. The cr_handle was obtained from the CONNECTION_REQUEST + * AE passed up by the adapter. See c2wr_ae_connection_request_t. + */ +struct c2wr_cr_accept_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 qp_handle; /* QP to bind to this LLP conn */ + u32 ep_handle; /* LLP handle to accept */ + __be32 private_data_length; + u8 private_data[0]; /* data in-line in msg. */ +} __attribute__((packed)); + +/* + * adapter sends reply when private data is successfully submitted to + * the LLP. + */ +struct c2wr_cr_accept_rep { + struct c2wr_hdr hdr; +} __attribute__((packed)); + +union c2wr_cr_accept { + struct c2wr_cr_accept_req req; + struct c2wr_cr_accept_rep rep; +} __attribute__((packed)); + +/* + * The host sends this down if a given iWARP connection request was + * rejected by the consumer. The cr_handle was obtained from a + * previous c2wr_ae_connection_request_t AE sent by the adapter. + */ +struct c2wr_cr_reject_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 ep_handle; /* LLP handle to reject */ +} __attribute__((packed)); + +/* + * Dunno if this is needed, but we'll add it for now. The adapter will + * send the reject_reply after the LLP endpoint has been destroyed. + */ +struct c2wr_cr_reject_rep { + struct c2wr_hdr hdr; +} __attribute__((packed)); + +union c2wr_cr_reject { + struct c2wr_cr_reject_req req; + struct c2wr_cr_reject_rep rep; +} __attribute__((packed)); + +/* + * console command. Used to implement a debug console over the verbs + * request and reply queues. + */ + +/* + * Console request message. It contains: + * - message hdr with id = CCWR_CONSOLE + * - the physaddr/len of host memory to be used for the reply. + * - the command string. eg: "netstat -s" or "zoneinfo" + */ +struct c2wr_console_req { + struct c2wr_hdr hdr; /* id = CCWR_CONSOLE */ + u64 reply_buf; /* pinned host buf for reply */ + u32 reply_buf_len; /* length of reply buffer */ + u8 command[0]; /* NUL terminated ascii string */ + /* containing the command req */ +} __attribute__((packed)); + +/* + * flags used in the console reply. + */ +enum c2_console_flags { + CONS_REPLY_TRUNCATED = 0x00000001 /* reply was truncated */ +} __attribute__((packed)); + +/* + * Console reply message. + * hdr.result contains the c2_status_t error if the reply was _not_ generated, + * or C2_OK if the reply was generated. + */ +struct c2wr_console_rep { + struct c2wr_hdr hdr; /* id = CCWR_CONSOLE */ + u32 flags; +} __attribute__((packed)); + +union c2wr_console { + struct c2wr_console_req req; + struct c2wr_console_rep rep; +} __attribute__((packed)); + + +/* + * Giant union with all WRs. Makes life easier... + */ +union c2wr { + struct c2wr_hdr hdr; + struct c2wr_user_hdr user_hdr; + union c2wr_rnic_open rnic_open; + union c2wr_rnic_query rnic_query; + union c2wr_rnic_getconfig rnic_getconfig; + union c2wr_rnic_setconfig rnic_setconfig; + union c2wr_rnic_close rnic_close; + union c2wr_cq_create cq_create; + union c2wr_cq_modify cq_modify; + union c2wr_cq_destroy cq_destroy; + union c2wr_pd_alloc pd_alloc; + union c2wr_pd_dealloc pd_dealloc; + union c2wr_srq_create srq_create; + union c2wr_srq_destroy srq_destroy; + union c2wr_qp_create qp_create; + union c2wr_qp_query qp_query; + union c2wr_qp_modify qp_modify; + union c2wr_qp_destroy qp_destroy; + struct c2wr_qp_connect qp_connect; + union c2wr_nsmr_stag_alloc nsmr_stag_alloc; + union c2wr_nsmr_register nsmr_register; + union c2wr_nsmr_pbl nsmr_pbl; + union c2wr_mr_query mr_query; + union c2wr_mw_query mw_query; + union c2wr_stag_dealloc stag_dealloc; + union c2wr_sqwr sqwr; + struct c2wr_rqwr rqwr; + struct c2wr_ce ce; + union c2wr_ae ae; + union c2wr_init init; + union c2wr_ep_listen_create ep_listen_create; + union c2wr_ep_listen_destroy ep_listen_destroy; + union c2wr_cr_accept cr_accept; + union c2wr_cr_reject cr_reject; + union c2wr_console console; + union c2wr_flash_init flash_init; + union c2wr_flash flash; + union c2wr_buf_alloc buf_alloc; + union c2wr_buf_free buf_free; + union c2wr_flash_write flash_write; +} __attribute__((packed)); + + +/* + * Accessors for the wr fields that are packed together tightly to + * reduce the wr message size. The wr arguments are void* so that + * either a struct c2wr*, a struct c2wr_hdr*, or a pointer to any of the types + * in the struct c2wr union can be passed in. + */ +static __inline__ u8 c2_wr_get_id(void *wr) +{ + return ((struct c2wr_hdr *) wr)->id; +} +static __inline__ void c2_wr_set_id(void *wr, u8 id) +{ + ((struct c2wr_hdr *) wr)->id = id; +} +static __inline__ u8 c2_wr_get_result(void *wr) +{ + return ((struct c2wr_hdr *) wr)->result; +} +static __inline__ void c2_wr_set_result(void *wr, u8 result) +{ + ((struct c2wr_hdr *) wr)->result = result; +} +static __inline__ u8 c2_wr_get_flags(void *wr) +{ + return ((struct c2wr_hdr *) wr)->flags; +} +static __inline__ void c2_wr_set_flags(void *wr, u8 flags) +{ + ((struct c2wr_hdr *) wr)->flags = flags; +} +static __inline__ u8 c2_wr_get_sge_count(void *wr) +{ + return ((struct c2wr_hdr *) wr)->sge_count; +} +static __inline__ void c2_wr_set_sge_count(void *wr, u8 sge_count) +{ + ((struct c2wr_hdr *) wr)->sge_count = sge_count; +} +static __inline__ __be32 c2_wr_get_wqe_count(void *wr) +{ + return ((struct c2wr_hdr *) wr)->wqe_count; +} +static __inline__ void c2_wr_set_wqe_count(void *wr, u32 wqe_count) +{ + ((struct c2wr_hdr *) wr)->wqe_count = wqe_count; +} + +#endif /* _C2_WR_H_ */ diff --git a/kernel/drivers/infiniband/hw/cxgb3/Kconfig b/kernel/drivers/infiniband/hw/cxgb3/Kconfig new file mode 100644 index 000000000..2b6352b85 --- /dev/null +++ b/kernel/drivers/infiniband/hw/cxgb3/Kconfig @@ -0,0 +1,27 @@ +config INFINIBAND_CXGB3 + tristate "Chelsio RDMA Driver" + depends on CHELSIO_T3 && INET + select GENERIC_ALLOCATOR + ---help--- + This is an iWARP/RDMA driver for the Chelsio T3 1GbE and + 10GbE adapters. + + For general information about Chelsio and our products, visit + our website at . + + For customer support, please visit our customer support page at + . + + Please send feedback to . + + To compile this driver as a module, choose M here: the module + will be called iw_cxgb3. + +config INFINIBAND_CXGB3_DEBUG + bool "Verbose debugging output" + depends on INFINIBAND_CXGB3 + default n + ---help--- + This option causes the Chelsio RDMA driver to produce copious + amounts of debug messages. Select this if you are developing + the driver or trying to diagnose a problem. diff --git a/kernel/drivers/infiniband/hw/cxgb3/Makefile b/kernel/drivers/infiniband/hw/cxgb3/Makefile new file mode 100644 index 000000000..276136418 --- /dev/null +++ b/kernel/drivers/infiniband/hw/cxgb3/Makefile @@ -0,0 +1,8 @@ +ccflags-y := -Idrivers/net/ethernet/chelsio/cxgb3 + +obj-$(CONFIG_INFINIBAND_CXGB3) += iw_cxgb3.o + +iw_cxgb3-y := iwch_cm.o iwch_ev.o iwch_cq.o iwch_qp.o iwch_mem.o \ + iwch_provider.o iwch.o cxio_hal.o cxio_resource.o + +ccflags-$(CONFIG_INFINIBAND_CXGB3_DEBUG) += -DDEBUG diff --git a/kernel/drivers/infiniband/hw/cxgb3/cxio_dbg.c b/kernel/drivers/infiniband/hw/cxgb3/cxio_dbg.c new file mode 100644 index 000000000..8bca6b4ec --- /dev/null +++ b/kernel/drivers/infiniband/hw/cxgb3/cxio_dbg.c @@ -0,0 +1,207 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef DEBUG +#include +#include +#include "common.h" +#include "cxgb3_ioctl.h" +#include "cxio_hal.h" +#include "cxio_wr.h" + +void cxio_dump_tpt(struct cxio_rdev *rdev, u32 stag) +{ + struct ch_mem_range *m; + u64 *data; + int rc; + int size = 32; + + m = kmalloc(sizeof(*m) + size, GFP_ATOMIC); + if (!m) { + PDBG("%s couldn't allocate memory.\n", __func__); + return; + } + m->mem_id = MEM_PMRX; + m->addr = (stag>>8) * 32 + rdev->rnic_info.tpt_base; + m->len = size; + PDBG("%s TPT addr 0x%x len %d\n", __func__, m->addr, m->len); + rc = rdev->t3cdev_p->ctl(rdev->t3cdev_p, RDMA_GET_MEM, m); + if (rc) { + PDBG("%s toectl returned error %d\n", __func__, rc); + kfree(m); + return; + } + + data = (u64 *)m->buf; + while (size > 0) { + PDBG("TPT %08x: %016llx\n", m->addr, (unsigned long long) *data); + size -= 8; + data++; + m->addr += 8; + } + kfree(m); +} + +void cxio_dump_pbl(struct cxio_rdev *rdev, u32 pbl_addr, uint len, u8 shift) +{ + struct ch_mem_range *m; + u64 *data; + int rc; + int size, npages; + + shift += 12; + npages = (len + (1ULL << shift) - 1) >> shift; + size = npages * sizeof(u64); + + m = kmalloc(sizeof(*m) + size, GFP_ATOMIC); + if (!m) { + PDBG("%s couldn't allocate memory.\n", __func__); + return; + } + m->mem_id = MEM_PMRX; + m->addr = pbl_addr; + m->len = size; + PDBG("%s PBL addr 0x%x len %d depth %d\n", + __func__, m->addr, m->len, npages); + rc = rdev->t3cdev_p->ctl(rdev->t3cdev_p, RDMA_GET_MEM, m); + if (rc) { + PDBG("%s toectl returned error %d\n", __func__, rc); + kfree(m); + return; + } + + data = (u64 *)m->buf; + while (size > 0) { + PDBG("PBL %08x: %016llx\n", m->addr, (unsigned long long) *data); + size -= 8; + data++; + m->addr += 8; + } + kfree(m); +} + +void cxio_dump_wqe(union t3_wr *wqe) +{ + __be64 *data = (__be64 *)wqe; + uint size = (uint)(be64_to_cpu(*data) & 0xff); + + if (size == 0) + size = 8; + while (size > 0) { + PDBG("WQE %p: %016llx\n", data, + (unsigned long long) be64_to_cpu(*data)); + size--; + data++; + } +} + +void cxio_dump_wce(struct t3_cqe *wce) +{ + __be64 *data = (__be64 *)wce; + int size = sizeof(*wce); + + while (size > 0) { + PDBG("WCE %p: %016llx\n", data, + (unsigned long long) be64_to_cpu(*data)); + size -= 8; + data++; + } +} + +void cxio_dump_rqt(struct cxio_rdev *rdev, u32 hwtid, int nents) +{ + struct ch_mem_range *m; + int size = nents * 64; + u64 *data; + int rc; + + m = kmalloc(sizeof(*m) + size, GFP_ATOMIC); + if (!m) { + PDBG("%s couldn't allocate memory.\n", __func__); + return; + } + m->mem_id = MEM_PMRX; + m->addr = ((hwtid)<<10) + rdev->rnic_info.rqt_base; + m->len = size; + PDBG("%s RQT addr 0x%x len %d\n", __func__, m->addr, m->len); + rc = rdev->t3cdev_p->ctl(rdev->t3cdev_p, RDMA_GET_MEM, m); + if (rc) { + PDBG("%s toectl returned error %d\n", __func__, rc); + kfree(m); + return; + } + + data = (u64 *)m->buf; + while (size > 0) { + PDBG("RQT %08x: %016llx\n", m->addr, (unsigned long long) *data); + size -= 8; + data++; + m->addr += 8; + } + kfree(m); +} + +void cxio_dump_tcb(struct cxio_rdev *rdev, u32 hwtid) +{ + struct ch_mem_range *m; + int size = TCB_SIZE; + u32 *data; + int rc; + + m = kmalloc(sizeof(*m) + size, GFP_ATOMIC); + if (!m) { + PDBG("%s couldn't allocate memory.\n", __func__); + return; + } + m->mem_id = MEM_CM; + m->addr = hwtid * size; + m->len = size; + PDBG("%s TCB %d len %d\n", __func__, m->addr, m->len); + rc = rdev->t3cdev_p->ctl(rdev->t3cdev_p, RDMA_GET_MEM, m); + if (rc) { + PDBG("%s toectl returned error %d\n", __func__, rc); + kfree(m); + return; + } + + data = (u32 *)m->buf; + while (size > 0) { + printk("%2u: %08x %08x %08x %08x %08x %08x %08x %08x\n", + m->addr, + *(data+2), *(data+3), *(data),*(data+1), + *(data+6), *(data+7), *(data+4), *(data+5)); + size -= 32; + data += 8; + m->addr += 32; + } + kfree(m); +} +#endif diff --git a/kernel/drivers/infiniband/hw/cxgb3/cxio_hal.c b/kernel/drivers/infiniband/hw/cxgb3/cxio_hal.c new file mode 100644 index 000000000..de1c61b41 --- /dev/null +++ b/kernel/drivers/infiniband/hw/cxgb3/cxio_hal.c @@ -0,0 +1,1343 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cxio_resource.h" +#include "cxio_hal.h" +#include "cxgb3_offload.h" +#include "sge_defs.h" + +static LIST_HEAD(rdev_list); +static cxio_hal_ev_callback_func_t cxio_ev_cb = NULL; + +static struct cxio_rdev *cxio_hal_find_rdev_by_name(char *dev_name) +{ + struct cxio_rdev *rdev; + + list_for_each_entry(rdev, &rdev_list, entry) + if (!strcmp(rdev->dev_name, dev_name)) + return rdev; + return NULL; +} + +static struct cxio_rdev *cxio_hal_find_rdev_by_t3cdev(struct t3cdev *tdev) +{ + struct cxio_rdev *rdev; + + list_for_each_entry(rdev, &rdev_list, entry) + if (rdev->t3cdev_p == tdev) + return rdev; + return NULL; +} + +int cxio_hal_cq_op(struct cxio_rdev *rdev_p, struct t3_cq *cq, + enum t3_cq_opcode op, u32 credit) +{ + int ret; + struct t3_cqe *cqe; + u32 rptr; + + struct rdma_cq_op setup; + setup.id = cq->cqid; + setup.credits = (op == CQ_CREDIT_UPDATE) ? credit : 0; + setup.op = op; + ret = rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_OP, &setup); + + if ((ret < 0) || (op == CQ_CREDIT_UPDATE)) + return ret; + + /* + * If the rearm returned an index other than our current index, + * then there might be CQE's in flight (being DMA'd). We must wait + * here for them to complete or the consumer can miss a notification. + */ + if (Q_PTR2IDX((cq->rptr), cq->size_log2) != ret) { + int i=0; + + rptr = cq->rptr; + + /* + * Keep the generation correct by bumping rptr until it + * matches the index returned by the rearm - 1. + */ + while (Q_PTR2IDX((rptr+1), cq->size_log2) != ret) + rptr++; + + /* + * Now rptr is the index for the (last) cqe that was + * in-flight at the time the HW rearmed the CQ. We + * spin until that CQE is valid. + */ + cqe = cq->queue + Q_PTR2IDX(rptr, cq->size_log2); + while (!CQ_VLD_ENTRY(rptr, cq->size_log2, cqe)) { + udelay(1); + if (i++ > 1000000) { + printk(KERN_ERR "%s: stalled rnic\n", + rdev_p->dev_name); + return -EIO; + } + } + + return 1; + } + + return 0; +} + +static int cxio_hal_clear_cq_ctx(struct cxio_rdev *rdev_p, u32 cqid) +{ + struct rdma_cq_setup setup; + setup.id = cqid; + setup.base_addr = 0; /* NULL address */ + setup.size = 0; /* disaable the CQ */ + setup.credits = 0; + setup.credit_thres = 0; + setup.ovfl_mode = 0; + return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup)); +} + +static int cxio_hal_clear_qp_ctx(struct cxio_rdev *rdev_p, u32 qpid) +{ + u64 sge_cmd; + struct t3_modify_qp_wr *wqe; + struct sk_buff *skb = alloc_skb(sizeof(*wqe), GFP_KERNEL); + if (!skb) { + PDBG("%s alloc_skb failed\n", __func__); + return -ENOMEM; + } + wqe = (struct t3_modify_qp_wr *) skb_put(skb, sizeof(*wqe)); + memset(wqe, 0, sizeof(*wqe)); + build_fw_riwrh((struct fw_riwrh *) wqe, T3_WR_QP_MOD, + T3_COMPLETION_FLAG | T3_NOTIFY_FLAG, 0, qpid, 7, + T3_SOPEOP); + wqe->flags = cpu_to_be32(MODQP_WRITE_EC); + sge_cmd = qpid << 8 | 3; + wqe->sge_cmd = cpu_to_be64(sge_cmd); + skb->priority = CPL_PRIORITY_CONTROL; + return iwch_cxgb3_ofld_send(rdev_p->t3cdev_p, skb); +} + +int cxio_create_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq, int kernel) +{ + struct rdma_cq_setup setup; + int size = (1UL << (cq->size_log2)) * sizeof(struct t3_cqe); + + size += 1; /* one extra page for storing cq-in-err state */ + cq->cqid = cxio_hal_get_cqid(rdev_p->rscp); + if (!cq->cqid) + return -ENOMEM; + if (kernel) { + cq->sw_queue = kzalloc(size, GFP_KERNEL); + if (!cq->sw_queue) + return -ENOMEM; + } + cq->queue = dma_alloc_coherent(&(rdev_p->rnic_info.pdev->dev), size, + &(cq->dma_addr), GFP_KERNEL); + if (!cq->queue) { + kfree(cq->sw_queue); + return -ENOMEM; + } + dma_unmap_addr_set(cq, mapping, cq->dma_addr); + memset(cq->queue, 0, size); + setup.id = cq->cqid; + setup.base_addr = (u64) (cq->dma_addr); + setup.size = 1UL << cq->size_log2; + setup.credits = 65535; + setup.credit_thres = 1; + if (rdev_p->t3cdev_p->type != T3A) + setup.ovfl_mode = 0; + else + setup.ovfl_mode = 1; + return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup)); +} + +#ifdef notyet +int cxio_resize_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq) +{ + struct rdma_cq_setup setup; + setup.id = cq->cqid; + setup.base_addr = (u64) (cq->dma_addr); + setup.size = 1UL << cq->size_log2; + setup.credits = setup.size; + setup.credit_thres = setup.size; /* TBD: overflow recovery */ + setup.ovfl_mode = 1; + return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup)); +} +#endif + +static u32 get_qpid(struct cxio_rdev *rdev_p, struct cxio_ucontext *uctx) +{ + struct cxio_qpid_list *entry; + u32 qpid; + int i; + + mutex_lock(&uctx->lock); + if (!list_empty(&uctx->qpids)) { + entry = list_entry(uctx->qpids.next, struct cxio_qpid_list, + entry); + list_del(&entry->entry); + qpid = entry->qpid; + kfree(entry); + } else { + qpid = cxio_hal_get_qpid(rdev_p->rscp); + if (!qpid) + goto out; + for (i = qpid+1; i & rdev_p->qpmask; i++) { + entry = kmalloc(sizeof *entry, GFP_KERNEL); + if (!entry) + break; + entry->qpid = i; + list_add_tail(&entry->entry, &uctx->qpids); + } + } +out: + mutex_unlock(&uctx->lock); + PDBG("%s qpid 0x%x\n", __func__, qpid); + return qpid; +} + +static void put_qpid(struct cxio_rdev *rdev_p, u32 qpid, + struct cxio_ucontext *uctx) +{ + struct cxio_qpid_list *entry; + + entry = kmalloc(sizeof *entry, GFP_KERNEL); + if (!entry) + return; + PDBG("%s qpid 0x%x\n", __func__, qpid); + entry->qpid = qpid; + mutex_lock(&uctx->lock); + list_add_tail(&entry->entry, &uctx->qpids); + mutex_unlock(&uctx->lock); +} + +void cxio_release_ucontext(struct cxio_rdev *rdev_p, struct cxio_ucontext *uctx) +{ + struct list_head *pos, *nxt; + struct cxio_qpid_list *entry; + + mutex_lock(&uctx->lock); + list_for_each_safe(pos, nxt, &uctx->qpids) { + entry = list_entry(pos, struct cxio_qpid_list, entry); + list_del_init(&entry->entry); + if (!(entry->qpid & rdev_p->qpmask)) + cxio_hal_put_qpid(rdev_p->rscp, entry->qpid); + kfree(entry); + } + mutex_unlock(&uctx->lock); +} + +void cxio_init_ucontext(struct cxio_rdev *rdev_p, struct cxio_ucontext *uctx) +{ + INIT_LIST_HEAD(&uctx->qpids); + mutex_init(&uctx->lock); +} + +int cxio_create_qp(struct cxio_rdev *rdev_p, u32 kernel_domain, + struct t3_wq *wq, struct cxio_ucontext *uctx) +{ + int depth = 1UL << wq->size_log2; + int rqsize = 1UL << wq->rq_size_log2; + + wq->qpid = get_qpid(rdev_p, uctx); + if (!wq->qpid) + return -ENOMEM; + + wq->rq = kzalloc(depth * sizeof(struct t3_swrq), GFP_KERNEL); + if (!wq->rq) + goto err1; + + wq->rq_addr = cxio_hal_rqtpool_alloc(rdev_p, rqsize); + if (!wq->rq_addr) + goto err2; + + wq->sq = kzalloc(depth * sizeof(struct t3_swsq), GFP_KERNEL); + if (!wq->sq) + goto err3; + + wq->queue = dma_alloc_coherent(&(rdev_p->rnic_info.pdev->dev), + depth * sizeof(union t3_wr), + &(wq->dma_addr), GFP_KERNEL); + if (!wq->queue) + goto err4; + + memset(wq->queue, 0, depth * sizeof(union t3_wr)); + dma_unmap_addr_set(wq, mapping, wq->dma_addr); + wq->doorbell = (void __iomem *)rdev_p->rnic_info.kdb_addr; + if (!kernel_domain) + wq->udb = (u64)rdev_p->rnic_info.udbell_physbase + + (wq->qpid << rdev_p->qpshift); + wq->rdev = rdev_p; + PDBG("%s qpid 0x%x doorbell 0x%p udb 0x%llx\n", __func__, + wq->qpid, wq->doorbell, (unsigned long long) wq->udb); + return 0; +err4: + kfree(wq->sq); +err3: + cxio_hal_rqtpool_free(rdev_p, wq->rq_addr, rqsize); +err2: + kfree(wq->rq); +err1: + put_qpid(rdev_p, wq->qpid, uctx); + return -ENOMEM; +} + +int cxio_destroy_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq) +{ + int err; + err = cxio_hal_clear_cq_ctx(rdev_p, cq->cqid); + kfree(cq->sw_queue); + dma_free_coherent(&(rdev_p->rnic_info.pdev->dev), + (1UL << (cq->size_log2)) + * sizeof(struct t3_cqe), cq->queue, + dma_unmap_addr(cq, mapping)); + cxio_hal_put_cqid(rdev_p->rscp, cq->cqid); + return err; +} + +int cxio_destroy_qp(struct cxio_rdev *rdev_p, struct t3_wq *wq, + struct cxio_ucontext *uctx) +{ + dma_free_coherent(&(rdev_p->rnic_info.pdev->dev), + (1UL << (wq->size_log2)) + * sizeof(union t3_wr), wq->queue, + dma_unmap_addr(wq, mapping)); + kfree(wq->sq); + cxio_hal_rqtpool_free(rdev_p, wq->rq_addr, (1UL << wq->rq_size_log2)); + kfree(wq->rq); + put_qpid(rdev_p, wq->qpid, uctx); + return 0; +} + +static void insert_recv_cqe(struct t3_wq *wq, struct t3_cq *cq) +{ + struct t3_cqe cqe; + + PDBG("%s wq %p cq %p sw_rptr 0x%x sw_wptr 0x%x\n", __func__, + wq, cq, cq->sw_rptr, cq->sw_wptr); + memset(&cqe, 0, sizeof(cqe)); + cqe.header = cpu_to_be32(V_CQE_STATUS(TPT_ERR_SWFLUSH) | + V_CQE_OPCODE(T3_SEND) | + V_CQE_TYPE(0) | + V_CQE_SWCQE(1) | + V_CQE_QPID(wq->qpid) | + V_CQE_GENBIT(Q_GENBIT(cq->sw_wptr, + cq->size_log2))); + *(cq->sw_queue + Q_PTR2IDX(cq->sw_wptr, cq->size_log2)) = cqe; + cq->sw_wptr++; +} + +int cxio_flush_rq(struct t3_wq *wq, struct t3_cq *cq, int count) +{ + u32 ptr; + int flushed = 0; + + PDBG("%s wq %p cq %p\n", __func__, wq, cq); + + /* flush RQ */ + PDBG("%s rq_rptr %u rq_wptr %u skip count %u\n", __func__, + wq->rq_rptr, wq->rq_wptr, count); + ptr = wq->rq_rptr + count; + while (ptr++ != wq->rq_wptr) { + insert_recv_cqe(wq, cq); + flushed++; + } + return flushed; +} + +static void insert_sq_cqe(struct t3_wq *wq, struct t3_cq *cq, + struct t3_swsq *sqp) +{ + struct t3_cqe cqe; + + PDBG("%s wq %p cq %p sw_rptr 0x%x sw_wptr 0x%x\n", __func__, + wq, cq, cq->sw_rptr, cq->sw_wptr); + memset(&cqe, 0, sizeof(cqe)); + cqe.header = cpu_to_be32(V_CQE_STATUS(TPT_ERR_SWFLUSH) | + V_CQE_OPCODE(sqp->opcode) | + V_CQE_TYPE(1) | + V_CQE_SWCQE(1) | + V_CQE_QPID(wq->qpid) | + V_CQE_GENBIT(Q_GENBIT(cq->sw_wptr, + cq->size_log2))); + cqe.u.scqe.wrid_hi = sqp->sq_wptr; + + *(cq->sw_queue + Q_PTR2IDX(cq->sw_wptr, cq->size_log2)) = cqe; + cq->sw_wptr++; +} + +int cxio_flush_sq(struct t3_wq *wq, struct t3_cq *cq, int count) +{ + __u32 ptr; + int flushed = 0; + struct t3_swsq *sqp = wq->sq + Q_PTR2IDX(wq->sq_rptr, wq->sq_size_log2); + + ptr = wq->sq_rptr + count; + sqp = wq->sq + Q_PTR2IDX(ptr, wq->sq_size_log2); + while (ptr != wq->sq_wptr) { + sqp->signaled = 0; + insert_sq_cqe(wq, cq, sqp); + ptr++; + sqp = wq->sq + Q_PTR2IDX(ptr, wq->sq_size_log2); + flushed++; + } + return flushed; +} + +/* + * Move all CQEs from the HWCQ into the SWCQ. + */ +void cxio_flush_hw_cq(struct t3_cq *cq) +{ + struct t3_cqe *cqe, *swcqe; + + PDBG("%s cq %p cqid 0x%x\n", __func__, cq, cq->cqid); + cqe = cxio_next_hw_cqe(cq); + while (cqe) { + PDBG("%s flushing hwcq rptr 0x%x to swcq wptr 0x%x\n", + __func__, cq->rptr, cq->sw_wptr); + swcqe = cq->sw_queue + Q_PTR2IDX(cq->sw_wptr, cq->size_log2); + *swcqe = *cqe; + swcqe->header |= cpu_to_be32(V_CQE_SWCQE(1)); + cq->sw_wptr++; + cq->rptr++; + cqe = cxio_next_hw_cqe(cq); + } +} + +static int cqe_completes_wr(struct t3_cqe *cqe, struct t3_wq *wq) +{ + if (CQE_OPCODE(*cqe) == T3_TERMINATE) + return 0; + + if ((CQE_OPCODE(*cqe) == T3_RDMA_WRITE) && RQ_TYPE(*cqe)) + return 0; + + if ((CQE_OPCODE(*cqe) == T3_READ_RESP) && SQ_TYPE(*cqe)) + return 0; + + if (CQE_SEND_OPCODE(*cqe) && RQ_TYPE(*cqe) && + Q_EMPTY(wq->rq_rptr, wq->rq_wptr)) + return 0; + + return 1; +} + +void cxio_count_scqes(struct t3_cq *cq, struct t3_wq *wq, int *count) +{ + struct t3_cqe *cqe; + u32 ptr; + + *count = 0; + ptr = cq->sw_rptr; + while (!Q_EMPTY(ptr, cq->sw_wptr)) { + cqe = cq->sw_queue + (Q_PTR2IDX(ptr, cq->size_log2)); + if ((SQ_TYPE(*cqe) || + ((CQE_OPCODE(*cqe) == T3_READ_RESP) && wq->oldest_read)) && + (CQE_QPID(*cqe) == wq->qpid)) + (*count)++; + ptr++; + } + PDBG("%s cq %p count %d\n", __func__, cq, *count); +} + +void cxio_count_rcqes(struct t3_cq *cq, struct t3_wq *wq, int *count) +{ + struct t3_cqe *cqe; + u32 ptr; + + *count = 0; + PDBG("%s count zero %d\n", __func__, *count); + ptr = cq->sw_rptr; + while (!Q_EMPTY(ptr, cq->sw_wptr)) { + cqe = cq->sw_queue + (Q_PTR2IDX(ptr, cq->size_log2)); + if (RQ_TYPE(*cqe) && (CQE_OPCODE(*cqe) != T3_READ_RESP) && + (CQE_QPID(*cqe) == wq->qpid) && cqe_completes_wr(cqe, wq)) + (*count)++; + ptr++; + } + PDBG("%s cq %p count %d\n", __func__, cq, *count); +} + +static int cxio_hal_init_ctrl_cq(struct cxio_rdev *rdev_p) +{ + struct rdma_cq_setup setup; + setup.id = 0; + setup.base_addr = 0; /* NULL address */ + setup.size = 1; /* enable the CQ */ + setup.credits = 0; + + /* force SGE to redirect to RspQ and interrupt */ + setup.credit_thres = 0; + setup.ovfl_mode = 1; + return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup)); +} + +static int cxio_hal_init_ctrl_qp(struct cxio_rdev *rdev_p) +{ + int err; + u64 sge_cmd, ctx0, ctx1; + u64 base_addr; + struct t3_modify_qp_wr *wqe; + struct sk_buff *skb; + + skb = alloc_skb(sizeof(*wqe), GFP_KERNEL); + if (!skb) { + PDBG("%s alloc_skb failed\n", __func__); + return -ENOMEM; + } + err = cxio_hal_init_ctrl_cq(rdev_p); + if (err) { + PDBG("%s err %d initializing ctrl_cq\n", __func__, err); + goto err; + } + rdev_p->ctrl_qp.workq = dma_alloc_coherent( + &(rdev_p->rnic_info.pdev->dev), + (1 << T3_CTRL_QP_SIZE_LOG2) * + sizeof(union t3_wr), + &(rdev_p->ctrl_qp.dma_addr), + GFP_KERNEL); + if (!rdev_p->ctrl_qp.workq) { + PDBG("%s dma_alloc_coherent failed\n", __func__); + err = -ENOMEM; + goto err; + } + dma_unmap_addr_set(&rdev_p->ctrl_qp, mapping, + rdev_p->ctrl_qp.dma_addr); + rdev_p->ctrl_qp.doorbell = (void __iomem *)rdev_p->rnic_info.kdb_addr; + memset(rdev_p->ctrl_qp.workq, 0, + (1 << T3_CTRL_QP_SIZE_LOG2) * sizeof(union t3_wr)); + + mutex_init(&rdev_p->ctrl_qp.lock); + init_waitqueue_head(&rdev_p->ctrl_qp.waitq); + + /* update HW Ctrl QP context */ + base_addr = rdev_p->ctrl_qp.dma_addr; + base_addr >>= 12; + ctx0 = (V_EC_SIZE((1 << T3_CTRL_QP_SIZE_LOG2)) | + V_EC_BASE_LO((u32) base_addr & 0xffff)); + ctx0 <<= 32; + ctx0 |= V_EC_CREDITS(FW_WR_NUM); + base_addr >>= 16; + ctx1 = (u32) base_addr; + base_addr >>= 32; + ctx1 |= ((u64) (V_EC_BASE_HI((u32) base_addr & 0xf) | V_EC_RESPQ(0) | + V_EC_TYPE(0) | V_EC_GEN(1) | + V_EC_UP_TOKEN(T3_CTL_QP_TID) | F_EC_VALID)) << 32; + wqe = (struct t3_modify_qp_wr *) skb_put(skb, sizeof(*wqe)); + memset(wqe, 0, sizeof(*wqe)); + build_fw_riwrh((struct fw_riwrh *) wqe, T3_WR_QP_MOD, 0, 0, + T3_CTL_QP_TID, 7, T3_SOPEOP); + wqe->flags = cpu_to_be32(MODQP_WRITE_EC); + sge_cmd = (3ULL << 56) | FW_RI_SGEEC_START << 8 | 3; + wqe->sge_cmd = cpu_to_be64(sge_cmd); + wqe->ctx1 = cpu_to_be64(ctx1); + wqe->ctx0 = cpu_to_be64(ctx0); + PDBG("CtrlQP dma_addr 0x%llx workq %p size %d\n", + (unsigned long long) rdev_p->ctrl_qp.dma_addr, + rdev_p->ctrl_qp.workq, 1 << T3_CTRL_QP_SIZE_LOG2); + skb->priority = CPL_PRIORITY_CONTROL; + return iwch_cxgb3_ofld_send(rdev_p->t3cdev_p, skb); +err: + kfree_skb(skb); + return err; +} + +static int cxio_hal_destroy_ctrl_qp(struct cxio_rdev *rdev_p) +{ + dma_free_coherent(&(rdev_p->rnic_info.pdev->dev), + (1UL << T3_CTRL_QP_SIZE_LOG2) + * sizeof(union t3_wr), rdev_p->ctrl_qp.workq, + dma_unmap_addr(&rdev_p->ctrl_qp, mapping)); + return cxio_hal_clear_qp_ctx(rdev_p, T3_CTRL_QP_ID); +} + +/* write len bytes of data into addr (32B aligned address) + * If data is NULL, clear len byte of memory to zero. + * caller acquires the ctrl_qp lock before the call + */ +static int cxio_hal_ctrl_qp_write_mem(struct cxio_rdev *rdev_p, u32 addr, + u32 len, void *data) +{ + u32 i, nr_wqe, copy_len; + u8 *copy_data; + u8 wr_len, utx_len; /* length in 8 byte flit */ + enum t3_wr_flags flag; + __be64 *wqe; + u64 utx_cmd; + addr &= 0x7FFFFFF; + nr_wqe = len % 96 ? len / 96 + 1 : len / 96; /* 96B max per WQE */ + PDBG("%s wptr 0x%x rptr 0x%x len %d, nr_wqe %d data %p addr 0x%0x\n", + __func__, rdev_p->ctrl_qp.wptr, rdev_p->ctrl_qp.rptr, len, + nr_wqe, data, addr); + utx_len = 3; /* in 32B unit */ + for (i = 0; i < nr_wqe; i++) { + if (Q_FULL(rdev_p->ctrl_qp.rptr, rdev_p->ctrl_qp.wptr, + T3_CTRL_QP_SIZE_LOG2)) { + PDBG("%s ctrl_qp full wtpr 0x%0x rptr 0x%0x, " + "wait for more space i %d\n", __func__, + rdev_p->ctrl_qp.wptr, rdev_p->ctrl_qp.rptr, i); + if (wait_event_interruptible(rdev_p->ctrl_qp.waitq, + !Q_FULL(rdev_p->ctrl_qp.rptr, + rdev_p->ctrl_qp.wptr, + T3_CTRL_QP_SIZE_LOG2))) { + PDBG("%s ctrl_qp workq interrupted\n", + __func__); + return -ERESTARTSYS; + } + PDBG("%s ctrl_qp wakeup, continue posting work request " + "i %d\n", __func__, i); + } + wqe = (__be64 *)(rdev_p->ctrl_qp.workq + (rdev_p->ctrl_qp.wptr % + (1 << T3_CTRL_QP_SIZE_LOG2))); + flag = 0; + if (i == (nr_wqe - 1)) { + /* last WQE */ + flag = T3_COMPLETION_FLAG; + if (len % 32) + utx_len = len / 32 + 1; + else + utx_len = len / 32; + } + + /* + * Force a CQE to return the credit to the workq in case + * we posted more than half the max QP size of WRs + */ + if ((i != 0) && + (i % (((1 << T3_CTRL_QP_SIZE_LOG2)) >> 1) == 0)) { + flag = T3_COMPLETION_FLAG; + PDBG("%s force completion at i %d\n", __func__, i); + } + + /* build the utx mem command */ + wqe += (sizeof(struct t3_bypass_wr) >> 3); + utx_cmd = (T3_UTX_MEM_WRITE << 28) | (addr + i * 3); + utx_cmd <<= 32; + utx_cmd |= (utx_len << 28) | ((utx_len << 2) + 1); + *wqe = cpu_to_be64(utx_cmd); + wqe++; + copy_data = (u8 *) data + i * 96; + copy_len = len > 96 ? 96 : len; + + /* clear memory content if data is NULL */ + if (data) + memcpy(wqe, copy_data, copy_len); + else + memset(wqe, 0, copy_len); + if (copy_len % 32) + memset(((u8 *) wqe) + copy_len, 0, + 32 - (copy_len % 32)); + wr_len = ((sizeof(struct t3_bypass_wr)) >> 3) + 1 + + (utx_len << 2); + wqe = (__be64 *)(rdev_p->ctrl_qp.workq + (rdev_p->ctrl_qp.wptr % + (1 << T3_CTRL_QP_SIZE_LOG2))); + + /* wptr in the WRID[31:0] */ + ((union t3_wrid *)(wqe+1))->id0.low = rdev_p->ctrl_qp.wptr; + + /* + * This must be the last write with a memory barrier + * for the genbit + */ + build_fw_riwrh((struct fw_riwrh *) wqe, T3_WR_BP, flag, + Q_GENBIT(rdev_p->ctrl_qp.wptr, + T3_CTRL_QP_SIZE_LOG2), T3_CTRL_QP_ID, + wr_len, T3_SOPEOP); + if (flag == T3_COMPLETION_FLAG) + ring_doorbell(rdev_p->ctrl_qp.doorbell, T3_CTRL_QP_ID); + len -= 96; + rdev_p->ctrl_qp.wptr++; + } + return 0; +} + +/* IN: stag key, pdid, perm, zbva, to, len, page_size, pbl_size and pbl_addr + * OUT: stag index + * TBD: shared memory region support + */ +static int __cxio_tpt_op(struct cxio_rdev *rdev_p, u32 reset_tpt_entry, + u32 *stag, u8 stag_state, u32 pdid, + enum tpt_mem_type type, enum tpt_mem_perm perm, + u32 zbva, u64 to, u32 len, u8 page_size, + u32 pbl_size, u32 pbl_addr) +{ + int err; + struct tpt_entry tpt; + u32 stag_idx; + u32 wptr; + + if (cxio_fatal_error(rdev_p)) + return -EIO; + + stag_state = stag_state > 0; + stag_idx = (*stag) >> 8; + + if ((!reset_tpt_entry) && !(*stag != T3_STAG_UNSET)) { + stag_idx = cxio_hal_get_stag(rdev_p->rscp); + if (!stag_idx) + return -ENOMEM; + *stag = (stag_idx << 8) | ((*stag) & 0xFF); + } + PDBG("%s stag_state 0x%0x type 0x%0x pdid 0x%0x, stag_idx 0x%x\n", + __func__, stag_state, type, pdid, stag_idx); + + mutex_lock(&rdev_p->ctrl_qp.lock); + + /* write TPT entry */ + if (reset_tpt_entry) + memset(&tpt, 0, sizeof(tpt)); + else { + tpt.valid_stag_pdid = cpu_to_be32(F_TPT_VALID | + V_TPT_STAG_KEY((*stag) & M_TPT_STAG_KEY) | + V_TPT_STAG_STATE(stag_state) | + V_TPT_STAG_TYPE(type) | V_TPT_PDID(pdid)); + BUG_ON(page_size >= 28); + tpt.flags_pagesize_qpid = cpu_to_be32(V_TPT_PERM(perm) | + ((perm & TPT_MW_BIND) ? F_TPT_MW_BIND_ENABLE : 0) | + V_TPT_ADDR_TYPE((zbva ? TPT_ZBTO : TPT_VATO)) | + V_TPT_PAGE_SIZE(page_size)); + tpt.rsvd_pbl_addr = cpu_to_be32(V_TPT_PBL_ADDR(PBL_OFF(rdev_p, pbl_addr)>>3)); + tpt.len = cpu_to_be32(len); + tpt.va_hi = cpu_to_be32((u32) (to >> 32)); + tpt.va_low_or_fbo = cpu_to_be32((u32) (to & 0xFFFFFFFFULL)); + tpt.rsvd_bind_cnt_or_pstag = 0; + tpt.rsvd_pbl_size = cpu_to_be32(V_TPT_PBL_SIZE(pbl_size >> 2)); + } + err = cxio_hal_ctrl_qp_write_mem(rdev_p, + stag_idx + + (rdev_p->rnic_info.tpt_base >> 5), + sizeof(tpt), &tpt); + + /* release the stag index to free pool */ + if (reset_tpt_entry) + cxio_hal_put_stag(rdev_p->rscp, stag_idx); + + wptr = rdev_p->ctrl_qp.wptr; + mutex_unlock(&rdev_p->ctrl_qp.lock); + if (!err) + if (wait_event_interruptible(rdev_p->ctrl_qp.waitq, + SEQ32_GE(rdev_p->ctrl_qp.rptr, + wptr))) + return -ERESTARTSYS; + return err; +} + +int cxio_write_pbl(struct cxio_rdev *rdev_p, __be64 *pbl, + u32 pbl_addr, u32 pbl_size) +{ + u32 wptr; + int err; + + PDBG("%s *pdb_addr 0x%x, pbl_base 0x%x, pbl_size %d\n", + __func__, pbl_addr, rdev_p->rnic_info.pbl_base, + pbl_size); + + mutex_lock(&rdev_p->ctrl_qp.lock); + err = cxio_hal_ctrl_qp_write_mem(rdev_p, pbl_addr >> 5, pbl_size << 3, + pbl); + wptr = rdev_p->ctrl_qp.wptr; + mutex_unlock(&rdev_p->ctrl_qp.lock); + if (err) + return err; + + if (wait_event_interruptible(rdev_p->ctrl_qp.waitq, + SEQ32_GE(rdev_p->ctrl_qp.rptr, + wptr))) + return -ERESTARTSYS; + + return 0; +} + +int cxio_register_phys_mem(struct cxio_rdev *rdev_p, u32 *stag, u32 pdid, + enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len, + u8 page_size, u32 pbl_size, u32 pbl_addr) +{ + *stag = T3_STAG_UNSET; + return __cxio_tpt_op(rdev_p, 0, stag, 1, pdid, TPT_NON_SHARED_MR, perm, + zbva, to, len, page_size, pbl_size, pbl_addr); +} + +int cxio_reregister_phys_mem(struct cxio_rdev *rdev_p, u32 *stag, u32 pdid, + enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len, + u8 page_size, u32 pbl_size, u32 pbl_addr) +{ + return __cxio_tpt_op(rdev_p, 0, stag, 1, pdid, TPT_NON_SHARED_MR, perm, + zbva, to, len, page_size, pbl_size, pbl_addr); +} + +int cxio_dereg_mem(struct cxio_rdev *rdev_p, u32 stag, u32 pbl_size, + u32 pbl_addr) +{ + return __cxio_tpt_op(rdev_p, 1, &stag, 0, 0, 0, 0, 0, 0ULL, 0, 0, + pbl_size, pbl_addr); +} + +int cxio_allocate_window(struct cxio_rdev *rdev_p, u32 * stag, u32 pdid) +{ + *stag = T3_STAG_UNSET; + return __cxio_tpt_op(rdev_p, 0, stag, 0, pdid, TPT_MW, 0, 0, 0ULL, 0, 0, + 0, 0); +} + +int cxio_deallocate_window(struct cxio_rdev *rdev_p, u32 stag) +{ + return __cxio_tpt_op(rdev_p, 1, &stag, 0, 0, 0, 0, 0, 0ULL, 0, 0, + 0, 0); +} + +int cxio_allocate_stag(struct cxio_rdev *rdev_p, u32 *stag, u32 pdid, u32 pbl_size, u32 pbl_addr) +{ + *stag = T3_STAG_UNSET; + return __cxio_tpt_op(rdev_p, 0, stag, 0, pdid, TPT_NON_SHARED_MR, + 0, 0, 0ULL, 0, 0, pbl_size, pbl_addr); +} + +int cxio_rdma_init(struct cxio_rdev *rdev_p, struct t3_rdma_init_attr *attr) +{ + struct t3_rdma_init_wr *wqe; + struct sk_buff *skb = alloc_skb(sizeof(*wqe), GFP_ATOMIC); + if (!skb) + return -ENOMEM; + PDBG("%s rdev_p %p\n", __func__, rdev_p); + wqe = (struct t3_rdma_init_wr *) __skb_put(skb, sizeof(*wqe)); + wqe->wrh.op_seop_flags = cpu_to_be32(V_FW_RIWR_OP(T3_WR_INIT)); + wqe->wrh.gen_tid_len = cpu_to_be32(V_FW_RIWR_TID(attr->tid) | + V_FW_RIWR_LEN(sizeof(*wqe) >> 3)); + wqe->wrid.id1 = 0; + wqe->qpid = cpu_to_be32(attr->qpid); + wqe->pdid = cpu_to_be32(attr->pdid); + wqe->scqid = cpu_to_be32(attr->scqid); + wqe->rcqid = cpu_to_be32(attr->rcqid); + wqe->rq_addr = cpu_to_be32(attr->rq_addr - rdev_p->rnic_info.rqt_base); + wqe->rq_size = cpu_to_be32(attr->rq_size); + wqe->mpaattrs = attr->mpaattrs; + wqe->qpcaps = attr->qpcaps; + wqe->ulpdu_size = cpu_to_be16(attr->tcp_emss); + wqe->rqe_count = cpu_to_be16(attr->rqe_count); + wqe->flags_rtr_type = cpu_to_be16(attr->flags | + V_RTR_TYPE(attr->rtr_type) | + V_CHAN(attr->chan)); + wqe->ord = cpu_to_be32(attr->ord); + wqe->ird = cpu_to_be32(attr->ird); + wqe->qp_dma_addr = cpu_to_be64(attr->qp_dma_addr); + wqe->qp_dma_size = cpu_to_be32(attr->qp_dma_size); + wqe->irs = cpu_to_be32(attr->irs); + skb->priority = 0; /* 0=>ToeQ; 1=>CtrlQ */ + return iwch_cxgb3_ofld_send(rdev_p->t3cdev_p, skb); +} + +void cxio_register_ev_cb(cxio_hal_ev_callback_func_t ev_cb) +{ + cxio_ev_cb = ev_cb; +} + +void cxio_unregister_ev_cb(cxio_hal_ev_callback_func_t ev_cb) +{ + cxio_ev_cb = NULL; +} + +static int cxio_hal_ev_handler(struct t3cdev *t3cdev_p, struct sk_buff *skb) +{ + static int cnt; + struct cxio_rdev *rdev_p = NULL; + struct respQ_msg_t *rsp_msg = (struct respQ_msg_t *) skb->data; + PDBG("%d: %s cq_id 0x%x cq_ptr 0x%x genbit %0x overflow %0x an %0x" + " se %0x notify %0x cqbranch %0x creditth %0x\n", + cnt, __func__, RSPQ_CQID(rsp_msg), RSPQ_CQPTR(rsp_msg), + RSPQ_GENBIT(rsp_msg), RSPQ_OVERFLOW(rsp_msg), RSPQ_AN(rsp_msg), + RSPQ_SE(rsp_msg), RSPQ_NOTIFY(rsp_msg), RSPQ_CQBRANCH(rsp_msg), + RSPQ_CREDIT_THRESH(rsp_msg)); + PDBG("CQE: QPID 0x%0x genbit %0x type 0x%0x status 0x%0x opcode %d " + "len 0x%0x wrid_hi_stag 0x%x wrid_low_msn 0x%x\n", + CQE_QPID(rsp_msg->cqe), CQE_GENBIT(rsp_msg->cqe), + CQE_TYPE(rsp_msg->cqe), CQE_STATUS(rsp_msg->cqe), + CQE_OPCODE(rsp_msg->cqe), CQE_LEN(rsp_msg->cqe), + CQE_WRID_HI(rsp_msg->cqe), CQE_WRID_LOW(rsp_msg->cqe)); + rdev_p = (struct cxio_rdev *)t3cdev_p->ulp; + if (!rdev_p) { + PDBG("%s called by t3cdev %p with null ulp\n", __func__, + t3cdev_p); + return 0; + } + if (CQE_QPID(rsp_msg->cqe) == T3_CTRL_QP_ID) { + rdev_p->ctrl_qp.rptr = CQE_WRID_LOW(rsp_msg->cqe) + 1; + wake_up_interruptible(&rdev_p->ctrl_qp.waitq); + dev_kfree_skb_irq(skb); + } else if (CQE_QPID(rsp_msg->cqe) == 0xfff8) + dev_kfree_skb_irq(skb); + else if (cxio_ev_cb) + (*cxio_ev_cb) (rdev_p, skb); + else + dev_kfree_skb_irq(skb); + cnt++; + return 0; +} + +/* Caller takes care of locking if needed */ +int cxio_rdev_open(struct cxio_rdev *rdev_p) +{ + struct net_device *netdev_p = NULL; + int err = 0; + if (strlen(rdev_p->dev_name)) { + if (cxio_hal_find_rdev_by_name(rdev_p->dev_name)) { + return -EBUSY; + } + netdev_p = dev_get_by_name(&init_net, rdev_p->dev_name); + if (!netdev_p) { + return -EINVAL; + } + dev_put(netdev_p); + } else if (rdev_p->t3cdev_p) { + if (cxio_hal_find_rdev_by_t3cdev(rdev_p->t3cdev_p)) { + return -EBUSY; + } + netdev_p = rdev_p->t3cdev_p->lldev; + strncpy(rdev_p->dev_name, rdev_p->t3cdev_p->name, + T3_MAX_DEV_NAME_LEN); + } else { + PDBG("%s t3cdev_p or dev_name must be set\n", __func__); + return -EINVAL; + } + + list_add_tail(&rdev_p->entry, &rdev_list); + + PDBG("%s opening rnic dev %s\n", __func__, rdev_p->dev_name); + memset(&rdev_p->ctrl_qp, 0, sizeof(rdev_p->ctrl_qp)); + if (!rdev_p->t3cdev_p) + rdev_p->t3cdev_p = dev2t3cdev(netdev_p); + rdev_p->t3cdev_p->ulp = (void *) rdev_p; + + err = rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, GET_EMBEDDED_INFO, + &(rdev_p->fw_info)); + if (err) { + printk(KERN_ERR "%s t3cdev_p(%p)->ctl returned error %d.\n", + __func__, rdev_p->t3cdev_p, err); + goto err1; + } + if (G_FW_VERSION_MAJOR(rdev_p->fw_info.fw_vers) != CXIO_FW_MAJ) { + printk(KERN_ERR MOD "fatal firmware version mismatch: " + "need version %u but adapter has version %u\n", + CXIO_FW_MAJ, + G_FW_VERSION_MAJOR(rdev_p->fw_info.fw_vers)); + err = -EINVAL; + goto err1; + } + + err = rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_GET_PARAMS, + &(rdev_p->rnic_info)); + if (err) { + printk(KERN_ERR "%s t3cdev_p(%p)->ctl returned error %d.\n", + __func__, rdev_p->t3cdev_p, err); + goto err1; + } + err = rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, GET_PORTS, + &(rdev_p->port_info)); + if (err) { + printk(KERN_ERR "%s t3cdev_p(%p)->ctl returned error %d.\n", + __func__, rdev_p->t3cdev_p, err); + goto err1; + } + + /* + * qpshift is the number of bits to shift the qpid left in order + * to get the correct address of the doorbell for that qp. + */ + cxio_init_ucontext(rdev_p, &rdev_p->uctx); + rdev_p->qpshift = PAGE_SHIFT - + ilog2(65536 >> + ilog2(rdev_p->rnic_info.udbell_len >> + PAGE_SHIFT)); + rdev_p->qpnr = rdev_p->rnic_info.udbell_len >> PAGE_SHIFT; + rdev_p->qpmask = (65536 >> ilog2(rdev_p->qpnr)) - 1; + PDBG("%s rnic %s info: tpt_base 0x%0x tpt_top 0x%0x num stags %d " + "pbl_base 0x%0x pbl_top 0x%0x rqt_base 0x%0x, rqt_top 0x%0x\n", + __func__, rdev_p->dev_name, rdev_p->rnic_info.tpt_base, + rdev_p->rnic_info.tpt_top, cxio_num_stags(rdev_p), + rdev_p->rnic_info.pbl_base, + rdev_p->rnic_info.pbl_top, rdev_p->rnic_info.rqt_base, + rdev_p->rnic_info.rqt_top); + PDBG("udbell_len 0x%0x udbell_physbase 0x%lx kdb_addr %p qpshift %lu " + "qpnr %d qpmask 0x%x\n", + rdev_p->rnic_info.udbell_len, + rdev_p->rnic_info.udbell_physbase, rdev_p->rnic_info.kdb_addr, + rdev_p->qpshift, rdev_p->qpnr, rdev_p->qpmask); + + err = cxio_hal_init_ctrl_qp(rdev_p); + if (err) { + printk(KERN_ERR "%s error %d initializing ctrl_qp.\n", + __func__, err); + goto err1; + } + err = cxio_hal_init_resource(rdev_p, cxio_num_stags(rdev_p), 0, + 0, T3_MAX_NUM_QP, T3_MAX_NUM_CQ, + T3_MAX_NUM_PD); + if (err) { + printk(KERN_ERR "%s error %d initializing hal resources.\n", + __func__, err); + goto err2; + } + err = cxio_hal_pblpool_create(rdev_p); + if (err) { + printk(KERN_ERR "%s error %d initializing pbl mem pool.\n", + __func__, err); + goto err3; + } + err = cxio_hal_rqtpool_create(rdev_p); + if (err) { + printk(KERN_ERR "%s error %d initializing rqt mem pool.\n", + __func__, err); + goto err4; + } + return 0; +err4: + cxio_hal_pblpool_destroy(rdev_p); +err3: + cxio_hal_destroy_resource(rdev_p->rscp); +err2: + cxio_hal_destroy_ctrl_qp(rdev_p); +err1: + rdev_p->t3cdev_p->ulp = NULL; + list_del(&rdev_p->entry); + return err; +} + +void cxio_rdev_close(struct cxio_rdev *rdev_p) +{ + if (rdev_p) { + cxio_hal_pblpool_destroy(rdev_p); + cxio_hal_rqtpool_destroy(rdev_p); + list_del(&rdev_p->entry); + cxio_hal_destroy_ctrl_qp(rdev_p); + cxio_hal_destroy_resource(rdev_p->rscp); + rdev_p->t3cdev_p->ulp = NULL; + } +} + +int __init cxio_hal_init(void) +{ + if (cxio_hal_init_rhdl_resource(T3_MAX_NUM_RI)) + return -ENOMEM; + t3_register_cpl_handler(CPL_ASYNC_NOTIF, cxio_hal_ev_handler); + return 0; +} + +void __exit cxio_hal_exit(void) +{ + struct cxio_rdev *rdev, *tmp; + + t3_register_cpl_handler(CPL_ASYNC_NOTIF, NULL); + list_for_each_entry_safe(rdev, tmp, &rdev_list, entry) + cxio_rdev_close(rdev); + cxio_hal_destroy_rhdl_resource(); +} + +static void flush_completed_wrs(struct t3_wq *wq, struct t3_cq *cq) +{ + struct t3_swsq *sqp; + __u32 ptr = wq->sq_rptr; + int count = Q_COUNT(wq->sq_rptr, wq->sq_wptr); + + sqp = wq->sq + Q_PTR2IDX(ptr, wq->sq_size_log2); + while (count--) + if (!sqp->signaled) { + ptr++; + sqp = wq->sq + Q_PTR2IDX(ptr, wq->sq_size_log2); + } else if (sqp->complete) { + + /* + * Insert this completed cqe into the swcq. + */ + PDBG("%s moving cqe into swcq sq idx %ld cq idx %ld\n", + __func__, Q_PTR2IDX(ptr, wq->sq_size_log2), + Q_PTR2IDX(cq->sw_wptr, cq->size_log2)); + sqp->cqe.header |= htonl(V_CQE_SWCQE(1)); + *(cq->sw_queue + Q_PTR2IDX(cq->sw_wptr, cq->size_log2)) + = sqp->cqe; + cq->sw_wptr++; + sqp->signaled = 0; + break; + } else + break; +} + +static void create_read_req_cqe(struct t3_wq *wq, struct t3_cqe *hw_cqe, + struct t3_cqe *read_cqe) +{ + read_cqe->u.scqe.wrid_hi = wq->oldest_read->sq_wptr; + read_cqe->len = wq->oldest_read->read_len; + read_cqe->header = htonl(V_CQE_QPID(CQE_QPID(*hw_cqe)) | + V_CQE_SWCQE(SW_CQE(*hw_cqe)) | + V_CQE_OPCODE(T3_READ_REQ) | + V_CQE_TYPE(1)); +} + +/* + * Return a ptr to the next read wr in the SWSQ or NULL. + */ +static void advance_oldest_read(struct t3_wq *wq) +{ + + u32 rptr = wq->oldest_read - wq->sq + 1; + u32 wptr = Q_PTR2IDX(wq->sq_wptr, wq->sq_size_log2); + + while (Q_PTR2IDX(rptr, wq->sq_size_log2) != wptr) { + wq->oldest_read = wq->sq + Q_PTR2IDX(rptr, wq->sq_size_log2); + + if (wq->oldest_read->opcode == T3_READ_REQ) + return; + rptr++; + } + wq->oldest_read = NULL; +} + +/* + * cxio_poll_cq + * + * Caller must: + * check the validity of the first CQE, + * supply the wq assicated with the qpid. + * + * credit: cq credit to return to sge. + * cqe_flushed: 1 iff the CQE is flushed. + * cqe: copy of the polled CQE. + * + * return value: + * 0 CQE returned, + * -1 CQE skipped, try again. + */ +int cxio_poll_cq(struct t3_wq *wq, struct t3_cq *cq, struct t3_cqe *cqe, + u8 *cqe_flushed, u64 *cookie, u32 *credit) +{ + int ret = 0; + struct t3_cqe *hw_cqe, read_cqe; + + *cqe_flushed = 0; + *credit = 0; + hw_cqe = cxio_next_cqe(cq); + + PDBG("%s CQE OOO %d qpid 0x%0x genbit %d type %d status 0x%0x" + " opcode 0x%0x len 0x%0x wrid_hi_stag 0x%x wrid_low_msn 0x%x\n", + __func__, CQE_OOO(*hw_cqe), CQE_QPID(*hw_cqe), + CQE_GENBIT(*hw_cqe), CQE_TYPE(*hw_cqe), CQE_STATUS(*hw_cqe), + CQE_OPCODE(*hw_cqe), CQE_LEN(*hw_cqe), CQE_WRID_HI(*hw_cqe), + CQE_WRID_LOW(*hw_cqe)); + + /* + * skip cqe's not affiliated with a QP. + */ + if (wq == NULL) { + ret = -1; + goto skip_cqe; + } + + /* + * Gotta tweak READ completions: + * 1) the cqe doesn't contain the sq_wptr from the wr. + * 2) opcode not reflected from the wr. + * 3) read_len not reflected from the wr. + * 4) cq_type is RQ_TYPE not SQ_TYPE. + */ + if (RQ_TYPE(*hw_cqe) && (CQE_OPCODE(*hw_cqe) == T3_READ_RESP)) { + + /* + * If this is an unsolicited read response, then the read + * was generated by the kernel driver as part of peer-2-peer + * connection setup. So ignore the completion. + */ + if (!wq->oldest_read) { + if (CQE_STATUS(*hw_cqe)) + wq->error = 1; + ret = -1; + goto skip_cqe; + } + + /* + * Don't write to the HWCQ, so create a new read req CQE + * in local memory. + */ + create_read_req_cqe(wq, hw_cqe, &read_cqe); + hw_cqe = &read_cqe; + advance_oldest_read(wq); + } + + /* + * T3A: Discard TERMINATE CQEs. + */ + if (CQE_OPCODE(*hw_cqe) == T3_TERMINATE) { + ret = -1; + wq->error = 1; + goto skip_cqe; + } + + if (CQE_STATUS(*hw_cqe) || wq->error) { + *cqe_flushed = wq->error; + wq->error = 1; + + /* + * T3A inserts errors into the CQE. We cannot return + * these as work completions. + */ + /* incoming write failures */ + if ((CQE_OPCODE(*hw_cqe) == T3_RDMA_WRITE) + && RQ_TYPE(*hw_cqe)) { + ret = -1; + goto skip_cqe; + } + /* incoming read request failures */ + if ((CQE_OPCODE(*hw_cqe) == T3_READ_RESP) && SQ_TYPE(*hw_cqe)) { + ret = -1; + goto skip_cqe; + } + + /* incoming SEND with no receive posted failures */ + if (CQE_SEND_OPCODE(*hw_cqe) && RQ_TYPE(*hw_cqe) && + Q_EMPTY(wq->rq_rptr, wq->rq_wptr)) { + ret = -1; + goto skip_cqe; + } + BUG_ON((*cqe_flushed == 0) && !SW_CQE(*hw_cqe)); + goto proc_cqe; + } + + /* + * RECV completion. + */ + if (RQ_TYPE(*hw_cqe)) { + + /* + * HW only validates 4 bits of MSN. So we must validate that + * the MSN in the SEND is the next expected MSN. If its not, + * then we complete this with TPT_ERR_MSN and mark the wq in + * error. + */ + + if (Q_EMPTY(wq->rq_rptr, wq->rq_wptr)) { + wq->error = 1; + ret = -1; + goto skip_cqe; + } + + if (unlikely((CQE_WRID_MSN(*hw_cqe) != (wq->rq_rptr + 1)))) { + wq->error = 1; + hw_cqe->header |= htonl(V_CQE_STATUS(TPT_ERR_MSN)); + goto proc_cqe; + } + goto proc_cqe; + } + + /* + * If we get here its a send completion. + * + * Handle out of order completion. These get stuffed + * in the SW SQ. Then the SW SQ is walked to move any + * now in-order completions into the SW CQ. This handles + * 2 cases: + * 1) reaping unsignaled WRs when the first subsequent + * signaled WR is completed. + * 2) out of order read completions. + */ + if (!SW_CQE(*hw_cqe) && (CQE_WRID_SQ_WPTR(*hw_cqe) != wq->sq_rptr)) { + struct t3_swsq *sqp; + + PDBG("%s out of order completion going in swsq at idx %ld\n", + __func__, + Q_PTR2IDX(CQE_WRID_SQ_WPTR(*hw_cqe), wq->sq_size_log2)); + sqp = wq->sq + + Q_PTR2IDX(CQE_WRID_SQ_WPTR(*hw_cqe), wq->sq_size_log2); + sqp->cqe = *hw_cqe; + sqp->complete = 1; + ret = -1; + goto flush_wq; + } + +proc_cqe: + *cqe = *hw_cqe; + + /* + * Reap the associated WR(s) that are freed up with this + * completion. + */ + if (SQ_TYPE(*hw_cqe)) { + wq->sq_rptr = CQE_WRID_SQ_WPTR(*hw_cqe); + PDBG("%s completing sq idx %ld\n", __func__, + Q_PTR2IDX(wq->sq_rptr, wq->sq_size_log2)); + *cookie = wq->sq[Q_PTR2IDX(wq->sq_rptr, wq->sq_size_log2)].wr_id; + wq->sq_rptr++; + } else { + PDBG("%s completing rq idx %ld\n", __func__, + Q_PTR2IDX(wq->rq_rptr, wq->rq_size_log2)); + *cookie = wq->rq[Q_PTR2IDX(wq->rq_rptr, wq->rq_size_log2)].wr_id; + if (wq->rq[Q_PTR2IDX(wq->rq_rptr, wq->rq_size_log2)].pbl_addr) + cxio_hal_pblpool_free(wq->rdev, + wq->rq[Q_PTR2IDX(wq->rq_rptr, + wq->rq_size_log2)].pbl_addr, T3_STAG0_PBL_SIZE); + BUG_ON(Q_EMPTY(wq->rq_rptr, wq->rq_wptr)); + wq->rq_rptr++; + } + +flush_wq: + /* + * Flush any completed cqes that are now in-order. + */ + flush_completed_wrs(wq, cq); + +skip_cqe: + if (SW_CQE(*hw_cqe)) { + PDBG("%s cq %p cqid 0x%x skip sw cqe sw_rptr 0x%x\n", + __func__, cq, cq->cqid, cq->sw_rptr); + ++cq->sw_rptr; + } else { + PDBG("%s cq %p cqid 0x%x skip hw cqe rptr 0x%x\n", + __func__, cq, cq->cqid, cq->rptr); + ++cq->rptr; + + /* + * T3A: compute credits. + */ + if (((cq->rptr - cq->wptr) > (1 << (cq->size_log2 - 1))) + || ((cq->rptr - cq->wptr) >= 128)) { + *credit = cq->rptr - cq->wptr; + cq->wptr = cq->rptr; + } + } + return ret; +} diff --git a/kernel/drivers/infiniband/hw/cxgb3/cxio_hal.h b/kernel/drivers/infiniband/hw/cxgb3/cxio_hal.h new file mode 100644 index 000000000..78fbe9ffe --- /dev/null +++ b/kernel/drivers/infiniband/hw/cxgb3/cxio_hal.h @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __CXIO_HAL_H__ +#define __CXIO_HAL_H__ + +#include +#include +#include + +#include "t3_cpl.h" +#include "t3cdev.h" +#include "cxgb3_ctl_defs.h" +#include "cxio_wr.h" + +#define T3_CTRL_QP_ID FW_RI_SGEEC_START +#define T3_CTL_QP_TID FW_RI_TID_START +#define T3_CTRL_QP_SIZE_LOG2 8 +#define T3_CTRL_CQ_ID 0 + +#define T3_MAX_NUM_RI (1<<15) +#define T3_MAX_NUM_QP (1<<15) +#define T3_MAX_NUM_CQ (1<<15) +#define T3_MAX_NUM_PD (1<<15) +#define T3_MAX_PBL_SIZE 256 +#define T3_MAX_RQ_SIZE 1024 +#define T3_MAX_QP_DEPTH (T3_MAX_RQ_SIZE-1) +#define T3_MAX_CQ_DEPTH 65536 +#define T3_MAX_NUM_STAG (1<<15) +#define T3_MAX_MR_SIZE 0x100000000ULL +#define T3_PAGESIZE_MASK 0xffff000 /* 4KB-128MB */ + +#define T3_STAG_UNSET 0xffffffff + +#define T3_MAX_DEV_NAME_LEN 32 + +#define CXIO_FW_MAJ 7 + +struct cxio_hal_ctrl_qp { + u32 wptr; + u32 rptr; + struct mutex lock; /* for the wtpr, can sleep */ + wait_queue_head_t waitq;/* wait for RspQ/CQE msg */ + union t3_wr *workq; /* the work request queue */ + dma_addr_t dma_addr; /* pci bus address of the workq */ + DEFINE_DMA_UNMAP_ADDR(mapping); + void __iomem *doorbell; +}; + +struct cxio_hal_resource { + struct kfifo tpt_fifo; + spinlock_t tpt_fifo_lock; + struct kfifo qpid_fifo; + spinlock_t qpid_fifo_lock; + struct kfifo cqid_fifo; + spinlock_t cqid_fifo_lock; + struct kfifo pdid_fifo; + spinlock_t pdid_fifo_lock; +}; + +struct cxio_qpid_list { + struct list_head entry; + u32 qpid; +}; + +struct cxio_ucontext { + struct list_head qpids; + struct mutex lock; +}; + +struct cxio_rdev { + char dev_name[T3_MAX_DEV_NAME_LEN]; + struct t3cdev *t3cdev_p; + struct rdma_info rnic_info; + struct adap_ports port_info; + struct cxio_hal_resource *rscp; + struct cxio_hal_ctrl_qp ctrl_qp; + void *ulp; + unsigned long qpshift; + u32 qpnr; + u32 qpmask; + struct cxio_ucontext uctx; + struct gen_pool *pbl_pool; + struct gen_pool *rqt_pool; + struct list_head entry; + struct ch_embedded_info fw_info; + u32 flags; +#define CXIO_ERROR_FATAL 1 +}; + +static inline int cxio_fatal_error(struct cxio_rdev *rdev_p) +{ + return rdev_p->flags & CXIO_ERROR_FATAL; +} + +static inline int cxio_num_stags(struct cxio_rdev *rdev_p) +{ + return min((int)T3_MAX_NUM_STAG, (int)((rdev_p->rnic_info.tpt_top - rdev_p->rnic_info.tpt_base) >> 5)); +} + +typedef void (*cxio_hal_ev_callback_func_t) (struct cxio_rdev * rdev_p, + struct sk_buff * skb); + +#define RSPQ_CQID(rsp) (be32_to_cpu(rsp->cq_ptrid) & 0xffff) +#define RSPQ_CQPTR(rsp) ((be32_to_cpu(rsp->cq_ptrid) >> 16) & 0xffff) +#define RSPQ_GENBIT(rsp) ((be32_to_cpu(rsp->flags) >> 16) & 1) +#define RSPQ_OVERFLOW(rsp) ((be32_to_cpu(rsp->flags) >> 17) & 1) +#define RSPQ_AN(rsp) ((be32_to_cpu(rsp->flags) >> 18) & 1) +#define RSPQ_SE(rsp) ((be32_to_cpu(rsp->flags) >> 19) & 1) +#define RSPQ_NOTIFY(rsp) ((be32_to_cpu(rsp->flags) >> 20) & 1) +#define RSPQ_CQBRANCH(rsp) ((be32_to_cpu(rsp->flags) >> 21) & 1) +#define RSPQ_CREDIT_THRESH(rsp) ((be32_to_cpu(rsp->flags) >> 22) & 1) + +struct respQ_msg_t { + __be32 flags; /* flit 0 */ + __be32 cq_ptrid; + __be64 rsvd; /* flit 1 */ + struct t3_cqe cqe; /* flits 2-3 */ +}; + +enum t3_cq_opcode { + CQ_ARM_AN = 0x2, + CQ_ARM_SE = 0x6, + CQ_FORCE_AN = 0x3, + CQ_CREDIT_UPDATE = 0x7 +}; + +int cxio_rdev_open(struct cxio_rdev *rdev); +void cxio_rdev_close(struct cxio_rdev *rdev); +int cxio_hal_cq_op(struct cxio_rdev *rdev, struct t3_cq *cq, + enum t3_cq_opcode op, u32 credit); +int cxio_create_cq(struct cxio_rdev *rdev, struct t3_cq *cq, int kernel); +int cxio_destroy_cq(struct cxio_rdev *rdev, struct t3_cq *cq); +int cxio_resize_cq(struct cxio_rdev *rdev, struct t3_cq *cq); +void cxio_release_ucontext(struct cxio_rdev *rdev, struct cxio_ucontext *uctx); +void cxio_init_ucontext(struct cxio_rdev *rdev, struct cxio_ucontext *uctx); +int cxio_create_qp(struct cxio_rdev *rdev, u32 kernel_domain, struct t3_wq *wq, + struct cxio_ucontext *uctx); +int cxio_destroy_qp(struct cxio_rdev *rdev, struct t3_wq *wq, + struct cxio_ucontext *uctx); +int cxio_peek_cq(struct t3_wq *wr, struct t3_cq *cq, int opcode); +int cxio_write_pbl(struct cxio_rdev *rdev_p, __be64 *pbl, + u32 pbl_addr, u32 pbl_size); +int cxio_register_phys_mem(struct cxio_rdev *rdev, u32 * stag, u32 pdid, + enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len, + u8 page_size, u32 pbl_size, u32 pbl_addr); +int cxio_reregister_phys_mem(struct cxio_rdev *rdev, u32 * stag, u32 pdid, + enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len, + u8 page_size, u32 pbl_size, u32 pbl_addr); +int cxio_dereg_mem(struct cxio_rdev *rdev, u32 stag, u32 pbl_size, + u32 pbl_addr); +int cxio_allocate_window(struct cxio_rdev *rdev, u32 * stag, u32 pdid); +int cxio_allocate_stag(struct cxio_rdev *rdev, u32 *stag, u32 pdid, u32 pbl_size, u32 pbl_addr); +int cxio_deallocate_window(struct cxio_rdev *rdev, u32 stag); +int cxio_rdma_init(struct cxio_rdev *rdev, struct t3_rdma_init_attr *attr); +void cxio_register_ev_cb(cxio_hal_ev_callback_func_t ev_cb); +void cxio_unregister_ev_cb(cxio_hal_ev_callback_func_t ev_cb); +u32 cxio_hal_get_pdid(struct cxio_hal_resource *rscp); +void cxio_hal_put_pdid(struct cxio_hal_resource *rscp, u32 pdid); +int __init cxio_hal_init(void); +void __exit cxio_hal_exit(void); +int cxio_flush_rq(struct t3_wq *wq, struct t3_cq *cq, int count); +int cxio_flush_sq(struct t3_wq *wq, struct t3_cq *cq, int count); +void cxio_count_rcqes(struct t3_cq *cq, struct t3_wq *wq, int *count); +void cxio_count_scqes(struct t3_cq *cq, struct t3_wq *wq, int *count); +void cxio_flush_hw_cq(struct t3_cq *cq); +int cxio_poll_cq(struct t3_wq *wq, struct t3_cq *cq, struct t3_cqe *cqe, + u8 *cqe_flushed, u64 *cookie, u32 *credit); +int iwch_cxgb3_ofld_send(struct t3cdev *tdev, struct sk_buff *skb); + +#define MOD "iw_cxgb3: " +#define PDBG(fmt, args...) pr_debug(MOD fmt, ## args) + +#ifdef DEBUG +void cxio_dump_tpt(struct cxio_rdev *rev, u32 stag); +void cxio_dump_pbl(struct cxio_rdev *rev, u32 pbl_addr, uint len, u8 shift); +void cxio_dump_wqe(union t3_wr *wqe); +void cxio_dump_wce(struct t3_cqe *wce); +void cxio_dump_rqt(struct cxio_rdev *rdev, u32 hwtid, int nents); +void cxio_dump_tcb(struct cxio_rdev *rdev, u32 hwtid); +#endif + +#endif diff --git a/kernel/drivers/infiniband/hw/cxgb3/cxio_resource.c b/kernel/drivers/infiniband/hw/cxgb3/cxio_resource.c new file mode 100644 index 000000000..c40088ecf --- /dev/null +++ b/kernel/drivers/infiniband/hw/cxgb3/cxio_resource.c @@ -0,0 +1,343 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +/* Crude resource management */ +#include +#include +#include +#include +#include +#include +#include "cxio_resource.h" +#include "cxio_hal.h" + +static struct kfifo rhdl_fifo; +static spinlock_t rhdl_fifo_lock; + +#define RANDOM_SIZE 16 + +static int __cxio_init_resource_fifo(struct kfifo *fifo, + spinlock_t *fifo_lock, + u32 nr, u32 skip_low, + u32 skip_high, + int random) +{ + u32 i, j, entry = 0, idx; + u32 random_bytes; + u32 rarray[16]; + spin_lock_init(fifo_lock); + + if (kfifo_alloc(fifo, nr * sizeof(u32), GFP_KERNEL)) + return -ENOMEM; + + for (i = 0; i < skip_low + skip_high; i++) + kfifo_in(fifo, (unsigned char *) &entry, sizeof(u32)); + if (random) { + j = 0; + random_bytes = prandom_u32(); + for (i = 0; i < RANDOM_SIZE; i++) + rarray[i] = i + skip_low; + for (i = skip_low + RANDOM_SIZE; i < nr - skip_high; i++) { + if (j >= RANDOM_SIZE) { + j = 0; + random_bytes = prandom_u32(); + } + idx = (random_bytes >> (j * 2)) & 0xF; + kfifo_in(fifo, + (unsigned char *) &rarray[idx], + sizeof(u32)); + rarray[idx] = i; + j++; + } + for (i = 0; i < RANDOM_SIZE; i++) + kfifo_in(fifo, + (unsigned char *) &rarray[i], + sizeof(u32)); + } else + for (i = skip_low; i < nr - skip_high; i++) + kfifo_in(fifo, (unsigned char *) &i, sizeof(u32)); + + for (i = 0; i < skip_low + skip_high; i++) + if (kfifo_out_locked(fifo, (unsigned char *) &entry, + sizeof(u32), fifo_lock) != sizeof(u32)) + break; + return 0; +} + +static int cxio_init_resource_fifo(struct kfifo *fifo, spinlock_t * fifo_lock, + u32 nr, u32 skip_low, u32 skip_high) +{ + return (__cxio_init_resource_fifo(fifo, fifo_lock, nr, skip_low, + skip_high, 0)); +} + +static int cxio_init_resource_fifo_random(struct kfifo *fifo, + spinlock_t * fifo_lock, + u32 nr, u32 skip_low, u32 skip_high) +{ + + return (__cxio_init_resource_fifo(fifo, fifo_lock, nr, skip_low, + skip_high, 1)); +} + +static int cxio_init_qpid_fifo(struct cxio_rdev *rdev_p) +{ + u32 i; + + spin_lock_init(&rdev_p->rscp->qpid_fifo_lock); + + if (kfifo_alloc(&rdev_p->rscp->qpid_fifo, T3_MAX_NUM_QP * sizeof(u32), + GFP_KERNEL)) + return -ENOMEM; + + for (i = 16; i < T3_MAX_NUM_QP; i++) + if (!(i & rdev_p->qpmask)) + kfifo_in(&rdev_p->rscp->qpid_fifo, + (unsigned char *) &i, sizeof(u32)); + return 0; +} + +int cxio_hal_init_rhdl_resource(u32 nr_rhdl) +{ + return cxio_init_resource_fifo(&rhdl_fifo, &rhdl_fifo_lock, nr_rhdl, 1, + 0); +} + +void cxio_hal_destroy_rhdl_resource(void) +{ + kfifo_free(&rhdl_fifo); +} + +/* nr_* must be power of 2 */ +int cxio_hal_init_resource(struct cxio_rdev *rdev_p, + u32 nr_tpt, u32 nr_pbl, + u32 nr_rqt, u32 nr_qpid, u32 nr_cqid, u32 nr_pdid) +{ + int err = 0; + struct cxio_hal_resource *rscp; + + rscp = kmalloc(sizeof(*rscp), GFP_KERNEL); + if (!rscp) + return -ENOMEM; + rdev_p->rscp = rscp; + err = cxio_init_resource_fifo_random(&rscp->tpt_fifo, + &rscp->tpt_fifo_lock, + nr_tpt, 1, 0); + if (err) + goto tpt_err; + err = cxio_init_qpid_fifo(rdev_p); + if (err) + goto qpid_err; + err = cxio_init_resource_fifo(&rscp->cqid_fifo, &rscp->cqid_fifo_lock, + nr_cqid, 1, 0); + if (err) + goto cqid_err; + err = cxio_init_resource_fifo(&rscp->pdid_fifo, &rscp->pdid_fifo_lock, + nr_pdid, 1, 0); + if (err) + goto pdid_err; + return 0; +pdid_err: + kfifo_free(&rscp->cqid_fifo); +cqid_err: + kfifo_free(&rscp->qpid_fifo); +qpid_err: + kfifo_free(&rscp->tpt_fifo); +tpt_err: + return -ENOMEM; +} + +/* + * returns 0 if no resource available + */ +static u32 cxio_hal_get_resource(struct kfifo *fifo, spinlock_t * lock) +{ + u32 entry; + if (kfifo_out_locked(fifo, (unsigned char *) &entry, sizeof(u32), lock)) + return entry; + else + return 0; /* fifo emptry */ +} + +static void cxio_hal_put_resource(struct kfifo *fifo, spinlock_t * lock, + u32 entry) +{ + BUG_ON( + kfifo_in_locked(fifo, (unsigned char *) &entry, sizeof(u32), lock) + == 0); +} + +u32 cxio_hal_get_stag(struct cxio_hal_resource *rscp) +{ + return cxio_hal_get_resource(&rscp->tpt_fifo, &rscp->tpt_fifo_lock); +} + +void cxio_hal_put_stag(struct cxio_hal_resource *rscp, u32 stag) +{ + cxio_hal_put_resource(&rscp->tpt_fifo, &rscp->tpt_fifo_lock, stag); +} + +u32 cxio_hal_get_qpid(struct cxio_hal_resource *rscp) +{ + u32 qpid = cxio_hal_get_resource(&rscp->qpid_fifo, + &rscp->qpid_fifo_lock); + PDBG("%s qpid 0x%x\n", __func__, qpid); + return qpid; +} + +void cxio_hal_put_qpid(struct cxio_hal_resource *rscp, u32 qpid) +{ + PDBG("%s qpid 0x%x\n", __func__, qpid); + cxio_hal_put_resource(&rscp->qpid_fifo, &rscp->qpid_fifo_lock, qpid); +} + +u32 cxio_hal_get_cqid(struct cxio_hal_resource *rscp) +{ + return cxio_hal_get_resource(&rscp->cqid_fifo, &rscp->cqid_fifo_lock); +} + +void cxio_hal_put_cqid(struct cxio_hal_resource *rscp, u32 cqid) +{ + cxio_hal_put_resource(&rscp->cqid_fifo, &rscp->cqid_fifo_lock, cqid); +} + +u32 cxio_hal_get_pdid(struct cxio_hal_resource *rscp) +{ + return cxio_hal_get_resource(&rscp->pdid_fifo, &rscp->pdid_fifo_lock); +} + +void cxio_hal_put_pdid(struct cxio_hal_resource *rscp, u32 pdid) +{ + cxio_hal_put_resource(&rscp->pdid_fifo, &rscp->pdid_fifo_lock, pdid); +} + +void cxio_hal_destroy_resource(struct cxio_hal_resource *rscp) +{ + kfifo_free(&rscp->tpt_fifo); + kfifo_free(&rscp->cqid_fifo); + kfifo_free(&rscp->qpid_fifo); + kfifo_free(&rscp->pdid_fifo); + kfree(rscp); +} + +/* + * PBL Memory Manager. Uses Linux generic allocator. + */ + +#define MIN_PBL_SHIFT 8 /* 256B == min PBL size (32 entries) */ + +u32 cxio_hal_pblpool_alloc(struct cxio_rdev *rdev_p, int size) +{ + unsigned long addr = gen_pool_alloc(rdev_p->pbl_pool, size); + PDBG("%s addr 0x%x size %d\n", __func__, (u32)addr, size); + return (u32)addr; +} + +void cxio_hal_pblpool_free(struct cxio_rdev *rdev_p, u32 addr, int size) +{ + PDBG("%s addr 0x%x size %d\n", __func__, addr, size); + gen_pool_free(rdev_p->pbl_pool, (unsigned long)addr, size); +} + +int cxio_hal_pblpool_create(struct cxio_rdev *rdev_p) +{ + unsigned pbl_start, pbl_chunk; + + rdev_p->pbl_pool = gen_pool_create(MIN_PBL_SHIFT, -1); + if (!rdev_p->pbl_pool) + return -ENOMEM; + + pbl_start = rdev_p->rnic_info.pbl_base; + pbl_chunk = rdev_p->rnic_info.pbl_top - pbl_start + 1; + + while (pbl_start < rdev_p->rnic_info.pbl_top) { + pbl_chunk = min(rdev_p->rnic_info.pbl_top - pbl_start + 1, + pbl_chunk); + if (gen_pool_add(rdev_p->pbl_pool, pbl_start, pbl_chunk, -1)) { + PDBG("%s failed to add PBL chunk (%x/%x)\n", + __func__, pbl_start, pbl_chunk); + if (pbl_chunk <= 1024 << MIN_PBL_SHIFT) { + printk(KERN_WARNING MOD "%s: Failed to add all PBL chunks (%x/%x)\n", + __func__, pbl_start, rdev_p->rnic_info.pbl_top - pbl_start); + return 0; + } + pbl_chunk >>= 1; + } else { + PDBG("%s added PBL chunk (%x/%x)\n", + __func__, pbl_start, pbl_chunk); + pbl_start += pbl_chunk; + } + } + + return 0; +} + +void cxio_hal_pblpool_destroy(struct cxio_rdev *rdev_p) +{ + gen_pool_destroy(rdev_p->pbl_pool); +} + +/* + * RQT Memory Manager. Uses Linux generic allocator. + */ + +#define MIN_RQT_SHIFT 10 /* 1KB == mini RQT size (16 entries) */ +#define RQT_CHUNK 2*1024*1024 + +u32 cxio_hal_rqtpool_alloc(struct cxio_rdev *rdev_p, int size) +{ + unsigned long addr = gen_pool_alloc(rdev_p->rqt_pool, size << 6); + PDBG("%s addr 0x%x size %d\n", __func__, (u32)addr, size << 6); + return (u32)addr; +} + +void cxio_hal_rqtpool_free(struct cxio_rdev *rdev_p, u32 addr, int size) +{ + PDBG("%s addr 0x%x size %d\n", __func__, addr, size << 6); + gen_pool_free(rdev_p->rqt_pool, (unsigned long)addr, size << 6); +} + +int cxio_hal_rqtpool_create(struct cxio_rdev *rdev_p) +{ + unsigned long i; + rdev_p->rqt_pool = gen_pool_create(MIN_RQT_SHIFT, -1); + if (rdev_p->rqt_pool) + for (i = rdev_p->rnic_info.rqt_base; + i <= rdev_p->rnic_info.rqt_top - RQT_CHUNK + 1; + i += RQT_CHUNK) + gen_pool_add(rdev_p->rqt_pool, i, RQT_CHUNK, -1); + return rdev_p->rqt_pool ? 0 : -ENOMEM; +} + +void cxio_hal_rqtpool_destroy(struct cxio_rdev *rdev_p) +{ + gen_pool_destroy(rdev_p->rqt_pool); +} diff --git a/kernel/drivers/infiniband/hw/cxgb3/cxio_resource.h b/kernel/drivers/infiniband/hw/cxgb3/cxio_resource.h new file mode 100644 index 000000000..a2703a3d8 --- /dev/null +++ b/kernel/drivers/infiniband/hw/cxgb3/cxio_resource.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __CXIO_RESOURCE_H__ +#define __CXIO_RESOURCE_H__ + +#include +#include +#include +#include +#include +#include +#include +#include "cxio_hal.h" + +extern int cxio_hal_init_rhdl_resource(u32 nr_rhdl); +extern void cxio_hal_destroy_rhdl_resource(void); +extern int cxio_hal_init_resource(struct cxio_rdev *rdev_p, + u32 nr_tpt, u32 nr_pbl, + u32 nr_rqt, u32 nr_qpid, u32 nr_cqid, + u32 nr_pdid); +extern u32 cxio_hal_get_stag(struct cxio_hal_resource *rscp); +extern void cxio_hal_put_stag(struct cxio_hal_resource *rscp, u32 stag); +extern u32 cxio_hal_get_qpid(struct cxio_hal_resource *rscp); +extern void cxio_hal_put_qpid(struct cxio_hal_resource *rscp, u32 qpid); +extern u32 cxio_hal_get_cqid(struct cxio_hal_resource *rscp); +extern void cxio_hal_put_cqid(struct cxio_hal_resource *rscp, u32 cqid); +extern void cxio_hal_destroy_resource(struct cxio_hal_resource *rscp); + +#define PBL_OFF(rdev_p, a) ( (a) - (rdev_p)->rnic_info.pbl_base ) +extern int cxio_hal_pblpool_create(struct cxio_rdev *rdev_p); +extern void cxio_hal_pblpool_destroy(struct cxio_rdev *rdev_p); +extern u32 cxio_hal_pblpool_alloc(struct cxio_rdev *rdev_p, int size); +extern void cxio_hal_pblpool_free(struct cxio_rdev *rdev_p, u32 addr, int size); + +#define RQT_OFF(rdev_p, a) ( (a) - (rdev_p)->rnic_info.rqt_base ) +extern int cxio_hal_rqtpool_create(struct cxio_rdev *rdev_p); +extern void cxio_hal_rqtpool_destroy(struct cxio_rdev *rdev_p); +extern u32 cxio_hal_rqtpool_alloc(struct cxio_rdev *rdev_p, int size); +extern void cxio_hal_rqtpool_free(struct cxio_rdev *rdev_p, u32 addr, int size); +#endif diff --git a/kernel/drivers/infiniband/hw/cxgb3/cxio_wr.h b/kernel/drivers/infiniband/hw/cxgb3/cxio_wr.h new file mode 100644 index 000000000..83d2e19d3 --- /dev/null +++ b/kernel/drivers/infiniband/hw/cxgb3/cxio_wr.h @@ -0,0 +1,802 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __CXIO_WR_H__ +#define __CXIO_WR_H__ + +#include +#include +#include +#include "firmware_exports.h" + +#define T3_MAX_SGE 4 +#define T3_MAX_INLINE 64 +#define T3_STAG0_PBL_SIZE (2 * T3_MAX_SGE << 3) +#define T3_STAG0_MAX_PBE_LEN (128 * 1024 * 1024) +#define T3_STAG0_PAGE_SHIFT 15 + +#define Q_EMPTY(rptr,wptr) ((rptr)==(wptr)) +#define Q_FULL(rptr,wptr,size_log2) ( (((wptr)-(rptr))>>(size_log2)) && \ + ((rptr)!=(wptr)) ) +#define Q_GENBIT(ptr,size_log2) (!(((ptr)>>size_log2)&0x1)) +#define Q_FREECNT(rptr,wptr,size_log2) ((1UL<> S_FW_RIWR_OP)) & M_FW_RIWR_OP) + +#define S_FW_RIWR_SOPEOP 22 +#define M_FW_RIWR_SOPEOP 0x3 +#define V_FW_RIWR_SOPEOP(x) ((x) << S_FW_RIWR_SOPEOP) + +#define S_FW_RIWR_FLAGS 8 +#define M_FW_RIWR_FLAGS 0x3fffff +#define V_FW_RIWR_FLAGS(x) ((x) << S_FW_RIWR_FLAGS) +#define G_FW_RIWR_FLAGS(x) ((((x) >> S_FW_RIWR_FLAGS)) & M_FW_RIWR_FLAGS) + +#define S_FW_RIWR_TID 8 +#define V_FW_RIWR_TID(x) ((x) << S_FW_RIWR_TID) + +#define S_FW_RIWR_LEN 0 +#define V_FW_RIWR_LEN(x) ((x) << S_FW_RIWR_LEN) + +#define S_FW_RIWR_GEN 31 +#define V_FW_RIWR_GEN(x) ((x) << S_FW_RIWR_GEN) + +struct t3_sge { + __be32 stag; + __be32 len; + __be64 to; +}; + +/* If num_sgle is zero, flit 5+ contains immediate data.*/ +struct t3_send_wr { + struct fw_riwrh wrh; /* 0 */ + union t3_wrid wrid; /* 1 */ + + u8 rdmaop; /* 2 */ + u8 reserved[3]; + __be32 rem_stag; + __be32 plen; /* 3 */ + __be32 num_sgle; + struct t3_sge sgl[T3_MAX_SGE]; /* 4+ */ +}; + +#define T3_MAX_FASTREG_DEPTH 10 +#define T3_MAX_FASTREG_FRAG 10 + +struct t3_fastreg_wr { + struct fw_riwrh wrh; /* 0 */ + union t3_wrid wrid; /* 1 */ + __be32 stag; /* 2 */ + __be32 len; + __be32 va_base_hi; /* 3 */ + __be32 va_base_lo_fbo; + __be32 page_type_perms; /* 4 */ + __be32 reserved1; + __be64 pbl_addrs[0]; /* 5+ */ +}; + +/* + * If a fastreg wr spans multiple wqes, then the 2nd fragment look like this. + */ +struct t3_pbl_frag { + struct fw_riwrh wrh; /* 0 */ + __be64 pbl_addrs[14]; /* 1..14 */ +}; + +#define S_FR_PAGE_COUNT 24 +#define M_FR_PAGE_COUNT 0xff +#define V_FR_PAGE_COUNT(x) ((x) << S_FR_PAGE_COUNT) +#define G_FR_PAGE_COUNT(x) ((((x) >> S_FR_PAGE_COUNT)) & M_FR_PAGE_COUNT) + +#define S_FR_PAGE_SIZE 16 +#define M_FR_PAGE_SIZE 0x1f +#define V_FR_PAGE_SIZE(x) ((x) << S_FR_PAGE_SIZE) +#define G_FR_PAGE_SIZE(x) ((((x) >> S_FR_PAGE_SIZE)) & M_FR_PAGE_SIZE) + +#define S_FR_TYPE 8 +#define M_FR_TYPE 0x1 +#define V_FR_TYPE(x) ((x) << S_FR_TYPE) +#define G_FR_TYPE(x) ((((x) >> S_FR_TYPE)) & M_FR_TYPE) + +#define S_FR_PERMS 0 +#define M_FR_PERMS 0xff +#define V_FR_PERMS(x) ((x) << S_FR_PERMS) +#define G_FR_PERMS(x) ((((x) >> S_FR_PERMS)) & M_FR_PERMS) + +struct t3_local_inv_wr { + struct fw_riwrh wrh; /* 0 */ + union t3_wrid wrid; /* 1 */ + __be32 stag; /* 2 */ + __be32 reserved; +}; + +struct t3_rdma_write_wr { + struct fw_riwrh wrh; /* 0 */ + union t3_wrid wrid; /* 1 */ + u8 rdmaop; /* 2 */ + u8 reserved[3]; + __be32 stag_sink; + __be64 to_sink; /* 3 */ + __be32 plen; /* 4 */ + __be32 num_sgle; + struct t3_sge sgl[T3_MAX_SGE]; /* 5+ */ +}; + +struct t3_rdma_read_wr { + struct fw_riwrh wrh; /* 0 */ + union t3_wrid wrid; /* 1 */ + u8 rdmaop; /* 2 */ + u8 local_inv; + u8 reserved[2]; + __be32 rem_stag; + __be64 rem_to; /* 3 */ + __be32 local_stag; /* 4 */ + __be32 local_len; + __be64 local_to; /* 5 */ +}; + +struct t3_bind_mw_wr { + struct fw_riwrh wrh; /* 0 */ + union t3_wrid wrid; /* 1 */ + u16 reserved; /* 2 */ + u8 type; + u8 perms; + __be32 mr_stag; + __be32 mw_stag; /* 3 */ + __be32 mw_len; + __be64 mw_va; /* 4 */ + __be32 mr_pbl_addr; /* 5 */ + u8 reserved2[3]; + u8 mr_pagesz; +}; + +struct t3_receive_wr { + struct fw_riwrh wrh; /* 0 */ + union t3_wrid wrid; /* 1 */ + u8 pagesz[T3_MAX_SGE]; + __be32 num_sgle; /* 2 */ + struct t3_sge sgl[T3_MAX_SGE]; /* 3+ */ + __be32 pbl_addr[T3_MAX_SGE]; +}; + +struct t3_bypass_wr { + struct fw_riwrh wrh; + union t3_wrid wrid; /* 1 */ +}; + +struct t3_modify_qp_wr { + struct fw_riwrh wrh; /* 0 */ + union t3_wrid wrid; /* 1 */ + __be32 flags; /* 2 */ + __be32 quiesce; /* 2 */ + __be32 max_ird; /* 3 */ + __be32 max_ord; /* 3 */ + __be64 sge_cmd; /* 4 */ + __be64 ctx1; /* 5 */ + __be64 ctx0; /* 6 */ +}; + +enum t3_modify_qp_flags { + MODQP_QUIESCE = 0x01, + MODQP_MAX_IRD = 0x02, + MODQP_MAX_ORD = 0x04, + MODQP_WRITE_EC = 0x08, + MODQP_READ_EC = 0x10, +}; + + +enum t3_mpa_attrs { + uP_RI_MPA_RX_MARKER_ENABLE = 0x1, + uP_RI_MPA_TX_MARKER_ENABLE = 0x2, + uP_RI_MPA_CRC_ENABLE = 0x4, + uP_RI_MPA_IETF_ENABLE = 0x8 +} __attribute__ ((packed)); + +enum t3_qp_caps { + uP_RI_QP_RDMA_READ_ENABLE = 0x01, + uP_RI_QP_RDMA_WRITE_ENABLE = 0x02, + uP_RI_QP_BIND_ENABLE = 0x04, + uP_RI_QP_FAST_REGISTER_ENABLE = 0x08, + uP_RI_QP_STAG0_ENABLE = 0x10 +} __attribute__ ((packed)); + +enum rdma_init_rtr_types { + RTR_READ = 1, + RTR_WRITE = 2, + RTR_SEND = 3, +}; + +#define S_RTR_TYPE 2 +#define M_RTR_TYPE 0x3 +#define V_RTR_TYPE(x) ((x) << S_RTR_TYPE) +#define G_RTR_TYPE(x) ((((x) >> S_RTR_TYPE)) & M_RTR_TYPE) + +#define S_CHAN 4 +#define M_CHAN 0x3 +#define V_CHAN(x) ((x) << S_CHAN) +#define G_CHAN(x) ((((x) >> S_CHAN)) & M_CHAN) + +struct t3_rdma_init_attr { + u32 tid; + u32 qpid; + u32 pdid; + u32 scqid; + u32 rcqid; + u32 rq_addr; + u32 rq_size; + enum t3_mpa_attrs mpaattrs; + enum t3_qp_caps qpcaps; + u16 tcp_emss; + u32 ord; + u32 ird; + u64 qp_dma_addr; + u32 qp_dma_size; + enum rdma_init_rtr_types rtr_type; + u16 flags; + u16 rqe_count; + u32 irs; + u32 chan; +}; + +struct t3_rdma_init_wr { + struct fw_riwrh wrh; /* 0 */ + union t3_wrid wrid; /* 1 */ + __be32 qpid; /* 2 */ + __be32 pdid; + __be32 scqid; /* 3 */ + __be32 rcqid; + __be32 rq_addr; /* 4 */ + __be32 rq_size; + u8 mpaattrs; /* 5 */ + u8 qpcaps; + __be16 ulpdu_size; + __be16 flags_rtr_type; + __be16 rqe_count; + __be32 ord; /* 6 */ + __be32 ird; + __be64 qp_dma_addr; /* 7 */ + __be32 qp_dma_size; /* 8 */ + __be32 irs; +}; + +struct t3_genbit { + u64 flit[15]; + __be64 genbit; +}; + +struct t3_wq_in_err { + u64 flit[13]; + u64 err; +}; + +enum rdma_init_wr_flags { + MPA_INITIATOR = (1<<0), + PRIV_QP = (1<<1), +}; + +union t3_wr { + struct t3_send_wr send; + struct t3_rdma_write_wr write; + struct t3_rdma_read_wr read; + struct t3_receive_wr recv; + struct t3_fastreg_wr fastreg; + struct t3_pbl_frag pbl_frag; + struct t3_local_inv_wr local_inv; + struct t3_bind_mw_wr bind; + struct t3_bypass_wr bypass; + struct t3_rdma_init_wr init; + struct t3_modify_qp_wr qp_mod; + struct t3_genbit genbit; + struct t3_wq_in_err wq_in_err; + __be64 flit[16]; +}; + +#define T3_SQ_CQE_FLIT 13 +#define T3_SQ_COOKIE_FLIT 14 + +#define T3_RQ_COOKIE_FLIT 13 +#define T3_RQ_CQE_FLIT 14 + +static inline enum t3_wr_opcode fw_riwrh_opcode(struct fw_riwrh *wqe) +{ + return G_FW_RIWR_OP(be32_to_cpu(wqe->op_seop_flags)); +} + +enum t3_wr_hdr_bits { + T3_EOP = 1, + T3_SOP = 2, + T3_SOPEOP = T3_EOP|T3_SOP, +}; + +static inline void build_fw_riwrh(struct fw_riwrh *wqe, enum t3_wr_opcode op, + enum t3_wr_flags flags, u8 genbit, u32 tid, + u8 len, u8 sopeop) +{ + wqe->op_seop_flags = cpu_to_be32(V_FW_RIWR_OP(op) | + V_FW_RIWR_SOPEOP(sopeop) | + V_FW_RIWR_FLAGS(flags)); + wmb(); + wqe->gen_tid_len = cpu_to_be32(V_FW_RIWR_GEN(genbit) | + V_FW_RIWR_TID(tid) | + V_FW_RIWR_LEN(len)); + /* 2nd gen bit... */ + ((union t3_wr *)wqe)->genbit.genbit = cpu_to_be64(genbit); +} + +/* + * T3 ULP2_TX commands + */ +enum t3_utx_mem_op { + T3_UTX_MEM_READ = 2, + T3_UTX_MEM_WRITE = 3 +}; + +/* T3 MC7 RDMA TPT entry format */ + +enum tpt_mem_type { + TPT_NON_SHARED_MR = 0x0, + TPT_SHARED_MR = 0x1, + TPT_MW = 0x2, + TPT_MW_RELAXED_PROTECTION = 0x3 +}; + +enum tpt_addr_type { + TPT_ZBTO = 0, + TPT_VATO = 1 +}; + +enum tpt_mem_perm { + TPT_MW_BIND = 0x10, + TPT_LOCAL_READ = 0x8, + TPT_LOCAL_WRITE = 0x4, + TPT_REMOTE_READ = 0x2, + TPT_REMOTE_WRITE = 0x1 +}; + +struct tpt_entry { + __be32 valid_stag_pdid; + __be32 flags_pagesize_qpid; + + __be32 rsvd_pbl_addr; + __be32 len; + __be32 va_hi; + __be32 va_low_or_fbo; + + __be32 rsvd_bind_cnt_or_pstag; + __be32 rsvd_pbl_size; +}; + +#define S_TPT_VALID 31 +#define V_TPT_VALID(x) ((x) << S_TPT_VALID) +#define F_TPT_VALID V_TPT_VALID(1U) + +#define S_TPT_STAG_KEY 23 +#define M_TPT_STAG_KEY 0xFF +#define V_TPT_STAG_KEY(x) ((x) << S_TPT_STAG_KEY) +#define G_TPT_STAG_KEY(x) (((x) >> S_TPT_STAG_KEY) & M_TPT_STAG_KEY) + +#define S_TPT_STAG_STATE 22 +#define V_TPT_STAG_STATE(x) ((x) << S_TPT_STAG_STATE) +#define F_TPT_STAG_STATE V_TPT_STAG_STATE(1U) + +#define S_TPT_STAG_TYPE 20 +#define M_TPT_STAG_TYPE 0x3 +#define V_TPT_STAG_TYPE(x) ((x) << S_TPT_STAG_TYPE) +#define G_TPT_STAG_TYPE(x) (((x) >> S_TPT_STAG_TYPE) & M_TPT_STAG_TYPE) + +#define S_TPT_PDID 0 +#define M_TPT_PDID 0xFFFFF +#define V_TPT_PDID(x) ((x) << S_TPT_PDID) +#define G_TPT_PDID(x) (((x) >> S_TPT_PDID) & M_TPT_PDID) + +#define S_TPT_PERM 28 +#define M_TPT_PERM 0xF +#define V_TPT_PERM(x) ((x) << S_TPT_PERM) +#define G_TPT_PERM(x) (((x) >> S_TPT_PERM) & M_TPT_PERM) + +#define S_TPT_REM_INV_DIS 27 +#define V_TPT_REM_INV_DIS(x) ((x) << S_TPT_REM_INV_DIS) +#define F_TPT_REM_INV_DIS V_TPT_REM_INV_DIS(1U) + +#define S_TPT_ADDR_TYPE 26 +#define V_TPT_ADDR_TYPE(x) ((x) << S_TPT_ADDR_TYPE) +#define F_TPT_ADDR_TYPE V_TPT_ADDR_TYPE(1U) + +#define S_TPT_MW_BIND_ENABLE 25 +#define V_TPT_MW_BIND_ENABLE(x) ((x) << S_TPT_MW_BIND_ENABLE) +#define F_TPT_MW_BIND_ENABLE V_TPT_MW_BIND_ENABLE(1U) + +#define S_TPT_PAGE_SIZE 20 +#define M_TPT_PAGE_SIZE 0x1F +#define V_TPT_PAGE_SIZE(x) ((x) << S_TPT_PAGE_SIZE) +#define G_TPT_PAGE_SIZE(x) (((x) >> S_TPT_PAGE_SIZE) & M_TPT_PAGE_SIZE) + +#define S_TPT_PBL_ADDR 0 +#define M_TPT_PBL_ADDR 0x1FFFFFFF +#define V_TPT_PBL_ADDR(x) ((x) << S_TPT_PBL_ADDR) +#define G_TPT_PBL_ADDR(x) (((x) >> S_TPT_PBL_ADDR) & M_TPT_PBL_ADDR) + +#define S_TPT_QPID 0 +#define M_TPT_QPID 0xFFFFF +#define V_TPT_QPID(x) ((x) << S_TPT_QPID) +#define G_TPT_QPID(x) (((x) >> S_TPT_QPID) & M_TPT_QPID) + +#define S_TPT_PSTAG 0 +#define M_TPT_PSTAG 0xFFFFFF +#define V_TPT_PSTAG(x) ((x) << S_TPT_PSTAG) +#define G_TPT_PSTAG(x) (((x) >> S_TPT_PSTAG) & M_TPT_PSTAG) + +#define S_TPT_PBL_SIZE 0 +#define M_TPT_PBL_SIZE 0xFFFFF +#define V_TPT_PBL_SIZE(x) ((x) << S_TPT_PBL_SIZE) +#define G_TPT_PBL_SIZE(x) (((x) >> S_TPT_PBL_SIZE) & M_TPT_PBL_SIZE) + +/* + * CQE defs + */ +struct t3_cqe { + __be32 header; + __be32 len; + union { + struct { + __be32 stag; + __be32 msn; + } rcqe; + struct { + u32 wrid_hi; + u32 wrid_low; + } scqe; + } u; +}; + +#define S_CQE_OOO 31 +#define M_CQE_OOO 0x1 +#define G_CQE_OOO(x) ((((x) >> S_CQE_OOO)) & M_CQE_OOO) +#define V_CEQ_OOO(x) ((x)<> S_CQE_QPID)) & M_CQE_QPID) +#define V_CQE_QPID(x) ((x)<> S_CQE_SWCQE)) & M_CQE_SWCQE) +#define V_CQE_SWCQE(x) ((x)<> S_CQE_GENBIT) & M_CQE_GENBIT) +#define V_CQE_GENBIT(x) ((x)<> S_CQE_STATUS)) & M_CQE_STATUS) +#define V_CQE_STATUS(x) ((x)<> S_CQE_TYPE)) & M_CQE_TYPE) +#define V_CQE_TYPE(x) ((x)<> S_CQE_OPCODE)) & M_CQE_OPCODE) +#define V_CQE_OPCODE(x) ((x)<queue[1 << cq->size_log2])->cq_err; +} + +static inline void cxio_set_cq_in_error(struct t3_cq *cq) +{ + ((struct t3_cq_status_page *) + &cq->queue[1 << cq->size_log2])->cq_err = 1; +} + +static inline void cxio_set_wq_in_error(struct t3_wq *wq) +{ + wq->queue->wq_in_err.err |= 1; +} + +static inline void cxio_disable_wq_db(struct t3_wq *wq) +{ + wq->queue->wq_in_err.err |= 2; +} + +static inline void cxio_enable_wq_db(struct t3_wq *wq) +{ + wq->queue->wq_in_err.err &= ~2; +} + +static inline int cxio_wq_db_enabled(struct t3_wq *wq) +{ + return !(wq->queue->wq_in_err.err & 2); +} + +static inline struct t3_cqe *cxio_next_hw_cqe(struct t3_cq *cq) +{ + struct t3_cqe *cqe; + + cqe = cq->queue + (Q_PTR2IDX(cq->rptr, cq->size_log2)); + if (CQ_VLD_ENTRY(cq->rptr, cq->size_log2, cqe)) + return cqe; + return NULL; +} + +static inline struct t3_cqe *cxio_next_sw_cqe(struct t3_cq *cq) +{ + struct t3_cqe *cqe; + + if (!Q_EMPTY(cq->sw_rptr, cq->sw_wptr)) { + cqe = cq->sw_queue + (Q_PTR2IDX(cq->sw_rptr, cq->size_log2)); + return cqe; + } + return NULL; +} + +static inline struct t3_cqe *cxio_next_cqe(struct t3_cq *cq) +{ + struct t3_cqe *cqe; + + if (!Q_EMPTY(cq->sw_rptr, cq->sw_wptr)) { + cqe = cq->sw_queue + (Q_PTR2IDX(cq->sw_rptr, cq->size_log2)); + return cqe; + } + cqe = cq->queue + (Q_PTR2IDX(cq->rptr, cq->size_log2)); + if (CQ_VLD_ENTRY(cq->rptr, cq->size_log2, cqe)) + return cqe; + return NULL; +} + +#endif diff --git a/kernel/drivers/infiniband/hw/cxgb3/iwch.c b/kernel/drivers/infiniband/hw/cxgb3/iwch.c new file mode 100644 index 000000000..8e77dc543 --- /dev/null +++ b/kernel/drivers/infiniband/hw/cxgb3/iwch.c @@ -0,0 +1,292 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include + +#include + +#include "cxgb3_offload.h" +#include "iwch_provider.h" +#include "iwch_user.h" +#include "iwch.h" +#include "iwch_cm.h" + +#define DRV_VERSION "1.1" + +MODULE_AUTHOR("Boyd Faulkner, Steve Wise"); +MODULE_DESCRIPTION("Chelsio T3 RDMA Driver"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION(DRV_VERSION); + +static void open_rnic_dev(struct t3cdev *); +static void close_rnic_dev(struct t3cdev *); +static void iwch_event_handler(struct t3cdev *, u32, u32); + +struct cxgb3_client t3c_client = { + .name = "iw_cxgb3", + .add = open_rnic_dev, + .remove = close_rnic_dev, + .handlers = t3c_handlers, + .redirect = iwch_ep_redirect, + .event_handler = iwch_event_handler +}; + +static LIST_HEAD(dev_list); +static DEFINE_MUTEX(dev_mutex); + +static int disable_qp_db(int id, void *p, void *data) +{ + struct iwch_qp *qhp = p; + + cxio_disable_wq_db(&qhp->wq); + return 0; +} + +static int enable_qp_db(int id, void *p, void *data) +{ + struct iwch_qp *qhp = p; + + if (data) + ring_doorbell(qhp->rhp->rdev.ctrl_qp.doorbell, qhp->wq.qpid); + cxio_enable_wq_db(&qhp->wq); + return 0; +} + +static void disable_dbs(struct iwch_dev *rnicp) +{ + spin_lock_irq(&rnicp->lock); + idr_for_each(&rnicp->qpidr, disable_qp_db, NULL); + spin_unlock_irq(&rnicp->lock); +} + +static void enable_dbs(struct iwch_dev *rnicp, int ring_db) +{ + spin_lock_irq(&rnicp->lock); + idr_for_each(&rnicp->qpidr, enable_qp_db, + (void *)(unsigned long)ring_db); + spin_unlock_irq(&rnicp->lock); +} + +static void iwch_db_drop_task(struct work_struct *work) +{ + struct iwch_dev *rnicp = container_of(work, struct iwch_dev, + db_drop_task.work); + enable_dbs(rnicp, 1); +} + +static void rnic_init(struct iwch_dev *rnicp) +{ + PDBG("%s iwch_dev %p\n", __func__, rnicp); + idr_init(&rnicp->cqidr); + idr_init(&rnicp->qpidr); + idr_init(&rnicp->mmidr); + spin_lock_init(&rnicp->lock); + INIT_DELAYED_WORK(&rnicp->db_drop_task, iwch_db_drop_task); + + rnicp->attr.max_qps = T3_MAX_NUM_QP - 32; + rnicp->attr.max_wrs = T3_MAX_QP_DEPTH; + rnicp->attr.max_sge_per_wr = T3_MAX_SGE; + rnicp->attr.max_sge_per_rdma_write_wr = T3_MAX_SGE; + rnicp->attr.max_cqs = T3_MAX_NUM_CQ - 1; + rnicp->attr.max_cqes_per_cq = T3_MAX_CQ_DEPTH; + rnicp->attr.max_mem_regs = cxio_num_stags(&rnicp->rdev); + rnicp->attr.max_phys_buf_entries = T3_MAX_PBL_SIZE; + rnicp->attr.max_pds = T3_MAX_NUM_PD - 1; + rnicp->attr.mem_pgsizes_bitmask = T3_PAGESIZE_MASK; + rnicp->attr.max_mr_size = T3_MAX_MR_SIZE; + rnicp->attr.can_resize_wq = 0; + rnicp->attr.max_rdma_reads_per_qp = 8; + rnicp->attr.max_rdma_read_resources = + rnicp->attr.max_rdma_reads_per_qp * rnicp->attr.max_qps; + rnicp->attr.max_rdma_read_qp_depth = 8; /* IRD */ + rnicp->attr.max_rdma_read_depth = + rnicp->attr.max_rdma_read_qp_depth * rnicp->attr.max_qps; + rnicp->attr.rq_overflow_handled = 0; + rnicp->attr.can_modify_ird = 0; + rnicp->attr.can_modify_ord = 0; + rnicp->attr.max_mem_windows = rnicp->attr.max_mem_regs - 1; + rnicp->attr.stag0_value = 1; + rnicp->attr.zbva_support = 1; + rnicp->attr.local_invalidate_fence = 1; + rnicp->attr.cq_overflow_detection = 1; + return; +} + +static void open_rnic_dev(struct t3cdev *tdev) +{ + struct iwch_dev *rnicp; + + PDBG("%s t3cdev %p\n", __func__, tdev); + printk_once(KERN_INFO MOD "Chelsio T3 RDMA Driver - version %s\n", + DRV_VERSION); + rnicp = (struct iwch_dev *)ib_alloc_device(sizeof(*rnicp)); + if (!rnicp) { + printk(KERN_ERR MOD "Cannot allocate ib device\n"); + return; + } + rnicp->rdev.ulp = rnicp; + rnicp->rdev.t3cdev_p = tdev; + + mutex_lock(&dev_mutex); + + if (cxio_rdev_open(&rnicp->rdev)) { + mutex_unlock(&dev_mutex); + printk(KERN_ERR MOD "Unable to open CXIO rdev\n"); + ib_dealloc_device(&rnicp->ibdev); + return; + } + + rnic_init(rnicp); + + list_add_tail(&rnicp->entry, &dev_list); + mutex_unlock(&dev_mutex); + + if (iwch_register_device(rnicp)) { + printk(KERN_ERR MOD "Unable to register device\n"); + close_rnic_dev(tdev); + } + printk(KERN_INFO MOD "Initialized device %s\n", + pci_name(rnicp->rdev.rnic_info.pdev)); + return; +} + +static void close_rnic_dev(struct t3cdev *tdev) +{ + struct iwch_dev *dev, *tmp; + PDBG("%s t3cdev %p\n", __func__, tdev); + mutex_lock(&dev_mutex); + list_for_each_entry_safe(dev, tmp, &dev_list, entry) { + if (dev->rdev.t3cdev_p == tdev) { + dev->rdev.flags = CXIO_ERROR_FATAL; + synchronize_net(); + cancel_delayed_work_sync(&dev->db_drop_task); + list_del(&dev->entry); + iwch_unregister_device(dev); + cxio_rdev_close(&dev->rdev); + idr_destroy(&dev->cqidr); + idr_destroy(&dev->qpidr); + idr_destroy(&dev->mmidr); + ib_dealloc_device(&dev->ibdev); + break; + } + } + mutex_unlock(&dev_mutex); +} + +static void iwch_event_handler(struct t3cdev *tdev, u32 evt, u32 port_id) +{ + struct cxio_rdev *rdev = tdev->ulp; + struct iwch_dev *rnicp; + struct ib_event event; + u32 portnum = port_id + 1; + int dispatch = 0; + + if (!rdev) + return; + rnicp = rdev_to_iwch_dev(rdev); + switch (evt) { + case OFFLOAD_STATUS_DOWN: { + rdev->flags = CXIO_ERROR_FATAL; + synchronize_net(); + event.event = IB_EVENT_DEVICE_FATAL; + dispatch = 1; + break; + } + case OFFLOAD_PORT_DOWN: { + event.event = IB_EVENT_PORT_ERR; + dispatch = 1; + break; + } + case OFFLOAD_PORT_UP: { + event.event = IB_EVENT_PORT_ACTIVE; + dispatch = 1; + break; + } + case OFFLOAD_DB_FULL: { + disable_dbs(rnicp); + break; + } + case OFFLOAD_DB_EMPTY: { + enable_dbs(rnicp, 1); + break; + } + case OFFLOAD_DB_DROP: { + unsigned long delay = 1000; + unsigned short r; + + disable_dbs(rnicp); + get_random_bytes(&r, 2); + delay += r & 1023; + + /* + * delay is between 1000-2023 usecs. + */ + schedule_delayed_work(&rnicp->db_drop_task, + usecs_to_jiffies(delay)); + break; + } + } + + if (dispatch) { + event.device = &rnicp->ibdev; + event.element.port_num = portnum; + ib_dispatch_event(&event); + } + + return; +} + +static int __init iwch_init_module(void) +{ + int err; + + err = cxio_hal_init(); + if (err) + return err; + err = iwch_cm_init(); + if (err) + return err; + cxio_register_ev_cb(iwch_ev_dispatch); + cxgb3_register_client(&t3c_client); + return 0; +} + +static void __exit iwch_exit_module(void) +{ + cxgb3_unregister_client(&t3c_client); + cxio_unregister_ev_cb(iwch_ev_dispatch); + iwch_cm_term(); + cxio_hal_exit(); +} + +module_init(iwch_init_module); +module_exit(iwch_exit_module); diff --git a/kernel/drivers/infiniband/hw/cxgb3/iwch.h b/kernel/drivers/infiniband/hw/cxgb3/iwch.h new file mode 100644 index 000000000..837862287 --- /dev/null +++ b/kernel/drivers/infiniband/hw/cxgb3/iwch.h @@ -0,0 +1,180 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __IWCH_H__ +#define __IWCH_H__ + +#include +#include +#include +#include +#include + +#include + +#include "cxio_hal.h" +#include "cxgb3_offload.h" + +struct iwch_pd; +struct iwch_cq; +struct iwch_qp; +struct iwch_mr; + +struct iwch_rnic_attributes { + u32 max_qps; + u32 max_wrs; /* Max for any SQ/RQ */ + u32 max_sge_per_wr; + u32 max_sge_per_rdma_write_wr; /* for RDMA Write WR */ + u32 max_cqs; + u32 max_cqes_per_cq; + u32 max_mem_regs; + u32 max_phys_buf_entries; /* for phys buf list */ + u32 max_pds; + + /* + * The memory page sizes supported by this RNIC. + * Bit position i in bitmap indicates page of + * size (4k)^i. Phys block list mode unsupported. + */ + u32 mem_pgsizes_bitmask; + u64 max_mr_size; + u8 can_resize_wq; + + /* + * The maximum number of RDMA Reads that can be outstanding + * per QP with this RNIC as the target. + */ + u32 max_rdma_reads_per_qp; + + /* + * The maximum number of resources used for RDMA Reads + * by this RNIC with this RNIC as the target. + */ + u32 max_rdma_read_resources; + + /* + * The max depth per QP for initiation of RDMA Read + * by this RNIC. + */ + u32 max_rdma_read_qp_depth; + + /* + * The maximum depth for initiation of RDMA Read + * operations by this RNIC on all QPs + */ + u32 max_rdma_read_depth; + u8 rq_overflow_handled; + u32 can_modify_ird; + u32 can_modify_ord; + u32 max_mem_windows; + u32 stag0_value; + u8 zbva_support; + u8 local_invalidate_fence; + u32 cq_overflow_detection; +}; + +struct iwch_dev { + struct ib_device ibdev; + struct cxio_rdev rdev; + u32 device_cap_flags; + struct iwch_rnic_attributes attr; + struct idr cqidr; + struct idr qpidr; + struct idr mmidr; + spinlock_t lock; + struct list_head entry; + struct delayed_work db_drop_task; +}; + +static inline struct iwch_dev *to_iwch_dev(struct ib_device *ibdev) +{ + return container_of(ibdev, struct iwch_dev, ibdev); +} + +static inline struct iwch_dev *rdev_to_iwch_dev(struct cxio_rdev *rdev) +{ + return container_of(rdev, struct iwch_dev, rdev); +} + +static inline int t3b_device(const struct iwch_dev *rhp) +{ + return rhp->rdev.t3cdev_p->type == T3B; +} + +static inline int t3a_device(const struct iwch_dev *rhp) +{ + return rhp->rdev.t3cdev_p->type == T3A; +} + +static inline struct iwch_cq *get_chp(struct iwch_dev *rhp, u32 cqid) +{ + return idr_find(&rhp->cqidr, cqid); +} + +static inline struct iwch_qp *get_qhp(struct iwch_dev *rhp, u32 qpid) +{ + return idr_find(&rhp->qpidr, qpid); +} + +static inline struct iwch_mr *get_mhp(struct iwch_dev *rhp, u32 mmid) +{ + return idr_find(&rhp->mmidr, mmid); +} + +static inline int insert_handle(struct iwch_dev *rhp, struct idr *idr, + void *handle, u32 id) +{ + int ret; + + idr_preload(GFP_KERNEL); + spin_lock_irq(&rhp->lock); + + ret = idr_alloc(idr, handle, id, id + 1, GFP_NOWAIT); + + spin_unlock_irq(&rhp->lock); + idr_preload_end(); + + BUG_ON(ret == -ENOSPC); + return ret < 0 ? ret : 0; +} + +static inline void remove_handle(struct iwch_dev *rhp, struct idr *idr, u32 id) +{ + spin_lock_irq(&rhp->lock); + idr_remove(idr, id); + spin_unlock_irq(&rhp->lock); +} + +extern struct cxgb3_client t3c_client; +extern cxgb3_cpl_handler_func t3c_handlers[NUM_CPL_CMDS]; +extern void iwch_ev_dispatch(struct cxio_rdev *rdev_p, struct sk_buff *skb); + +#endif diff --git a/kernel/drivers/infiniband/hw/cxgb3/iwch_cm.c b/kernel/drivers/infiniband/hw/cxgb3/iwch_cm.c new file mode 100644 index 000000000..cb78b1e9b --- /dev/null +++ b/kernel/drivers/infiniband/hw/cxgb3/iwch_cm.c @@ -0,0 +1,2272 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "tcb.h" +#include "cxgb3_offload.h" +#include "iwch.h" +#include "iwch_provider.h" +#include "iwch_cm.h" + +static char *states[] = { + "idle", + "listen", + "connecting", + "mpa_wait_req", + "mpa_req_sent", + "mpa_req_rcvd", + "mpa_rep_sent", + "fpdu_mode", + "aborting", + "closing", + "moribund", + "dead", + NULL, +}; + +int peer2peer = 0; +module_param(peer2peer, int, 0644); +MODULE_PARM_DESC(peer2peer, "Support peer2peer ULPs (default=0)"); + +static int ep_timeout_secs = 60; +module_param(ep_timeout_secs, int, 0644); +MODULE_PARM_DESC(ep_timeout_secs, "CM Endpoint operation timeout " + "in seconds (default=60)"); + +static int mpa_rev = 1; +module_param(mpa_rev, int, 0644); +MODULE_PARM_DESC(mpa_rev, "MPA Revision, 0 supports amso1100, " + "1 is spec compliant. (default=1)"); + +static int markers_enabled = 0; +module_param(markers_enabled, int, 0644); +MODULE_PARM_DESC(markers_enabled, "Enable MPA MARKERS (default(0)=disabled)"); + +static int crc_enabled = 1; +module_param(crc_enabled, int, 0644); +MODULE_PARM_DESC(crc_enabled, "Enable MPA CRC (default(1)=enabled)"); + +static int rcv_win = 256 * 1024; +module_param(rcv_win, int, 0644); +MODULE_PARM_DESC(rcv_win, "TCP receive window in bytes (default=256)"); + +static int snd_win = 32 * 1024; +module_param(snd_win, int, 0644); +MODULE_PARM_DESC(snd_win, "TCP send window in bytes (default=32KB)"); + +static unsigned int nocong = 0; +module_param(nocong, uint, 0644); +MODULE_PARM_DESC(nocong, "Turn off congestion control (default=0)"); + +static unsigned int cong_flavor = 1; +module_param(cong_flavor, uint, 0644); +MODULE_PARM_DESC(cong_flavor, "TCP Congestion control flavor (default=1)"); + +static struct workqueue_struct *workq; + +static struct sk_buff_head rxq; + +static struct sk_buff *get_skb(struct sk_buff *skb, int len, gfp_t gfp); +static void ep_timeout(unsigned long arg); +static void connect_reply_upcall(struct iwch_ep *ep, int status); + +static void start_ep_timer(struct iwch_ep *ep) +{ + PDBG("%s ep %p\n", __func__, ep); + if (timer_pending(&ep->timer)) { + PDBG("%s stopped / restarted timer ep %p\n", __func__, ep); + del_timer_sync(&ep->timer); + } else + get_ep(&ep->com); + ep->timer.expires = jiffies + ep_timeout_secs * HZ; + ep->timer.data = (unsigned long)ep; + ep->timer.function = ep_timeout; + add_timer(&ep->timer); +} + +static void stop_ep_timer(struct iwch_ep *ep) +{ + PDBG("%s ep %p\n", __func__, ep); + if (!timer_pending(&ep->timer)) { + WARN(1, "%s timer stopped when its not running! ep %p state %u\n", + __func__, ep, ep->com.state); + return; + } + del_timer_sync(&ep->timer); + put_ep(&ep->com); +} + +static int iwch_l2t_send(struct t3cdev *tdev, struct sk_buff *skb, struct l2t_entry *l2e) +{ + int error = 0; + struct cxio_rdev *rdev; + + rdev = (struct cxio_rdev *)tdev->ulp; + if (cxio_fatal_error(rdev)) { + kfree_skb(skb); + return -EIO; + } + error = l2t_send(tdev, skb, l2e); + if (error < 0) + kfree_skb(skb); + return error; +} + +int iwch_cxgb3_ofld_send(struct t3cdev *tdev, struct sk_buff *skb) +{ + int error = 0; + struct cxio_rdev *rdev; + + rdev = (struct cxio_rdev *)tdev->ulp; + if (cxio_fatal_error(rdev)) { + kfree_skb(skb); + return -EIO; + } + error = cxgb3_ofld_send(tdev, skb); + if (error < 0) + kfree_skb(skb); + return error; +} + +static void release_tid(struct t3cdev *tdev, u32 hwtid, struct sk_buff *skb) +{ + struct cpl_tid_release *req; + + skb = get_skb(skb, sizeof *req, GFP_KERNEL); + if (!skb) + return; + req = (struct cpl_tid_release *) skb_put(skb, sizeof(*req)); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, hwtid)); + skb->priority = CPL_PRIORITY_SETUP; + iwch_cxgb3_ofld_send(tdev, skb); + return; +} + +int iwch_quiesce_tid(struct iwch_ep *ep) +{ + struct cpl_set_tcb_field *req; + struct sk_buff *skb = get_skb(NULL, sizeof(*req), GFP_KERNEL); + + if (!skb) + return -ENOMEM; + req = (struct cpl_set_tcb_field *) skb_put(skb, sizeof(*req)); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, ep->hwtid)); + req->reply = 0; + req->cpu_idx = 0; + req->word = htons(W_TCB_RX_QUIESCE); + req->mask = cpu_to_be64(1ULL << S_TCB_RX_QUIESCE); + req->val = cpu_to_be64(1 << S_TCB_RX_QUIESCE); + + skb->priority = CPL_PRIORITY_DATA; + return iwch_cxgb3_ofld_send(ep->com.tdev, skb); +} + +int iwch_resume_tid(struct iwch_ep *ep) +{ + struct cpl_set_tcb_field *req; + struct sk_buff *skb = get_skb(NULL, sizeof(*req), GFP_KERNEL); + + if (!skb) + return -ENOMEM; + req = (struct cpl_set_tcb_field *) skb_put(skb, sizeof(*req)); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, ep->hwtid)); + req->reply = 0; + req->cpu_idx = 0; + req->word = htons(W_TCB_RX_QUIESCE); + req->mask = cpu_to_be64(1ULL << S_TCB_RX_QUIESCE); + req->val = 0; + + skb->priority = CPL_PRIORITY_DATA; + return iwch_cxgb3_ofld_send(ep->com.tdev, skb); +} + +static void set_emss(struct iwch_ep *ep, u16 opt) +{ + PDBG("%s ep %p opt %u\n", __func__, ep, opt); + ep->emss = T3C_DATA(ep->com.tdev)->mtus[G_TCPOPT_MSS(opt)] - 40; + if (G_TCPOPT_TSTAMP(opt)) + ep->emss -= 12; + if (ep->emss < 128) + ep->emss = 128; + PDBG("emss=%d\n", ep->emss); +} + +static enum iwch_ep_state state_read(struct iwch_ep_common *epc) +{ + unsigned long flags; + enum iwch_ep_state state; + + spin_lock_irqsave(&epc->lock, flags); + state = epc->state; + spin_unlock_irqrestore(&epc->lock, flags); + return state; +} + +static void __state_set(struct iwch_ep_common *epc, enum iwch_ep_state new) +{ + epc->state = new; +} + +static void state_set(struct iwch_ep_common *epc, enum iwch_ep_state new) +{ + unsigned long flags; + + spin_lock_irqsave(&epc->lock, flags); + PDBG("%s - %s -> %s\n", __func__, states[epc->state], states[new]); + __state_set(epc, new); + spin_unlock_irqrestore(&epc->lock, flags); + return; +} + +static void *alloc_ep(int size, gfp_t gfp) +{ + struct iwch_ep_common *epc; + + epc = kzalloc(size, gfp); + if (epc) { + kref_init(&epc->kref); + spin_lock_init(&epc->lock); + init_waitqueue_head(&epc->waitq); + } + PDBG("%s alloc ep %p\n", __func__, epc); + return epc; +} + +void __free_ep(struct kref *kref) +{ + struct iwch_ep *ep; + ep = container_of(container_of(kref, struct iwch_ep_common, kref), + struct iwch_ep, com); + PDBG("%s ep %p state %s\n", __func__, ep, states[state_read(&ep->com)]); + if (test_bit(RELEASE_RESOURCES, &ep->com.flags)) { + cxgb3_remove_tid(ep->com.tdev, (void *)ep, ep->hwtid); + dst_release(ep->dst); + l2t_release(ep->com.tdev, ep->l2t); + } + kfree(ep); +} + +static void release_ep_resources(struct iwch_ep *ep) +{ + PDBG("%s ep %p tid %d\n", __func__, ep, ep->hwtid); + set_bit(RELEASE_RESOURCES, &ep->com.flags); + put_ep(&ep->com); +} + +static int status2errno(int status) +{ + switch (status) { + case CPL_ERR_NONE: + return 0; + case CPL_ERR_CONN_RESET: + return -ECONNRESET; + case CPL_ERR_ARP_MISS: + return -EHOSTUNREACH; + case CPL_ERR_CONN_TIMEDOUT: + return -ETIMEDOUT; + case CPL_ERR_TCAM_FULL: + return -ENOMEM; + case CPL_ERR_CONN_EXIST: + return -EADDRINUSE; + default: + return -EIO; + } +} + +/* + * Try and reuse skbs already allocated... + */ +static struct sk_buff *get_skb(struct sk_buff *skb, int len, gfp_t gfp) +{ + if (skb && !skb_is_nonlinear(skb) && !skb_cloned(skb)) { + skb_trim(skb, 0); + skb_get(skb); + } else { + skb = alloc_skb(len, gfp); + } + return skb; +} + +static struct rtable *find_route(struct t3cdev *dev, __be32 local_ip, + __be32 peer_ip, __be16 local_port, + __be16 peer_port, u8 tos) +{ + struct rtable *rt; + struct flowi4 fl4; + + rt = ip_route_output_ports(&init_net, &fl4, NULL, peer_ip, local_ip, + peer_port, local_port, IPPROTO_TCP, + tos, 0); + if (IS_ERR(rt)) + return NULL; + return rt; +} + +static unsigned int find_best_mtu(const struct t3c_data *d, unsigned short mtu) +{ + int i = 0; + + while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu) + ++i; + return i; +} + +static void arp_failure_discard(struct t3cdev *dev, struct sk_buff *skb) +{ + PDBG("%s t3cdev %p\n", __func__, dev); + kfree_skb(skb); +} + +/* + * Handle an ARP failure for an active open. + */ +static void act_open_req_arp_failure(struct t3cdev *dev, struct sk_buff *skb) +{ + printk(KERN_ERR MOD "ARP failure duing connect\n"); + kfree_skb(skb); +} + +/* + * Handle an ARP failure for a CPL_ABORT_REQ. Change it into a no RST variant + * and send it along. + */ +static void abort_arp_failure(struct t3cdev *dev, struct sk_buff *skb) +{ + struct cpl_abort_req *req = cplhdr(skb); + + PDBG("%s t3cdev %p\n", __func__, dev); + req->cmd = CPL_ABORT_NO_RST; + iwch_cxgb3_ofld_send(dev, skb); +} + +static int send_halfclose(struct iwch_ep *ep, gfp_t gfp) +{ + struct cpl_close_con_req *req; + struct sk_buff *skb; + + PDBG("%s ep %p\n", __func__, ep); + skb = get_skb(NULL, sizeof(*req), gfp); + if (!skb) { + printk(KERN_ERR MOD "%s - failed to alloc skb\n", __func__); + return -ENOMEM; + } + skb->priority = CPL_PRIORITY_DATA; + set_arp_failure_handler(skb, arp_failure_discard); + req = (struct cpl_close_con_req *) skb_put(skb, sizeof(*req)); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON)); + req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, ep->hwtid)); + return iwch_l2t_send(ep->com.tdev, skb, ep->l2t); +} + +static int send_abort(struct iwch_ep *ep, struct sk_buff *skb, gfp_t gfp) +{ + struct cpl_abort_req *req; + + PDBG("%s ep %p\n", __func__, ep); + skb = get_skb(skb, sizeof(*req), gfp); + if (!skb) { + printk(KERN_ERR MOD "%s - failed to alloc skb.\n", + __func__); + return -ENOMEM; + } + skb->priority = CPL_PRIORITY_DATA; + set_arp_failure_handler(skb, abort_arp_failure); + req = (struct cpl_abort_req *) skb_put(skb, sizeof(*req)); + memset(req, 0, sizeof(*req)); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ)); + req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, ep->hwtid)); + req->cmd = CPL_ABORT_SEND_RST; + return iwch_l2t_send(ep->com.tdev, skb, ep->l2t); +} + +static int send_connect(struct iwch_ep *ep) +{ + struct cpl_act_open_req *req; + struct sk_buff *skb; + u32 opt0h, opt0l, opt2; + unsigned int mtu_idx; + int wscale; + + PDBG("%s ep %p\n", __func__, ep); + + skb = get_skb(NULL, sizeof(*req), GFP_KERNEL); + if (!skb) { + printk(KERN_ERR MOD "%s - failed to alloc skb.\n", + __func__); + return -ENOMEM; + } + mtu_idx = find_best_mtu(T3C_DATA(ep->com.tdev), dst_mtu(ep->dst)); + wscale = compute_wscale(rcv_win); + opt0h = V_NAGLE(0) | + V_NO_CONG(nocong) | + V_KEEP_ALIVE(1) | + F_TCAM_BYPASS | + V_WND_SCALE(wscale) | + V_MSS_IDX(mtu_idx) | + V_L2T_IDX(ep->l2t->idx) | V_TX_CHANNEL(ep->l2t->smt_idx); + opt0l = V_TOS((ep->tos >> 2) & M_TOS) | V_RCV_BUFSIZ(rcv_win>>10); + opt2 = F_RX_COALESCE_VALID | V_RX_COALESCE(0) | V_FLAVORS_VALID(1) | + V_CONG_CONTROL_FLAVOR(cong_flavor); + skb->priority = CPL_PRIORITY_SETUP; + set_arp_failure_handler(skb, act_open_req_arp_failure); + + req = (struct cpl_act_open_req *) skb_put(skb, sizeof(*req)); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, ep->atid)); + req->local_port = ep->com.local_addr.sin_port; + req->peer_port = ep->com.remote_addr.sin_port; + req->local_ip = ep->com.local_addr.sin_addr.s_addr; + req->peer_ip = ep->com.remote_addr.sin_addr.s_addr; + req->opt0h = htonl(opt0h); + req->opt0l = htonl(opt0l); + req->params = 0; + req->opt2 = htonl(opt2); + return iwch_l2t_send(ep->com.tdev, skb, ep->l2t); +} + +static void send_mpa_req(struct iwch_ep *ep, struct sk_buff *skb) +{ + int mpalen; + struct tx_data_wr *req; + struct mpa_message *mpa; + int len; + + PDBG("%s ep %p pd_len %d\n", __func__, ep, ep->plen); + + BUG_ON(skb_cloned(skb)); + + mpalen = sizeof(*mpa) + ep->plen; + if (skb->data + mpalen + sizeof(*req) > skb_end_pointer(skb)) { + kfree_skb(skb); + skb=alloc_skb(mpalen + sizeof(*req), GFP_KERNEL); + if (!skb) { + connect_reply_upcall(ep, -ENOMEM); + return; + } + } + skb_trim(skb, 0); + skb_reserve(skb, sizeof(*req)); + skb_put(skb, mpalen); + skb->priority = CPL_PRIORITY_DATA; + mpa = (struct mpa_message *) skb->data; + memset(mpa, 0, sizeof(*mpa)); + memcpy(mpa->key, MPA_KEY_REQ, sizeof(mpa->key)); + mpa->flags = (crc_enabled ? MPA_CRC : 0) | + (markers_enabled ? MPA_MARKERS : 0); + mpa->private_data_size = htons(ep->plen); + mpa->revision = mpa_rev; + + if (ep->plen) + memcpy(mpa->private_data, ep->mpa_pkt + sizeof(*mpa), ep->plen); + + /* + * Reference the mpa skb. This ensures the data area + * will remain in memory until the hw acks the tx. + * Function tx_ack() will deref it. + */ + skb_get(skb); + set_arp_failure_handler(skb, arp_failure_discard); + skb_reset_transport_header(skb); + len = skb->len; + req = (struct tx_data_wr *) skb_push(skb, sizeof(*req)); + req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)|F_WR_COMPL); + req->wr_lo = htonl(V_WR_TID(ep->hwtid)); + req->len = htonl(len); + req->param = htonl(V_TX_PORT(ep->l2t->smt_idx) | + V_TX_SNDBUF(snd_win>>15)); + req->flags = htonl(F_TX_INIT); + req->sndseq = htonl(ep->snd_seq); + BUG_ON(ep->mpa_skb); + ep->mpa_skb = skb; + iwch_l2t_send(ep->com.tdev, skb, ep->l2t); + start_ep_timer(ep); + state_set(&ep->com, MPA_REQ_SENT); + return; +} + +static int send_mpa_reject(struct iwch_ep *ep, const void *pdata, u8 plen) +{ + int mpalen; + struct tx_data_wr *req; + struct mpa_message *mpa; + struct sk_buff *skb; + + PDBG("%s ep %p plen %d\n", __func__, ep, plen); + + mpalen = sizeof(*mpa) + plen; + + skb = get_skb(NULL, mpalen + sizeof(*req), GFP_KERNEL); + if (!skb) { + printk(KERN_ERR MOD "%s - cannot alloc skb!\n", __func__); + return -ENOMEM; + } + skb_reserve(skb, sizeof(*req)); + mpa = (struct mpa_message *) skb_put(skb, mpalen); + memset(mpa, 0, sizeof(*mpa)); + memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key)); + mpa->flags = MPA_REJECT; + mpa->revision = mpa_rev; + mpa->private_data_size = htons(plen); + if (plen) + memcpy(mpa->private_data, pdata, plen); + + /* + * Reference the mpa skb again. This ensures the data area + * will remain in memory until the hw acks the tx. + * Function tx_ack() will deref it. + */ + skb_get(skb); + skb->priority = CPL_PRIORITY_DATA; + set_arp_failure_handler(skb, arp_failure_discard); + skb_reset_transport_header(skb); + req = (struct tx_data_wr *) skb_push(skb, sizeof(*req)); + req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)|F_WR_COMPL); + req->wr_lo = htonl(V_WR_TID(ep->hwtid)); + req->len = htonl(mpalen); + req->param = htonl(V_TX_PORT(ep->l2t->smt_idx) | + V_TX_SNDBUF(snd_win>>15)); + req->flags = htonl(F_TX_INIT); + req->sndseq = htonl(ep->snd_seq); + BUG_ON(ep->mpa_skb); + ep->mpa_skb = skb; + return iwch_l2t_send(ep->com.tdev, skb, ep->l2t); +} + +static int send_mpa_reply(struct iwch_ep *ep, const void *pdata, u8 plen) +{ + int mpalen; + struct tx_data_wr *req; + struct mpa_message *mpa; + int len; + struct sk_buff *skb; + + PDBG("%s ep %p plen %d\n", __func__, ep, plen); + + mpalen = sizeof(*mpa) + plen; + + skb = get_skb(NULL, mpalen + sizeof(*req), GFP_KERNEL); + if (!skb) { + printk(KERN_ERR MOD "%s - cannot alloc skb!\n", __func__); + return -ENOMEM; + } + skb->priority = CPL_PRIORITY_DATA; + skb_reserve(skb, sizeof(*req)); + mpa = (struct mpa_message *) skb_put(skb, mpalen); + memset(mpa, 0, sizeof(*mpa)); + memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key)); + mpa->flags = (ep->mpa_attr.crc_enabled ? MPA_CRC : 0) | + (markers_enabled ? MPA_MARKERS : 0); + mpa->revision = mpa_rev; + mpa->private_data_size = htons(plen); + if (plen) + memcpy(mpa->private_data, pdata, plen); + + /* + * Reference the mpa skb. This ensures the data area + * will remain in memory until the hw acks the tx. + * Function tx_ack() will deref it. + */ + skb_get(skb); + set_arp_failure_handler(skb, arp_failure_discard); + skb_reset_transport_header(skb); + len = skb->len; + req = (struct tx_data_wr *) skb_push(skb, sizeof(*req)); + req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)|F_WR_COMPL); + req->wr_lo = htonl(V_WR_TID(ep->hwtid)); + req->len = htonl(len); + req->param = htonl(V_TX_PORT(ep->l2t->smt_idx) | + V_TX_SNDBUF(snd_win>>15)); + req->flags = htonl(F_TX_INIT); + req->sndseq = htonl(ep->snd_seq); + ep->mpa_skb = skb; + state_set(&ep->com, MPA_REP_SENT); + return iwch_l2t_send(ep->com.tdev, skb, ep->l2t); +} + +static int act_establish(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct iwch_ep *ep = ctx; + struct cpl_act_establish *req = cplhdr(skb); + unsigned int tid = GET_TID(req); + + PDBG("%s ep %p tid %d\n", __func__, ep, tid); + + dst_confirm(ep->dst); + + /* setup the hwtid for this connection */ + ep->hwtid = tid; + cxgb3_insert_tid(ep->com.tdev, &t3c_client, ep, tid); + + ep->snd_seq = ntohl(req->snd_isn); + ep->rcv_seq = ntohl(req->rcv_isn); + + set_emss(ep, ntohs(req->tcp_opt)); + + /* dealloc the atid */ + cxgb3_free_atid(ep->com.tdev, ep->atid); + + /* start MPA negotiation */ + send_mpa_req(ep, skb); + + return 0; +} + +static void abort_connection(struct iwch_ep *ep, struct sk_buff *skb, gfp_t gfp) +{ + PDBG("%s ep %p\n", __FILE__, ep); + state_set(&ep->com, ABORTING); + send_abort(ep, skb, gfp); +} + +static void close_complete_upcall(struct iwch_ep *ep) +{ + struct iw_cm_event event; + + PDBG("%s ep %p\n", __func__, ep); + memset(&event, 0, sizeof(event)); + event.event = IW_CM_EVENT_CLOSE; + if (ep->com.cm_id) { + PDBG("close complete delivered ep %p cm_id %p tid %d\n", + ep, ep->com.cm_id, ep->hwtid); + ep->com.cm_id->event_handler(ep->com.cm_id, &event); + ep->com.cm_id->rem_ref(ep->com.cm_id); + ep->com.cm_id = NULL; + ep->com.qp = NULL; + } +} + +static void peer_close_upcall(struct iwch_ep *ep) +{ + struct iw_cm_event event; + + PDBG("%s ep %p\n", __func__, ep); + memset(&event, 0, sizeof(event)); + event.event = IW_CM_EVENT_DISCONNECT; + if (ep->com.cm_id) { + PDBG("peer close delivered ep %p cm_id %p tid %d\n", + ep, ep->com.cm_id, ep->hwtid); + ep->com.cm_id->event_handler(ep->com.cm_id, &event); + } +} + +static void peer_abort_upcall(struct iwch_ep *ep) +{ + struct iw_cm_event event; + + PDBG("%s ep %p\n", __func__, ep); + memset(&event, 0, sizeof(event)); + event.event = IW_CM_EVENT_CLOSE; + event.status = -ECONNRESET; + if (ep->com.cm_id) { + PDBG("abort delivered ep %p cm_id %p tid %d\n", ep, + ep->com.cm_id, ep->hwtid); + ep->com.cm_id->event_handler(ep->com.cm_id, &event); + ep->com.cm_id->rem_ref(ep->com.cm_id); + ep->com.cm_id = NULL; + ep->com.qp = NULL; + } +} + +static void connect_reply_upcall(struct iwch_ep *ep, int status) +{ + struct iw_cm_event event; + + PDBG("%s ep %p status %d\n", __func__, ep, status); + memset(&event, 0, sizeof(event)); + event.event = IW_CM_EVENT_CONNECT_REPLY; + event.status = status; + memcpy(&event.local_addr, &ep->com.local_addr, + sizeof(ep->com.local_addr)); + memcpy(&event.remote_addr, &ep->com.remote_addr, + sizeof(ep->com.remote_addr)); + + if ((status == 0) || (status == -ECONNREFUSED)) { + event.private_data_len = ep->plen; + event.private_data = ep->mpa_pkt + sizeof(struct mpa_message); + } + if (ep->com.cm_id) { + PDBG("%s ep %p tid %d status %d\n", __func__, ep, + ep->hwtid, status); + ep->com.cm_id->event_handler(ep->com.cm_id, &event); + } + if (status < 0) { + ep->com.cm_id->rem_ref(ep->com.cm_id); + ep->com.cm_id = NULL; + ep->com.qp = NULL; + } +} + +static void connect_request_upcall(struct iwch_ep *ep) +{ + struct iw_cm_event event; + + PDBG("%s ep %p tid %d\n", __func__, ep, ep->hwtid); + memset(&event, 0, sizeof(event)); + event.event = IW_CM_EVENT_CONNECT_REQUEST; + memcpy(&event.local_addr, &ep->com.local_addr, + sizeof(ep->com.local_addr)); + memcpy(&event.remote_addr, &ep->com.remote_addr, + sizeof(ep->com.local_addr)); + event.private_data_len = ep->plen; + event.private_data = ep->mpa_pkt + sizeof(struct mpa_message); + event.provider_data = ep; + /* + * Until ird/ord negotiation via MPAv2 support is added, send max + * supported values + */ + event.ird = event.ord = 8; + if (state_read(&ep->parent_ep->com) != DEAD) { + get_ep(&ep->com); + ep->parent_ep->com.cm_id->event_handler( + ep->parent_ep->com.cm_id, + &event); + } + put_ep(&ep->parent_ep->com); + ep->parent_ep = NULL; +} + +static void established_upcall(struct iwch_ep *ep) +{ + struct iw_cm_event event; + + PDBG("%s ep %p\n", __func__, ep); + memset(&event, 0, sizeof(event)); + event.event = IW_CM_EVENT_ESTABLISHED; + /* + * Until ird/ord negotiation via MPAv2 support is added, send max + * supported values + */ + event.ird = event.ord = 8; + if (ep->com.cm_id) { + PDBG("%s ep %p tid %d\n", __func__, ep, ep->hwtid); + ep->com.cm_id->event_handler(ep->com.cm_id, &event); + } +} + +static int update_rx_credits(struct iwch_ep *ep, u32 credits) +{ + struct cpl_rx_data_ack *req; + struct sk_buff *skb; + + PDBG("%s ep %p credits %u\n", __func__, ep, credits); + skb = get_skb(NULL, sizeof(*req), GFP_KERNEL); + if (!skb) { + printk(KERN_ERR MOD "update_rx_credits - cannot alloc skb!\n"); + return 0; + } + + req = (struct cpl_rx_data_ack *) skb_put(skb, sizeof(*req)); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, ep->hwtid)); + req->credit_dack = htonl(V_RX_CREDITS(credits) | V_RX_FORCE_ACK(1)); + skb->priority = CPL_PRIORITY_ACK; + iwch_cxgb3_ofld_send(ep->com.tdev, skb); + return credits; +} + +static void process_mpa_reply(struct iwch_ep *ep, struct sk_buff *skb) +{ + struct mpa_message *mpa; + u16 plen; + struct iwch_qp_attributes attrs; + enum iwch_qp_attr_mask mask; + int err; + + PDBG("%s ep %p\n", __func__, ep); + + /* + * Stop mpa timer. If it expired, then the state has + * changed and we bail since ep_timeout already aborted + * the connection. + */ + stop_ep_timer(ep); + if (state_read(&ep->com) != MPA_REQ_SENT) + return; + + /* + * If we get more than the supported amount of private data + * then we must fail this connection. + */ + if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt)) { + err = -EINVAL; + goto err; + } + + /* + * copy the new data into our accumulation buffer. + */ + skb_copy_from_linear_data(skb, &(ep->mpa_pkt[ep->mpa_pkt_len]), + skb->len); + ep->mpa_pkt_len += skb->len; + + /* + * if we don't even have the mpa message, then bail. + */ + if (ep->mpa_pkt_len < sizeof(*mpa)) + return; + mpa = (struct mpa_message *) ep->mpa_pkt; + + /* Validate MPA header. */ + if (mpa->revision != mpa_rev) { + err = -EPROTO; + goto err; + } + if (memcmp(mpa->key, MPA_KEY_REP, sizeof(mpa->key))) { + err = -EPROTO; + goto err; + } + + plen = ntohs(mpa->private_data_size); + + /* + * Fail if there's too much private data. + */ + if (plen > MPA_MAX_PRIVATE_DATA) { + err = -EPROTO; + goto err; + } + + /* + * If plen does not account for pkt size + */ + if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) { + err = -EPROTO; + goto err; + } + + ep->plen = (u8) plen; + + /* + * If we don't have all the pdata yet, then bail. + * We'll continue process when more data arrives. + */ + if (ep->mpa_pkt_len < (sizeof(*mpa) + plen)) + return; + + if (mpa->flags & MPA_REJECT) { + err = -ECONNREFUSED; + goto err; + } + + /* + * If we get here we have accumulated the entire mpa + * start reply message including private data. And + * the MPA header is valid. + */ + state_set(&ep->com, FPDU_MODE); + ep->mpa_attr.initiator = 1; + ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0; + ep->mpa_attr.recv_marker_enabled = markers_enabled; + ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0; + ep->mpa_attr.version = mpa_rev; + PDBG("%s - crc_enabled=%d, recv_marker_enabled=%d, " + "xmit_marker_enabled=%d, version=%d\n", __func__, + ep->mpa_attr.crc_enabled, ep->mpa_attr.recv_marker_enabled, + ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version); + + attrs.mpa_attr = ep->mpa_attr; + attrs.max_ird = ep->ird; + attrs.max_ord = ep->ord; + attrs.llp_stream_handle = ep; + attrs.next_state = IWCH_QP_STATE_RTS; + + mask = IWCH_QP_ATTR_NEXT_STATE | + IWCH_QP_ATTR_LLP_STREAM_HANDLE | IWCH_QP_ATTR_MPA_ATTR | + IWCH_QP_ATTR_MAX_IRD | IWCH_QP_ATTR_MAX_ORD; + + /* bind QP and TID with INIT_WR */ + err = iwch_modify_qp(ep->com.qp->rhp, + ep->com.qp, mask, &attrs, 1); + if (err) + goto err; + + if (peer2peer && iwch_rqes_posted(ep->com.qp) == 0) { + iwch_post_zb_read(ep); + } + + goto out; +err: + abort_connection(ep, skb, GFP_KERNEL); +out: + connect_reply_upcall(ep, err); + return; +} + +static void process_mpa_request(struct iwch_ep *ep, struct sk_buff *skb) +{ + struct mpa_message *mpa; + u16 plen; + + PDBG("%s ep %p\n", __func__, ep); + + /* + * Stop mpa timer. If it expired, then the state has + * changed and we bail since ep_timeout already aborted + * the connection. + */ + stop_ep_timer(ep); + if (state_read(&ep->com) != MPA_REQ_WAIT) + return; + + /* + * If we get more than the supported amount of private data + * then we must fail this connection. + */ + if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt)) { + abort_connection(ep, skb, GFP_KERNEL); + return; + } + + PDBG("%s enter (%s line %u)\n", __func__, __FILE__, __LINE__); + + /* + * Copy the new data into our accumulation buffer. + */ + skb_copy_from_linear_data(skb, &(ep->mpa_pkt[ep->mpa_pkt_len]), + skb->len); + ep->mpa_pkt_len += skb->len; + + /* + * If we don't even have the mpa message, then bail. + * We'll continue process when more data arrives. + */ + if (ep->mpa_pkt_len < sizeof(*mpa)) + return; + PDBG("%s enter (%s line %u)\n", __func__, __FILE__, __LINE__); + mpa = (struct mpa_message *) ep->mpa_pkt; + + /* + * Validate MPA Header. + */ + if (mpa->revision != mpa_rev) { + abort_connection(ep, skb, GFP_KERNEL); + return; + } + + if (memcmp(mpa->key, MPA_KEY_REQ, sizeof(mpa->key))) { + abort_connection(ep, skb, GFP_KERNEL); + return; + } + + plen = ntohs(mpa->private_data_size); + + /* + * Fail if there's too much private data. + */ + if (plen > MPA_MAX_PRIVATE_DATA) { + abort_connection(ep, skb, GFP_KERNEL); + return; + } + + /* + * If plen does not account for pkt size + */ + if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) { + abort_connection(ep, skb, GFP_KERNEL); + return; + } + ep->plen = (u8) plen; + + /* + * If we don't have all the pdata yet, then bail. + */ + if (ep->mpa_pkt_len < (sizeof(*mpa) + plen)) + return; + + /* + * If we get here we have accumulated the entire mpa + * start reply message including private data. + */ + ep->mpa_attr.initiator = 0; + ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0; + ep->mpa_attr.recv_marker_enabled = markers_enabled; + ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0; + ep->mpa_attr.version = mpa_rev; + PDBG("%s - crc_enabled=%d, recv_marker_enabled=%d, " + "xmit_marker_enabled=%d, version=%d\n", __func__, + ep->mpa_attr.crc_enabled, ep->mpa_attr.recv_marker_enabled, + ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version); + + state_set(&ep->com, MPA_REQ_RCVD); + + /* drive upcall */ + connect_request_upcall(ep); + return; +} + +static int rx_data(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct iwch_ep *ep = ctx; + struct cpl_rx_data *hdr = cplhdr(skb); + unsigned int dlen = ntohs(hdr->len); + + PDBG("%s ep %p dlen %u\n", __func__, ep, dlen); + + skb_pull(skb, sizeof(*hdr)); + skb_trim(skb, dlen); + + ep->rcv_seq += dlen; + BUG_ON(ep->rcv_seq != (ntohl(hdr->seq) + dlen)); + + switch (state_read(&ep->com)) { + case MPA_REQ_SENT: + process_mpa_reply(ep, skb); + break; + case MPA_REQ_WAIT: + process_mpa_request(ep, skb); + break; + case MPA_REP_SENT: + break; + default: + printk(KERN_ERR MOD "%s Unexpected streaming data." + " ep %p state %d tid %d\n", + __func__, ep, state_read(&ep->com), ep->hwtid); + + /* + * The ep will timeout and inform the ULP of the failure. + * See ep_timeout(). + */ + break; + } + + /* update RX credits */ + update_rx_credits(ep, dlen); + + return CPL_RET_BUF_DONE; +} + +/* + * Upcall from the adapter indicating data has been transmitted. + * For us its just the single MPA request or reply. We can now free + * the skb holding the mpa message. + */ +static int tx_ack(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct iwch_ep *ep = ctx; + struct cpl_wr_ack *hdr = cplhdr(skb); + unsigned int credits = ntohs(hdr->credits); + unsigned long flags; + int post_zb = 0; + + PDBG("%s ep %p credits %u\n", __func__, ep, credits); + + if (credits == 0) { + PDBG("%s 0 credit ack ep %p state %u\n", + __func__, ep, state_read(&ep->com)); + return CPL_RET_BUF_DONE; + } + + spin_lock_irqsave(&ep->com.lock, flags); + BUG_ON(credits != 1); + dst_confirm(ep->dst); + if (!ep->mpa_skb) { + PDBG("%s rdma_init wr_ack ep %p state %u\n", + __func__, ep, ep->com.state); + if (ep->mpa_attr.initiator) { + PDBG("%s initiator ep %p state %u\n", + __func__, ep, ep->com.state); + if (peer2peer && ep->com.state == FPDU_MODE) + post_zb = 1; + } else { + PDBG("%s responder ep %p state %u\n", + __func__, ep, ep->com.state); + if (ep->com.state == MPA_REQ_RCVD) { + ep->com.rpl_done = 1; + wake_up(&ep->com.waitq); + } + } + } else { + PDBG("%s lsm ack ep %p state %u freeing skb\n", + __func__, ep, ep->com.state); + kfree_skb(ep->mpa_skb); + ep->mpa_skb = NULL; + } + spin_unlock_irqrestore(&ep->com.lock, flags); + if (post_zb) + iwch_post_zb_read(ep); + return CPL_RET_BUF_DONE; +} + +static int abort_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct iwch_ep *ep = ctx; + unsigned long flags; + int release = 0; + + PDBG("%s ep %p\n", __func__, ep); + BUG_ON(!ep); + + /* + * We get 2 abort replies from the HW. The first one must + * be ignored except for scribbling that we need one more. + */ + if (!test_and_set_bit(ABORT_REQ_IN_PROGRESS, &ep->com.flags)) { + return CPL_RET_BUF_DONE; + } + + spin_lock_irqsave(&ep->com.lock, flags); + switch (ep->com.state) { + case ABORTING: + close_complete_upcall(ep); + __state_set(&ep->com, DEAD); + release = 1; + break; + default: + printk(KERN_ERR "%s ep %p state %d\n", + __func__, ep, ep->com.state); + break; + } + spin_unlock_irqrestore(&ep->com.lock, flags); + + if (release) + release_ep_resources(ep); + return CPL_RET_BUF_DONE; +} + +/* + * Return whether a failed active open has allocated a TID + */ +static inline int act_open_has_tid(int status) +{ + return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST && + status != CPL_ERR_ARP_MISS; +} + +static int act_open_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct iwch_ep *ep = ctx; + struct cpl_act_open_rpl *rpl = cplhdr(skb); + + PDBG("%s ep %p status %u errno %d\n", __func__, ep, rpl->status, + status2errno(rpl->status)); + connect_reply_upcall(ep, status2errno(rpl->status)); + state_set(&ep->com, DEAD); + if (ep->com.tdev->type != T3A && act_open_has_tid(rpl->status)) + release_tid(ep->com.tdev, GET_TID(rpl), NULL); + cxgb3_free_atid(ep->com.tdev, ep->atid); + dst_release(ep->dst); + l2t_release(ep->com.tdev, ep->l2t); + put_ep(&ep->com); + return CPL_RET_BUF_DONE; +} + +static int listen_start(struct iwch_listen_ep *ep) +{ + struct sk_buff *skb; + struct cpl_pass_open_req *req; + + PDBG("%s ep %p\n", __func__, ep); + skb = get_skb(NULL, sizeof(*req), GFP_KERNEL); + if (!skb) { + printk(KERN_ERR MOD "t3c_listen_start failed to alloc skb!\n"); + return -ENOMEM; + } + + req = (struct cpl_pass_open_req *) skb_put(skb, sizeof(*req)); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, ep->stid)); + req->local_port = ep->com.local_addr.sin_port; + req->local_ip = ep->com.local_addr.sin_addr.s_addr; + req->peer_port = 0; + req->peer_ip = 0; + req->peer_netmask = 0; + req->opt0h = htonl(F_DELACK | F_TCAM_BYPASS); + req->opt0l = htonl(V_RCV_BUFSIZ(rcv_win>>10)); + req->opt1 = htonl(V_CONN_POLICY(CPL_CONN_POLICY_ASK)); + + skb->priority = 1; + return iwch_cxgb3_ofld_send(ep->com.tdev, skb); +} + +static int pass_open_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct iwch_listen_ep *ep = ctx; + struct cpl_pass_open_rpl *rpl = cplhdr(skb); + + PDBG("%s ep %p status %d error %d\n", __func__, ep, + rpl->status, status2errno(rpl->status)); + ep->com.rpl_err = status2errno(rpl->status); + ep->com.rpl_done = 1; + wake_up(&ep->com.waitq); + + return CPL_RET_BUF_DONE; +} + +static int listen_stop(struct iwch_listen_ep *ep) +{ + struct sk_buff *skb; + struct cpl_close_listserv_req *req; + + PDBG("%s ep %p\n", __func__, ep); + skb = get_skb(NULL, sizeof(*req), GFP_KERNEL); + if (!skb) { + printk(KERN_ERR MOD "%s - failed to alloc skb\n", __func__); + return -ENOMEM; + } + req = (struct cpl_close_listserv_req *) skb_put(skb, sizeof(*req)); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + req->cpu_idx = 0; + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ, ep->stid)); + skb->priority = 1; + return iwch_cxgb3_ofld_send(ep->com.tdev, skb); +} + +static int close_listsrv_rpl(struct t3cdev *tdev, struct sk_buff *skb, + void *ctx) +{ + struct iwch_listen_ep *ep = ctx; + struct cpl_close_listserv_rpl *rpl = cplhdr(skb); + + PDBG("%s ep %p\n", __func__, ep); + ep->com.rpl_err = status2errno(rpl->status); + ep->com.rpl_done = 1; + wake_up(&ep->com.waitq); + return CPL_RET_BUF_DONE; +} + +static void accept_cr(struct iwch_ep *ep, __be32 peer_ip, struct sk_buff *skb) +{ + struct cpl_pass_accept_rpl *rpl; + unsigned int mtu_idx; + u32 opt0h, opt0l, opt2; + int wscale; + + PDBG("%s ep %p\n", __func__, ep); + BUG_ON(skb_cloned(skb)); + skb_trim(skb, sizeof(*rpl)); + skb_get(skb); + mtu_idx = find_best_mtu(T3C_DATA(ep->com.tdev), dst_mtu(ep->dst)); + wscale = compute_wscale(rcv_win); + opt0h = V_NAGLE(0) | + V_NO_CONG(nocong) | + V_KEEP_ALIVE(1) | + F_TCAM_BYPASS | + V_WND_SCALE(wscale) | + V_MSS_IDX(mtu_idx) | + V_L2T_IDX(ep->l2t->idx) | V_TX_CHANNEL(ep->l2t->smt_idx); + opt0l = V_TOS((ep->tos >> 2) & M_TOS) | V_RCV_BUFSIZ(rcv_win>>10); + opt2 = F_RX_COALESCE_VALID | V_RX_COALESCE(0) | V_FLAVORS_VALID(1) | + V_CONG_CONTROL_FLAVOR(cong_flavor); + + rpl = cplhdr(skb); + rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, ep->hwtid)); + rpl->peer_ip = peer_ip; + rpl->opt0h = htonl(opt0h); + rpl->opt0l_status = htonl(opt0l | CPL_PASS_OPEN_ACCEPT); + rpl->opt2 = htonl(opt2); + rpl->rsvd = rpl->opt2; /* workaround for HW bug */ + skb->priority = CPL_PRIORITY_SETUP; + iwch_l2t_send(ep->com.tdev, skb, ep->l2t); + + return; +} + +static void reject_cr(struct t3cdev *tdev, u32 hwtid, __be32 peer_ip, + struct sk_buff *skb) +{ + PDBG("%s t3cdev %p tid %u peer_ip %x\n", __func__, tdev, hwtid, + peer_ip); + BUG_ON(skb_cloned(skb)); + skb_trim(skb, sizeof(struct cpl_tid_release)); + skb_get(skb); + + if (tdev->type != T3A) + release_tid(tdev, hwtid, skb); + else { + struct cpl_pass_accept_rpl *rpl; + + rpl = cplhdr(skb); + skb->priority = CPL_PRIORITY_SETUP; + rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, + hwtid)); + rpl->peer_ip = peer_ip; + rpl->opt0h = htonl(F_TCAM_BYPASS); + rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT); + rpl->opt2 = 0; + rpl->rsvd = rpl->opt2; + iwch_cxgb3_ofld_send(tdev, skb); + } +} + +static int pass_accept_req(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct iwch_ep *child_ep, *parent_ep = ctx; + struct cpl_pass_accept_req *req = cplhdr(skb); + unsigned int hwtid = GET_TID(req); + struct dst_entry *dst; + struct l2t_entry *l2t; + struct rtable *rt; + struct iff_mac tim; + + PDBG("%s parent ep %p tid %u\n", __func__, parent_ep, hwtid); + + if (state_read(&parent_ep->com) != LISTEN) { + printk(KERN_ERR "%s - listening ep not in LISTEN\n", + __func__); + goto reject; + } + + /* + * Find the netdev for this connection request. + */ + tim.mac_addr = req->dst_mac; + tim.vlan_tag = ntohs(req->vlan_tag); + if (tdev->ctl(tdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) { + printk(KERN_ERR "%s bad dst mac %pM\n", + __func__, req->dst_mac); + goto reject; + } + + /* Find output route */ + rt = find_route(tdev, + req->local_ip, + req->peer_ip, + req->local_port, + req->peer_port, G_PASS_OPEN_TOS(ntohl(req->tos_tid))); + if (!rt) { + printk(KERN_ERR MOD "%s - failed to find dst entry!\n", + __func__); + goto reject; + } + dst = &rt->dst; + l2t = t3_l2t_get(tdev, dst, NULL, &req->peer_ip); + if (!l2t) { + printk(KERN_ERR MOD "%s - failed to allocate l2t entry!\n", + __func__); + dst_release(dst); + goto reject; + } + child_ep = alloc_ep(sizeof(*child_ep), GFP_KERNEL); + if (!child_ep) { + printk(KERN_ERR MOD "%s - failed to allocate ep entry!\n", + __func__); + l2t_release(tdev, l2t); + dst_release(dst); + goto reject; + } + state_set(&child_ep->com, CONNECTING); + child_ep->com.tdev = tdev; + child_ep->com.cm_id = NULL; + child_ep->com.local_addr.sin_family = PF_INET; + child_ep->com.local_addr.sin_port = req->local_port; + child_ep->com.local_addr.sin_addr.s_addr = req->local_ip; + child_ep->com.remote_addr.sin_family = PF_INET; + child_ep->com.remote_addr.sin_port = req->peer_port; + child_ep->com.remote_addr.sin_addr.s_addr = req->peer_ip; + get_ep(&parent_ep->com); + child_ep->parent_ep = parent_ep; + child_ep->tos = G_PASS_OPEN_TOS(ntohl(req->tos_tid)); + child_ep->l2t = l2t; + child_ep->dst = dst; + child_ep->hwtid = hwtid; + init_timer(&child_ep->timer); + cxgb3_insert_tid(tdev, &t3c_client, child_ep, hwtid); + accept_cr(child_ep, req->peer_ip, skb); + goto out; +reject: + reject_cr(tdev, hwtid, req->peer_ip, skb); +out: + return CPL_RET_BUF_DONE; +} + +static int pass_establish(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct iwch_ep *ep = ctx; + struct cpl_pass_establish *req = cplhdr(skb); + + PDBG("%s ep %p\n", __func__, ep); + ep->snd_seq = ntohl(req->snd_isn); + ep->rcv_seq = ntohl(req->rcv_isn); + + set_emss(ep, ntohs(req->tcp_opt)); + + dst_confirm(ep->dst); + state_set(&ep->com, MPA_REQ_WAIT); + start_ep_timer(ep); + + return CPL_RET_BUF_DONE; +} + +static int peer_close(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct iwch_ep *ep = ctx; + struct iwch_qp_attributes attrs; + unsigned long flags; + int disconnect = 1; + int release = 0; + + PDBG("%s ep %p\n", __func__, ep); + dst_confirm(ep->dst); + + spin_lock_irqsave(&ep->com.lock, flags); + switch (ep->com.state) { + case MPA_REQ_WAIT: + __state_set(&ep->com, CLOSING); + break; + case MPA_REQ_SENT: + __state_set(&ep->com, CLOSING); + connect_reply_upcall(ep, -ECONNRESET); + break; + case MPA_REQ_RCVD: + + /* + * We're gonna mark this puppy DEAD, but keep + * the reference on it until the ULP accepts or + * rejects the CR. Also wake up anyone waiting + * in rdma connection migration (see iwch_accept_cr()). + */ + __state_set(&ep->com, CLOSING); + ep->com.rpl_done = 1; + ep->com.rpl_err = -ECONNRESET; + PDBG("waking up ep %p\n", ep); + wake_up(&ep->com.waitq); + break; + case MPA_REP_SENT: + __state_set(&ep->com, CLOSING); + ep->com.rpl_done = 1; + ep->com.rpl_err = -ECONNRESET; + PDBG("waking up ep %p\n", ep); + wake_up(&ep->com.waitq); + break; + case FPDU_MODE: + start_ep_timer(ep); + __state_set(&ep->com, CLOSING); + attrs.next_state = IWCH_QP_STATE_CLOSING; + iwch_modify_qp(ep->com.qp->rhp, ep->com.qp, + IWCH_QP_ATTR_NEXT_STATE, &attrs, 1); + peer_close_upcall(ep); + break; + case ABORTING: + disconnect = 0; + break; + case CLOSING: + __state_set(&ep->com, MORIBUND); + disconnect = 0; + break; + case MORIBUND: + stop_ep_timer(ep); + if (ep->com.cm_id && ep->com.qp) { + attrs.next_state = IWCH_QP_STATE_IDLE; + iwch_modify_qp(ep->com.qp->rhp, ep->com.qp, + IWCH_QP_ATTR_NEXT_STATE, &attrs, 1); + } + close_complete_upcall(ep); + __state_set(&ep->com, DEAD); + release = 1; + disconnect = 0; + break; + case DEAD: + disconnect = 0; + break; + default: + BUG_ON(1); + } + spin_unlock_irqrestore(&ep->com.lock, flags); + if (disconnect) + iwch_ep_disconnect(ep, 0, GFP_KERNEL); + if (release) + release_ep_resources(ep); + return CPL_RET_BUF_DONE; +} + +/* + * Returns whether an ABORT_REQ_RSS message is a negative advice. + */ +static int is_neg_adv_abort(unsigned int status) +{ + return status == CPL_ERR_RTX_NEG_ADVICE || + status == CPL_ERR_PERSIST_NEG_ADVICE; +} + +static int peer_abort(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct cpl_abort_req_rss *req = cplhdr(skb); + struct iwch_ep *ep = ctx; + struct cpl_abort_rpl *rpl; + struct sk_buff *rpl_skb; + struct iwch_qp_attributes attrs; + int ret; + int release = 0; + unsigned long flags; + + if (is_neg_adv_abort(req->status)) { + PDBG("%s neg_adv_abort ep %p tid %d\n", __func__, ep, + ep->hwtid); + t3_l2t_send_event(ep->com.tdev, ep->l2t); + return CPL_RET_BUF_DONE; + } + + /* + * We get 2 peer aborts from the HW. The first one must + * be ignored except for scribbling that we need one more. + */ + if (!test_and_set_bit(PEER_ABORT_IN_PROGRESS, &ep->com.flags)) { + return CPL_RET_BUF_DONE; + } + + spin_lock_irqsave(&ep->com.lock, flags); + PDBG("%s ep %p state %u\n", __func__, ep, ep->com.state); + switch (ep->com.state) { + case CONNECTING: + break; + case MPA_REQ_WAIT: + stop_ep_timer(ep); + break; + case MPA_REQ_SENT: + stop_ep_timer(ep); + connect_reply_upcall(ep, -ECONNRESET); + break; + case MPA_REP_SENT: + ep->com.rpl_done = 1; + ep->com.rpl_err = -ECONNRESET; + PDBG("waking up ep %p\n", ep); + wake_up(&ep->com.waitq); + break; + case MPA_REQ_RCVD: + + /* + * We're gonna mark this puppy DEAD, but keep + * the reference on it until the ULP accepts or + * rejects the CR. Also wake up anyone waiting + * in rdma connection migration (see iwch_accept_cr()). + */ + ep->com.rpl_done = 1; + ep->com.rpl_err = -ECONNRESET; + PDBG("waking up ep %p\n", ep); + wake_up(&ep->com.waitq); + break; + case MORIBUND: + case CLOSING: + stop_ep_timer(ep); + /*FALLTHROUGH*/ + case FPDU_MODE: + if (ep->com.cm_id && ep->com.qp) { + attrs.next_state = IWCH_QP_STATE_ERROR; + ret = iwch_modify_qp(ep->com.qp->rhp, + ep->com.qp, IWCH_QP_ATTR_NEXT_STATE, + &attrs, 1); + if (ret) + printk(KERN_ERR MOD + "%s - qp <- error failed!\n", + __func__); + } + peer_abort_upcall(ep); + break; + case ABORTING: + break; + case DEAD: + PDBG("%s PEER_ABORT IN DEAD STATE!!!!\n", __func__); + spin_unlock_irqrestore(&ep->com.lock, flags); + return CPL_RET_BUF_DONE; + default: + BUG_ON(1); + break; + } + dst_confirm(ep->dst); + if (ep->com.state != ABORTING) { + __state_set(&ep->com, DEAD); + release = 1; + } + spin_unlock_irqrestore(&ep->com.lock, flags); + + rpl_skb = get_skb(skb, sizeof(*rpl), GFP_KERNEL); + if (!rpl_skb) { + printk(KERN_ERR MOD "%s - cannot allocate skb!\n", + __func__); + release = 1; + goto out; + } + rpl_skb->priority = CPL_PRIORITY_DATA; + rpl = (struct cpl_abort_rpl *) skb_put(rpl_skb, sizeof(*rpl)); + rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL)); + rpl->wr.wr_lo = htonl(V_WR_TID(ep->hwtid)); + OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, ep->hwtid)); + rpl->cmd = CPL_ABORT_NO_RST; + iwch_cxgb3_ofld_send(ep->com.tdev, rpl_skb); +out: + if (release) + release_ep_resources(ep); + return CPL_RET_BUF_DONE; +} + +static int close_con_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct iwch_ep *ep = ctx; + struct iwch_qp_attributes attrs; + unsigned long flags; + int release = 0; + + PDBG("%s ep %p\n", __func__, ep); + BUG_ON(!ep); + + /* The cm_id may be null if we failed to connect */ + spin_lock_irqsave(&ep->com.lock, flags); + switch (ep->com.state) { + case CLOSING: + __state_set(&ep->com, MORIBUND); + break; + case MORIBUND: + stop_ep_timer(ep); + if ((ep->com.cm_id) && (ep->com.qp)) { + attrs.next_state = IWCH_QP_STATE_IDLE; + iwch_modify_qp(ep->com.qp->rhp, + ep->com.qp, + IWCH_QP_ATTR_NEXT_STATE, + &attrs, 1); + } + close_complete_upcall(ep); + __state_set(&ep->com, DEAD); + release = 1; + break; + case ABORTING: + case DEAD: + break; + default: + BUG_ON(1); + break; + } + spin_unlock_irqrestore(&ep->com.lock, flags); + if (release) + release_ep_resources(ep); + return CPL_RET_BUF_DONE; +} + +/* + * T3A does 3 things when a TERM is received: + * 1) send up a CPL_RDMA_TERMINATE message with the TERM packet + * 2) generate an async event on the QP with the TERMINATE opcode + * 3) post a TERMINATE opcode cqe into the associated CQ. + * + * For (1), we save the message in the qp for later consumer consumption. + * For (2), we move the QP into TERMINATE, post a QP event and disconnect. + * For (3), we toss the CQE in cxio_poll_cq(). + * + * terminate() handles case (1)... + */ +static int terminate(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct iwch_ep *ep = ctx; + + if (state_read(&ep->com) != FPDU_MODE) + return CPL_RET_BUF_DONE; + + PDBG("%s ep %p\n", __func__, ep); + skb_pull(skb, sizeof(struct cpl_rdma_terminate)); + PDBG("%s saving %d bytes of term msg\n", __func__, skb->len); + skb_copy_from_linear_data(skb, ep->com.qp->attr.terminate_buffer, + skb->len); + ep->com.qp->attr.terminate_msg_len = skb->len; + ep->com.qp->attr.is_terminate_local = 0; + return CPL_RET_BUF_DONE; +} + +static int ec_status(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct cpl_rdma_ec_status *rep = cplhdr(skb); + struct iwch_ep *ep = ctx; + + PDBG("%s ep %p tid %u status %d\n", __func__, ep, ep->hwtid, + rep->status); + if (rep->status) { + struct iwch_qp_attributes attrs; + + printk(KERN_ERR MOD "%s BAD CLOSE - Aborting tid %u\n", + __func__, ep->hwtid); + stop_ep_timer(ep); + attrs.next_state = IWCH_QP_STATE_ERROR; + iwch_modify_qp(ep->com.qp->rhp, + ep->com.qp, IWCH_QP_ATTR_NEXT_STATE, + &attrs, 1); + abort_connection(ep, NULL, GFP_KERNEL); + } + return CPL_RET_BUF_DONE; +} + +static void ep_timeout(unsigned long arg) +{ + struct iwch_ep *ep = (struct iwch_ep *)arg; + struct iwch_qp_attributes attrs; + unsigned long flags; + int abort = 1; + + spin_lock_irqsave(&ep->com.lock, flags); + PDBG("%s ep %p tid %u state %d\n", __func__, ep, ep->hwtid, + ep->com.state); + switch (ep->com.state) { + case MPA_REQ_SENT: + __state_set(&ep->com, ABORTING); + connect_reply_upcall(ep, -ETIMEDOUT); + break; + case MPA_REQ_WAIT: + __state_set(&ep->com, ABORTING); + break; + case CLOSING: + case MORIBUND: + if (ep->com.cm_id && ep->com.qp) { + attrs.next_state = IWCH_QP_STATE_ERROR; + iwch_modify_qp(ep->com.qp->rhp, + ep->com.qp, IWCH_QP_ATTR_NEXT_STATE, + &attrs, 1); + } + __state_set(&ep->com, ABORTING); + break; + default: + WARN(1, "%s unexpected state ep %p state %u\n", + __func__, ep, ep->com.state); + abort = 0; + } + spin_unlock_irqrestore(&ep->com.lock, flags); + if (abort) + abort_connection(ep, NULL, GFP_ATOMIC); + put_ep(&ep->com); +} + +int iwch_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len) +{ + int err; + struct iwch_ep *ep = to_ep(cm_id); + PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); + + if (state_read(&ep->com) == DEAD) { + put_ep(&ep->com); + return -ECONNRESET; + } + BUG_ON(state_read(&ep->com) != MPA_REQ_RCVD); + if (mpa_rev == 0) + abort_connection(ep, NULL, GFP_KERNEL); + else { + err = send_mpa_reject(ep, pdata, pdata_len); + err = iwch_ep_disconnect(ep, 0, GFP_KERNEL); + } + put_ep(&ep->com); + return 0; +} + +int iwch_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) +{ + int err; + struct iwch_qp_attributes attrs; + enum iwch_qp_attr_mask mask; + struct iwch_ep *ep = to_ep(cm_id); + struct iwch_dev *h = to_iwch_dev(cm_id->device); + struct iwch_qp *qp = get_qhp(h, conn_param->qpn); + + PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); + if (state_read(&ep->com) == DEAD) { + err = -ECONNRESET; + goto err; + } + + BUG_ON(state_read(&ep->com) != MPA_REQ_RCVD); + BUG_ON(!qp); + + if ((conn_param->ord > qp->rhp->attr.max_rdma_read_qp_depth) || + (conn_param->ird > qp->rhp->attr.max_rdma_reads_per_qp)) { + abort_connection(ep, NULL, GFP_KERNEL); + err = -EINVAL; + goto err; + } + + cm_id->add_ref(cm_id); + ep->com.cm_id = cm_id; + ep->com.qp = qp; + + ep->ird = conn_param->ird; + ep->ord = conn_param->ord; + + if (peer2peer && ep->ird == 0) + ep->ird = 1; + + PDBG("%s %d ird %d ord %d\n", __func__, __LINE__, ep->ird, ep->ord); + + /* bind QP to EP and move to RTS */ + attrs.mpa_attr = ep->mpa_attr; + attrs.max_ird = ep->ird; + attrs.max_ord = ep->ord; + attrs.llp_stream_handle = ep; + attrs.next_state = IWCH_QP_STATE_RTS; + + /* bind QP and TID with INIT_WR */ + mask = IWCH_QP_ATTR_NEXT_STATE | + IWCH_QP_ATTR_LLP_STREAM_HANDLE | + IWCH_QP_ATTR_MPA_ATTR | + IWCH_QP_ATTR_MAX_IRD | + IWCH_QP_ATTR_MAX_ORD; + + err = iwch_modify_qp(ep->com.qp->rhp, + ep->com.qp, mask, &attrs, 1); + if (err) + goto err1; + + /* if needed, wait for wr_ack */ + if (iwch_rqes_posted(qp)) { + wait_event(ep->com.waitq, ep->com.rpl_done); + err = ep->com.rpl_err; + if (err) + goto err1; + } + + err = send_mpa_reply(ep, conn_param->private_data, + conn_param->private_data_len); + if (err) + goto err1; + + + state_set(&ep->com, FPDU_MODE); + established_upcall(ep); + put_ep(&ep->com); + return 0; +err1: + ep->com.cm_id = NULL; + ep->com.qp = NULL; + cm_id->rem_ref(cm_id); +err: + put_ep(&ep->com); + return err; +} + +static int is_loopback_dst(struct iw_cm_id *cm_id) +{ + struct net_device *dev; + struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->remote_addr; + + dev = ip_dev_find(&init_net, raddr->sin_addr.s_addr); + if (!dev) + return 0; + dev_put(dev); + return 1; +} + +int iwch_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) +{ + struct iwch_dev *h = to_iwch_dev(cm_id->device); + struct iwch_ep *ep; + struct rtable *rt; + int err = 0; + struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->local_addr; + struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->remote_addr; + + if (cm_id->remote_addr.ss_family != PF_INET) { + err = -ENOSYS; + goto out; + } + + if (is_loopback_dst(cm_id)) { + err = -ENOSYS; + goto out; + } + + ep = alloc_ep(sizeof(*ep), GFP_KERNEL); + if (!ep) { + printk(KERN_ERR MOD "%s - cannot alloc ep.\n", __func__); + err = -ENOMEM; + goto out; + } + init_timer(&ep->timer); + ep->plen = conn_param->private_data_len; + if (ep->plen) + memcpy(ep->mpa_pkt + sizeof(struct mpa_message), + conn_param->private_data, ep->plen); + ep->ird = conn_param->ird; + ep->ord = conn_param->ord; + + if (peer2peer && ep->ord == 0) + ep->ord = 1; + + ep->com.tdev = h->rdev.t3cdev_p; + + cm_id->add_ref(cm_id); + ep->com.cm_id = cm_id; + ep->com.qp = get_qhp(h, conn_param->qpn); + BUG_ON(!ep->com.qp); + PDBG("%s qpn 0x%x qp %p cm_id %p\n", __func__, conn_param->qpn, + ep->com.qp, cm_id); + + /* + * Allocate an active TID to initiate a TCP connection. + */ + ep->atid = cxgb3_alloc_atid(h->rdev.t3cdev_p, &t3c_client, ep); + if (ep->atid == -1) { + printk(KERN_ERR MOD "%s - cannot alloc atid.\n", __func__); + err = -ENOMEM; + goto fail2; + } + + /* find a route */ + rt = find_route(h->rdev.t3cdev_p, laddr->sin_addr.s_addr, + raddr->sin_addr.s_addr, laddr->sin_port, + raddr->sin_port, IPTOS_LOWDELAY); + if (!rt) { + printk(KERN_ERR MOD "%s - cannot find route.\n", __func__); + err = -EHOSTUNREACH; + goto fail3; + } + ep->dst = &rt->dst; + ep->l2t = t3_l2t_get(ep->com.tdev, ep->dst, NULL, + &raddr->sin_addr.s_addr); + if (!ep->l2t) { + printk(KERN_ERR MOD "%s - cannot alloc l2e.\n", __func__); + err = -ENOMEM; + goto fail4; + } + + state_set(&ep->com, CONNECTING); + ep->tos = IPTOS_LOWDELAY; + memcpy(&ep->com.local_addr, &cm_id->local_addr, + sizeof(ep->com.local_addr)); + memcpy(&ep->com.remote_addr, &cm_id->remote_addr, + sizeof(ep->com.remote_addr)); + + /* send connect request to rnic */ + err = send_connect(ep); + if (!err) + goto out; + + l2t_release(h->rdev.t3cdev_p, ep->l2t); +fail4: + dst_release(ep->dst); +fail3: + cxgb3_free_atid(ep->com.tdev, ep->atid); +fail2: + cm_id->rem_ref(cm_id); + put_ep(&ep->com); +out: + return err; +} + +int iwch_create_listen(struct iw_cm_id *cm_id, int backlog) +{ + int err = 0; + struct iwch_dev *h = to_iwch_dev(cm_id->device); + struct iwch_listen_ep *ep; + + + might_sleep(); + + if (cm_id->local_addr.ss_family != PF_INET) { + err = -ENOSYS; + goto fail1; + } + + ep = alloc_ep(sizeof(*ep), GFP_KERNEL); + if (!ep) { + printk(KERN_ERR MOD "%s - cannot alloc ep.\n", __func__); + err = -ENOMEM; + goto fail1; + } + PDBG("%s ep %p\n", __func__, ep); + ep->com.tdev = h->rdev.t3cdev_p; + cm_id->add_ref(cm_id); + ep->com.cm_id = cm_id; + ep->backlog = backlog; + memcpy(&ep->com.local_addr, &cm_id->local_addr, + sizeof(ep->com.local_addr)); + + /* + * Allocate a server TID. + */ + ep->stid = cxgb3_alloc_stid(h->rdev.t3cdev_p, &t3c_client, ep); + if (ep->stid == -1) { + printk(KERN_ERR MOD "%s - cannot alloc atid.\n", __func__); + err = -ENOMEM; + goto fail2; + } + + state_set(&ep->com, LISTEN); + err = listen_start(ep); + if (err) + goto fail3; + + /* wait for pass_open_rpl */ + wait_event(ep->com.waitq, ep->com.rpl_done); + err = ep->com.rpl_err; + if (!err) { + cm_id->provider_data = ep; + goto out; + } +fail3: + cxgb3_free_stid(ep->com.tdev, ep->stid); +fail2: + cm_id->rem_ref(cm_id); + put_ep(&ep->com); +fail1: +out: + return err; +} + +int iwch_destroy_listen(struct iw_cm_id *cm_id) +{ + int err; + struct iwch_listen_ep *ep = to_listen_ep(cm_id); + + PDBG("%s ep %p\n", __func__, ep); + + might_sleep(); + state_set(&ep->com, DEAD); + ep->com.rpl_done = 0; + ep->com.rpl_err = 0; + err = listen_stop(ep); + if (err) + goto done; + wait_event(ep->com.waitq, ep->com.rpl_done); + cxgb3_free_stid(ep->com.tdev, ep->stid); +done: + err = ep->com.rpl_err; + cm_id->rem_ref(cm_id); + put_ep(&ep->com); + return err; +} + +int iwch_ep_disconnect(struct iwch_ep *ep, int abrupt, gfp_t gfp) +{ + int ret=0; + unsigned long flags; + int close = 0; + int fatal = 0; + struct t3cdev *tdev; + struct cxio_rdev *rdev; + + spin_lock_irqsave(&ep->com.lock, flags); + + PDBG("%s ep %p state %s, abrupt %d\n", __func__, ep, + states[ep->com.state], abrupt); + + tdev = (struct t3cdev *)ep->com.tdev; + rdev = (struct cxio_rdev *)tdev->ulp; + if (cxio_fatal_error(rdev)) { + fatal = 1; + close_complete_upcall(ep); + ep->com.state = DEAD; + } + switch (ep->com.state) { + case MPA_REQ_WAIT: + case MPA_REQ_SENT: + case MPA_REQ_RCVD: + case MPA_REP_SENT: + case FPDU_MODE: + close = 1; + if (abrupt) + ep->com.state = ABORTING; + else { + ep->com.state = CLOSING; + start_ep_timer(ep); + } + set_bit(CLOSE_SENT, &ep->com.flags); + break; + case CLOSING: + if (!test_and_set_bit(CLOSE_SENT, &ep->com.flags)) { + close = 1; + if (abrupt) { + stop_ep_timer(ep); + ep->com.state = ABORTING; + } else + ep->com.state = MORIBUND; + } + break; + case MORIBUND: + case ABORTING: + case DEAD: + PDBG("%s ignoring disconnect ep %p state %u\n", + __func__, ep, ep->com.state); + break; + default: + BUG(); + break; + } + + spin_unlock_irqrestore(&ep->com.lock, flags); + if (close) { + if (abrupt) + ret = send_abort(ep, NULL, gfp); + else + ret = send_halfclose(ep, gfp); + if (ret) + fatal = 1; + } + if (fatal) + release_ep_resources(ep); + return ret; +} + +int iwch_ep_redirect(void *ctx, struct dst_entry *old, struct dst_entry *new, + struct l2t_entry *l2t) +{ + struct iwch_ep *ep = ctx; + + if (ep->dst != old) + return 0; + + PDBG("%s ep %p redirect to dst %p l2t %p\n", __func__, ep, new, + l2t); + dst_hold(new); + l2t_release(ep->com.tdev, ep->l2t); + ep->l2t = l2t; + dst_release(old); + ep->dst = new; + return 1; +} + +/* + * All the CM events are handled on a work queue to have a safe context. + * These are the real handlers that are called from the work queue. + */ +static const cxgb3_cpl_handler_func work_handlers[NUM_CPL_CMDS] = { + [CPL_ACT_ESTABLISH] = act_establish, + [CPL_ACT_OPEN_RPL] = act_open_rpl, + [CPL_RX_DATA] = rx_data, + [CPL_TX_DMA_ACK] = tx_ack, + [CPL_ABORT_RPL_RSS] = abort_rpl, + [CPL_ABORT_RPL] = abort_rpl, + [CPL_PASS_OPEN_RPL] = pass_open_rpl, + [CPL_CLOSE_LISTSRV_RPL] = close_listsrv_rpl, + [CPL_PASS_ACCEPT_REQ] = pass_accept_req, + [CPL_PASS_ESTABLISH] = pass_establish, + [CPL_PEER_CLOSE] = peer_close, + [CPL_ABORT_REQ_RSS] = peer_abort, + [CPL_CLOSE_CON_RPL] = close_con_rpl, + [CPL_RDMA_TERMINATE] = terminate, + [CPL_RDMA_EC_STATUS] = ec_status, +}; + +static void process_work(struct work_struct *work) +{ + struct sk_buff *skb = NULL; + void *ep; + struct t3cdev *tdev; + int ret; + + while ((skb = skb_dequeue(&rxq))) { + ep = *((void **) (skb->cb)); + tdev = *((struct t3cdev **) (skb->cb + sizeof(void *))); + ret = work_handlers[G_OPCODE(ntohl((__force __be32)skb->csum))](tdev, skb, ep); + if (ret & CPL_RET_BUF_DONE) + kfree_skb(skb); + + /* + * ep was referenced in sched(), and is freed here. + */ + put_ep((struct iwch_ep_common *)ep); + } +} + +static DECLARE_WORK(skb_work, process_work); + +static int sched(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct iwch_ep_common *epc = ctx; + + get_ep(epc); + + /* + * Save ctx and tdev in the skb->cb area. + */ + *((void **) skb->cb) = ctx; + *((struct t3cdev **) (skb->cb + sizeof(void *))) = tdev; + + /* + * Queue the skb and schedule the worker thread. + */ + skb_queue_tail(&rxq, skb); + queue_work(workq, &skb_work); + return 0; +} + +static int set_tcb_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct cpl_set_tcb_rpl *rpl = cplhdr(skb); + + if (rpl->status != CPL_ERR_NONE) { + printk(KERN_ERR MOD "Unexpected SET_TCB_RPL status %u " + "for tid %u\n", rpl->status, GET_TID(rpl)); + } + return CPL_RET_BUF_DONE; +} + +/* + * All upcalls from the T3 Core go to sched() to schedule the + * processing on a work queue. + */ +cxgb3_cpl_handler_func t3c_handlers[NUM_CPL_CMDS] = { + [CPL_ACT_ESTABLISH] = sched, + [CPL_ACT_OPEN_RPL] = sched, + [CPL_RX_DATA] = sched, + [CPL_TX_DMA_ACK] = sched, + [CPL_ABORT_RPL_RSS] = sched, + [CPL_ABORT_RPL] = sched, + [CPL_PASS_OPEN_RPL] = sched, + [CPL_CLOSE_LISTSRV_RPL] = sched, + [CPL_PASS_ACCEPT_REQ] = sched, + [CPL_PASS_ESTABLISH] = sched, + [CPL_PEER_CLOSE] = sched, + [CPL_CLOSE_CON_RPL] = sched, + [CPL_ABORT_REQ_RSS] = sched, + [CPL_RDMA_TERMINATE] = sched, + [CPL_RDMA_EC_STATUS] = sched, + [CPL_SET_TCB_RPL] = set_tcb_rpl, +}; + +int __init iwch_cm_init(void) +{ + skb_queue_head_init(&rxq); + + workq = create_singlethread_workqueue("iw_cxgb3"); + if (!workq) + return -ENOMEM; + + return 0; +} + +void __exit iwch_cm_term(void) +{ + flush_workqueue(workq); + destroy_workqueue(workq); +} diff --git a/kernel/drivers/infiniband/hw/cxgb3/iwch_cm.h b/kernel/drivers/infiniband/hw/cxgb3/iwch_cm.h new file mode 100644 index 000000000..b9efadfff --- /dev/null +++ b/kernel/drivers/infiniband/hw/cxgb3/iwch_cm.h @@ -0,0 +1,233 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _IWCH_CM_H_ +#define _IWCH_CM_H_ + +#include +#include +#include +#include + +#include +#include + +#include "cxgb3_offload.h" +#include "iwch_provider.h" + +#define MPA_KEY_REQ "MPA ID Req Frame" +#define MPA_KEY_REP "MPA ID Rep Frame" + +#define MPA_MAX_PRIVATE_DATA 256 +#define MPA_REV 0 /* XXX - amso1100 uses rev 0 ! */ +#define MPA_REJECT 0x20 +#define MPA_CRC 0x40 +#define MPA_MARKERS 0x80 +#define MPA_FLAGS_MASK 0xE0 + +#define put_ep(ep) { \ + PDBG("put_ep (via %s:%u) ep %p refcnt %d\n", __func__, __LINE__, \ + ep, atomic_read(&((ep)->kref.refcount))); \ + WARN_ON(atomic_read(&((ep)->kref.refcount)) < 1); \ + kref_put(&((ep)->kref), __free_ep); \ +} + +#define get_ep(ep) { \ + PDBG("get_ep (via %s:%u) ep %p, refcnt %d\n", __func__, __LINE__, \ + ep, atomic_read(&((ep)->kref.refcount))); \ + kref_get(&((ep)->kref)); \ +} + +struct mpa_message { + u8 key[16]; + u8 flags; + u8 revision; + __be16 private_data_size; + u8 private_data[0]; +}; + +struct terminate_message { + u8 layer_etype; + u8 ecode; + __be16 hdrct_rsvd; + u8 len_hdrs[0]; +}; + +#define TERM_MAX_LENGTH (sizeof(struct terminate_message) + 2 + 18 + 28) + +enum iwch_layers_types { + LAYER_RDMAP = 0x00, + LAYER_DDP = 0x10, + LAYER_MPA = 0x20, + RDMAP_LOCAL_CATA = 0x00, + RDMAP_REMOTE_PROT = 0x01, + RDMAP_REMOTE_OP = 0x02, + DDP_LOCAL_CATA = 0x00, + DDP_TAGGED_ERR = 0x01, + DDP_UNTAGGED_ERR = 0x02, + DDP_LLP = 0x03 +}; + +enum iwch_rdma_ecodes { + RDMAP_INV_STAG = 0x00, + RDMAP_BASE_BOUNDS = 0x01, + RDMAP_ACC_VIOL = 0x02, + RDMAP_STAG_NOT_ASSOC = 0x03, + RDMAP_TO_WRAP = 0x04, + RDMAP_INV_VERS = 0x05, + RDMAP_INV_OPCODE = 0x06, + RDMAP_STREAM_CATA = 0x07, + RDMAP_GLOBAL_CATA = 0x08, + RDMAP_CANT_INV_STAG = 0x09, + RDMAP_UNSPECIFIED = 0xff +}; + +enum iwch_ddp_ecodes { + DDPT_INV_STAG = 0x00, + DDPT_BASE_BOUNDS = 0x01, + DDPT_STAG_NOT_ASSOC = 0x02, + DDPT_TO_WRAP = 0x03, + DDPT_INV_VERS = 0x04, + DDPU_INV_QN = 0x01, + DDPU_INV_MSN_NOBUF = 0x02, + DDPU_INV_MSN_RANGE = 0x03, + DDPU_INV_MO = 0x04, + DDPU_MSG_TOOBIG = 0x05, + DDPU_INV_VERS = 0x06 +}; + +enum iwch_mpa_ecodes { + MPA_CRC_ERR = 0x02, + MPA_MARKER_ERR = 0x03 +}; + +enum iwch_ep_state { + IDLE = 0, + LISTEN, + CONNECTING, + MPA_REQ_WAIT, + MPA_REQ_SENT, + MPA_REQ_RCVD, + MPA_REP_SENT, + FPDU_MODE, + ABORTING, + CLOSING, + MORIBUND, + DEAD, +}; + +enum iwch_ep_flags { + PEER_ABORT_IN_PROGRESS = 0, + ABORT_REQ_IN_PROGRESS = 1, + RELEASE_RESOURCES = 2, + CLOSE_SENT = 3, +}; + +struct iwch_ep_common { + struct iw_cm_id *cm_id; + struct iwch_qp *qp; + struct t3cdev *tdev; + enum iwch_ep_state state; + struct kref kref; + spinlock_t lock; + struct sockaddr_in local_addr; + struct sockaddr_in remote_addr; + wait_queue_head_t waitq; + int rpl_done; + int rpl_err; + unsigned long flags; +}; + +struct iwch_listen_ep { + struct iwch_ep_common com; + unsigned int stid; + int backlog; +}; + +struct iwch_ep { + struct iwch_ep_common com; + struct iwch_ep *parent_ep; + struct timer_list timer; + unsigned int atid; + u32 hwtid; + u32 snd_seq; + u32 rcv_seq; + struct l2t_entry *l2t; + struct dst_entry *dst; + struct sk_buff *mpa_skb; + struct iwch_mpa_attributes mpa_attr; + unsigned int mpa_pkt_len; + u8 mpa_pkt[sizeof(struct mpa_message) + MPA_MAX_PRIVATE_DATA]; + u8 tos; + u16 emss; + u16 plen; + u32 ird; + u32 ord; +}; + +static inline struct iwch_ep *to_ep(struct iw_cm_id *cm_id) +{ + return cm_id->provider_data; +} + +static inline struct iwch_listen_ep *to_listen_ep(struct iw_cm_id *cm_id) +{ + return cm_id->provider_data; +} + +static inline int compute_wscale(int win) +{ + int wscale = 0; + + while (wscale < 14 && (65535<cq); + + if (!rd_cqe) + return 0; + + qhp = get_qhp(rhp, CQE_QPID(*rd_cqe)); + if (!qhp) + wq = NULL; + else { + spin_lock(&qhp->lock); + wq = &(qhp->wq); + } + ret = cxio_poll_cq(wq, &(chp->cq), &cqe, &cqe_flushed, &cookie, + &credit); + if (t3a_device(chp->rhp) && credit) { + PDBG("%s updating %d cq credits on id %d\n", __func__, + credit, chp->cq.cqid); + cxio_hal_cq_op(&rhp->rdev, &chp->cq, CQ_CREDIT_UPDATE, credit); + } + + if (ret) { + ret = -EAGAIN; + goto out; + } + ret = 1; + + wc->wr_id = cookie; + wc->qp = &qhp->ibqp; + wc->vendor_err = CQE_STATUS(cqe); + wc->wc_flags = 0; + + PDBG("%s qpid 0x%x type %d opcode %d status 0x%x wrid hi 0x%x " + "lo 0x%x cookie 0x%llx\n", __func__, + CQE_QPID(cqe), CQE_TYPE(cqe), + CQE_OPCODE(cqe), CQE_STATUS(cqe), CQE_WRID_HI(cqe), + CQE_WRID_LOW(cqe), (unsigned long long) cookie); + + if (CQE_TYPE(cqe) == 0) { + if (!CQE_STATUS(cqe)) + wc->byte_len = CQE_LEN(cqe); + else + wc->byte_len = 0; + wc->opcode = IB_WC_RECV; + if (CQE_OPCODE(cqe) == T3_SEND_WITH_INV || + CQE_OPCODE(cqe) == T3_SEND_WITH_SE_INV) { + wc->ex.invalidate_rkey = CQE_WRID_STAG(cqe); + wc->wc_flags |= IB_WC_WITH_INVALIDATE; + } + } else { + switch (CQE_OPCODE(cqe)) { + case T3_RDMA_WRITE: + wc->opcode = IB_WC_RDMA_WRITE; + break; + case T3_READ_REQ: + wc->opcode = IB_WC_RDMA_READ; + wc->byte_len = CQE_LEN(cqe); + break; + case T3_SEND: + case T3_SEND_WITH_SE: + case T3_SEND_WITH_INV: + case T3_SEND_WITH_SE_INV: + wc->opcode = IB_WC_SEND; + break; + case T3_BIND_MW: + wc->opcode = IB_WC_BIND_MW; + break; + + case T3_LOCAL_INV: + wc->opcode = IB_WC_LOCAL_INV; + break; + case T3_FAST_REGISTER: + wc->opcode = IB_WC_FAST_REG_MR; + break; + default: + printk(KERN_ERR MOD "Unexpected opcode %d " + "in the CQE received for QPID=0x%0x\n", + CQE_OPCODE(cqe), CQE_QPID(cqe)); + ret = -EINVAL; + goto out; + } + } + + if (cqe_flushed) + wc->status = IB_WC_WR_FLUSH_ERR; + else { + + switch (CQE_STATUS(cqe)) { + case TPT_ERR_SUCCESS: + wc->status = IB_WC_SUCCESS; + break; + case TPT_ERR_STAG: + wc->status = IB_WC_LOC_ACCESS_ERR; + break; + case TPT_ERR_PDID: + wc->status = IB_WC_LOC_PROT_ERR; + break; + case TPT_ERR_QPID: + case TPT_ERR_ACCESS: + wc->status = IB_WC_LOC_ACCESS_ERR; + break; + case TPT_ERR_WRAP: + wc->status = IB_WC_GENERAL_ERR; + break; + case TPT_ERR_BOUND: + wc->status = IB_WC_LOC_LEN_ERR; + break; + case TPT_ERR_INVALIDATE_SHARED_MR: + case TPT_ERR_INVALIDATE_MR_WITH_MW_BOUND: + wc->status = IB_WC_MW_BIND_ERR; + break; + case TPT_ERR_CRC: + case TPT_ERR_MARKER: + case TPT_ERR_PDU_LEN_ERR: + case TPT_ERR_OUT_OF_RQE: + case TPT_ERR_DDP_VERSION: + case TPT_ERR_RDMA_VERSION: + case TPT_ERR_DDP_QUEUE_NUM: + case TPT_ERR_MSN: + case TPT_ERR_TBIT: + case TPT_ERR_MO: + case TPT_ERR_MSN_RANGE: + case TPT_ERR_IRD_OVERFLOW: + case TPT_ERR_OPCODE: + wc->status = IB_WC_FATAL_ERR; + break; + case TPT_ERR_SWFLUSH: + wc->status = IB_WC_WR_FLUSH_ERR; + break; + default: + printk(KERN_ERR MOD "Unexpected cqe_status 0x%x for " + "QPID=0x%0x\n", CQE_STATUS(cqe), CQE_QPID(cqe)); + ret = -EINVAL; + } + } +out: + if (wq) + spin_unlock(&qhp->lock); + return ret; +} + +int iwch_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) +{ + struct iwch_dev *rhp; + struct iwch_cq *chp; + unsigned long flags; + int npolled; + int err = 0; + + chp = to_iwch_cq(ibcq); + rhp = chp->rhp; + + spin_lock_irqsave(&chp->lock, flags); + for (npolled = 0; npolled < num_entries; ++npolled) { +#ifdef DEBUG + int i=0; +#endif + + /* + * Because T3 can post CQEs that are _not_ associated + * with a WR, we might have to poll again after removing + * one of these. + */ + do { + err = iwch_poll_cq_one(rhp, chp, wc + npolled); +#ifdef DEBUG + BUG_ON(++i > 1000); +#endif + } while (err == -EAGAIN); + if (err <= 0) + break; + } + spin_unlock_irqrestore(&chp->lock, flags); + + if (err < 0) + return err; + else { + return npolled; + } +} diff --git a/kernel/drivers/infiniband/hw/cxgb3/iwch_ev.c b/kernel/drivers/infiniband/hw/cxgb3/iwch_ev.c new file mode 100644 index 000000000..abcc9e769 --- /dev/null +++ b/kernel/drivers/infiniband/hw/cxgb3/iwch_ev.c @@ -0,0 +1,232 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include +#include "iwch_provider.h" +#include "iwch.h" +#include "iwch_cm.h" +#include "cxio_hal.h" +#include "cxio_wr.h" + +static void post_qp_event(struct iwch_dev *rnicp, struct iwch_cq *chp, + struct respQ_msg_t *rsp_msg, + enum ib_event_type ib_event, + int send_term) +{ + struct ib_event event; + struct iwch_qp_attributes attrs; + struct iwch_qp *qhp; + unsigned long flag; + + spin_lock(&rnicp->lock); + qhp = get_qhp(rnicp, CQE_QPID(rsp_msg->cqe)); + + if (!qhp) { + printk(KERN_ERR "%s unaffiliated error 0x%x qpid 0x%x\n", + __func__, CQE_STATUS(rsp_msg->cqe), + CQE_QPID(rsp_msg->cqe)); + spin_unlock(&rnicp->lock); + return; + } + + if ((qhp->attr.state == IWCH_QP_STATE_ERROR) || + (qhp->attr.state == IWCH_QP_STATE_TERMINATE)) { + PDBG("%s AE received after RTS - " + "qp state %d qpid 0x%x status 0x%x\n", __func__, + qhp->attr.state, qhp->wq.qpid, CQE_STATUS(rsp_msg->cqe)); + spin_unlock(&rnicp->lock); + return; + } + + printk(KERN_ERR "%s - AE qpid 0x%x opcode %d status 0x%x " + "type %d wrid.hi 0x%x wrid.lo 0x%x \n", __func__, + CQE_QPID(rsp_msg->cqe), CQE_OPCODE(rsp_msg->cqe), + CQE_STATUS(rsp_msg->cqe), CQE_TYPE(rsp_msg->cqe), + CQE_WRID_HI(rsp_msg->cqe), CQE_WRID_LOW(rsp_msg->cqe)); + + atomic_inc(&qhp->refcnt); + spin_unlock(&rnicp->lock); + + if (qhp->attr.state == IWCH_QP_STATE_RTS) { + attrs.next_state = IWCH_QP_STATE_TERMINATE; + iwch_modify_qp(qhp->rhp, qhp, IWCH_QP_ATTR_NEXT_STATE, + &attrs, 1); + if (send_term) + iwch_post_terminate(qhp, rsp_msg); + } + + event.event = ib_event; + event.device = chp->ibcq.device; + if (ib_event == IB_EVENT_CQ_ERR) + event.element.cq = &chp->ibcq; + else + event.element.qp = &qhp->ibqp; + + if (qhp->ibqp.event_handler) + (*qhp->ibqp.event_handler)(&event, qhp->ibqp.qp_context); + + spin_lock_irqsave(&chp->comp_handler_lock, flag); + (*chp->ibcq.comp_handler)(&chp->ibcq, chp->ibcq.cq_context); + spin_unlock_irqrestore(&chp->comp_handler_lock, flag); + + if (atomic_dec_and_test(&qhp->refcnt)) + wake_up(&qhp->wait); +} + +void iwch_ev_dispatch(struct cxio_rdev *rdev_p, struct sk_buff *skb) +{ + struct iwch_dev *rnicp; + struct respQ_msg_t *rsp_msg = (struct respQ_msg_t *) skb->data; + struct iwch_cq *chp; + struct iwch_qp *qhp; + u32 cqid = RSPQ_CQID(rsp_msg); + unsigned long flag; + + rnicp = (struct iwch_dev *) rdev_p->ulp; + spin_lock(&rnicp->lock); + chp = get_chp(rnicp, cqid); + qhp = get_qhp(rnicp, CQE_QPID(rsp_msg->cqe)); + if (!chp || !qhp) { + printk(KERN_ERR MOD "BAD AE cqid 0x%x qpid 0x%x opcode %d " + "status 0x%x type %d wrid.hi 0x%x wrid.lo 0x%x \n", + cqid, CQE_QPID(rsp_msg->cqe), + CQE_OPCODE(rsp_msg->cqe), CQE_STATUS(rsp_msg->cqe), + CQE_TYPE(rsp_msg->cqe), CQE_WRID_HI(rsp_msg->cqe), + CQE_WRID_LOW(rsp_msg->cqe)); + spin_unlock(&rnicp->lock); + goto out; + } + iwch_qp_add_ref(&qhp->ibqp); + atomic_inc(&chp->refcnt); + spin_unlock(&rnicp->lock); + + /* + * 1) completion of our sending a TERMINATE. + * 2) incoming TERMINATE message. + */ + if ((CQE_OPCODE(rsp_msg->cqe) == T3_TERMINATE) && + (CQE_STATUS(rsp_msg->cqe) == 0)) { + if (SQ_TYPE(rsp_msg->cqe)) { + PDBG("%s QPID 0x%x ep %p disconnecting\n", + __func__, qhp->wq.qpid, qhp->ep); + iwch_ep_disconnect(qhp->ep, 0, GFP_ATOMIC); + } else { + PDBG("%s post REQ_ERR AE QPID 0x%x\n", __func__, + qhp->wq.qpid); + post_qp_event(rnicp, chp, rsp_msg, + IB_EVENT_QP_REQ_ERR, 0); + iwch_ep_disconnect(qhp->ep, 0, GFP_ATOMIC); + } + goto done; + } + + /* Bad incoming Read request */ + if (SQ_TYPE(rsp_msg->cqe) && + (CQE_OPCODE(rsp_msg->cqe) == T3_READ_RESP)) { + post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_QP_REQ_ERR, 1); + goto done; + } + + /* Bad incoming write */ + if (RQ_TYPE(rsp_msg->cqe) && + (CQE_OPCODE(rsp_msg->cqe) == T3_RDMA_WRITE)) { + post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_QP_REQ_ERR, 1); + goto done; + } + + switch (CQE_STATUS(rsp_msg->cqe)) { + + /* Completion Events */ + case TPT_ERR_SUCCESS: + + /* + * Confirm the destination entry if this is a RECV completion. + */ + if (qhp->ep && SQ_TYPE(rsp_msg->cqe)) + dst_confirm(qhp->ep->dst); + spin_lock_irqsave(&chp->comp_handler_lock, flag); + (*chp->ibcq.comp_handler)(&chp->ibcq, chp->ibcq.cq_context); + spin_unlock_irqrestore(&chp->comp_handler_lock, flag); + break; + + case TPT_ERR_STAG: + case TPT_ERR_PDID: + case TPT_ERR_QPID: + case TPT_ERR_ACCESS: + case TPT_ERR_WRAP: + case TPT_ERR_BOUND: + case TPT_ERR_INVALIDATE_SHARED_MR: + case TPT_ERR_INVALIDATE_MR_WITH_MW_BOUND: + post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_QP_ACCESS_ERR, 1); + break; + + /* Device Fatal Errors */ + case TPT_ERR_ECC: + case TPT_ERR_ECC_PSTAG: + case TPT_ERR_INTERNAL_ERR: + post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_DEVICE_FATAL, 1); + break; + + /* QP Fatal Errors */ + case TPT_ERR_OUT_OF_RQE: + case TPT_ERR_PBL_ADDR_BOUND: + case TPT_ERR_CRC: + case TPT_ERR_MARKER: + case TPT_ERR_PDU_LEN_ERR: + case TPT_ERR_DDP_VERSION: + case TPT_ERR_RDMA_VERSION: + case TPT_ERR_OPCODE: + case TPT_ERR_DDP_QUEUE_NUM: + case TPT_ERR_MSN: + case TPT_ERR_TBIT: + case TPT_ERR_MO: + case TPT_ERR_MSN_GAP: + case TPT_ERR_MSN_RANGE: + case TPT_ERR_RQE_ADDR_BOUND: + case TPT_ERR_IRD_OVERFLOW: + post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_QP_FATAL, 1); + break; + + default: + printk(KERN_ERR MOD "Unknown T3 status 0x%x QPID 0x%x\n", + CQE_STATUS(rsp_msg->cqe), qhp->wq.qpid); + post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_QP_FATAL, 1); + break; + } +done: + if (atomic_dec_and_test(&chp->refcnt)) + wake_up(&chp->wait); + iwch_qp_rem_ref(&qhp->ibqp); +out: + dev_kfree_skb_irq(skb); +} diff --git a/kernel/drivers/infiniband/hw/cxgb3/iwch_mem.c b/kernel/drivers/infiniband/hw/cxgb3/iwch_mem.c new file mode 100644 index 000000000..5c36ee280 --- /dev/null +++ b/kernel/drivers/infiniband/hw/cxgb3/iwch_mem.c @@ -0,0 +1,203 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include + +#include +#include + +#include "cxio_hal.h" +#include "cxio_resource.h" +#include "iwch.h" +#include "iwch_provider.h" + +static int iwch_finish_mem_reg(struct iwch_mr *mhp, u32 stag) +{ + u32 mmid; + + mhp->attr.state = 1; + mhp->attr.stag = stag; + mmid = stag >> 8; + mhp->ibmr.rkey = mhp->ibmr.lkey = stag; + PDBG("%s mmid 0x%x mhp %p\n", __func__, mmid, mhp); + return insert_handle(mhp->rhp, &mhp->rhp->mmidr, mhp, mmid); +} + +int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php, + struct iwch_mr *mhp, int shift) +{ + u32 stag; + int ret; + + if (cxio_register_phys_mem(&rhp->rdev, + &stag, mhp->attr.pdid, + mhp->attr.perms, + mhp->attr.zbva, + mhp->attr.va_fbo, + mhp->attr.len, + shift - 12, + mhp->attr.pbl_size, mhp->attr.pbl_addr)) + return -ENOMEM; + + ret = iwch_finish_mem_reg(mhp, stag); + if (ret) + cxio_dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size, + mhp->attr.pbl_addr); + return ret; +} + +int iwch_reregister_mem(struct iwch_dev *rhp, struct iwch_pd *php, + struct iwch_mr *mhp, + int shift, + int npages) +{ + u32 stag; + int ret; + + /* We could support this... */ + if (npages > mhp->attr.pbl_size) + return -ENOMEM; + + stag = mhp->attr.stag; + if (cxio_reregister_phys_mem(&rhp->rdev, + &stag, mhp->attr.pdid, + mhp->attr.perms, + mhp->attr.zbva, + mhp->attr.va_fbo, + mhp->attr.len, + shift - 12, + mhp->attr.pbl_size, mhp->attr.pbl_addr)) + return -ENOMEM; + + ret = iwch_finish_mem_reg(mhp, stag); + if (ret) + cxio_dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size, + mhp->attr.pbl_addr); + + return ret; +} + +int iwch_alloc_pbl(struct iwch_mr *mhp, int npages) +{ + mhp->attr.pbl_addr = cxio_hal_pblpool_alloc(&mhp->rhp->rdev, + npages << 3); + + if (!mhp->attr.pbl_addr) + return -ENOMEM; + + mhp->attr.pbl_size = npages; + + return 0; +} + +void iwch_free_pbl(struct iwch_mr *mhp) +{ + cxio_hal_pblpool_free(&mhp->rhp->rdev, mhp->attr.pbl_addr, + mhp->attr.pbl_size << 3); +} + +int iwch_write_pbl(struct iwch_mr *mhp, __be64 *pages, int npages, int offset) +{ + return cxio_write_pbl(&mhp->rhp->rdev, pages, + mhp->attr.pbl_addr + (offset << 3), npages); +} + +int build_phys_page_list(struct ib_phys_buf *buffer_list, + int num_phys_buf, + u64 *iova_start, + u64 *total_size, + int *npages, + int *shift, + __be64 **page_list) +{ + u64 mask; + int i, j, n; + + mask = 0; + *total_size = 0; + for (i = 0; i < num_phys_buf; ++i) { + if (i != 0 && buffer_list[i].addr & ~PAGE_MASK) + return -EINVAL; + if (i != 0 && i != num_phys_buf - 1 && + (buffer_list[i].size & ~PAGE_MASK)) + return -EINVAL; + *total_size += buffer_list[i].size; + if (i > 0) + mask |= buffer_list[i].addr; + else + mask |= buffer_list[i].addr & PAGE_MASK; + if (i != num_phys_buf - 1) + mask |= buffer_list[i].addr + buffer_list[i].size; + else + mask |= (buffer_list[i].addr + buffer_list[i].size + + PAGE_SIZE - 1) & PAGE_MASK; + } + + if (*total_size > 0xFFFFFFFFULL) + return -ENOMEM; + + /* Find largest page shift we can use to cover buffers */ + for (*shift = PAGE_SHIFT; *shift < 27; ++(*shift)) + if ((1ULL << *shift) & mask) + break; + + buffer_list[0].size += buffer_list[0].addr & ((1ULL << *shift) - 1); + buffer_list[0].addr &= ~0ull << *shift; + + *npages = 0; + for (i = 0; i < num_phys_buf; ++i) + *npages += (buffer_list[i].size + + (1ULL << *shift) - 1) >> *shift; + + if (!*npages) + return -EINVAL; + + *page_list = kmalloc(sizeof(u64) * *npages, GFP_KERNEL); + if (!*page_list) + return -ENOMEM; + + n = 0; + for (i = 0; i < num_phys_buf; ++i) + for (j = 0; + j < (buffer_list[i].size + (1ULL << *shift) - 1) >> *shift; + ++j) + (*page_list)[n++] = cpu_to_be64(buffer_list[i].addr + + ((u64) j << *shift)); + + PDBG("%s va 0x%llx mask 0x%llx shift %d len %lld pbl_size %d\n", + __func__, (unsigned long long) *iova_start, + (unsigned long long) mask, *shift, (unsigned long long) *total_size, + *npages); + + return 0; + +} diff --git a/kernel/drivers/infiniband/hw/cxgb3/iwch_provider.c b/kernel/drivers/infiniband/hw/cxgb3/iwch_provider.c new file mode 100644 index 000000000..811b24a53 --- /dev/null +++ b/kernel/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -0,0 +1,1467 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "cxio_hal.h" +#include "iwch.h" +#include "iwch_provider.h" +#include "iwch_cm.h" +#include "iwch_user.h" +#include "common.h" + +static struct ib_ah *iwch_ah_create(struct ib_pd *pd, + struct ib_ah_attr *ah_attr) +{ + return ERR_PTR(-ENOSYS); +} + +static int iwch_ah_destroy(struct ib_ah *ah) +{ + return -ENOSYS; +} + +static int iwch_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + return -ENOSYS; +} + +static int iwch_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + return -ENOSYS; +} + +static int iwch_process_mad(struct ib_device *ibdev, + int mad_flags, + u8 port_num, + struct ib_wc *in_wc, + struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad) +{ + return -ENOSYS; +} + +static int iwch_dealloc_ucontext(struct ib_ucontext *context) +{ + struct iwch_dev *rhp = to_iwch_dev(context->device); + struct iwch_ucontext *ucontext = to_iwch_ucontext(context); + struct iwch_mm_entry *mm, *tmp; + + PDBG("%s context %p\n", __func__, context); + list_for_each_entry_safe(mm, tmp, &ucontext->mmaps, entry) + kfree(mm); + cxio_release_ucontext(&rhp->rdev, &ucontext->uctx); + kfree(ucontext); + return 0; +} + +static struct ib_ucontext *iwch_alloc_ucontext(struct ib_device *ibdev, + struct ib_udata *udata) +{ + struct iwch_ucontext *context; + struct iwch_dev *rhp = to_iwch_dev(ibdev); + + PDBG("%s ibdev %p\n", __func__, ibdev); + context = kzalloc(sizeof(*context), GFP_KERNEL); + if (!context) + return ERR_PTR(-ENOMEM); + cxio_init_ucontext(&rhp->rdev, &context->uctx); + INIT_LIST_HEAD(&context->mmaps); + spin_lock_init(&context->mmap_lock); + return &context->ibucontext; +} + +static int iwch_destroy_cq(struct ib_cq *ib_cq) +{ + struct iwch_cq *chp; + + PDBG("%s ib_cq %p\n", __func__, ib_cq); + chp = to_iwch_cq(ib_cq); + + remove_handle(chp->rhp, &chp->rhp->cqidr, chp->cq.cqid); + atomic_dec(&chp->refcnt); + wait_event(chp->wait, !atomic_read(&chp->refcnt)); + + cxio_destroy_cq(&chp->rhp->rdev, &chp->cq); + kfree(chp); + return 0; +} + +static struct ib_cq *iwch_create_cq(struct ib_device *ibdev, int entries, int vector, + struct ib_ucontext *ib_context, + struct ib_udata *udata) +{ + struct iwch_dev *rhp; + struct iwch_cq *chp; + struct iwch_create_cq_resp uresp; + struct iwch_create_cq_req ureq; + struct iwch_ucontext *ucontext = NULL; + static int warned; + size_t resplen; + + PDBG("%s ib_dev %p entries %d\n", __func__, ibdev, entries); + rhp = to_iwch_dev(ibdev); + chp = kzalloc(sizeof(*chp), GFP_KERNEL); + if (!chp) + return ERR_PTR(-ENOMEM); + + if (ib_context) { + ucontext = to_iwch_ucontext(ib_context); + if (!t3a_device(rhp)) { + if (ib_copy_from_udata(&ureq, udata, sizeof (ureq))) { + kfree(chp); + return ERR_PTR(-EFAULT); + } + chp->user_rptr_addr = (u32 __user *)(unsigned long)ureq.user_rptr_addr; + } + } + + if (t3a_device(rhp)) { + + /* + * T3A: Add some fluff to handle extra CQEs inserted + * for various errors. + * Additional CQE possibilities: + * TERMINATE, + * incoming RDMA WRITE Failures + * incoming RDMA READ REQUEST FAILUREs + * NOTE: We cannot ensure the CQ won't overflow. + */ + entries += 16; + } + entries = roundup_pow_of_two(entries); + chp->cq.size_log2 = ilog2(entries); + + if (cxio_create_cq(&rhp->rdev, &chp->cq, !ucontext)) { + kfree(chp); + return ERR_PTR(-ENOMEM); + } + chp->rhp = rhp; + chp->ibcq.cqe = 1 << chp->cq.size_log2; + spin_lock_init(&chp->lock); + spin_lock_init(&chp->comp_handler_lock); + atomic_set(&chp->refcnt, 1); + init_waitqueue_head(&chp->wait); + if (insert_handle(rhp, &rhp->cqidr, chp, chp->cq.cqid)) { + cxio_destroy_cq(&chp->rhp->rdev, &chp->cq); + kfree(chp); + return ERR_PTR(-ENOMEM); + } + + if (ucontext) { + struct iwch_mm_entry *mm; + + mm = kmalloc(sizeof *mm, GFP_KERNEL); + if (!mm) { + iwch_destroy_cq(&chp->ibcq); + return ERR_PTR(-ENOMEM); + } + uresp.cqid = chp->cq.cqid; + uresp.size_log2 = chp->cq.size_log2; + spin_lock(&ucontext->mmap_lock); + uresp.key = ucontext->key; + ucontext->key += PAGE_SIZE; + spin_unlock(&ucontext->mmap_lock); + mm->key = uresp.key; + mm->addr = virt_to_phys(chp->cq.queue); + if (udata->outlen < sizeof uresp) { + if (!warned++) + printk(KERN_WARNING MOD "Warning - " + "downlevel libcxgb3 (non-fatal).\n"); + mm->len = PAGE_ALIGN((1UL << uresp.size_log2) * + sizeof(struct t3_cqe)); + resplen = sizeof(struct iwch_create_cq_resp_v0); + } else { + mm->len = PAGE_ALIGN(((1UL << uresp.size_log2) + 1) * + sizeof(struct t3_cqe)); + uresp.memsize = mm->len; + uresp.reserved = 0; + resplen = sizeof uresp; + } + if (ib_copy_to_udata(udata, &uresp, resplen)) { + kfree(mm); + iwch_destroy_cq(&chp->ibcq); + return ERR_PTR(-EFAULT); + } + insert_mmap(ucontext, mm); + } + PDBG("created cqid 0x%0x chp %p size 0x%0x, dma_addr 0x%0llx\n", + chp->cq.cqid, chp, (1 << chp->cq.size_log2), + (unsigned long long) chp->cq.dma_addr); + return &chp->ibcq; +} + +static int iwch_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata) +{ +#ifdef notyet + struct iwch_cq *chp = to_iwch_cq(cq); + struct t3_cq oldcq, newcq; + int ret; + + PDBG("%s ib_cq %p cqe %d\n", __func__, cq, cqe); + + /* We don't downsize... */ + if (cqe <= cq->cqe) + return 0; + + /* create new t3_cq with new size */ + cqe = roundup_pow_of_two(cqe+1); + newcq.size_log2 = ilog2(cqe); + + /* Dont allow resize to less than the current wce count */ + if (cqe < Q_COUNT(chp->cq.rptr, chp->cq.wptr)) { + return -ENOMEM; + } + + /* Quiesce all QPs using this CQ */ + ret = iwch_quiesce_qps(chp); + if (ret) { + return ret; + } + + ret = cxio_create_cq(&chp->rhp->rdev, &newcq); + if (ret) { + return ret; + } + + /* copy CQEs */ + memcpy(newcq.queue, chp->cq.queue, (1 << chp->cq.size_log2) * + sizeof(struct t3_cqe)); + + /* old iwch_qp gets new t3_cq but keeps old cqid */ + oldcq = chp->cq; + chp->cq = newcq; + chp->cq.cqid = oldcq.cqid; + + /* resize new t3_cq to update the HW context */ + ret = cxio_resize_cq(&chp->rhp->rdev, &chp->cq); + if (ret) { + chp->cq = oldcq; + return ret; + } + chp->ibcq.cqe = (1<cq.size_log2) - 1; + + /* destroy old t3_cq */ + oldcq.cqid = newcq.cqid; + ret = cxio_destroy_cq(&chp->rhp->rdev, &oldcq); + if (ret) { + printk(KERN_ERR MOD "%s - cxio_destroy_cq failed %d\n", + __func__, ret); + } + + /* add user hooks here */ + + /* resume qps */ + ret = iwch_resume_qps(chp); + return ret; +#else + return -ENOSYS; +#endif +} + +static int iwch_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) +{ + struct iwch_dev *rhp; + struct iwch_cq *chp; + enum t3_cq_opcode cq_op; + int err; + unsigned long flag; + u32 rptr; + + chp = to_iwch_cq(ibcq); + rhp = chp->rhp; + if ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED) + cq_op = CQ_ARM_SE; + else + cq_op = CQ_ARM_AN; + if (chp->user_rptr_addr) { + if (get_user(rptr, chp->user_rptr_addr)) + return -EFAULT; + spin_lock_irqsave(&chp->lock, flag); + chp->cq.rptr = rptr; + } else + spin_lock_irqsave(&chp->lock, flag); + PDBG("%s rptr 0x%x\n", __func__, chp->cq.rptr); + err = cxio_hal_cq_op(&rhp->rdev, &chp->cq, cq_op, 0); + spin_unlock_irqrestore(&chp->lock, flag); + if (err < 0) + printk(KERN_ERR MOD "Error %d rearming CQID 0x%x\n", err, + chp->cq.cqid); + if (err > 0 && !(flags & IB_CQ_REPORT_MISSED_EVENTS)) + err = 0; + return err; +} + +static int iwch_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) +{ + int len = vma->vm_end - vma->vm_start; + u32 key = vma->vm_pgoff << PAGE_SHIFT; + struct cxio_rdev *rdev_p; + int ret = 0; + struct iwch_mm_entry *mm; + struct iwch_ucontext *ucontext; + u64 addr; + + PDBG("%s pgoff 0x%lx key 0x%x len %d\n", __func__, vma->vm_pgoff, + key, len); + + if (vma->vm_start & (PAGE_SIZE-1)) { + return -EINVAL; + } + + rdev_p = &(to_iwch_dev(context->device)->rdev); + ucontext = to_iwch_ucontext(context); + + mm = remove_mmap(ucontext, key, len); + if (!mm) + return -EINVAL; + addr = mm->addr; + kfree(mm); + + if ((addr >= rdev_p->rnic_info.udbell_physbase) && + (addr < (rdev_p->rnic_info.udbell_physbase + + rdev_p->rnic_info.udbell_len))) { + + /* + * Map T3 DB register. + */ + if (vma->vm_flags & VM_READ) { + return -EPERM; + } + + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND; + vma->vm_flags &= ~VM_MAYREAD; + ret = io_remap_pfn_range(vma, vma->vm_start, + addr >> PAGE_SHIFT, + len, vma->vm_page_prot); + } else { + + /* + * Map WQ or CQ contig dma memory... + */ + ret = remap_pfn_range(vma, vma->vm_start, + addr >> PAGE_SHIFT, + len, vma->vm_page_prot); + } + + return ret; +} + +static int iwch_deallocate_pd(struct ib_pd *pd) +{ + struct iwch_dev *rhp; + struct iwch_pd *php; + + php = to_iwch_pd(pd); + rhp = php->rhp; + PDBG("%s ibpd %p pdid 0x%x\n", __func__, pd, php->pdid); + cxio_hal_put_pdid(rhp->rdev.rscp, php->pdid); + kfree(php); + return 0; +} + +static struct ib_pd *iwch_allocate_pd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct iwch_pd *php; + u32 pdid; + struct iwch_dev *rhp; + + PDBG("%s ibdev %p\n", __func__, ibdev); + rhp = (struct iwch_dev *) ibdev; + pdid = cxio_hal_get_pdid(rhp->rdev.rscp); + if (!pdid) + return ERR_PTR(-EINVAL); + php = kzalloc(sizeof(*php), GFP_KERNEL); + if (!php) { + cxio_hal_put_pdid(rhp->rdev.rscp, pdid); + return ERR_PTR(-ENOMEM); + } + php->pdid = pdid; + php->rhp = rhp; + if (context) { + if (ib_copy_to_udata(udata, &php->pdid, sizeof (__u32))) { + iwch_deallocate_pd(&php->ibpd); + return ERR_PTR(-EFAULT); + } + } + PDBG("%s pdid 0x%0x ptr 0x%p\n", __func__, pdid, php); + return &php->ibpd; +} + +static int iwch_dereg_mr(struct ib_mr *ib_mr) +{ + struct iwch_dev *rhp; + struct iwch_mr *mhp; + u32 mmid; + + PDBG("%s ib_mr %p\n", __func__, ib_mr); + /* There can be no memory windows */ + if (atomic_read(&ib_mr->usecnt)) + return -EINVAL; + + mhp = to_iwch_mr(ib_mr); + rhp = mhp->rhp; + mmid = mhp->attr.stag >> 8; + cxio_dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size, + mhp->attr.pbl_addr); + iwch_free_pbl(mhp); + remove_handle(rhp, &rhp->mmidr, mmid); + if (mhp->kva) + kfree((void *) (unsigned long) mhp->kva); + if (mhp->umem) + ib_umem_release(mhp->umem); + PDBG("%s mmid 0x%x ptr %p\n", __func__, mmid, mhp); + kfree(mhp); + return 0; +} + +static struct ib_mr *iwch_register_phys_mem(struct ib_pd *pd, + struct ib_phys_buf *buffer_list, + int num_phys_buf, + int acc, + u64 *iova_start) +{ + __be64 *page_list; + int shift; + u64 total_size; + int npages; + struct iwch_dev *rhp; + struct iwch_pd *php; + struct iwch_mr *mhp; + int ret; + + PDBG("%s ib_pd %p\n", __func__, pd); + php = to_iwch_pd(pd); + rhp = php->rhp; + + mhp = kzalloc(sizeof(*mhp), GFP_KERNEL); + if (!mhp) + return ERR_PTR(-ENOMEM); + + mhp->rhp = rhp; + + /* First check that we have enough alignment */ + if ((*iova_start & ~PAGE_MASK) != (buffer_list[0].addr & ~PAGE_MASK)) { + ret = -EINVAL; + goto err; + } + + if (num_phys_buf > 1 && + ((buffer_list[0].addr + buffer_list[0].size) & ~PAGE_MASK)) { + ret = -EINVAL; + goto err; + } + + ret = build_phys_page_list(buffer_list, num_phys_buf, iova_start, + &total_size, &npages, &shift, &page_list); + if (ret) + goto err; + + ret = iwch_alloc_pbl(mhp, npages); + if (ret) { + kfree(page_list); + goto err_pbl; + } + + ret = iwch_write_pbl(mhp, page_list, npages, 0); + kfree(page_list); + if (ret) + goto err_pbl; + + mhp->attr.pdid = php->pdid; + mhp->attr.zbva = 0; + + mhp->attr.perms = iwch_ib_to_tpt_access(acc); + mhp->attr.va_fbo = *iova_start; + mhp->attr.page_size = shift - 12; + + mhp->attr.len = (u32) total_size; + mhp->attr.pbl_size = npages; + ret = iwch_register_mem(rhp, php, mhp, shift); + if (ret) + goto err_pbl; + + return &mhp->ibmr; + +err_pbl: + iwch_free_pbl(mhp); + +err: + kfree(mhp); + return ERR_PTR(ret); + +} + +static int iwch_reregister_phys_mem(struct ib_mr *mr, + int mr_rereg_mask, + struct ib_pd *pd, + struct ib_phys_buf *buffer_list, + int num_phys_buf, + int acc, u64 * iova_start) +{ + + struct iwch_mr mh, *mhp; + struct iwch_pd *php; + struct iwch_dev *rhp; + __be64 *page_list = NULL; + int shift = 0; + u64 total_size; + int npages = 0; + int ret; + + PDBG("%s ib_mr %p ib_pd %p\n", __func__, mr, pd); + + /* There can be no memory windows */ + if (atomic_read(&mr->usecnt)) + return -EINVAL; + + mhp = to_iwch_mr(mr); + rhp = mhp->rhp; + php = to_iwch_pd(mr->pd); + + /* make sure we are on the same adapter */ + if (rhp != php->rhp) + return -EINVAL; + + memcpy(&mh, mhp, sizeof *mhp); + + if (mr_rereg_mask & IB_MR_REREG_PD) + php = to_iwch_pd(pd); + if (mr_rereg_mask & IB_MR_REREG_ACCESS) + mh.attr.perms = iwch_ib_to_tpt_access(acc); + if (mr_rereg_mask & IB_MR_REREG_TRANS) { + ret = build_phys_page_list(buffer_list, num_phys_buf, + iova_start, + &total_size, &npages, + &shift, &page_list); + if (ret) + return ret; + } + + ret = iwch_reregister_mem(rhp, php, &mh, shift, npages); + kfree(page_list); + if (ret) { + return ret; + } + if (mr_rereg_mask & IB_MR_REREG_PD) + mhp->attr.pdid = php->pdid; + if (mr_rereg_mask & IB_MR_REREG_ACCESS) + mhp->attr.perms = iwch_ib_to_tpt_access(acc); + if (mr_rereg_mask & IB_MR_REREG_TRANS) { + mhp->attr.zbva = 0; + mhp->attr.va_fbo = *iova_start; + mhp->attr.page_size = shift - 12; + mhp->attr.len = (u32) total_size; + mhp->attr.pbl_size = npages; + } + + return 0; +} + + +static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt, int acc, struct ib_udata *udata) +{ + __be64 *pages; + int shift, n, len; + int i, k, entry; + int err = 0; + struct iwch_dev *rhp; + struct iwch_pd *php; + struct iwch_mr *mhp; + struct iwch_reg_user_mr_resp uresp; + struct scatterlist *sg; + PDBG("%s ib_pd %p\n", __func__, pd); + + php = to_iwch_pd(pd); + rhp = php->rhp; + mhp = kzalloc(sizeof(*mhp), GFP_KERNEL); + if (!mhp) + return ERR_PTR(-ENOMEM); + + mhp->rhp = rhp; + + mhp->umem = ib_umem_get(pd->uobject->context, start, length, acc, 0); + if (IS_ERR(mhp->umem)) { + err = PTR_ERR(mhp->umem); + kfree(mhp); + return ERR_PTR(err); + } + + shift = ffs(mhp->umem->page_size) - 1; + + n = mhp->umem->nmap; + + err = iwch_alloc_pbl(mhp, n); + if (err) + goto err; + + pages = (__be64 *) __get_free_page(GFP_KERNEL); + if (!pages) { + err = -ENOMEM; + goto err_pbl; + } + + i = n = 0; + + for_each_sg(mhp->umem->sg_head.sgl, sg, mhp->umem->nmap, entry) { + len = sg_dma_len(sg) >> shift; + for (k = 0; k < len; ++k) { + pages[i++] = cpu_to_be64(sg_dma_address(sg) + + mhp->umem->page_size * k); + if (i == PAGE_SIZE / sizeof *pages) { + err = iwch_write_pbl(mhp, pages, i, n); + if (err) + goto pbl_done; + n += i; + i = 0; + } + } + } + + if (i) + err = iwch_write_pbl(mhp, pages, i, n); + +pbl_done: + free_page((unsigned long) pages); + if (err) + goto err_pbl; + + mhp->attr.pdid = php->pdid; + mhp->attr.zbva = 0; + mhp->attr.perms = iwch_ib_to_tpt_access(acc); + mhp->attr.va_fbo = virt; + mhp->attr.page_size = shift - 12; + mhp->attr.len = (u32) length; + + err = iwch_register_mem(rhp, php, mhp, shift); + if (err) + goto err_pbl; + + if (udata && !t3a_device(rhp)) { + uresp.pbl_addr = (mhp->attr.pbl_addr - + rhp->rdev.rnic_info.pbl_base) >> 3; + PDBG("%s user resp pbl_addr 0x%x\n", __func__, + uresp.pbl_addr); + + if (ib_copy_to_udata(udata, &uresp, sizeof (uresp))) { + iwch_dereg_mr(&mhp->ibmr); + err = -EFAULT; + goto err; + } + } + + return &mhp->ibmr; + +err_pbl: + iwch_free_pbl(mhp); + +err: + ib_umem_release(mhp->umem); + kfree(mhp); + return ERR_PTR(err); +} + +static struct ib_mr *iwch_get_dma_mr(struct ib_pd *pd, int acc) +{ + struct ib_phys_buf bl; + u64 kva; + struct ib_mr *ibmr; + + PDBG("%s ib_pd %p\n", __func__, pd); + + /* + * T3 only supports 32 bits of size. + */ + bl.size = 0xffffffff; + bl.addr = 0; + kva = 0; + ibmr = iwch_register_phys_mem(pd, &bl, 1, acc, &kva); + return ibmr; +} + +static struct ib_mw *iwch_alloc_mw(struct ib_pd *pd, enum ib_mw_type type) +{ + struct iwch_dev *rhp; + struct iwch_pd *php; + struct iwch_mw *mhp; + u32 mmid; + u32 stag = 0; + int ret; + + if (type != IB_MW_TYPE_1) + return ERR_PTR(-EINVAL); + + php = to_iwch_pd(pd); + rhp = php->rhp; + mhp = kzalloc(sizeof(*mhp), GFP_KERNEL); + if (!mhp) + return ERR_PTR(-ENOMEM); + ret = cxio_allocate_window(&rhp->rdev, &stag, php->pdid); + if (ret) { + kfree(mhp); + return ERR_PTR(ret); + } + mhp->rhp = rhp; + mhp->attr.pdid = php->pdid; + mhp->attr.type = TPT_MW; + mhp->attr.stag = stag; + mmid = (stag) >> 8; + mhp->ibmw.rkey = stag; + if (insert_handle(rhp, &rhp->mmidr, mhp, mmid)) { + cxio_deallocate_window(&rhp->rdev, mhp->attr.stag); + kfree(mhp); + return ERR_PTR(-ENOMEM); + } + PDBG("%s mmid 0x%x mhp %p stag 0x%x\n", __func__, mmid, mhp, stag); + return &(mhp->ibmw); +} + +static int iwch_dealloc_mw(struct ib_mw *mw) +{ + struct iwch_dev *rhp; + struct iwch_mw *mhp; + u32 mmid; + + mhp = to_iwch_mw(mw); + rhp = mhp->rhp; + mmid = (mw->rkey) >> 8; + cxio_deallocate_window(&rhp->rdev, mhp->attr.stag); + remove_handle(rhp, &rhp->mmidr, mmid); + PDBG("%s ib_mw %p mmid 0x%x ptr %p\n", __func__, mw, mmid, mhp); + kfree(mhp); + return 0; +} + +static struct ib_mr *iwch_alloc_fast_reg_mr(struct ib_pd *pd, int pbl_depth) +{ + struct iwch_dev *rhp; + struct iwch_pd *php; + struct iwch_mr *mhp; + u32 mmid; + u32 stag = 0; + int ret = 0; + + php = to_iwch_pd(pd); + rhp = php->rhp; + mhp = kzalloc(sizeof(*mhp), GFP_KERNEL); + if (!mhp) + goto err; + + mhp->rhp = rhp; + ret = iwch_alloc_pbl(mhp, pbl_depth); + if (ret) + goto err1; + mhp->attr.pbl_size = pbl_depth; + ret = cxio_allocate_stag(&rhp->rdev, &stag, php->pdid, + mhp->attr.pbl_size, mhp->attr.pbl_addr); + if (ret) + goto err2; + mhp->attr.pdid = php->pdid; + mhp->attr.type = TPT_NON_SHARED_MR; + mhp->attr.stag = stag; + mhp->attr.state = 1; + mmid = (stag) >> 8; + mhp->ibmr.rkey = mhp->ibmr.lkey = stag; + if (insert_handle(rhp, &rhp->mmidr, mhp, mmid)) + goto err3; + + PDBG("%s mmid 0x%x mhp %p stag 0x%x\n", __func__, mmid, mhp, stag); + return &(mhp->ibmr); +err3: + cxio_dereg_mem(&rhp->rdev, stag, mhp->attr.pbl_size, + mhp->attr.pbl_addr); +err2: + iwch_free_pbl(mhp); +err1: + kfree(mhp); +err: + return ERR_PTR(ret); +} + +static struct ib_fast_reg_page_list *iwch_alloc_fastreg_pbl( + struct ib_device *device, + int page_list_len) +{ + struct ib_fast_reg_page_list *page_list; + + page_list = kmalloc(sizeof *page_list + page_list_len * sizeof(u64), + GFP_KERNEL); + if (!page_list) + return ERR_PTR(-ENOMEM); + + page_list->page_list = (u64 *)(page_list + 1); + page_list->max_page_list_len = page_list_len; + + return page_list; +} + +static void iwch_free_fastreg_pbl(struct ib_fast_reg_page_list *page_list) +{ + kfree(page_list); +} + +static int iwch_destroy_qp(struct ib_qp *ib_qp) +{ + struct iwch_dev *rhp; + struct iwch_qp *qhp; + struct iwch_qp_attributes attrs; + struct iwch_ucontext *ucontext; + + qhp = to_iwch_qp(ib_qp); + rhp = qhp->rhp; + + attrs.next_state = IWCH_QP_STATE_ERROR; + iwch_modify_qp(rhp, qhp, IWCH_QP_ATTR_NEXT_STATE, &attrs, 0); + wait_event(qhp->wait, !qhp->ep); + + remove_handle(rhp, &rhp->qpidr, qhp->wq.qpid); + + atomic_dec(&qhp->refcnt); + wait_event(qhp->wait, !atomic_read(&qhp->refcnt)); + + ucontext = ib_qp->uobject ? to_iwch_ucontext(ib_qp->uobject->context) + : NULL; + cxio_destroy_qp(&rhp->rdev, &qhp->wq, + ucontext ? &ucontext->uctx : &rhp->rdev.uctx); + + PDBG("%s ib_qp %p qpid 0x%0x qhp %p\n", __func__, + ib_qp, qhp->wq.qpid, qhp); + kfree(qhp); + return 0; +} + +static struct ib_qp *iwch_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *attrs, + struct ib_udata *udata) +{ + struct iwch_dev *rhp; + struct iwch_qp *qhp; + struct iwch_pd *php; + struct iwch_cq *schp; + struct iwch_cq *rchp; + struct iwch_create_qp_resp uresp; + int wqsize, sqsize, rqsize; + struct iwch_ucontext *ucontext; + + PDBG("%s ib_pd %p\n", __func__, pd); + if (attrs->qp_type != IB_QPT_RC) + return ERR_PTR(-EINVAL); + php = to_iwch_pd(pd); + rhp = php->rhp; + schp = get_chp(rhp, ((struct iwch_cq *) attrs->send_cq)->cq.cqid); + rchp = get_chp(rhp, ((struct iwch_cq *) attrs->recv_cq)->cq.cqid); + if (!schp || !rchp) + return ERR_PTR(-EINVAL); + + /* The RQT size must be # of entries + 1 rounded up to a power of two */ + rqsize = roundup_pow_of_two(attrs->cap.max_recv_wr); + if (rqsize == attrs->cap.max_recv_wr) + rqsize = roundup_pow_of_two(attrs->cap.max_recv_wr+1); + + /* T3 doesn't support RQT depth < 16 */ + if (rqsize < 16) + rqsize = 16; + + if (rqsize > T3_MAX_RQ_SIZE) + return ERR_PTR(-EINVAL); + + if (attrs->cap.max_inline_data > T3_MAX_INLINE) + return ERR_PTR(-EINVAL); + + /* + * NOTE: The SQ and total WQ sizes don't need to be + * a power of two. However, all the code assumes + * they are. EG: Q_FREECNT() and friends. + */ + sqsize = roundup_pow_of_two(attrs->cap.max_send_wr); + wqsize = roundup_pow_of_two(rqsize + sqsize); + + /* + * Kernel users need more wq space for fastreg WRs which can take + * 2 WR fragments. + */ + ucontext = pd->uobject ? to_iwch_ucontext(pd->uobject->context) : NULL; + if (!ucontext && wqsize < (rqsize + (2 * sqsize))) + wqsize = roundup_pow_of_two(rqsize + + roundup_pow_of_two(attrs->cap.max_send_wr * 2)); + PDBG("%s wqsize %d sqsize %d rqsize %d\n", __func__, + wqsize, sqsize, rqsize); + qhp = kzalloc(sizeof(*qhp), GFP_KERNEL); + if (!qhp) + return ERR_PTR(-ENOMEM); + qhp->wq.size_log2 = ilog2(wqsize); + qhp->wq.rq_size_log2 = ilog2(rqsize); + qhp->wq.sq_size_log2 = ilog2(sqsize); + if (cxio_create_qp(&rhp->rdev, !udata, &qhp->wq, + ucontext ? &ucontext->uctx : &rhp->rdev.uctx)) { + kfree(qhp); + return ERR_PTR(-ENOMEM); + } + + attrs->cap.max_recv_wr = rqsize - 1; + attrs->cap.max_send_wr = sqsize; + attrs->cap.max_inline_data = T3_MAX_INLINE; + + qhp->rhp = rhp; + qhp->attr.pd = php->pdid; + qhp->attr.scq = ((struct iwch_cq *) attrs->send_cq)->cq.cqid; + qhp->attr.rcq = ((struct iwch_cq *) attrs->recv_cq)->cq.cqid; + qhp->attr.sq_num_entries = attrs->cap.max_send_wr; + qhp->attr.rq_num_entries = attrs->cap.max_recv_wr; + qhp->attr.sq_max_sges = attrs->cap.max_send_sge; + qhp->attr.sq_max_sges_rdma_write = attrs->cap.max_send_sge; + qhp->attr.rq_max_sges = attrs->cap.max_recv_sge; + qhp->attr.state = IWCH_QP_STATE_IDLE; + qhp->attr.next_state = IWCH_QP_STATE_IDLE; + + /* + * XXX - These don't get passed in from the openib user + * at create time. The CM sets them via a QP modify. + * Need to fix... I think the CM should + */ + qhp->attr.enable_rdma_read = 1; + qhp->attr.enable_rdma_write = 1; + qhp->attr.enable_bind = 1; + qhp->attr.max_ord = 1; + qhp->attr.max_ird = 1; + + spin_lock_init(&qhp->lock); + init_waitqueue_head(&qhp->wait); + atomic_set(&qhp->refcnt, 1); + + if (insert_handle(rhp, &rhp->qpidr, qhp, qhp->wq.qpid)) { + cxio_destroy_qp(&rhp->rdev, &qhp->wq, + ucontext ? &ucontext->uctx : &rhp->rdev.uctx); + kfree(qhp); + return ERR_PTR(-ENOMEM); + } + + if (udata) { + + struct iwch_mm_entry *mm1, *mm2; + + mm1 = kmalloc(sizeof *mm1, GFP_KERNEL); + if (!mm1) { + iwch_destroy_qp(&qhp->ibqp); + return ERR_PTR(-ENOMEM); + } + + mm2 = kmalloc(sizeof *mm2, GFP_KERNEL); + if (!mm2) { + kfree(mm1); + iwch_destroy_qp(&qhp->ibqp); + return ERR_PTR(-ENOMEM); + } + + uresp.qpid = qhp->wq.qpid; + uresp.size_log2 = qhp->wq.size_log2; + uresp.sq_size_log2 = qhp->wq.sq_size_log2; + uresp.rq_size_log2 = qhp->wq.rq_size_log2; + spin_lock(&ucontext->mmap_lock); + uresp.key = ucontext->key; + ucontext->key += PAGE_SIZE; + uresp.db_key = ucontext->key; + ucontext->key += PAGE_SIZE; + spin_unlock(&ucontext->mmap_lock); + if (ib_copy_to_udata(udata, &uresp, sizeof (uresp))) { + kfree(mm1); + kfree(mm2); + iwch_destroy_qp(&qhp->ibqp); + return ERR_PTR(-EFAULT); + } + mm1->key = uresp.key; + mm1->addr = virt_to_phys(qhp->wq.queue); + mm1->len = PAGE_ALIGN(wqsize * sizeof (union t3_wr)); + insert_mmap(ucontext, mm1); + mm2->key = uresp.db_key; + mm2->addr = qhp->wq.udb & PAGE_MASK; + mm2->len = PAGE_SIZE; + insert_mmap(ucontext, mm2); + } + qhp->ibqp.qp_num = qhp->wq.qpid; + init_timer(&(qhp->timer)); + PDBG("%s sq_num_entries %d, rq_num_entries %d " + "qpid 0x%0x qhp %p dma_addr 0x%llx size %d rq_addr 0x%x\n", + __func__, qhp->attr.sq_num_entries, qhp->attr.rq_num_entries, + qhp->wq.qpid, qhp, (unsigned long long) qhp->wq.dma_addr, + 1 << qhp->wq.size_log2, qhp->wq.rq_addr); + return &qhp->ibqp; +} + +static int iwch_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct iwch_dev *rhp; + struct iwch_qp *qhp; + enum iwch_qp_attr_mask mask = 0; + struct iwch_qp_attributes attrs; + + PDBG("%s ib_qp %p\n", __func__, ibqp); + + /* iwarp does not support the RTR state */ + if ((attr_mask & IB_QP_STATE) && (attr->qp_state == IB_QPS_RTR)) + attr_mask &= ~IB_QP_STATE; + + /* Make sure we still have something left to do */ + if (!attr_mask) + return 0; + + memset(&attrs, 0, sizeof attrs); + qhp = to_iwch_qp(ibqp); + rhp = qhp->rhp; + + attrs.next_state = iwch_convert_state(attr->qp_state); + attrs.enable_rdma_read = (attr->qp_access_flags & + IB_ACCESS_REMOTE_READ) ? 1 : 0; + attrs.enable_rdma_write = (attr->qp_access_flags & + IB_ACCESS_REMOTE_WRITE) ? 1 : 0; + attrs.enable_bind = (attr->qp_access_flags & IB_ACCESS_MW_BIND) ? 1 : 0; + + + mask |= (attr_mask & IB_QP_STATE) ? IWCH_QP_ATTR_NEXT_STATE : 0; + mask |= (attr_mask & IB_QP_ACCESS_FLAGS) ? + (IWCH_QP_ATTR_ENABLE_RDMA_READ | + IWCH_QP_ATTR_ENABLE_RDMA_WRITE | + IWCH_QP_ATTR_ENABLE_RDMA_BIND) : 0; + + return iwch_modify_qp(rhp, qhp, mask, &attrs, 0); +} + +void iwch_qp_add_ref(struct ib_qp *qp) +{ + PDBG("%s ib_qp %p\n", __func__, qp); + atomic_inc(&(to_iwch_qp(qp)->refcnt)); +} + +void iwch_qp_rem_ref(struct ib_qp *qp) +{ + PDBG("%s ib_qp %p\n", __func__, qp); + if (atomic_dec_and_test(&(to_iwch_qp(qp)->refcnt))) + wake_up(&(to_iwch_qp(qp)->wait)); +} + +static struct ib_qp *iwch_get_qp(struct ib_device *dev, int qpn) +{ + PDBG("%s ib_dev %p qpn 0x%x\n", __func__, dev, qpn); + return (struct ib_qp *)get_qhp(to_iwch_dev(dev), qpn); +} + + +static int iwch_query_pkey(struct ib_device *ibdev, + u8 port, u16 index, u16 * pkey) +{ + PDBG("%s ibdev %p\n", __func__, ibdev); + *pkey = 0; + return 0; +} + +static int iwch_query_gid(struct ib_device *ibdev, u8 port, + int index, union ib_gid *gid) +{ + struct iwch_dev *dev; + + PDBG("%s ibdev %p, port %d, index %d, gid %p\n", + __func__, ibdev, port, index, gid); + dev = to_iwch_dev(ibdev); + BUG_ON(port == 0 || port > 2); + memset(&(gid->raw[0]), 0, sizeof(gid->raw)); + memcpy(&(gid->raw[0]), dev->rdev.port_info.lldevs[port-1]->dev_addr, 6); + return 0; +} + +static u64 fw_vers_string_to_u64(struct iwch_dev *iwch_dev) +{ + struct ethtool_drvinfo info; + struct net_device *lldev = iwch_dev->rdev.t3cdev_p->lldev; + char *cp, *next; + unsigned fw_maj, fw_min, fw_mic; + + lldev->ethtool_ops->get_drvinfo(lldev, &info); + + next = info.fw_version + 1; + cp = strsep(&next, "."); + sscanf(cp, "%i", &fw_maj); + cp = strsep(&next, "."); + sscanf(cp, "%i", &fw_min); + cp = strsep(&next, "."); + sscanf(cp, "%i", &fw_mic); + + return (((u64)fw_maj & 0xffff) << 32) | ((fw_min & 0xffff) << 16) | + (fw_mic & 0xffff); +} + +static int iwch_query_device(struct ib_device *ibdev, + struct ib_device_attr *props) +{ + + struct iwch_dev *dev; + PDBG("%s ibdev %p\n", __func__, ibdev); + + dev = to_iwch_dev(ibdev); + memset(props, 0, sizeof *props); + memcpy(&props->sys_image_guid, dev->rdev.t3cdev_p->lldev->dev_addr, 6); + props->hw_ver = dev->rdev.t3cdev_p->type; + props->fw_ver = fw_vers_string_to_u64(dev); + props->device_cap_flags = dev->device_cap_flags; + props->page_size_cap = dev->attr.mem_pgsizes_bitmask; + props->vendor_id = (u32)dev->rdev.rnic_info.pdev->vendor; + props->vendor_part_id = (u32)dev->rdev.rnic_info.pdev->device; + props->max_mr_size = dev->attr.max_mr_size; + props->max_qp = dev->attr.max_qps; + props->max_qp_wr = dev->attr.max_wrs; + props->max_sge = dev->attr.max_sge_per_wr; + props->max_sge_rd = 1; + props->max_qp_rd_atom = dev->attr.max_rdma_reads_per_qp; + props->max_qp_init_rd_atom = dev->attr.max_rdma_reads_per_qp; + props->max_cq = dev->attr.max_cqs; + props->max_cqe = dev->attr.max_cqes_per_cq; + props->max_mr = dev->attr.max_mem_regs; + props->max_pd = dev->attr.max_pds; + props->local_ca_ack_delay = 0; + props->max_fast_reg_page_list_len = T3_MAX_FASTREG_DEPTH; + + return 0; +} + +static int iwch_query_port(struct ib_device *ibdev, + u8 port, struct ib_port_attr *props) +{ + struct iwch_dev *dev; + struct net_device *netdev; + struct in_device *inetdev; + + PDBG("%s ibdev %p\n", __func__, ibdev); + + dev = to_iwch_dev(ibdev); + netdev = dev->rdev.port_info.lldevs[port-1]; + + memset(props, 0, sizeof(struct ib_port_attr)); + props->max_mtu = IB_MTU_4096; + if (netdev->mtu >= 4096) + props->active_mtu = IB_MTU_4096; + else if (netdev->mtu >= 2048) + props->active_mtu = IB_MTU_2048; + else if (netdev->mtu >= 1024) + props->active_mtu = IB_MTU_1024; + else if (netdev->mtu >= 512) + props->active_mtu = IB_MTU_512; + else + props->active_mtu = IB_MTU_256; + + if (!netif_carrier_ok(netdev)) + props->state = IB_PORT_DOWN; + else { + inetdev = in_dev_get(netdev); + if (inetdev) { + if (inetdev->ifa_list) + props->state = IB_PORT_ACTIVE; + else + props->state = IB_PORT_INIT; + in_dev_put(inetdev); + } else + props->state = IB_PORT_INIT; + } + + props->port_cap_flags = + IB_PORT_CM_SUP | + IB_PORT_SNMP_TUNNEL_SUP | + IB_PORT_REINIT_SUP | + IB_PORT_DEVICE_MGMT_SUP | + IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP; + props->gid_tbl_len = 1; + props->pkey_tbl_len = 1; + props->active_width = 2; + props->active_speed = IB_SPEED_DDR; + props->max_msg_sz = -1; + + return 0; +} + +static ssize_t show_rev(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev, + ibdev.dev); + PDBG("%s dev 0x%p\n", __func__, dev); + return sprintf(buf, "%d\n", iwch_dev->rdev.t3cdev_p->type); +} + +static ssize_t show_fw_ver(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev, + ibdev.dev); + struct ethtool_drvinfo info; + struct net_device *lldev = iwch_dev->rdev.t3cdev_p->lldev; + + PDBG("%s dev 0x%p\n", __func__, dev); + lldev->ethtool_ops->get_drvinfo(lldev, &info); + return sprintf(buf, "%s\n", info.fw_version); +} + +static ssize_t show_hca(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev, + ibdev.dev); + struct ethtool_drvinfo info; + struct net_device *lldev = iwch_dev->rdev.t3cdev_p->lldev; + + PDBG("%s dev 0x%p\n", __func__, dev); + lldev->ethtool_ops->get_drvinfo(lldev, &info); + return sprintf(buf, "%s\n", info.driver); +} + +static ssize_t show_board(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev, + ibdev.dev); + PDBG("%s dev 0x%p\n", __func__, dev); + return sprintf(buf, "%x.%x\n", iwch_dev->rdev.rnic_info.pdev->vendor, + iwch_dev->rdev.rnic_info.pdev->device); +} + +static int iwch_get_mib(struct ib_device *ibdev, + union rdma_protocol_stats *stats) +{ + struct iwch_dev *dev; + struct tp_mib_stats m; + int ret; + + PDBG("%s ibdev %p\n", __func__, ibdev); + dev = to_iwch_dev(ibdev); + ret = dev->rdev.t3cdev_p->ctl(dev->rdev.t3cdev_p, RDMA_GET_MIB, &m); + if (ret) + return -ENOSYS; + + memset(stats, 0, sizeof *stats); + stats->iw.ipInReceives = ((u64) m.ipInReceive_hi << 32) + + m.ipInReceive_lo; + stats->iw.ipInHdrErrors = ((u64) m.ipInHdrErrors_hi << 32) + + m.ipInHdrErrors_lo; + stats->iw.ipInAddrErrors = ((u64) m.ipInAddrErrors_hi << 32) + + m.ipInAddrErrors_lo; + stats->iw.ipInUnknownProtos = ((u64) m.ipInUnknownProtos_hi << 32) + + m.ipInUnknownProtos_lo; + stats->iw.ipInDiscards = ((u64) m.ipInDiscards_hi << 32) + + m.ipInDiscards_lo; + stats->iw.ipInDelivers = ((u64) m.ipInDelivers_hi << 32) + + m.ipInDelivers_lo; + stats->iw.ipOutRequests = ((u64) m.ipOutRequests_hi << 32) + + m.ipOutRequests_lo; + stats->iw.ipOutDiscards = ((u64) m.ipOutDiscards_hi << 32) + + m.ipOutDiscards_lo; + stats->iw.ipOutNoRoutes = ((u64) m.ipOutNoRoutes_hi << 32) + + m.ipOutNoRoutes_lo; + stats->iw.ipReasmTimeout = (u64) m.ipReasmTimeout; + stats->iw.ipReasmReqds = (u64) m.ipReasmReqds; + stats->iw.ipReasmOKs = (u64) m.ipReasmOKs; + stats->iw.ipReasmFails = (u64) m.ipReasmFails; + stats->iw.tcpActiveOpens = (u64) m.tcpActiveOpens; + stats->iw.tcpPassiveOpens = (u64) m.tcpPassiveOpens; + stats->iw.tcpAttemptFails = (u64) m.tcpAttemptFails; + stats->iw.tcpEstabResets = (u64) m.tcpEstabResets; + stats->iw.tcpOutRsts = (u64) m.tcpOutRsts; + stats->iw.tcpCurrEstab = (u64) m.tcpCurrEstab; + stats->iw.tcpInSegs = ((u64) m.tcpInSegs_hi << 32) + + m.tcpInSegs_lo; + stats->iw.tcpOutSegs = ((u64) m.tcpOutSegs_hi << 32) + + m.tcpOutSegs_lo; + stats->iw.tcpRetransSegs = ((u64) m.tcpRetransSeg_hi << 32) + + m.tcpRetransSeg_lo; + stats->iw.tcpInErrs = ((u64) m.tcpInErrs_hi << 32) + + m.tcpInErrs_lo; + stats->iw.tcpRtoMin = (u64) m.tcpRtoMin; + stats->iw.tcpRtoMax = (u64) m.tcpRtoMax; + return 0; +} + +static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); +static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); +static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); +static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); + +static struct device_attribute *iwch_class_attributes[] = { + &dev_attr_hw_rev, + &dev_attr_fw_ver, + &dev_attr_hca_type, + &dev_attr_board_id, +}; + +int iwch_register_device(struct iwch_dev *dev) +{ + int ret; + int i; + + PDBG("%s iwch_dev %p\n", __func__, dev); + strlcpy(dev->ibdev.name, "cxgb3_%d", IB_DEVICE_NAME_MAX); + memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid)); + memcpy(&dev->ibdev.node_guid, dev->rdev.t3cdev_p->lldev->dev_addr, 6); + dev->ibdev.owner = THIS_MODULE; + dev->device_cap_flags = IB_DEVICE_LOCAL_DMA_LKEY | + IB_DEVICE_MEM_WINDOW | + IB_DEVICE_MEM_MGT_EXTENSIONS; + + /* cxgb3 supports STag 0. */ + dev->ibdev.local_dma_lkey = 0; + + dev->ibdev.uverbs_cmd_mask = + (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | + (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | + (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_DEREG_MR) | + (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | + (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | + (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_QP) | + (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | + (1ull << IB_USER_VERBS_CMD_POLL_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | + (1ull << IB_USER_VERBS_CMD_POST_SEND) | + (1ull << IB_USER_VERBS_CMD_POST_RECV); + dev->ibdev.node_type = RDMA_NODE_RNIC; + memcpy(dev->ibdev.node_desc, IWCH_NODE_DESC, sizeof(IWCH_NODE_DESC)); + dev->ibdev.phys_port_cnt = dev->rdev.port_info.nports; + dev->ibdev.num_comp_vectors = 1; + dev->ibdev.dma_device = &(dev->rdev.rnic_info.pdev->dev); + dev->ibdev.query_device = iwch_query_device; + dev->ibdev.query_port = iwch_query_port; + dev->ibdev.query_pkey = iwch_query_pkey; + dev->ibdev.query_gid = iwch_query_gid; + dev->ibdev.alloc_ucontext = iwch_alloc_ucontext; + dev->ibdev.dealloc_ucontext = iwch_dealloc_ucontext; + dev->ibdev.mmap = iwch_mmap; + dev->ibdev.alloc_pd = iwch_allocate_pd; + dev->ibdev.dealloc_pd = iwch_deallocate_pd; + dev->ibdev.create_ah = iwch_ah_create; + dev->ibdev.destroy_ah = iwch_ah_destroy; + dev->ibdev.create_qp = iwch_create_qp; + dev->ibdev.modify_qp = iwch_ib_modify_qp; + dev->ibdev.destroy_qp = iwch_destroy_qp; + dev->ibdev.create_cq = iwch_create_cq; + dev->ibdev.destroy_cq = iwch_destroy_cq; + dev->ibdev.resize_cq = iwch_resize_cq; + dev->ibdev.poll_cq = iwch_poll_cq; + dev->ibdev.get_dma_mr = iwch_get_dma_mr; + dev->ibdev.reg_phys_mr = iwch_register_phys_mem; + dev->ibdev.rereg_phys_mr = iwch_reregister_phys_mem; + dev->ibdev.reg_user_mr = iwch_reg_user_mr; + dev->ibdev.dereg_mr = iwch_dereg_mr; + dev->ibdev.alloc_mw = iwch_alloc_mw; + dev->ibdev.bind_mw = iwch_bind_mw; + dev->ibdev.dealloc_mw = iwch_dealloc_mw; + dev->ibdev.alloc_fast_reg_mr = iwch_alloc_fast_reg_mr; + dev->ibdev.alloc_fast_reg_page_list = iwch_alloc_fastreg_pbl; + dev->ibdev.free_fast_reg_page_list = iwch_free_fastreg_pbl; + dev->ibdev.attach_mcast = iwch_multicast_attach; + dev->ibdev.detach_mcast = iwch_multicast_detach; + dev->ibdev.process_mad = iwch_process_mad; + dev->ibdev.req_notify_cq = iwch_arm_cq; + dev->ibdev.post_send = iwch_post_send; + dev->ibdev.post_recv = iwch_post_receive; + dev->ibdev.get_protocol_stats = iwch_get_mib; + dev->ibdev.uverbs_abi_ver = IWCH_UVERBS_ABI_VERSION; + + dev->ibdev.iwcm = kmalloc(sizeof(struct iw_cm_verbs), GFP_KERNEL); + if (!dev->ibdev.iwcm) + return -ENOMEM; + + dev->ibdev.iwcm->connect = iwch_connect; + dev->ibdev.iwcm->accept = iwch_accept_cr; + dev->ibdev.iwcm->reject = iwch_reject_cr; + dev->ibdev.iwcm->create_listen = iwch_create_listen; + dev->ibdev.iwcm->destroy_listen = iwch_destroy_listen; + dev->ibdev.iwcm->add_ref = iwch_qp_add_ref; + dev->ibdev.iwcm->rem_ref = iwch_qp_rem_ref; + dev->ibdev.iwcm->get_qp = iwch_get_qp; + + ret = ib_register_device(&dev->ibdev, NULL); + if (ret) + goto bail1; + + for (i = 0; i < ARRAY_SIZE(iwch_class_attributes); ++i) { + ret = device_create_file(&dev->ibdev.dev, + iwch_class_attributes[i]); + if (ret) { + goto bail2; + } + } + return 0; +bail2: + ib_unregister_device(&dev->ibdev); +bail1: + kfree(dev->ibdev.iwcm); + return ret; +} + +void iwch_unregister_device(struct iwch_dev *dev) +{ + int i; + + PDBG("%s iwch_dev %p\n", __func__, dev); + for (i = 0; i < ARRAY_SIZE(iwch_class_attributes); ++i) + device_remove_file(&dev->ibdev.dev, + iwch_class_attributes[i]); + ib_unregister_device(&dev->ibdev); + kfree(dev->ibdev.iwcm); + return; +} diff --git a/kernel/drivers/infiniband/hw/cxgb3/iwch_provider.h b/kernel/drivers/infiniband/hw/cxgb3/iwch_provider.h new file mode 100644 index 000000000..87c14b0c5 --- /dev/null +++ b/kernel/drivers/infiniband/hw/cxgb3/iwch_provider.h @@ -0,0 +1,360 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __IWCH_PROVIDER_H__ +#define __IWCH_PROVIDER_H__ + +#include +#include +#include +#include +#include "t3cdev.h" +#include "iwch.h" +#include "cxio_wr.h" +#include "cxio_hal.h" + +struct iwch_pd { + struct ib_pd ibpd; + u32 pdid; + struct iwch_dev *rhp; +}; + +static inline struct iwch_pd *to_iwch_pd(struct ib_pd *ibpd) +{ + return container_of(ibpd, struct iwch_pd, ibpd); +} + +struct tpt_attributes { + u32 stag; + u32 state:1; + u32 type:2; + u32 rsvd:1; + enum tpt_mem_perm perms; + u32 remote_invaliate_disable:1; + u32 zbva:1; + u32 mw_bind_enable:1; + u32 page_size:5; + + u32 pdid; + u32 qpid; + u32 pbl_addr; + u32 len; + u64 va_fbo; + u32 pbl_size; +}; + +struct iwch_mr { + struct ib_mr ibmr; + struct ib_umem *umem; + struct iwch_dev *rhp; + u64 kva; + struct tpt_attributes attr; +}; + +typedef struct iwch_mw iwch_mw_handle; + +static inline struct iwch_mr *to_iwch_mr(struct ib_mr *ibmr) +{ + return container_of(ibmr, struct iwch_mr, ibmr); +} + +struct iwch_mw { + struct ib_mw ibmw; + struct iwch_dev *rhp; + u64 kva; + struct tpt_attributes attr; +}; + +static inline struct iwch_mw *to_iwch_mw(struct ib_mw *ibmw) +{ + return container_of(ibmw, struct iwch_mw, ibmw); +} + +struct iwch_cq { + struct ib_cq ibcq; + struct iwch_dev *rhp; + struct t3_cq cq; + spinlock_t lock; + spinlock_t comp_handler_lock; + atomic_t refcnt; + wait_queue_head_t wait; + u32 __user *user_rptr_addr; +}; + +static inline struct iwch_cq *to_iwch_cq(struct ib_cq *ibcq) +{ + return container_of(ibcq, struct iwch_cq, ibcq); +} + +enum IWCH_QP_FLAGS { + QP_QUIESCED = 0x01 +}; + +struct iwch_mpa_attributes { + u8 initiator; + u8 recv_marker_enabled; + u8 xmit_marker_enabled; /* iWARP: enable inbound Read Resp. */ + u8 crc_enabled; + u8 version; /* 0 or 1 */ +}; + +struct iwch_qp_attributes { + u32 scq; + u32 rcq; + u32 sq_num_entries; + u32 rq_num_entries; + u32 sq_max_sges; + u32 sq_max_sges_rdma_write; + u32 rq_max_sges; + u32 state; + u8 enable_rdma_read; + u8 enable_rdma_write; /* enable inbound Read Resp. */ + u8 enable_bind; + u8 enable_mmid0_fastreg; /* Enable STAG0 + Fast-register */ + /* + * Next QP state. If specify the current state, only the + * QP attributes will be modified. + */ + u32 max_ord; + u32 max_ird; + u32 pd; /* IN */ + u32 next_state; + char terminate_buffer[52]; + u32 terminate_msg_len; + u8 is_terminate_local; + struct iwch_mpa_attributes mpa_attr; /* IN-OUT */ + struct iwch_ep *llp_stream_handle; + char *stream_msg_buf; /* Last stream msg. before Idle -> RTS */ + u32 stream_msg_buf_len; /* Only on Idle -> RTS */ +}; + +struct iwch_qp { + struct ib_qp ibqp; + struct iwch_dev *rhp; + struct iwch_ep *ep; + struct iwch_qp_attributes attr; + struct t3_wq wq; + spinlock_t lock; + atomic_t refcnt; + wait_queue_head_t wait; + enum IWCH_QP_FLAGS flags; + struct timer_list timer; +}; + +static inline int qp_quiesced(struct iwch_qp *qhp) +{ + return qhp->flags & QP_QUIESCED; +} + +static inline struct iwch_qp *to_iwch_qp(struct ib_qp *ibqp) +{ + return container_of(ibqp, struct iwch_qp, ibqp); +} + +void iwch_qp_add_ref(struct ib_qp *qp); +void iwch_qp_rem_ref(struct ib_qp *qp); + +struct iwch_ucontext { + struct ib_ucontext ibucontext; + struct cxio_ucontext uctx; + u32 key; + spinlock_t mmap_lock; + struct list_head mmaps; +}; + +static inline struct iwch_ucontext *to_iwch_ucontext(struct ib_ucontext *c) +{ + return container_of(c, struct iwch_ucontext, ibucontext); +} + +struct iwch_mm_entry { + struct list_head entry; + u64 addr; + u32 key; + unsigned len; +}; + +static inline struct iwch_mm_entry *remove_mmap(struct iwch_ucontext *ucontext, + u32 key, unsigned len) +{ + struct list_head *pos, *nxt; + struct iwch_mm_entry *mm; + + spin_lock(&ucontext->mmap_lock); + list_for_each_safe(pos, nxt, &ucontext->mmaps) { + + mm = list_entry(pos, struct iwch_mm_entry, entry); + if (mm->key == key && mm->len == len) { + list_del_init(&mm->entry); + spin_unlock(&ucontext->mmap_lock); + PDBG("%s key 0x%x addr 0x%llx len %d\n", __func__, + key, (unsigned long long) mm->addr, mm->len); + return mm; + } + } + spin_unlock(&ucontext->mmap_lock); + return NULL; +} + +static inline void insert_mmap(struct iwch_ucontext *ucontext, + struct iwch_mm_entry *mm) +{ + spin_lock(&ucontext->mmap_lock); + PDBG("%s key 0x%x addr 0x%llx len %d\n", __func__, + mm->key, (unsigned long long) mm->addr, mm->len); + list_add_tail(&mm->entry, &ucontext->mmaps); + spin_unlock(&ucontext->mmap_lock); +} + +enum iwch_qp_attr_mask { + IWCH_QP_ATTR_NEXT_STATE = 1 << 0, + IWCH_QP_ATTR_ENABLE_RDMA_READ = 1 << 7, + IWCH_QP_ATTR_ENABLE_RDMA_WRITE = 1 << 8, + IWCH_QP_ATTR_ENABLE_RDMA_BIND = 1 << 9, + IWCH_QP_ATTR_MAX_ORD = 1 << 11, + IWCH_QP_ATTR_MAX_IRD = 1 << 12, + IWCH_QP_ATTR_LLP_STREAM_HANDLE = 1 << 22, + IWCH_QP_ATTR_STREAM_MSG_BUFFER = 1 << 23, + IWCH_QP_ATTR_MPA_ATTR = 1 << 24, + IWCH_QP_ATTR_QP_CONTEXT_ACTIVATE = 1 << 25, + IWCH_QP_ATTR_VALID_MODIFY = (IWCH_QP_ATTR_ENABLE_RDMA_READ | + IWCH_QP_ATTR_ENABLE_RDMA_WRITE | + IWCH_QP_ATTR_MAX_ORD | + IWCH_QP_ATTR_MAX_IRD | + IWCH_QP_ATTR_LLP_STREAM_HANDLE | + IWCH_QP_ATTR_STREAM_MSG_BUFFER | + IWCH_QP_ATTR_MPA_ATTR | + IWCH_QP_ATTR_QP_CONTEXT_ACTIVATE) +}; + +int iwch_modify_qp(struct iwch_dev *rhp, + struct iwch_qp *qhp, + enum iwch_qp_attr_mask mask, + struct iwch_qp_attributes *attrs, + int internal); + +enum iwch_qp_state { + IWCH_QP_STATE_IDLE, + IWCH_QP_STATE_RTS, + IWCH_QP_STATE_ERROR, + IWCH_QP_STATE_TERMINATE, + IWCH_QP_STATE_CLOSING, + IWCH_QP_STATE_TOT +}; + +static inline int iwch_convert_state(enum ib_qp_state ib_state) +{ + switch (ib_state) { + case IB_QPS_RESET: + case IB_QPS_INIT: + return IWCH_QP_STATE_IDLE; + case IB_QPS_RTS: + return IWCH_QP_STATE_RTS; + case IB_QPS_SQD: + return IWCH_QP_STATE_CLOSING; + case IB_QPS_SQE: + return IWCH_QP_STATE_TERMINATE; + case IB_QPS_ERR: + return IWCH_QP_STATE_ERROR; + default: + return -1; + } +} + +static inline u32 iwch_ib_to_tpt_access(int acc) +{ + return (acc & IB_ACCESS_REMOTE_WRITE ? TPT_REMOTE_WRITE : 0) | + (acc & IB_ACCESS_REMOTE_READ ? TPT_REMOTE_READ : 0) | + (acc & IB_ACCESS_LOCAL_WRITE ? TPT_LOCAL_WRITE : 0) | + (acc & IB_ACCESS_MW_BIND ? TPT_MW_BIND : 0) | + TPT_LOCAL_READ; +} + +static inline u32 iwch_ib_to_tpt_bind_access(int acc) +{ + return (acc & IB_ACCESS_REMOTE_WRITE ? TPT_REMOTE_WRITE : 0) | + (acc & IB_ACCESS_REMOTE_READ ? TPT_REMOTE_READ : 0); +} + +enum iwch_mmid_state { + IWCH_STAG_STATE_VALID, + IWCH_STAG_STATE_INVALID +}; + +enum iwch_qp_query_flags { + IWCH_QP_QUERY_CONTEXT_NONE = 0x0, /* No ctx; Only attrs */ + IWCH_QP_QUERY_CONTEXT_GET = 0x1, /* Get ctx + attrs */ + IWCH_QP_QUERY_CONTEXT_SUSPEND = 0x2, /* Not Supported */ + + /* + * Quiesce QP context; Consumer + * will NOT replay outstanding WR + */ + IWCH_QP_QUERY_CONTEXT_QUIESCE = 0x4, + IWCH_QP_QUERY_CONTEXT_REMOVE = 0x8, + IWCH_QP_QUERY_TEST_USERWRITE = 0x32 /* Test special */ +}; + +u16 iwch_rqes_posted(struct iwch_qp *qhp); +int iwch_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr); +int iwch_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr); +int iwch_bind_mw(struct ib_qp *qp, + struct ib_mw *mw, + struct ib_mw_bind *mw_bind); +int iwch_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); +int iwch_post_terminate(struct iwch_qp *qhp, struct respQ_msg_t *rsp_msg); +int iwch_post_zb_read(struct iwch_ep *ep); +int iwch_register_device(struct iwch_dev *dev); +void iwch_unregister_device(struct iwch_dev *dev); +void stop_read_rep_timer(struct iwch_qp *qhp); +int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php, + struct iwch_mr *mhp, int shift); +int iwch_reregister_mem(struct iwch_dev *rhp, struct iwch_pd *php, + struct iwch_mr *mhp, + int shift, + int npages); +int iwch_alloc_pbl(struct iwch_mr *mhp, int npages); +void iwch_free_pbl(struct iwch_mr *mhp); +int iwch_write_pbl(struct iwch_mr *mhp, __be64 *pages, int npages, int offset); +int build_phys_page_list(struct ib_phys_buf *buffer_list, + int num_phys_buf, + u64 *iova_start, + u64 *total_size, + int *npages, + int *shift, + __be64 **page_list); + + +#define IWCH_NODE_DESC "cxgb3 Chelsio Communications" + +#endif diff --git a/kernel/drivers/infiniband/hw/cxgb3/iwch_qp.c b/kernel/drivers/infiniband/hw/cxgb3/iwch_qp.c new file mode 100644 index 000000000..b57c0befd --- /dev/null +++ b/kernel/drivers/infiniband/hw/cxgb3/iwch_qp.c @@ -0,0 +1,1163 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include "iwch_provider.h" +#include "iwch.h" +#include "iwch_cm.h" +#include "cxio_hal.h" +#include "cxio_resource.h" + +#define NO_SUPPORT -1 + +static int build_rdma_send(union t3_wr *wqe, struct ib_send_wr *wr, + u8 * flit_cnt) +{ + int i; + u32 plen; + + switch (wr->opcode) { + case IB_WR_SEND: + if (wr->send_flags & IB_SEND_SOLICITED) + wqe->send.rdmaop = T3_SEND_WITH_SE; + else + wqe->send.rdmaop = T3_SEND; + wqe->send.rem_stag = 0; + break; + case IB_WR_SEND_WITH_INV: + if (wr->send_flags & IB_SEND_SOLICITED) + wqe->send.rdmaop = T3_SEND_WITH_SE_INV; + else + wqe->send.rdmaop = T3_SEND_WITH_INV; + wqe->send.rem_stag = cpu_to_be32(wr->ex.invalidate_rkey); + break; + default: + return -EINVAL; + } + if (wr->num_sge > T3_MAX_SGE) + return -EINVAL; + wqe->send.reserved[0] = 0; + wqe->send.reserved[1] = 0; + wqe->send.reserved[2] = 0; + plen = 0; + for (i = 0; i < wr->num_sge; i++) { + if ((plen + wr->sg_list[i].length) < plen) + return -EMSGSIZE; + + plen += wr->sg_list[i].length; + wqe->send.sgl[i].stag = cpu_to_be32(wr->sg_list[i].lkey); + wqe->send.sgl[i].len = cpu_to_be32(wr->sg_list[i].length); + wqe->send.sgl[i].to = cpu_to_be64(wr->sg_list[i].addr); + } + wqe->send.num_sgle = cpu_to_be32(wr->num_sge); + *flit_cnt = 4 + ((wr->num_sge) << 1); + wqe->send.plen = cpu_to_be32(plen); + return 0; +} + +static int build_rdma_write(union t3_wr *wqe, struct ib_send_wr *wr, + u8 *flit_cnt) +{ + int i; + u32 plen; + if (wr->num_sge > T3_MAX_SGE) + return -EINVAL; + wqe->write.rdmaop = T3_RDMA_WRITE; + wqe->write.reserved[0] = 0; + wqe->write.reserved[1] = 0; + wqe->write.reserved[2] = 0; + wqe->write.stag_sink = cpu_to_be32(wr->wr.rdma.rkey); + wqe->write.to_sink = cpu_to_be64(wr->wr.rdma.remote_addr); + + if (wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) { + plen = 4; + wqe->write.sgl[0].stag = wr->ex.imm_data; + wqe->write.sgl[0].len = cpu_to_be32(0); + wqe->write.num_sgle = cpu_to_be32(0); + *flit_cnt = 6; + } else { + plen = 0; + for (i = 0; i < wr->num_sge; i++) { + if ((plen + wr->sg_list[i].length) < plen) { + return -EMSGSIZE; + } + plen += wr->sg_list[i].length; + wqe->write.sgl[i].stag = + cpu_to_be32(wr->sg_list[i].lkey); + wqe->write.sgl[i].len = + cpu_to_be32(wr->sg_list[i].length); + wqe->write.sgl[i].to = + cpu_to_be64(wr->sg_list[i].addr); + } + wqe->write.num_sgle = cpu_to_be32(wr->num_sge); + *flit_cnt = 5 + ((wr->num_sge) << 1); + } + wqe->write.plen = cpu_to_be32(plen); + return 0; +} + +static int build_rdma_read(union t3_wr *wqe, struct ib_send_wr *wr, + u8 *flit_cnt) +{ + if (wr->num_sge > 1) + return -EINVAL; + wqe->read.rdmaop = T3_READ_REQ; + if (wr->opcode == IB_WR_RDMA_READ_WITH_INV) + wqe->read.local_inv = 1; + else + wqe->read.local_inv = 0; + wqe->read.reserved[0] = 0; + wqe->read.reserved[1] = 0; + wqe->read.rem_stag = cpu_to_be32(wr->wr.rdma.rkey); + wqe->read.rem_to = cpu_to_be64(wr->wr.rdma.remote_addr); + wqe->read.local_stag = cpu_to_be32(wr->sg_list[0].lkey); + wqe->read.local_len = cpu_to_be32(wr->sg_list[0].length); + wqe->read.local_to = cpu_to_be64(wr->sg_list[0].addr); + *flit_cnt = sizeof(struct t3_rdma_read_wr) >> 3; + return 0; +} + +static int build_fastreg(union t3_wr *wqe, struct ib_send_wr *wr, + u8 *flit_cnt, int *wr_cnt, struct t3_wq *wq) +{ + int i; + __be64 *p; + + if (wr->wr.fast_reg.page_list_len > T3_MAX_FASTREG_DEPTH) + return -EINVAL; + *wr_cnt = 1; + wqe->fastreg.stag = cpu_to_be32(wr->wr.fast_reg.rkey); + wqe->fastreg.len = cpu_to_be32(wr->wr.fast_reg.length); + wqe->fastreg.va_base_hi = cpu_to_be32(wr->wr.fast_reg.iova_start >> 32); + wqe->fastreg.va_base_lo_fbo = + cpu_to_be32(wr->wr.fast_reg.iova_start & 0xffffffff); + wqe->fastreg.page_type_perms = cpu_to_be32( + V_FR_PAGE_COUNT(wr->wr.fast_reg.page_list_len) | + V_FR_PAGE_SIZE(wr->wr.fast_reg.page_shift-12) | + V_FR_TYPE(TPT_VATO) | + V_FR_PERMS(iwch_ib_to_tpt_access(wr->wr.fast_reg.access_flags))); + p = &wqe->fastreg.pbl_addrs[0]; + for (i = 0; i < wr->wr.fast_reg.page_list_len; i++, p++) { + + /* If we need a 2nd WR, then set it up */ + if (i == T3_MAX_FASTREG_FRAG) { + *wr_cnt = 2; + wqe = (union t3_wr *)(wq->queue + + Q_PTR2IDX((wq->wptr+1), wq->size_log2)); + build_fw_riwrh((void *)wqe, T3_WR_FASTREG, 0, + Q_GENBIT(wq->wptr + 1, wq->size_log2), + 0, 1 + wr->wr.fast_reg.page_list_len - T3_MAX_FASTREG_FRAG, + T3_EOP); + + p = &wqe->pbl_frag.pbl_addrs[0]; + } + *p = cpu_to_be64((u64)wr->wr.fast_reg.page_list->page_list[i]); + } + *flit_cnt = 5 + wr->wr.fast_reg.page_list_len; + if (*flit_cnt > 15) + *flit_cnt = 15; + return 0; +} + +static int build_inv_stag(union t3_wr *wqe, struct ib_send_wr *wr, + u8 *flit_cnt) +{ + wqe->local_inv.stag = cpu_to_be32(wr->ex.invalidate_rkey); + wqe->local_inv.reserved = 0; + *flit_cnt = sizeof(struct t3_local_inv_wr) >> 3; + return 0; +} + +static int iwch_sgl2pbl_map(struct iwch_dev *rhp, struct ib_sge *sg_list, + u32 num_sgle, u32 * pbl_addr, u8 * page_size) +{ + int i; + struct iwch_mr *mhp; + u64 offset; + for (i = 0; i < num_sgle; i++) { + + mhp = get_mhp(rhp, (sg_list[i].lkey) >> 8); + if (!mhp) { + PDBG("%s %d\n", __func__, __LINE__); + return -EIO; + } + if (!mhp->attr.state) { + PDBG("%s %d\n", __func__, __LINE__); + return -EIO; + } + if (mhp->attr.zbva) { + PDBG("%s %d\n", __func__, __LINE__); + return -EIO; + } + + if (sg_list[i].addr < mhp->attr.va_fbo) { + PDBG("%s %d\n", __func__, __LINE__); + return -EINVAL; + } + if (sg_list[i].addr + ((u64) sg_list[i].length) < + sg_list[i].addr) { + PDBG("%s %d\n", __func__, __LINE__); + return -EINVAL; + } + if (sg_list[i].addr + ((u64) sg_list[i].length) > + mhp->attr.va_fbo + ((u64) mhp->attr.len)) { + PDBG("%s %d\n", __func__, __LINE__); + return -EINVAL; + } + offset = sg_list[i].addr - mhp->attr.va_fbo; + offset += mhp->attr.va_fbo & + ((1UL << (12 + mhp->attr.page_size)) - 1); + pbl_addr[i] = ((mhp->attr.pbl_addr - + rhp->rdev.rnic_info.pbl_base) >> 3) + + (offset >> (12 + mhp->attr.page_size)); + page_size[i] = mhp->attr.page_size; + } + return 0; +} + +static int build_rdma_recv(struct iwch_qp *qhp, union t3_wr *wqe, + struct ib_recv_wr *wr) +{ + int i, err = 0; + u32 pbl_addr[T3_MAX_SGE]; + u8 page_size[T3_MAX_SGE]; + + err = iwch_sgl2pbl_map(qhp->rhp, wr->sg_list, wr->num_sge, pbl_addr, + page_size); + if (err) + return err; + wqe->recv.pagesz[0] = page_size[0]; + wqe->recv.pagesz[1] = page_size[1]; + wqe->recv.pagesz[2] = page_size[2]; + wqe->recv.pagesz[3] = page_size[3]; + wqe->recv.num_sgle = cpu_to_be32(wr->num_sge); + for (i = 0; i < wr->num_sge; i++) { + wqe->recv.sgl[i].stag = cpu_to_be32(wr->sg_list[i].lkey); + wqe->recv.sgl[i].len = cpu_to_be32(wr->sg_list[i].length); + + /* to in the WQE == the offset into the page */ + wqe->recv.sgl[i].to = cpu_to_be64(((u32)wr->sg_list[i].addr) & + ((1UL << (12 + page_size[i])) - 1)); + + /* pbl_addr is the adapters address in the PBL */ + wqe->recv.pbl_addr[i] = cpu_to_be32(pbl_addr[i]); + } + for (; i < T3_MAX_SGE; i++) { + wqe->recv.sgl[i].stag = 0; + wqe->recv.sgl[i].len = 0; + wqe->recv.sgl[i].to = 0; + wqe->recv.pbl_addr[i] = 0; + } + qhp->wq.rq[Q_PTR2IDX(qhp->wq.rq_wptr, + qhp->wq.rq_size_log2)].wr_id = wr->wr_id; + qhp->wq.rq[Q_PTR2IDX(qhp->wq.rq_wptr, + qhp->wq.rq_size_log2)].pbl_addr = 0; + return 0; +} + +static int build_zero_stag_recv(struct iwch_qp *qhp, union t3_wr *wqe, + struct ib_recv_wr *wr) +{ + int i; + u32 pbl_addr; + u32 pbl_offset; + + + /* + * The T3 HW requires the PBL in the HW recv descriptor to reference + * a PBL entry. So we allocate the max needed PBL memory here and pass + * it to the uP in the recv WR. The uP will build the PBL and setup + * the HW recv descriptor. + */ + pbl_addr = cxio_hal_pblpool_alloc(&qhp->rhp->rdev, T3_STAG0_PBL_SIZE); + if (!pbl_addr) + return -ENOMEM; + + /* + * Compute the 8B aligned offset. + */ + pbl_offset = (pbl_addr - qhp->rhp->rdev.rnic_info.pbl_base) >> 3; + + wqe->recv.num_sgle = cpu_to_be32(wr->num_sge); + + for (i = 0; i < wr->num_sge; i++) { + + /* + * Use a 128MB page size. This and an imposed 128MB + * sge length limit allows us to require only a 2-entry HW + * PBL for each SGE. This restriction is acceptable since + * since it is not possible to allocate 128MB of contiguous + * DMA coherent memory! + */ + if (wr->sg_list[i].length > T3_STAG0_MAX_PBE_LEN) + return -EINVAL; + wqe->recv.pagesz[i] = T3_STAG0_PAGE_SHIFT; + + /* + * T3 restricts a recv to all zero-stag or all non-zero-stag. + */ + if (wr->sg_list[i].lkey != 0) + return -EINVAL; + wqe->recv.sgl[i].stag = 0; + wqe->recv.sgl[i].len = cpu_to_be32(wr->sg_list[i].length); + wqe->recv.sgl[i].to = cpu_to_be64(wr->sg_list[i].addr); + wqe->recv.pbl_addr[i] = cpu_to_be32(pbl_offset); + pbl_offset += 2; + } + for (; i < T3_MAX_SGE; i++) { + wqe->recv.pagesz[i] = 0; + wqe->recv.sgl[i].stag = 0; + wqe->recv.sgl[i].len = 0; + wqe->recv.sgl[i].to = 0; + wqe->recv.pbl_addr[i] = 0; + } + qhp->wq.rq[Q_PTR2IDX(qhp->wq.rq_wptr, + qhp->wq.rq_size_log2)].wr_id = wr->wr_id; + qhp->wq.rq[Q_PTR2IDX(qhp->wq.rq_wptr, + qhp->wq.rq_size_log2)].pbl_addr = pbl_addr; + return 0; +} + +int iwch_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr) +{ + int err = 0; + u8 uninitialized_var(t3_wr_flit_cnt); + enum t3_wr_opcode t3_wr_opcode = 0; + enum t3_wr_flags t3_wr_flags; + struct iwch_qp *qhp; + u32 idx; + union t3_wr *wqe; + u32 num_wrs; + unsigned long flag; + struct t3_swsq *sqp; + int wr_cnt = 1; + + qhp = to_iwch_qp(ibqp); + spin_lock_irqsave(&qhp->lock, flag); + if (qhp->attr.state > IWCH_QP_STATE_RTS) { + spin_unlock_irqrestore(&qhp->lock, flag); + err = -EINVAL; + goto out; + } + num_wrs = Q_FREECNT(qhp->wq.sq_rptr, qhp->wq.sq_wptr, + qhp->wq.sq_size_log2); + if (num_wrs == 0) { + spin_unlock_irqrestore(&qhp->lock, flag); + err = -ENOMEM; + goto out; + } + while (wr) { + if (num_wrs == 0) { + err = -ENOMEM; + break; + } + idx = Q_PTR2IDX(qhp->wq.wptr, qhp->wq.size_log2); + wqe = (union t3_wr *) (qhp->wq.queue + idx); + t3_wr_flags = 0; + if (wr->send_flags & IB_SEND_SOLICITED) + t3_wr_flags |= T3_SOLICITED_EVENT_FLAG; + if (wr->send_flags & IB_SEND_SIGNALED) + t3_wr_flags |= T3_COMPLETION_FLAG; + sqp = qhp->wq.sq + + Q_PTR2IDX(qhp->wq.sq_wptr, qhp->wq.sq_size_log2); + switch (wr->opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_INV: + if (wr->send_flags & IB_SEND_FENCE) + t3_wr_flags |= T3_READ_FENCE_FLAG; + t3_wr_opcode = T3_WR_SEND; + err = build_rdma_send(wqe, wr, &t3_wr_flit_cnt); + break; + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + t3_wr_opcode = T3_WR_WRITE; + err = build_rdma_write(wqe, wr, &t3_wr_flit_cnt); + break; + case IB_WR_RDMA_READ: + case IB_WR_RDMA_READ_WITH_INV: + t3_wr_opcode = T3_WR_READ; + t3_wr_flags = 0; /* T3 reads are always signaled */ + err = build_rdma_read(wqe, wr, &t3_wr_flit_cnt); + if (err) + break; + sqp->read_len = wqe->read.local_len; + if (!qhp->wq.oldest_read) + qhp->wq.oldest_read = sqp; + break; + case IB_WR_FAST_REG_MR: + t3_wr_opcode = T3_WR_FASTREG; + err = build_fastreg(wqe, wr, &t3_wr_flit_cnt, + &wr_cnt, &qhp->wq); + break; + case IB_WR_LOCAL_INV: + if (wr->send_flags & IB_SEND_FENCE) + t3_wr_flags |= T3_LOCAL_FENCE_FLAG; + t3_wr_opcode = T3_WR_INV_STAG; + err = build_inv_stag(wqe, wr, &t3_wr_flit_cnt); + break; + default: + PDBG("%s post of type=%d TBD!\n", __func__, + wr->opcode); + err = -EINVAL; + } + if (err) + break; + wqe->send.wrid.id0.hi = qhp->wq.sq_wptr; + sqp->wr_id = wr->wr_id; + sqp->opcode = wr2opcode(t3_wr_opcode); + sqp->sq_wptr = qhp->wq.sq_wptr; + sqp->complete = 0; + sqp->signaled = (wr->send_flags & IB_SEND_SIGNALED); + + build_fw_riwrh((void *) wqe, t3_wr_opcode, t3_wr_flags, + Q_GENBIT(qhp->wq.wptr, qhp->wq.size_log2), + 0, t3_wr_flit_cnt, + (wr_cnt == 1) ? T3_SOPEOP : T3_SOP); + PDBG("%s cookie 0x%llx wq idx 0x%x swsq idx %ld opcode %d\n", + __func__, (unsigned long long) wr->wr_id, idx, + Q_PTR2IDX(qhp->wq.sq_wptr, qhp->wq.sq_size_log2), + sqp->opcode); + wr = wr->next; + num_wrs--; + qhp->wq.wptr += wr_cnt; + ++(qhp->wq.sq_wptr); + } + spin_unlock_irqrestore(&qhp->lock, flag); + if (cxio_wq_db_enabled(&qhp->wq)) + ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid); + +out: + if (err) + *bad_wr = wr; + return err; +} + +int iwch_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + int err = 0; + struct iwch_qp *qhp; + u32 idx; + union t3_wr *wqe; + u32 num_wrs; + unsigned long flag; + + qhp = to_iwch_qp(ibqp); + spin_lock_irqsave(&qhp->lock, flag); + if (qhp->attr.state > IWCH_QP_STATE_RTS) { + spin_unlock_irqrestore(&qhp->lock, flag); + err = -EINVAL; + goto out; + } + num_wrs = Q_FREECNT(qhp->wq.rq_rptr, qhp->wq.rq_wptr, + qhp->wq.rq_size_log2) - 1; + if (!wr) { + spin_unlock_irqrestore(&qhp->lock, flag); + err = -ENOMEM; + goto out; + } + while (wr) { + if (wr->num_sge > T3_MAX_SGE) { + err = -EINVAL; + break; + } + idx = Q_PTR2IDX(qhp->wq.wptr, qhp->wq.size_log2); + wqe = (union t3_wr *) (qhp->wq.queue + idx); + if (num_wrs) + if (wr->sg_list[0].lkey) + err = build_rdma_recv(qhp, wqe, wr); + else + err = build_zero_stag_recv(qhp, wqe, wr); + else + err = -ENOMEM; + + if (err) + break; + + build_fw_riwrh((void *) wqe, T3_WR_RCV, T3_COMPLETION_FLAG, + Q_GENBIT(qhp->wq.wptr, qhp->wq.size_log2), + 0, sizeof(struct t3_receive_wr) >> 3, T3_SOPEOP); + PDBG("%s cookie 0x%llx idx 0x%x rq_wptr 0x%x rw_rptr 0x%x " + "wqe %p \n", __func__, (unsigned long long) wr->wr_id, + idx, qhp->wq.rq_wptr, qhp->wq.rq_rptr, wqe); + ++(qhp->wq.rq_wptr); + ++(qhp->wq.wptr); + wr = wr->next; + num_wrs--; + } + spin_unlock_irqrestore(&qhp->lock, flag); + if (cxio_wq_db_enabled(&qhp->wq)) + ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid); + +out: + if (err) + *bad_wr = wr; + return err; +} + +int iwch_bind_mw(struct ib_qp *qp, + struct ib_mw *mw, + struct ib_mw_bind *mw_bind) +{ + struct iwch_dev *rhp; + struct iwch_mw *mhp; + struct iwch_qp *qhp; + union t3_wr *wqe; + u32 pbl_addr; + u8 page_size; + u32 num_wrs; + unsigned long flag; + struct ib_sge sgl; + int err=0; + enum t3_wr_flags t3_wr_flags; + u32 idx; + struct t3_swsq *sqp; + + qhp = to_iwch_qp(qp); + mhp = to_iwch_mw(mw); + rhp = qhp->rhp; + + spin_lock_irqsave(&qhp->lock, flag); + if (qhp->attr.state > IWCH_QP_STATE_RTS) { + spin_unlock_irqrestore(&qhp->lock, flag); + return -EINVAL; + } + num_wrs = Q_FREECNT(qhp->wq.sq_rptr, qhp->wq.sq_wptr, + qhp->wq.sq_size_log2); + if (num_wrs == 0) { + spin_unlock_irqrestore(&qhp->lock, flag); + return -ENOMEM; + } + idx = Q_PTR2IDX(qhp->wq.wptr, qhp->wq.size_log2); + PDBG("%s: idx 0x%0x, mw 0x%p, mw_bind 0x%p\n", __func__, idx, + mw, mw_bind); + wqe = (union t3_wr *) (qhp->wq.queue + idx); + + t3_wr_flags = 0; + if (mw_bind->send_flags & IB_SEND_SIGNALED) + t3_wr_flags = T3_COMPLETION_FLAG; + + sgl.addr = mw_bind->bind_info.addr; + sgl.lkey = mw_bind->bind_info.mr->lkey; + sgl.length = mw_bind->bind_info.length; + wqe->bind.reserved = 0; + wqe->bind.type = TPT_VATO; + + /* TBD: check perms */ + wqe->bind.perms = iwch_ib_to_tpt_bind_access( + mw_bind->bind_info.mw_access_flags); + wqe->bind.mr_stag = cpu_to_be32(mw_bind->bind_info.mr->lkey); + wqe->bind.mw_stag = cpu_to_be32(mw->rkey); + wqe->bind.mw_len = cpu_to_be32(mw_bind->bind_info.length); + wqe->bind.mw_va = cpu_to_be64(mw_bind->bind_info.addr); + err = iwch_sgl2pbl_map(rhp, &sgl, 1, &pbl_addr, &page_size); + if (err) { + spin_unlock_irqrestore(&qhp->lock, flag); + return err; + } + wqe->send.wrid.id0.hi = qhp->wq.sq_wptr; + sqp = qhp->wq.sq + Q_PTR2IDX(qhp->wq.sq_wptr, qhp->wq.sq_size_log2); + sqp->wr_id = mw_bind->wr_id; + sqp->opcode = T3_BIND_MW; + sqp->sq_wptr = qhp->wq.sq_wptr; + sqp->complete = 0; + sqp->signaled = (mw_bind->send_flags & IB_SEND_SIGNALED); + wqe->bind.mr_pbl_addr = cpu_to_be32(pbl_addr); + wqe->bind.mr_pagesz = page_size; + build_fw_riwrh((void *)wqe, T3_WR_BIND, t3_wr_flags, + Q_GENBIT(qhp->wq.wptr, qhp->wq.size_log2), 0, + sizeof(struct t3_bind_mw_wr) >> 3, T3_SOPEOP); + ++(qhp->wq.wptr); + ++(qhp->wq.sq_wptr); + spin_unlock_irqrestore(&qhp->lock, flag); + + if (cxio_wq_db_enabled(&qhp->wq)) + ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid); + + return err; +} + +static inline void build_term_codes(struct respQ_msg_t *rsp_msg, + u8 *layer_type, u8 *ecode) +{ + int status = TPT_ERR_INTERNAL_ERR; + int tagged = 0; + int opcode = -1; + int rqtype = 0; + int send_inv = 0; + + if (rsp_msg) { + status = CQE_STATUS(rsp_msg->cqe); + opcode = CQE_OPCODE(rsp_msg->cqe); + rqtype = RQ_TYPE(rsp_msg->cqe); + send_inv = (opcode == T3_SEND_WITH_INV) || + (opcode == T3_SEND_WITH_SE_INV); + tagged = (opcode == T3_RDMA_WRITE) || + (rqtype && (opcode == T3_READ_RESP)); + } + + switch (status) { + case TPT_ERR_STAG: + if (send_inv) { + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP; + *ecode = RDMAP_CANT_INV_STAG; + } else { + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT; + *ecode = RDMAP_INV_STAG; + } + break; + case TPT_ERR_PDID: + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT; + if ((opcode == T3_SEND_WITH_INV) || + (opcode == T3_SEND_WITH_SE_INV)) + *ecode = RDMAP_CANT_INV_STAG; + else + *ecode = RDMAP_STAG_NOT_ASSOC; + break; + case TPT_ERR_QPID: + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT; + *ecode = RDMAP_STAG_NOT_ASSOC; + break; + case TPT_ERR_ACCESS: + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT; + *ecode = RDMAP_ACC_VIOL; + break; + case TPT_ERR_WRAP: + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT; + *ecode = RDMAP_TO_WRAP; + break; + case TPT_ERR_BOUND: + if (tagged) { + *layer_type = LAYER_DDP|DDP_TAGGED_ERR; + *ecode = DDPT_BASE_BOUNDS; + } else { + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT; + *ecode = RDMAP_BASE_BOUNDS; + } + break; + case TPT_ERR_INVALIDATE_SHARED_MR: + case TPT_ERR_INVALIDATE_MR_WITH_MW_BOUND: + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP; + *ecode = RDMAP_CANT_INV_STAG; + break; + case TPT_ERR_ECC: + case TPT_ERR_ECC_PSTAG: + case TPT_ERR_INTERNAL_ERR: + *layer_type = LAYER_RDMAP|RDMAP_LOCAL_CATA; + *ecode = 0; + break; + case TPT_ERR_OUT_OF_RQE: + *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; + *ecode = DDPU_INV_MSN_NOBUF; + break; + case TPT_ERR_PBL_ADDR_BOUND: + *layer_type = LAYER_DDP|DDP_TAGGED_ERR; + *ecode = DDPT_BASE_BOUNDS; + break; + case TPT_ERR_CRC: + *layer_type = LAYER_MPA|DDP_LLP; + *ecode = MPA_CRC_ERR; + break; + case TPT_ERR_MARKER: + *layer_type = LAYER_MPA|DDP_LLP; + *ecode = MPA_MARKER_ERR; + break; + case TPT_ERR_PDU_LEN_ERR: + *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; + *ecode = DDPU_MSG_TOOBIG; + break; + case TPT_ERR_DDP_VERSION: + if (tagged) { + *layer_type = LAYER_DDP|DDP_TAGGED_ERR; + *ecode = DDPT_INV_VERS; + } else { + *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; + *ecode = DDPU_INV_VERS; + } + break; + case TPT_ERR_RDMA_VERSION: + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP; + *ecode = RDMAP_INV_VERS; + break; + case TPT_ERR_OPCODE: + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP; + *ecode = RDMAP_INV_OPCODE; + break; + case TPT_ERR_DDP_QUEUE_NUM: + *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; + *ecode = DDPU_INV_QN; + break; + case TPT_ERR_MSN: + case TPT_ERR_MSN_GAP: + case TPT_ERR_MSN_RANGE: + case TPT_ERR_IRD_OVERFLOW: + *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; + *ecode = DDPU_INV_MSN_RANGE; + break; + case TPT_ERR_TBIT: + *layer_type = LAYER_DDP|DDP_LOCAL_CATA; + *ecode = 0; + break; + case TPT_ERR_MO: + *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; + *ecode = DDPU_INV_MO; + break; + default: + *layer_type = LAYER_RDMAP|DDP_LOCAL_CATA; + *ecode = 0; + break; + } +} + +int iwch_post_zb_read(struct iwch_ep *ep) +{ + union t3_wr *wqe; + struct sk_buff *skb; + u8 flit_cnt = sizeof(struct t3_rdma_read_wr) >> 3; + + PDBG("%s enter\n", __func__); + skb = alloc_skb(40, GFP_KERNEL); + if (!skb) { + printk(KERN_ERR "%s cannot send zb_read!!\n", __func__); + return -ENOMEM; + } + wqe = (union t3_wr *)skb_put(skb, sizeof(struct t3_rdma_read_wr)); + memset(wqe, 0, sizeof(struct t3_rdma_read_wr)); + wqe->read.rdmaop = T3_READ_REQ; + wqe->read.reserved[0] = 0; + wqe->read.reserved[1] = 0; + wqe->read.rem_stag = cpu_to_be32(1); + wqe->read.rem_to = cpu_to_be64(1); + wqe->read.local_stag = cpu_to_be32(1); + wqe->read.local_len = cpu_to_be32(0); + wqe->read.local_to = cpu_to_be64(1); + wqe->send.wrh.op_seop_flags = cpu_to_be32(V_FW_RIWR_OP(T3_WR_READ)); + wqe->send.wrh.gen_tid_len = cpu_to_be32(V_FW_RIWR_TID(ep->hwtid)| + V_FW_RIWR_LEN(flit_cnt)); + skb->priority = CPL_PRIORITY_DATA; + return iwch_cxgb3_ofld_send(ep->com.qp->rhp->rdev.t3cdev_p, skb); +} + +/* + * This posts a TERMINATE with layer=RDMA, type=catastrophic. + */ +int iwch_post_terminate(struct iwch_qp *qhp, struct respQ_msg_t *rsp_msg) +{ + union t3_wr *wqe; + struct terminate_message *term; + struct sk_buff *skb; + + PDBG("%s %d\n", __func__, __LINE__); + skb = alloc_skb(40, GFP_ATOMIC); + if (!skb) { + printk(KERN_ERR "%s cannot send TERMINATE!\n", __func__); + return -ENOMEM; + } + wqe = (union t3_wr *)skb_put(skb, 40); + memset(wqe, 0, 40); + wqe->send.rdmaop = T3_TERMINATE; + + /* immediate data length */ + wqe->send.plen = htonl(4); + + /* immediate data starts here. */ + term = (struct terminate_message *)wqe->send.sgl; + build_term_codes(rsp_msg, &term->layer_etype, &term->ecode); + wqe->send.wrh.op_seop_flags = cpu_to_be32(V_FW_RIWR_OP(T3_WR_SEND) | + V_FW_RIWR_FLAGS(T3_COMPLETION_FLAG | T3_NOTIFY_FLAG)); + wqe->send.wrh.gen_tid_len = cpu_to_be32(V_FW_RIWR_TID(qhp->ep->hwtid)); + skb->priority = CPL_PRIORITY_DATA; + return iwch_cxgb3_ofld_send(qhp->rhp->rdev.t3cdev_p, skb); +} + +/* + * Assumes qhp lock is held. + */ +static void __flush_qp(struct iwch_qp *qhp, struct iwch_cq *rchp, + struct iwch_cq *schp) +{ + int count; + int flushed; + + + PDBG("%s qhp %p rchp %p schp %p\n", __func__, qhp, rchp, schp); + /* take a ref on the qhp since we must release the lock */ + atomic_inc(&qhp->refcnt); + spin_unlock(&qhp->lock); + + /* locking hierarchy: cq lock first, then qp lock. */ + spin_lock(&rchp->lock); + spin_lock(&qhp->lock); + cxio_flush_hw_cq(&rchp->cq); + cxio_count_rcqes(&rchp->cq, &qhp->wq, &count); + flushed = cxio_flush_rq(&qhp->wq, &rchp->cq, count); + spin_unlock(&qhp->lock); + spin_unlock(&rchp->lock); + if (flushed) { + spin_lock(&rchp->comp_handler_lock); + (*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context); + spin_unlock(&rchp->comp_handler_lock); + } + + /* locking hierarchy: cq lock first, then qp lock. */ + spin_lock(&schp->lock); + spin_lock(&qhp->lock); + cxio_flush_hw_cq(&schp->cq); + cxio_count_scqes(&schp->cq, &qhp->wq, &count); + flushed = cxio_flush_sq(&qhp->wq, &schp->cq, count); + spin_unlock(&qhp->lock); + spin_unlock(&schp->lock); + if (flushed) { + spin_lock(&schp->comp_handler_lock); + (*schp->ibcq.comp_handler)(&schp->ibcq, schp->ibcq.cq_context); + spin_unlock(&schp->comp_handler_lock); + } + + /* deref */ + if (atomic_dec_and_test(&qhp->refcnt)) + wake_up(&qhp->wait); + + spin_lock(&qhp->lock); +} + +static void flush_qp(struct iwch_qp *qhp) +{ + struct iwch_cq *rchp, *schp; + + rchp = get_chp(qhp->rhp, qhp->attr.rcq); + schp = get_chp(qhp->rhp, qhp->attr.scq); + + if (qhp->ibqp.uobject) { + cxio_set_wq_in_error(&qhp->wq); + cxio_set_cq_in_error(&rchp->cq); + spin_lock(&rchp->comp_handler_lock); + (*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context); + spin_unlock(&rchp->comp_handler_lock); + if (schp != rchp) { + cxio_set_cq_in_error(&schp->cq); + spin_lock(&schp->comp_handler_lock); + (*schp->ibcq.comp_handler)(&schp->ibcq, + schp->ibcq.cq_context); + spin_unlock(&schp->comp_handler_lock); + } + return; + } + __flush_qp(qhp, rchp, schp); +} + + +/* + * Return count of RECV WRs posted + */ +u16 iwch_rqes_posted(struct iwch_qp *qhp) +{ + union t3_wr *wqe = qhp->wq.queue; + u16 count = 0; + + while (count < USHRT_MAX && fw_riwrh_opcode((struct fw_riwrh *)wqe) == T3_WR_RCV) { + count++; + wqe++; + } + PDBG("%s qhp %p count %u\n", __func__, qhp, count); + return count; +} + +static int rdma_init(struct iwch_dev *rhp, struct iwch_qp *qhp, + enum iwch_qp_attr_mask mask, + struct iwch_qp_attributes *attrs) +{ + struct t3_rdma_init_attr init_attr; + int ret; + + init_attr.tid = qhp->ep->hwtid; + init_attr.qpid = qhp->wq.qpid; + init_attr.pdid = qhp->attr.pd; + init_attr.scqid = qhp->attr.scq; + init_attr.rcqid = qhp->attr.rcq; + init_attr.rq_addr = qhp->wq.rq_addr; + init_attr.rq_size = 1 << qhp->wq.rq_size_log2; + init_attr.mpaattrs = uP_RI_MPA_IETF_ENABLE | + qhp->attr.mpa_attr.recv_marker_enabled | + (qhp->attr.mpa_attr.xmit_marker_enabled << 1) | + (qhp->attr.mpa_attr.crc_enabled << 2); + + init_attr.qpcaps = uP_RI_QP_RDMA_READ_ENABLE | + uP_RI_QP_RDMA_WRITE_ENABLE | + uP_RI_QP_BIND_ENABLE; + if (!qhp->ibqp.uobject) + init_attr.qpcaps |= uP_RI_QP_STAG0_ENABLE | + uP_RI_QP_FAST_REGISTER_ENABLE; + + init_attr.tcp_emss = qhp->ep->emss; + init_attr.ord = qhp->attr.max_ord; + init_attr.ird = qhp->attr.max_ird; + init_attr.qp_dma_addr = qhp->wq.dma_addr; + init_attr.qp_dma_size = (1UL << qhp->wq.size_log2); + init_attr.rqe_count = iwch_rqes_posted(qhp); + init_attr.flags = qhp->attr.mpa_attr.initiator ? MPA_INITIATOR : 0; + init_attr.chan = qhp->ep->l2t->smt_idx; + if (peer2peer) { + init_attr.rtr_type = RTR_READ; + if (init_attr.ord == 0 && qhp->attr.mpa_attr.initiator) + init_attr.ord = 1; + if (init_attr.ird == 0 && !qhp->attr.mpa_attr.initiator) + init_attr.ird = 1; + } else + init_attr.rtr_type = 0; + init_attr.irs = qhp->ep->rcv_seq; + PDBG("%s init_attr.rq_addr 0x%x init_attr.rq_size = %d " + "flags 0x%x qpcaps 0x%x\n", __func__, + init_attr.rq_addr, init_attr.rq_size, + init_attr.flags, init_attr.qpcaps); + ret = cxio_rdma_init(&rhp->rdev, &init_attr); + PDBG("%s ret %d\n", __func__, ret); + return ret; +} + +int iwch_modify_qp(struct iwch_dev *rhp, struct iwch_qp *qhp, + enum iwch_qp_attr_mask mask, + struct iwch_qp_attributes *attrs, + int internal) +{ + int ret = 0; + struct iwch_qp_attributes newattr = qhp->attr; + unsigned long flag; + int disconnect = 0; + int terminate = 0; + int abort = 0; + int free = 0; + struct iwch_ep *ep = NULL; + + PDBG("%s qhp %p qpid 0x%x ep %p state %d -> %d\n", __func__, + qhp, qhp->wq.qpid, qhp->ep, qhp->attr.state, + (mask & IWCH_QP_ATTR_NEXT_STATE) ? attrs->next_state : -1); + + spin_lock_irqsave(&qhp->lock, flag); + + /* Process attr changes if in IDLE */ + if (mask & IWCH_QP_ATTR_VALID_MODIFY) { + if (qhp->attr.state != IWCH_QP_STATE_IDLE) { + ret = -EIO; + goto out; + } + if (mask & IWCH_QP_ATTR_ENABLE_RDMA_READ) + newattr.enable_rdma_read = attrs->enable_rdma_read; + if (mask & IWCH_QP_ATTR_ENABLE_RDMA_WRITE) + newattr.enable_rdma_write = attrs->enable_rdma_write; + if (mask & IWCH_QP_ATTR_ENABLE_RDMA_BIND) + newattr.enable_bind = attrs->enable_bind; + if (mask & IWCH_QP_ATTR_MAX_ORD) { + if (attrs->max_ord > + rhp->attr.max_rdma_read_qp_depth) { + ret = -EINVAL; + goto out; + } + newattr.max_ord = attrs->max_ord; + } + if (mask & IWCH_QP_ATTR_MAX_IRD) { + if (attrs->max_ird > + rhp->attr.max_rdma_reads_per_qp) { + ret = -EINVAL; + goto out; + } + newattr.max_ird = attrs->max_ird; + } + qhp->attr = newattr; + } + + if (!(mask & IWCH_QP_ATTR_NEXT_STATE)) + goto out; + if (qhp->attr.state == attrs->next_state) + goto out; + + switch (qhp->attr.state) { + case IWCH_QP_STATE_IDLE: + switch (attrs->next_state) { + case IWCH_QP_STATE_RTS: + if (!(mask & IWCH_QP_ATTR_LLP_STREAM_HANDLE)) { + ret = -EINVAL; + goto out; + } + if (!(mask & IWCH_QP_ATTR_MPA_ATTR)) { + ret = -EINVAL; + goto out; + } + qhp->attr.mpa_attr = attrs->mpa_attr; + qhp->attr.llp_stream_handle = attrs->llp_stream_handle; + qhp->ep = qhp->attr.llp_stream_handle; + qhp->attr.state = IWCH_QP_STATE_RTS; + + /* + * Ref the endpoint here and deref when we + * disassociate the endpoint from the QP. This + * happens in CLOSING->IDLE transition or *->ERROR + * transition. + */ + get_ep(&qhp->ep->com); + spin_unlock_irqrestore(&qhp->lock, flag); + ret = rdma_init(rhp, qhp, mask, attrs); + spin_lock_irqsave(&qhp->lock, flag); + if (ret) + goto err; + break; + case IWCH_QP_STATE_ERROR: + qhp->attr.state = IWCH_QP_STATE_ERROR; + flush_qp(qhp); + break; + default: + ret = -EINVAL; + goto out; + } + break; + case IWCH_QP_STATE_RTS: + switch (attrs->next_state) { + case IWCH_QP_STATE_CLOSING: + BUG_ON(atomic_read(&qhp->ep->com.kref.refcount) < 2); + qhp->attr.state = IWCH_QP_STATE_CLOSING; + if (!internal) { + abort=0; + disconnect = 1; + ep = qhp->ep; + get_ep(&ep->com); + } + break; + case IWCH_QP_STATE_TERMINATE: + qhp->attr.state = IWCH_QP_STATE_TERMINATE; + if (qhp->ibqp.uobject) + cxio_set_wq_in_error(&qhp->wq); + if (!internal) + terminate = 1; + break; + case IWCH_QP_STATE_ERROR: + qhp->attr.state = IWCH_QP_STATE_ERROR; + if (!internal) { + abort=1; + disconnect = 1; + ep = qhp->ep; + get_ep(&ep->com); + } + goto err; + break; + default: + ret = -EINVAL; + goto out; + } + break; + case IWCH_QP_STATE_CLOSING: + if (!internal) { + ret = -EINVAL; + goto out; + } + switch (attrs->next_state) { + case IWCH_QP_STATE_IDLE: + flush_qp(qhp); + qhp->attr.state = IWCH_QP_STATE_IDLE; + qhp->attr.llp_stream_handle = NULL; + put_ep(&qhp->ep->com); + qhp->ep = NULL; + wake_up(&qhp->wait); + break; + case IWCH_QP_STATE_ERROR: + goto err; + default: + ret = -EINVAL; + goto err; + } + break; + case IWCH_QP_STATE_ERROR: + if (attrs->next_state != IWCH_QP_STATE_IDLE) { + ret = -EINVAL; + goto out; + } + + if (!Q_EMPTY(qhp->wq.sq_rptr, qhp->wq.sq_wptr) || + !Q_EMPTY(qhp->wq.rq_rptr, qhp->wq.rq_wptr)) { + ret = -EINVAL; + goto out; + } + qhp->attr.state = IWCH_QP_STATE_IDLE; + break; + case IWCH_QP_STATE_TERMINATE: + if (!internal) { + ret = -EINVAL; + goto out; + } + goto err; + break; + default: + printk(KERN_ERR "%s in a bad state %d\n", + __func__, qhp->attr.state); + ret = -EINVAL; + goto err; + break; + } + goto out; +err: + PDBG("%s disassociating ep %p qpid 0x%x\n", __func__, qhp->ep, + qhp->wq.qpid); + + /* disassociate the LLP connection */ + qhp->attr.llp_stream_handle = NULL; + ep = qhp->ep; + qhp->ep = NULL; + qhp->attr.state = IWCH_QP_STATE_ERROR; + free=1; + wake_up(&qhp->wait); + BUG_ON(!ep); + flush_qp(qhp); +out: + spin_unlock_irqrestore(&qhp->lock, flag); + + if (terminate) + iwch_post_terminate(qhp, NULL); + + /* + * If disconnect is 1, then we need to initiate a disconnect + * on the EP. This can be a normal close (RTS->CLOSING) or + * an abnormal close (RTS/CLOSING->ERROR). + */ + if (disconnect) { + iwch_ep_disconnect(ep, abort, GFP_KERNEL); + put_ep(&ep->com); + } + + /* + * If free is 1, then we've disassociated the EP from the QP + * and we need to dereference the EP. + */ + if (free) + put_ep(&ep->com); + + PDBG("%s exit state %d\n", __func__, qhp->attr.state); + return ret; +} diff --git a/kernel/drivers/infiniband/hw/cxgb3/iwch_user.h b/kernel/drivers/infiniband/hw/cxgb3/iwch_user.h new file mode 100644 index 000000000..a277c31fc --- /dev/null +++ b/kernel/drivers/infiniband/hw/cxgb3/iwch_user.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __IWCH_USER_H__ +#define __IWCH_USER_H__ + +#define IWCH_UVERBS_ABI_VERSION 1 + +/* + * Make sure that all structs defined in this file remain laid out so + * that they pack the same way on 32-bit and 64-bit architectures (to + * avoid incompatibility between 32-bit userspace and 64-bit kernels). + * In particular do not use pointer types -- pass pointers in __u64 + * instead. + */ +struct iwch_create_cq_req { + __u64 user_rptr_addr; +}; + +struct iwch_create_cq_resp_v0 { + __u64 key; + __u32 cqid; + __u32 size_log2; +}; + +struct iwch_create_cq_resp { + __u64 key; + __u32 cqid; + __u32 size_log2; + __u32 memsize; + __u32 reserved; +}; + +struct iwch_create_qp_resp { + __u64 key; + __u64 db_key; + __u32 qpid; + __u32 size_log2; + __u32 sq_size_log2; + __u32 rq_size_log2; +}; + +struct iwch_reg_user_mr_resp { + __u32 pbl_addr; +}; +#endif diff --git a/kernel/drivers/infiniband/hw/cxgb3/tcb.h b/kernel/drivers/infiniband/hw/cxgb3/tcb.h new file mode 100644 index 000000000..c702dc199 --- /dev/null +++ b/kernel/drivers/infiniband/hw/cxgb3/tcb.h @@ -0,0 +1,632 @@ +/* + * Copyright (c) 2007 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _TCB_DEFS_H +#define _TCB_DEFS_H + +#define W_TCB_T_STATE 0 +#define S_TCB_T_STATE 0 +#define M_TCB_T_STATE 0xfULL +#define V_TCB_T_STATE(x) ((x) << S_TCB_T_STATE) + +#define W_TCB_TIMER 0 +#define S_TCB_TIMER 4 +#define M_TCB_TIMER 0x1ULL +#define V_TCB_TIMER(x) ((x) << S_TCB_TIMER) + +#define W_TCB_DACK_TIMER 0 +#define S_TCB_DACK_TIMER 5 +#define M_TCB_DACK_TIMER 0x1ULL +#define V_TCB_DACK_TIMER(x) ((x) << S_TCB_DACK_TIMER) + +#define W_TCB_DEL_FLAG 0 +#define S_TCB_DEL_FLAG 6 +#define M_TCB_DEL_FLAG 0x1ULL +#define V_TCB_DEL_FLAG(x) ((x) << S_TCB_DEL_FLAG) + +#define W_TCB_L2T_IX 0 +#define S_TCB_L2T_IX 7 +#define M_TCB_L2T_IX 0x7ffULL +#define V_TCB_L2T_IX(x) ((x) << S_TCB_L2T_IX) + +#define W_TCB_SMAC_SEL 0 +#define S_TCB_SMAC_SEL 18 +#define M_TCB_SMAC_SEL 0x3ULL +#define V_TCB_SMAC_SEL(x) ((x) << S_TCB_SMAC_SEL) + +#define W_TCB_TOS 0 +#define S_TCB_TOS 20 +#define M_TCB_TOS 0x3fULL +#define V_TCB_TOS(x) ((x) << S_TCB_TOS) + +#define W_TCB_MAX_RT 0 +#define S_TCB_MAX_RT 26 +#define M_TCB_MAX_RT 0xfULL +#define V_TCB_MAX_RT(x) ((x) << S_TCB_MAX_RT) + +#define W_TCB_T_RXTSHIFT 0 +#define S_TCB_T_RXTSHIFT 30 +#define M_TCB_T_RXTSHIFT 0xfULL +#define V_TCB_T_RXTSHIFT(x) ((x) << S_TCB_T_RXTSHIFT) + +#define W_TCB_T_DUPACKS 1 +#define S_TCB_T_DUPACKS 2 +#define M_TCB_T_DUPACKS 0xfULL +#define V_TCB_T_DUPACKS(x) ((x) << S_TCB_T_DUPACKS) + +#define W_TCB_T_MAXSEG 1 +#define S_TCB_T_MAXSEG 6 +#define M_TCB_T_MAXSEG 0xfULL +#define V_TCB_T_MAXSEG(x) ((x) << S_TCB_T_MAXSEG) + +#define W_TCB_T_FLAGS1 1 +#define S_TCB_T_FLAGS1 10 +#define M_TCB_T_FLAGS1 0xffffffffULL +#define V_TCB_T_FLAGS1(x) ((x) << S_TCB_T_FLAGS1) + +#define W_TCB_T_MIGRATION 1 +#define S_TCB_T_MIGRATION 20 +#define M_TCB_T_MIGRATION 0x1ULL +#define V_TCB_T_MIGRATION(x) ((x) << S_TCB_T_MIGRATION) + +#define W_TCB_T_FLAGS2 2 +#define S_TCB_T_FLAGS2 10 +#define M_TCB_T_FLAGS2 0x7fULL +#define V_TCB_T_FLAGS2(x) ((x) << S_TCB_T_FLAGS2) + +#define W_TCB_SND_SCALE 2 +#define S_TCB_SND_SCALE 17 +#define M_TCB_SND_SCALE 0xfULL +#define V_TCB_SND_SCALE(x) ((x) << S_TCB_SND_SCALE) + +#define W_TCB_RCV_SCALE 2 +#define S_TCB_RCV_SCALE 21 +#define M_TCB_RCV_SCALE 0xfULL +#define V_TCB_RCV_SCALE(x) ((x) << S_TCB_RCV_SCALE) + +#define W_TCB_SND_UNA_RAW 2 +#define S_TCB_SND_UNA_RAW 25 +#define M_TCB_SND_UNA_RAW 0x7ffffffULL +#define V_TCB_SND_UNA_RAW(x) ((x) << S_TCB_SND_UNA_RAW) + +#define W_TCB_SND_NXT_RAW 3 +#define S_TCB_SND_NXT_RAW 20 +#define M_TCB_SND_NXT_RAW 0x7ffffffULL +#define V_TCB_SND_NXT_RAW(x) ((x) << S_TCB_SND_NXT_RAW) + +#define W_TCB_RCV_NXT 4 +#define S_TCB_RCV_NXT 15 +#define M_TCB_RCV_NXT 0xffffffffULL +#define V_TCB_RCV_NXT(x) ((x) << S_TCB_RCV_NXT) + +#define W_TCB_RCV_ADV 5 +#define S_TCB_RCV_ADV 15 +#define M_TCB_RCV_ADV 0xffffULL +#define V_TCB_RCV_ADV(x) ((x) << S_TCB_RCV_ADV) + +#define W_TCB_SND_MAX_RAW 5 +#define S_TCB_SND_MAX_RAW 31 +#define M_TCB_SND_MAX_RAW 0x7ffffffULL +#define V_TCB_SND_MAX_RAW(x) ((x) << S_TCB_SND_MAX_RAW) + +#define W_TCB_SND_CWND 6 +#define S_TCB_SND_CWND 26 +#define M_TCB_SND_CWND 0x7ffffffULL +#define V_TCB_SND_CWND(x) ((x) << S_TCB_SND_CWND) + +#define W_TCB_SND_SSTHRESH 7 +#define S_TCB_SND_SSTHRESH 21 +#define M_TCB_SND_SSTHRESH 0x7ffffffULL +#define V_TCB_SND_SSTHRESH(x) ((x) << S_TCB_SND_SSTHRESH) + +#define W_TCB_T_RTT_TS_RECENT_AGE 8 +#define S_TCB_T_RTT_TS_RECENT_AGE 16 +#define M_TCB_T_RTT_TS_RECENT_AGE 0xffffffffULL +#define V_TCB_T_RTT_TS_RECENT_AGE(x) ((x) << S_TCB_T_RTT_TS_RECENT_AGE) + +#define W_TCB_T_RTSEQ_RECENT 9 +#define S_TCB_T_RTSEQ_RECENT 16 +#define M_TCB_T_RTSEQ_RECENT 0xffffffffULL +#define V_TCB_T_RTSEQ_RECENT(x) ((x) << S_TCB_T_RTSEQ_RECENT) + +#define W_TCB_T_SRTT 10 +#define S_TCB_T_SRTT 16 +#define M_TCB_T_SRTT 0xffffULL +#define V_TCB_T_SRTT(x) ((x) << S_TCB_T_SRTT) + +#define W_TCB_T_RTTVAR 11 +#define S_TCB_T_RTTVAR 0 +#define M_TCB_T_RTTVAR 0xffffULL +#define V_TCB_T_RTTVAR(x) ((x) << S_TCB_T_RTTVAR) + +#define W_TCB_TS_LAST_ACK_SENT_RAW 11 +#define S_TCB_TS_LAST_ACK_SENT_RAW 16 +#define M_TCB_TS_LAST_ACK_SENT_RAW 0x7ffffffULL +#define V_TCB_TS_LAST_ACK_SENT_RAW(x) ((x) << S_TCB_TS_LAST_ACK_SENT_RAW) + +#define W_TCB_DIP 12 +#define S_TCB_DIP 11 +#define M_TCB_DIP 0xffffffffULL +#define V_TCB_DIP(x) ((x) << S_TCB_DIP) + +#define W_TCB_SIP 13 +#define S_TCB_SIP 11 +#define M_TCB_SIP 0xffffffffULL +#define V_TCB_SIP(x) ((x) << S_TCB_SIP) + +#define W_TCB_DP 14 +#define S_TCB_DP 11 +#define M_TCB_DP 0xffffULL +#define V_TCB_DP(x) ((x) << S_TCB_DP) + +#define W_TCB_SP 14 +#define S_TCB_SP 27 +#define M_TCB_SP 0xffffULL +#define V_TCB_SP(x) ((x) << S_TCB_SP) + +#define W_TCB_TIMESTAMP 15 +#define S_TCB_TIMESTAMP 11 +#define M_TCB_TIMESTAMP 0xffffffffULL +#define V_TCB_TIMESTAMP(x) ((x) << S_TCB_TIMESTAMP) + +#define W_TCB_TIMESTAMP_OFFSET 16 +#define S_TCB_TIMESTAMP_OFFSET 11 +#define M_TCB_TIMESTAMP_OFFSET 0xfULL +#define V_TCB_TIMESTAMP_OFFSET(x) ((x) << S_TCB_TIMESTAMP_OFFSET) + +#define W_TCB_TX_MAX 16 +#define S_TCB_TX_MAX 15 +#define M_TCB_TX_MAX 0xffffffffULL +#define V_TCB_TX_MAX(x) ((x) << S_TCB_TX_MAX) + +#define W_TCB_TX_HDR_PTR_RAW 17 +#define S_TCB_TX_HDR_PTR_RAW 15 +#define M_TCB_TX_HDR_PTR_RAW 0x1ffffULL +#define V_TCB_TX_HDR_PTR_RAW(x) ((x) << S_TCB_TX_HDR_PTR_RAW) + +#define W_TCB_TX_LAST_PTR_RAW 18 +#define S_TCB_TX_LAST_PTR_RAW 0 +#define M_TCB_TX_LAST_PTR_RAW 0x1ffffULL +#define V_TCB_TX_LAST_PTR_RAW(x) ((x) << S_TCB_TX_LAST_PTR_RAW) + +#define W_TCB_TX_COMPACT 18 +#define S_TCB_TX_COMPACT 17 +#define M_TCB_TX_COMPACT 0x1ULL +#define V_TCB_TX_COMPACT(x) ((x) << S_TCB_TX_COMPACT) + +#define W_TCB_RX_COMPACT 18 +#define S_TCB_RX_COMPACT 18 +#define M_TCB_RX_COMPACT 0x1ULL +#define V_TCB_RX_COMPACT(x) ((x) << S_TCB_RX_COMPACT) + +#define W_TCB_RCV_WND 18 +#define S_TCB_RCV_WND 19 +#define M_TCB_RCV_WND 0x7ffffffULL +#define V_TCB_RCV_WND(x) ((x) << S_TCB_RCV_WND) + +#define W_TCB_RX_HDR_OFFSET 19 +#define S_TCB_RX_HDR_OFFSET 14 +#define M_TCB_RX_HDR_OFFSET 0x7ffffffULL +#define V_TCB_RX_HDR_OFFSET(x) ((x) << S_TCB_RX_HDR_OFFSET) + +#define W_TCB_RX_FRAG0_START_IDX_RAW 20 +#define S_TCB_RX_FRAG0_START_IDX_RAW 9 +#define M_TCB_RX_FRAG0_START_IDX_RAW 0x7ffffffULL +#define V_TCB_RX_FRAG0_START_IDX_RAW(x) ((x) << S_TCB_RX_FRAG0_START_IDX_RAW) + +#define W_TCB_RX_FRAG1_START_IDX_OFFSET 21 +#define S_TCB_RX_FRAG1_START_IDX_OFFSET 4 +#define M_TCB_RX_FRAG1_START_IDX_OFFSET 0x7ffffffULL +#define V_TCB_RX_FRAG1_START_IDX_OFFSET(x) ((x) << S_TCB_RX_FRAG1_START_IDX_OFFSET) + +#define W_TCB_RX_FRAG0_LEN 21 +#define S_TCB_RX_FRAG0_LEN 31 +#define M_TCB_RX_FRAG0_LEN 0x7ffffffULL +#define V_TCB_RX_FRAG0_LEN(x) ((x) << S_TCB_RX_FRAG0_LEN) + +#define W_TCB_RX_FRAG1_LEN 22 +#define S_TCB_RX_FRAG1_LEN 26 +#define M_TCB_RX_FRAG1_LEN 0x7ffffffULL +#define V_TCB_RX_FRAG1_LEN(x) ((x) << S_TCB_RX_FRAG1_LEN) + +#define W_TCB_NEWRENO_RECOVER 23 +#define S_TCB_NEWRENO_RECOVER 21 +#define M_TCB_NEWRENO_RECOVER 0x7ffffffULL +#define V_TCB_NEWRENO_RECOVER(x) ((x) << S_TCB_NEWRENO_RECOVER) + +#define W_TCB_PDU_HAVE_LEN 24 +#define S_TCB_PDU_HAVE_LEN 16 +#define M_TCB_PDU_HAVE_LEN 0x1ULL +#define V_TCB_PDU_HAVE_LEN(x) ((x) << S_TCB_PDU_HAVE_LEN) + +#define W_TCB_PDU_LEN 24 +#define S_TCB_PDU_LEN 17 +#define M_TCB_PDU_LEN 0xffffULL +#define V_TCB_PDU_LEN(x) ((x) << S_TCB_PDU_LEN) + +#define W_TCB_RX_QUIESCE 25 +#define S_TCB_RX_QUIESCE 1 +#define M_TCB_RX_QUIESCE 0x1ULL +#define V_TCB_RX_QUIESCE(x) ((x) << S_TCB_RX_QUIESCE) + +#define W_TCB_RX_PTR_RAW 25 +#define S_TCB_RX_PTR_RAW 2 +#define M_TCB_RX_PTR_RAW 0x1ffffULL +#define V_TCB_RX_PTR_RAW(x) ((x) << S_TCB_RX_PTR_RAW) + +#define W_TCB_CPU_NO 25 +#define S_TCB_CPU_NO 19 +#define M_TCB_CPU_NO 0x7fULL +#define V_TCB_CPU_NO(x) ((x) << S_TCB_CPU_NO) + +#define W_TCB_ULP_TYPE 25 +#define S_TCB_ULP_TYPE 26 +#define M_TCB_ULP_TYPE 0xfULL +#define V_TCB_ULP_TYPE(x) ((x) << S_TCB_ULP_TYPE) + +#define W_TCB_RX_FRAG1_PTR_RAW 25 +#define S_TCB_RX_FRAG1_PTR_RAW 30 +#define M_TCB_RX_FRAG1_PTR_RAW 0x1ffffULL +#define V_TCB_RX_FRAG1_PTR_RAW(x) ((x) << S_TCB_RX_FRAG1_PTR_RAW) + +#define W_TCB_RX_FRAG2_START_IDX_OFFSET_RAW 26 +#define S_TCB_RX_FRAG2_START_IDX_OFFSET_RAW 15 +#define M_TCB_RX_FRAG2_START_IDX_OFFSET_RAW 0x7ffffffULL +#define V_TCB_RX_FRAG2_START_IDX_OFFSET_RAW(x) ((x) << S_TCB_RX_FRAG2_START_IDX_OFFSET_RAW) + +#define W_TCB_RX_FRAG2_PTR_RAW 27 +#define S_TCB_RX_FRAG2_PTR_RAW 10 +#define M_TCB_RX_FRAG2_PTR_RAW 0x1ffffULL +#define V_TCB_RX_FRAG2_PTR_RAW(x) ((x) << S_TCB_RX_FRAG2_PTR_RAW) + +#define W_TCB_RX_FRAG2_LEN_RAW 27 +#define S_TCB_RX_FRAG2_LEN_RAW 27 +#define M_TCB_RX_FRAG2_LEN_RAW 0x7ffffffULL +#define V_TCB_RX_FRAG2_LEN_RAW(x) ((x) << S_TCB_RX_FRAG2_LEN_RAW) + +#define W_TCB_RX_FRAG3_PTR_RAW 28 +#define S_TCB_RX_FRAG3_PTR_RAW 22 +#define M_TCB_RX_FRAG3_PTR_RAW 0x1ffffULL +#define V_TCB_RX_FRAG3_PTR_RAW(x) ((x) << S_TCB_RX_FRAG3_PTR_RAW) + +#define W_TCB_RX_FRAG3_LEN_RAW 29 +#define S_TCB_RX_FRAG3_LEN_RAW 7 +#define M_TCB_RX_FRAG3_LEN_RAW 0x7ffffffULL +#define V_TCB_RX_FRAG3_LEN_RAW(x) ((x) << S_TCB_RX_FRAG3_LEN_RAW) + +#define W_TCB_RX_FRAG3_START_IDX_OFFSET_RAW 30 +#define S_TCB_RX_FRAG3_START_IDX_OFFSET_RAW 2 +#define M_TCB_RX_FRAG3_START_IDX_OFFSET_RAW 0x7ffffffULL +#define V_TCB_RX_FRAG3_START_IDX_OFFSET_RAW(x) ((x) << S_TCB_RX_FRAG3_START_IDX_OFFSET_RAW) + +#define W_TCB_PDU_HDR_LEN 30 +#define S_TCB_PDU_HDR_LEN 29 +#define M_TCB_PDU_HDR_LEN 0xffULL +#define V_TCB_PDU_HDR_LEN(x) ((x) << S_TCB_PDU_HDR_LEN) + +#define W_TCB_SLUSH1 31 +#define S_TCB_SLUSH1 5 +#define M_TCB_SLUSH1 0x7ffffULL +#define V_TCB_SLUSH1(x) ((x) << S_TCB_SLUSH1) + +#define W_TCB_ULP_RAW 31 +#define S_TCB_ULP_RAW 24 +#define M_TCB_ULP_RAW 0xffULL +#define V_TCB_ULP_RAW(x) ((x) << S_TCB_ULP_RAW) + +#define W_TCB_DDP_RDMAP_VERSION 25 +#define S_TCB_DDP_RDMAP_VERSION 30 +#define M_TCB_DDP_RDMAP_VERSION 0x1ULL +#define V_TCB_DDP_RDMAP_VERSION(x) ((x) << S_TCB_DDP_RDMAP_VERSION) + +#define W_TCB_MARKER_ENABLE_RX 25 +#define S_TCB_MARKER_ENABLE_RX 31 +#define M_TCB_MARKER_ENABLE_RX 0x1ULL +#define V_TCB_MARKER_ENABLE_RX(x) ((x) << S_TCB_MARKER_ENABLE_RX) + +#define W_TCB_MARKER_ENABLE_TX 26 +#define S_TCB_MARKER_ENABLE_TX 0 +#define M_TCB_MARKER_ENABLE_TX 0x1ULL +#define V_TCB_MARKER_ENABLE_TX(x) ((x) << S_TCB_MARKER_ENABLE_TX) + +#define W_TCB_CRC_ENABLE 26 +#define S_TCB_CRC_ENABLE 1 +#define M_TCB_CRC_ENABLE 0x1ULL +#define V_TCB_CRC_ENABLE(x) ((x) << S_TCB_CRC_ENABLE) + +#define W_TCB_IRS_ULP 26 +#define S_TCB_IRS_ULP 2 +#define M_TCB_IRS_ULP 0x1ffULL +#define V_TCB_IRS_ULP(x) ((x) << S_TCB_IRS_ULP) + +#define W_TCB_ISS_ULP 26 +#define S_TCB_ISS_ULP 11 +#define M_TCB_ISS_ULP 0x1ffULL +#define V_TCB_ISS_ULP(x) ((x) << S_TCB_ISS_ULP) + +#define W_TCB_TX_PDU_LEN 26 +#define S_TCB_TX_PDU_LEN 20 +#define M_TCB_TX_PDU_LEN 0x3fffULL +#define V_TCB_TX_PDU_LEN(x) ((x) << S_TCB_TX_PDU_LEN) + +#define W_TCB_TX_PDU_OUT 27 +#define S_TCB_TX_PDU_OUT 2 +#define M_TCB_TX_PDU_OUT 0x1ULL +#define V_TCB_TX_PDU_OUT(x) ((x) << S_TCB_TX_PDU_OUT) + +#define W_TCB_CQ_IDX_SQ 27 +#define S_TCB_CQ_IDX_SQ 3 +#define M_TCB_CQ_IDX_SQ 0xffffULL +#define V_TCB_CQ_IDX_SQ(x) ((x) << S_TCB_CQ_IDX_SQ) + +#define W_TCB_CQ_IDX_RQ 27 +#define S_TCB_CQ_IDX_RQ 19 +#define M_TCB_CQ_IDX_RQ 0xffffULL +#define V_TCB_CQ_IDX_RQ(x) ((x) << S_TCB_CQ_IDX_RQ) + +#define W_TCB_QP_ID 28 +#define S_TCB_QP_ID 3 +#define M_TCB_QP_ID 0xffffULL +#define V_TCB_QP_ID(x) ((x) << S_TCB_QP_ID) + +#define W_TCB_PD_ID 28 +#define S_TCB_PD_ID 19 +#define M_TCB_PD_ID 0xffffULL +#define V_TCB_PD_ID(x) ((x) << S_TCB_PD_ID) + +#define W_TCB_STAG 29 +#define S_TCB_STAG 3 +#define M_TCB_STAG 0xffffffffULL +#define V_TCB_STAG(x) ((x) << S_TCB_STAG) + +#define W_TCB_RQ_START 30 +#define S_TCB_RQ_START 3 +#define M_TCB_RQ_START 0x3ffffffULL +#define V_TCB_RQ_START(x) ((x) << S_TCB_RQ_START) + +#define W_TCB_RQ_MSN 30 +#define S_TCB_RQ_MSN 29 +#define M_TCB_RQ_MSN 0x3ffULL +#define V_TCB_RQ_MSN(x) ((x) << S_TCB_RQ_MSN) + +#define W_TCB_RQ_MAX_OFFSET 31 +#define S_TCB_RQ_MAX_OFFSET 7 +#define M_TCB_RQ_MAX_OFFSET 0xfULL +#define V_TCB_RQ_MAX_OFFSET(x) ((x) << S_TCB_RQ_MAX_OFFSET) + +#define W_TCB_RQ_WRITE_PTR 31 +#define S_TCB_RQ_WRITE_PTR 11 +#define M_TCB_RQ_WRITE_PTR 0x3ffULL +#define V_TCB_RQ_WRITE_PTR(x) ((x) << S_TCB_RQ_WRITE_PTR) + +#define W_TCB_INB_WRITE_PERM 31 +#define S_TCB_INB_WRITE_PERM 21 +#define M_TCB_INB_WRITE_PERM 0x1ULL +#define V_TCB_INB_WRITE_PERM(x) ((x) << S_TCB_INB_WRITE_PERM) + +#define W_TCB_INB_READ_PERM 31 +#define S_TCB_INB_READ_PERM 22 +#define M_TCB_INB_READ_PERM 0x1ULL +#define V_TCB_INB_READ_PERM(x) ((x) << S_TCB_INB_READ_PERM) + +#define W_TCB_ORD_L_BIT_VLD 31 +#define S_TCB_ORD_L_BIT_VLD 23 +#define M_TCB_ORD_L_BIT_VLD 0x1ULL +#define V_TCB_ORD_L_BIT_VLD(x) ((x) << S_TCB_ORD_L_BIT_VLD) + +#define W_TCB_RDMAP_OPCODE 31 +#define S_TCB_RDMAP_OPCODE 24 +#define M_TCB_RDMAP_OPCODE 0xfULL +#define V_TCB_RDMAP_OPCODE(x) ((x) << S_TCB_RDMAP_OPCODE) + +#define W_TCB_TX_FLUSH 31 +#define S_TCB_TX_FLUSH 28 +#define M_TCB_TX_FLUSH 0x1ULL +#define V_TCB_TX_FLUSH(x) ((x) << S_TCB_TX_FLUSH) + +#define W_TCB_TX_OOS_RXMT 31 +#define S_TCB_TX_OOS_RXMT 29 +#define M_TCB_TX_OOS_RXMT 0x1ULL +#define V_TCB_TX_OOS_RXMT(x) ((x) << S_TCB_TX_OOS_RXMT) + +#define W_TCB_TX_OOS_TXMT 31 +#define S_TCB_TX_OOS_TXMT 30 +#define M_TCB_TX_OOS_TXMT 0x1ULL +#define V_TCB_TX_OOS_TXMT(x) ((x) << S_TCB_TX_OOS_TXMT) + +#define W_TCB_SLUSH_AUX2 31 +#define S_TCB_SLUSH_AUX2 31 +#define M_TCB_SLUSH_AUX2 0x1ULL +#define V_TCB_SLUSH_AUX2(x) ((x) << S_TCB_SLUSH_AUX2) + +#define W_TCB_RX_FRAG1_PTR_RAW2 25 +#define S_TCB_RX_FRAG1_PTR_RAW2 30 +#define M_TCB_RX_FRAG1_PTR_RAW2 0x1ffffULL +#define V_TCB_RX_FRAG1_PTR_RAW2(x) ((x) << S_TCB_RX_FRAG1_PTR_RAW2) + +#define W_TCB_RX_DDP_FLAGS 26 +#define S_TCB_RX_DDP_FLAGS 15 +#define M_TCB_RX_DDP_FLAGS 0x3ffULL +#define V_TCB_RX_DDP_FLAGS(x) ((x) << S_TCB_RX_DDP_FLAGS) + +#define W_TCB_SLUSH_AUX3 26 +#define S_TCB_SLUSH_AUX3 31 +#define M_TCB_SLUSH_AUX3 0x1ffULL +#define V_TCB_SLUSH_AUX3(x) ((x) << S_TCB_SLUSH_AUX3) + +#define W_TCB_RX_DDP_BUF0_OFFSET 27 +#define S_TCB_RX_DDP_BUF0_OFFSET 8 +#define M_TCB_RX_DDP_BUF0_OFFSET 0x3fffffULL +#define V_TCB_RX_DDP_BUF0_OFFSET(x) ((x) << S_TCB_RX_DDP_BUF0_OFFSET) + +#define W_TCB_RX_DDP_BUF0_LEN 27 +#define S_TCB_RX_DDP_BUF0_LEN 30 +#define M_TCB_RX_DDP_BUF0_LEN 0x3fffffULL +#define V_TCB_RX_DDP_BUF0_LEN(x) ((x) << S_TCB_RX_DDP_BUF0_LEN) + +#define W_TCB_RX_DDP_BUF1_OFFSET 28 +#define S_TCB_RX_DDP_BUF1_OFFSET 20 +#define M_TCB_RX_DDP_BUF1_OFFSET 0x3fffffULL +#define V_TCB_RX_DDP_BUF1_OFFSET(x) ((x) << S_TCB_RX_DDP_BUF1_OFFSET) + +#define W_TCB_RX_DDP_BUF1_LEN 29 +#define S_TCB_RX_DDP_BUF1_LEN 10 +#define M_TCB_RX_DDP_BUF1_LEN 0x3fffffULL +#define V_TCB_RX_DDP_BUF1_LEN(x) ((x) << S_TCB_RX_DDP_BUF1_LEN) + +#define W_TCB_RX_DDP_BUF0_TAG 30 +#define S_TCB_RX_DDP_BUF0_TAG 0 +#define M_TCB_RX_DDP_BUF0_TAG 0xffffffffULL +#define V_TCB_RX_DDP_BUF0_TAG(x) ((x) << S_TCB_RX_DDP_BUF0_TAG) + +#define W_TCB_RX_DDP_BUF1_TAG 31 +#define S_TCB_RX_DDP_BUF1_TAG 0 +#define M_TCB_RX_DDP_BUF1_TAG 0xffffffffULL +#define V_TCB_RX_DDP_BUF1_TAG(x) ((x) << S_TCB_RX_DDP_BUF1_TAG) + +#define S_TF_DACK 10 +#define V_TF_DACK(x) ((x) << S_TF_DACK) + +#define S_TF_NAGLE 11 +#define V_TF_NAGLE(x) ((x) << S_TF_NAGLE) + +#define S_TF_RECV_SCALE 12 +#define V_TF_RECV_SCALE(x) ((x) << S_TF_RECV_SCALE) + +#define S_TF_RECV_TSTMP 13 +#define V_TF_RECV_TSTMP(x) ((x) << S_TF_RECV_TSTMP) + +#define S_TF_RECV_SACK 14 +#define V_TF_RECV_SACK(x) ((x) << S_TF_RECV_SACK) + +#define S_TF_TURBO 15 +#define V_TF_TURBO(x) ((x) << S_TF_TURBO) + +#define S_TF_KEEPALIVE 16 +#define V_TF_KEEPALIVE(x) ((x) << S_TF_KEEPALIVE) + +#define S_TF_TCAM_BYPASS 17 +#define V_TF_TCAM_BYPASS(x) ((x) << S_TF_TCAM_BYPASS) + +#define S_TF_CORE_FIN 18 +#define V_TF_CORE_FIN(x) ((x) << S_TF_CORE_FIN) + +#define S_TF_CORE_MORE 19 +#define V_TF_CORE_MORE(x) ((x) << S_TF_CORE_MORE) + +#define S_TF_MIGRATING 20 +#define V_TF_MIGRATING(x) ((x) << S_TF_MIGRATING) + +#define S_TF_ACTIVE_OPEN 21 +#define V_TF_ACTIVE_OPEN(x) ((x) << S_TF_ACTIVE_OPEN) + +#define S_TF_ASK_MODE 22 +#define V_TF_ASK_MODE(x) ((x) << S_TF_ASK_MODE) + +#define S_TF_NON_OFFLOAD 23 +#define V_TF_NON_OFFLOAD(x) ((x) << S_TF_NON_OFFLOAD) + +#define S_TF_MOD_SCHD 24 +#define V_TF_MOD_SCHD(x) ((x) << S_TF_MOD_SCHD) + +#define S_TF_MOD_SCHD_REASON0 25 +#define V_TF_MOD_SCHD_REASON0(x) ((x) << S_TF_MOD_SCHD_REASON0) + +#define S_TF_MOD_SCHD_REASON1 26 +#define V_TF_MOD_SCHD_REASON1(x) ((x) << S_TF_MOD_SCHD_REASON1) + +#define S_TF_MOD_SCHD_RX 27 +#define V_TF_MOD_SCHD_RX(x) ((x) << S_TF_MOD_SCHD_RX) + +#define S_TF_CORE_PUSH 28 +#define V_TF_CORE_PUSH(x) ((x) << S_TF_CORE_PUSH) + +#define S_TF_RCV_COALESCE_ENABLE 29 +#define V_TF_RCV_COALESCE_ENABLE(x) ((x) << S_TF_RCV_COALESCE_ENABLE) + +#define S_TF_RCV_COALESCE_PUSH 30 +#define V_TF_RCV_COALESCE_PUSH(x) ((x) << S_TF_RCV_COALESCE_PUSH) + +#define S_TF_RCV_COALESCE_LAST_PSH 31 +#define V_TF_RCV_COALESCE_LAST_PSH(x) ((x) << S_TF_RCV_COALESCE_LAST_PSH) + +#define S_TF_RCV_COALESCE_HEARTBEAT 32 +#define V_TF_RCV_COALESCE_HEARTBEAT(x) ((x) << S_TF_RCV_COALESCE_HEARTBEAT) + +#define S_TF_HALF_CLOSE 33 +#define V_TF_HALF_CLOSE(x) ((x) << S_TF_HALF_CLOSE) + +#define S_TF_DACK_MSS 34 +#define V_TF_DACK_MSS(x) ((x) << S_TF_DACK_MSS) + +#define S_TF_CCTRL_SEL0 35 +#define V_TF_CCTRL_SEL0(x) ((x) << S_TF_CCTRL_SEL0) + +#define S_TF_CCTRL_SEL1 36 +#define V_TF_CCTRL_SEL1(x) ((x) << S_TF_CCTRL_SEL1) + +#define S_TF_TCP_NEWRENO_FAST_RECOVERY 37 +#define V_TF_TCP_NEWRENO_FAST_RECOVERY(x) ((x) << S_TF_TCP_NEWRENO_FAST_RECOVERY) + +#define S_TF_TX_PACE_AUTO 38 +#define V_TF_TX_PACE_AUTO(x) ((x) << S_TF_TX_PACE_AUTO) + +#define S_TF_PEER_FIN_HELD 39 +#define V_TF_PEER_FIN_HELD(x) ((x) << S_TF_PEER_FIN_HELD) + +#define S_TF_CORE_URG 40 +#define V_TF_CORE_URG(x) ((x) << S_TF_CORE_URG) + +#define S_TF_RDMA_ERROR 41 +#define V_TF_RDMA_ERROR(x) ((x) << S_TF_RDMA_ERROR) + +#define S_TF_SSWS_DISABLED 42 +#define V_TF_SSWS_DISABLED(x) ((x) << S_TF_SSWS_DISABLED) + +#define S_TF_DUPACK_COUNT_ODD 43 +#define V_TF_DUPACK_COUNT_ODD(x) ((x) << S_TF_DUPACK_COUNT_ODD) + +#define S_TF_TX_CHANNEL 44 +#define V_TF_TX_CHANNEL(x) ((x) << S_TF_TX_CHANNEL) + +#define S_TF_RX_CHANNEL 45 +#define V_TF_RX_CHANNEL(x) ((x) << S_TF_RX_CHANNEL) + +#define S_TF_TX_PACE_FIXED 46 +#define V_TF_TX_PACE_FIXED(x) ((x) << S_TF_TX_PACE_FIXED) + +#define S_TF_RDMA_FLM_ERROR 47 +#define V_TF_RDMA_FLM_ERROR(x) ((x) << S_TF_RDMA_FLM_ERROR) + +#define S_TF_RX_FLOW_CONTROL_DISABLE 48 +#define V_TF_RX_FLOW_CONTROL_DISABLE(x) ((x) << S_TF_RX_FLOW_CONTROL_DISABLE) + +#endif /* _TCB_DEFS_H */ diff --git a/kernel/drivers/infiniband/hw/cxgb4/Kconfig b/kernel/drivers/infiniband/hw/cxgb4/Kconfig new file mode 100644 index 000000000..23f38cf2c --- /dev/null +++ b/kernel/drivers/infiniband/hw/cxgb4/Kconfig @@ -0,0 +1,18 @@ +config INFINIBAND_CXGB4 + tristate "Chelsio T4/T5 RDMA Driver" + depends on CHELSIO_T4 && INET && (IPV6 || IPV6=n) + select GENERIC_ALLOCATOR + ---help--- + This is an iWARP/RDMA driver for the Chelsio T4 and T5 + 1GbE, 10GbE adapters and T5 40GbE adapter. + + For general information about Chelsio and our products, visit + our website at . + + For customer support, please visit our customer support page at + . + + Please send feedback to . + + To compile this driver as a module, choose M here: the module + will be called iw_cxgb4. diff --git a/kernel/drivers/infiniband/hw/cxgb4/Makefile b/kernel/drivers/infiniband/hw/cxgb4/Makefile new file mode 100644 index 000000000..e11cf7299 --- /dev/null +++ b/kernel/drivers/infiniband/hw/cxgb4/Makefile @@ -0,0 +1,5 @@ +ccflags-y := -Idrivers/net/ethernet/chelsio/cxgb4 + +obj-$(CONFIG_INFINIBAND_CXGB4) += iw_cxgb4.o + +iw_cxgb4-y := device.o cm.o provider.o mem.o cq.o qp.o resource.o ev.o id_table.o diff --git a/kernel/drivers/infiniband/hw/cxgb4/cm.c b/kernel/drivers/infiniband/hw/cxgb4/cm.c new file mode 100644 index 000000000..3ad8dc798 --- /dev/null +++ b/kernel/drivers/infiniband/hw/cxgb4/cm.c @@ -0,0 +1,4053 @@ +/* + * Copyright (c) 2009-2014 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include "iw_cxgb4.h" + +static char *states[] = { + "idle", + "listen", + "connecting", + "mpa_wait_req", + "mpa_req_sent", + "mpa_req_rcvd", + "mpa_rep_sent", + "fpdu_mode", + "aborting", + "closing", + "moribund", + "dead", + NULL, +}; + +static int nocong; +module_param(nocong, int, 0644); +MODULE_PARM_DESC(nocong, "Turn of congestion control (default=0)"); + +static int enable_ecn; +module_param(enable_ecn, int, 0644); +MODULE_PARM_DESC(enable_ecn, "Enable ECN (default=0/disabled)"); + +static int dack_mode = 1; +module_param(dack_mode, int, 0644); +MODULE_PARM_DESC(dack_mode, "Delayed ack mode (default=1)"); + +uint c4iw_max_read_depth = 32; +module_param(c4iw_max_read_depth, int, 0644); +MODULE_PARM_DESC(c4iw_max_read_depth, + "Per-connection max ORD/IRD (default=32)"); + +static int enable_tcp_timestamps; +module_param(enable_tcp_timestamps, int, 0644); +MODULE_PARM_DESC(enable_tcp_timestamps, "Enable tcp timestamps (default=0)"); + +static int enable_tcp_sack; +module_param(enable_tcp_sack, int, 0644); +MODULE_PARM_DESC(enable_tcp_sack, "Enable tcp SACK (default=0)"); + +static int enable_tcp_window_scaling = 1; +module_param(enable_tcp_window_scaling, int, 0644); +MODULE_PARM_DESC(enable_tcp_window_scaling, + "Enable tcp window scaling (default=1)"); + +int c4iw_debug; +module_param(c4iw_debug, int, 0644); +MODULE_PARM_DESC(c4iw_debug, "Enable debug logging (default=0)"); + +static int peer2peer = 1; +module_param(peer2peer, int, 0644); +MODULE_PARM_DESC(peer2peer, "Support peer2peer ULPs (default=1)"); + +static int p2p_type = FW_RI_INIT_P2PTYPE_READ_REQ; +module_param(p2p_type, int, 0644); +MODULE_PARM_DESC(p2p_type, "RDMAP opcode to use for the RTR message: " + "1=RDMA_READ 0=RDMA_WRITE (default 1)"); + +static int ep_timeout_secs = 60; +module_param(ep_timeout_secs, int, 0644); +MODULE_PARM_DESC(ep_timeout_secs, "CM Endpoint operation timeout " + "in seconds (default=60)"); + +static int mpa_rev = 1; +module_param(mpa_rev, int, 0644); +MODULE_PARM_DESC(mpa_rev, "MPA Revision, 0 supports amso1100, " + "1 is RFC0544 spec compliant, 2 is IETF MPA Peer Connect Draft" + " compliant (default=1)"); + +static int markers_enabled; +module_param(markers_enabled, int, 0644); +MODULE_PARM_DESC(markers_enabled, "Enable MPA MARKERS (default(0)=disabled)"); + +static int crc_enabled = 1; +module_param(crc_enabled, int, 0644); +MODULE_PARM_DESC(crc_enabled, "Enable MPA CRC (default(1)=enabled)"); + +static int rcv_win = 256 * 1024; +module_param(rcv_win, int, 0644); +MODULE_PARM_DESC(rcv_win, "TCP receive window in bytes (default=256KB)"); + +static int snd_win = 128 * 1024; +module_param(snd_win, int, 0644); +MODULE_PARM_DESC(snd_win, "TCP send window in bytes (default=128KB)"); + +static struct workqueue_struct *workq; + +static struct sk_buff_head rxq; + +static struct sk_buff *get_skb(struct sk_buff *skb, int len, gfp_t gfp); +static void ep_timeout(unsigned long arg); +static void connect_reply_upcall(struct c4iw_ep *ep, int status); + +static LIST_HEAD(timeout_list); +static spinlock_t timeout_lock; + +static void deref_qp(struct c4iw_ep *ep) +{ + c4iw_qp_rem_ref(&ep->com.qp->ibqp); + clear_bit(QP_REFERENCED, &ep->com.flags); +} + +static void ref_qp(struct c4iw_ep *ep) +{ + set_bit(QP_REFERENCED, &ep->com.flags); + c4iw_qp_add_ref(&ep->com.qp->ibqp); +} + +static void start_ep_timer(struct c4iw_ep *ep) +{ + PDBG("%s ep %p\n", __func__, ep); + if (timer_pending(&ep->timer)) { + pr_err("%s timer already started! ep %p\n", + __func__, ep); + return; + } + clear_bit(TIMEOUT, &ep->com.flags); + c4iw_get_ep(&ep->com); + ep->timer.expires = jiffies + ep_timeout_secs * HZ; + ep->timer.data = (unsigned long)ep; + ep->timer.function = ep_timeout; + add_timer(&ep->timer); +} + +static int stop_ep_timer(struct c4iw_ep *ep) +{ + PDBG("%s ep %p stopping\n", __func__, ep); + del_timer_sync(&ep->timer); + if (!test_and_set_bit(TIMEOUT, &ep->com.flags)) { + c4iw_put_ep(&ep->com); + return 0; + } + return 1; +} + +static int c4iw_l2t_send(struct c4iw_rdev *rdev, struct sk_buff *skb, + struct l2t_entry *l2e) +{ + int error = 0; + + if (c4iw_fatal_error(rdev)) { + kfree_skb(skb); + PDBG("%s - device in error state - dropping\n", __func__); + return -EIO; + } + error = cxgb4_l2t_send(rdev->lldi.ports[0], skb, l2e); + if (error < 0) + kfree_skb(skb); + return error < 0 ? error : 0; +} + +int c4iw_ofld_send(struct c4iw_rdev *rdev, struct sk_buff *skb) +{ + int error = 0; + + if (c4iw_fatal_error(rdev)) { + kfree_skb(skb); + PDBG("%s - device in error state - dropping\n", __func__); + return -EIO; + } + error = cxgb4_ofld_send(rdev->lldi.ports[0], skb); + if (error < 0) + kfree_skb(skb); + return error < 0 ? error : 0; +} + +static void release_tid(struct c4iw_rdev *rdev, u32 hwtid, struct sk_buff *skb) +{ + struct cpl_tid_release *req; + + skb = get_skb(skb, sizeof *req, GFP_KERNEL); + if (!skb) + return; + req = (struct cpl_tid_release *) skb_put(skb, sizeof(*req)); + INIT_TP_WR(req, hwtid); + OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_TID_RELEASE, hwtid)); + set_wr_txq(skb, CPL_PRIORITY_SETUP, 0); + c4iw_ofld_send(rdev, skb); + return; +} + +static void set_emss(struct c4iw_ep *ep, u16 opt) +{ + ep->emss = ep->com.dev->rdev.lldi.mtus[TCPOPT_MSS_G(opt)] - + ((AF_INET == ep->com.remote_addr.ss_family) ? + sizeof(struct iphdr) : sizeof(struct ipv6hdr)) - + sizeof(struct tcphdr); + ep->mss = ep->emss; + if (TCPOPT_TSTAMP_G(opt)) + ep->emss -= round_up(TCPOLEN_TIMESTAMP, 4); + if (ep->emss < 128) + ep->emss = 128; + if (ep->emss & 7) + PDBG("Warning: misaligned mtu idx %u mss %u emss=%u\n", + TCPOPT_MSS_G(opt), ep->mss, ep->emss); + PDBG("%s mss_idx %u mss %u emss=%u\n", __func__, TCPOPT_MSS_G(opt), + ep->mss, ep->emss); +} + +static enum c4iw_ep_state state_read(struct c4iw_ep_common *epc) +{ + enum c4iw_ep_state state; + + mutex_lock(&epc->mutex); + state = epc->state; + mutex_unlock(&epc->mutex); + return state; +} + +static void __state_set(struct c4iw_ep_common *epc, enum c4iw_ep_state new) +{ + epc->state = new; +} + +static void state_set(struct c4iw_ep_common *epc, enum c4iw_ep_state new) +{ + mutex_lock(&epc->mutex); + PDBG("%s - %s -> %s\n", __func__, states[epc->state], states[new]); + __state_set(epc, new); + mutex_unlock(&epc->mutex); + return; +} + +static void *alloc_ep(int size, gfp_t gfp) +{ + struct c4iw_ep_common *epc; + + epc = kzalloc(size, gfp); + if (epc) { + kref_init(&epc->kref); + mutex_init(&epc->mutex); + c4iw_init_wr_wait(&epc->wr_wait); + } + PDBG("%s alloc ep %p\n", __func__, epc); + return epc; +} + +void _c4iw_free_ep(struct kref *kref) +{ + struct c4iw_ep *ep; + + ep = container_of(kref, struct c4iw_ep, com.kref); + PDBG("%s ep %p state %s\n", __func__, ep, states[state_read(&ep->com)]); + if (test_bit(QP_REFERENCED, &ep->com.flags)) + deref_qp(ep); + if (test_bit(RELEASE_RESOURCES, &ep->com.flags)) { + remove_handle(ep->com.dev, &ep->com.dev->hwtid_idr, ep->hwtid); + cxgb4_remove_tid(ep->com.dev->rdev.lldi.tids, 0, ep->hwtid); + dst_release(ep->dst); + cxgb4_l2t_release(ep->l2t); + } + if (test_bit(RELEASE_MAPINFO, &ep->com.flags)) { + print_addr(&ep->com, __func__, "remove_mapinfo/mapping"); + iwpm_remove_mapinfo(&ep->com.local_addr, + &ep->com.mapped_local_addr); + iwpm_remove_mapping(&ep->com.local_addr, RDMA_NL_C4IW); + } + kfree(ep); +} + +static void release_ep_resources(struct c4iw_ep *ep) +{ + set_bit(RELEASE_RESOURCES, &ep->com.flags); + c4iw_put_ep(&ep->com); +} + +static int status2errno(int status) +{ + switch (status) { + case CPL_ERR_NONE: + return 0; + case CPL_ERR_CONN_RESET: + return -ECONNRESET; + case CPL_ERR_ARP_MISS: + return -EHOSTUNREACH; + case CPL_ERR_CONN_TIMEDOUT: + return -ETIMEDOUT; + case CPL_ERR_TCAM_FULL: + return -ENOMEM; + case CPL_ERR_CONN_EXIST: + return -EADDRINUSE; + default: + return -EIO; + } +} + +/* + * Try and reuse skbs already allocated... + */ +static struct sk_buff *get_skb(struct sk_buff *skb, int len, gfp_t gfp) +{ + if (skb && !skb_is_nonlinear(skb) && !skb_cloned(skb)) { + skb_trim(skb, 0); + skb_get(skb); + skb_reset_transport_header(skb); + } else { + skb = alloc_skb(len, gfp); + } + t4_set_arp_err_handler(skb, NULL, NULL); + return skb; +} + +static struct net_device *get_real_dev(struct net_device *egress_dev) +{ + return rdma_vlan_dev_real_dev(egress_dev) ? : egress_dev; +} + +static int our_interface(struct c4iw_dev *dev, struct net_device *egress_dev) +{ + int i; + + egress_dev = get_real_dev(egress_dev); + for (i = 0; i < dev->rdev.lldi.nports; i++) + if (dev->rdev.lldi.ports[i] == egress_dev) + return 1; + return 0; +} + +static struct dst_entry *find_route6(struct c4iw_dev *dev, __u8 *local_ip, + __u8 *peer_ip, __be16 local_port, + __be16 peer_port, u8 tos, + __u32 sin6_scope_id) +{ + struct dst_entry *dst = NULL; + + if (IS_ENABLED(CONFIG_IPV6)) { + struct flowi6 fl6; + + memset(&fl6, 0, sizeof(fl6)); + memcpy(&fl6.daddr, peer_ip, 16); + memcpy(&fl6.saddr, local_ip, 16); + if (ipv6_addr_type(&fl6.daddr) & IPV6_ADDR_LINKLOCAL) + fl6.flowi6_oif = sin6_scope_id; + dst = ip6_route_output(&init_net, NULL, &fl6); + if (!dst) + goto out; + if (!our_interface(dev, ip6_dst_idev(dst)->dev) && + !(ip6_dst_idev(dst)->dev->flags & IFF_LOOPBACK)) { + dst_release(dst); + dst = NULL; + } + } + +out: + return dst; +} + +static struct dst_entry *find_route(struct c4iw_dev *dev, __be32 local_ip, + __be32 peer_ip, __be16 local_port, + __be16 peer_port, u8 tos) +{ + struct rtable *rt; + struct flowi4 fl4; + struct neighbour *n; + + rt = ip_route_output_ports(&init_net, &fl4, NULL, peer_ip, local_ip, + peer_port, local_port, IPPROTO_TCP, + tos, 0); + if (IS_ERR(rt)) + return NULL; + n = dst_neigh_lookup(&rt->dst, &peer_ip); + if (!n) + return NULL; + if (!our_interface(dev, n->dev) && + !(n->dev->flags & IFF_LOOPBACK)) { + neigh_release(n); + dst_release(&rt->dst); + return NULL; + } + neigh_release(n); + return &rt->dst; +} + +static void arp_failure_discard(void *handle, struct sk_buff *skb) +{ + PDBG("%s c4iw_dev %p\n", __func__, handle); + kfree_skb(skb); +} + +/* + * Handle an ARP failure for an active open. + */ +static void act_open_req_arp_failure(void *handle, struct sk_buff *skb) +{ + struct c4iw_ep *ep = handle; + + printk(KERN_ERR MOD "ARP failure duing connect\n"); + kfree_skb(skb); + connect_reply_upcall(ep, -EHOSTUNREACH); + state_set(&ep->com, DEAD); + remove_handle(ep->com.dev, &ep->com.dev->atid_idr, ep->atid); + cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid); + dst_release(ep->dst); + cxgb4_l2t_release(ep->l2t); + c4iw_put_ep(&ep->com); +} + +/* + * Handle an ARP failure for a CPL_ABORT_REQ. Change it into a no RST variant + * and send it along. + */ +static void abort_arp_failure(void *handle, struct sk_buff *skb) +{ + struct c4iw_rdev *rdev = handle; + struct cpl_abort_req *req = cplhdr(skb); + + PDBG("%s rdev %p\n", __func__, rdev); + req->cmd = CPL_ABORT_NO_RST; + c4iw_ofld_send(rdev, skb); +} + +static void send_flowc(struct c4iw_ep *ep, struct sk_buff *skb) +{ + unsigned int flowclen = 80; + struct fw_flowc_wr *flowc; + int i; + + skb = get_skb(skb, flowclen, GFP_KERNEL); + flowc = (struct fw_flowc_wr *)__skb_put(skb, flowclen); + + flowc->op_to_nparams = cpu_to_be32(FW_WR_OP_V(FW_FLOWC_WR) | + FW_FLOWC_WR_NPARAMS_V(8)); + flowc->flowid_len16 = cpu_to_be32(FW_WR_LEN16_V(DIV_ROUND_UP(flowclen, + 16)) | FW_WR_FLOWID_V(ep->hwtid)); + + flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN; + flowc->mnemval[0].val = cpu_to_be32(FW_PFVF_CMD_PFN_V + (ep->com.dev->rdev.lldi.pf)); + flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH; + flowc->mnemval[1].val = cpu_to_be32(ep->tx_chan); + flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT; + flowc->mnemval[2].val = cpu_to_be32(ep->tx_chan); + flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID; + flowc->mnemval[3].val = cpu_to_be32(ep->rss_qid); + flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDNXT; + flowc->mnemval[4].val = cpu_to_be32(ep->snd_seq); + flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_RCVNXT; + flowc->mnemval[5].val = cpu_to_be32(ep->rcv_seq); + flowc->mnemval[6].mnemonic = FW_FLOWC_MNEM_SNDBUF; + flowc->mnemval[6].val = cpu_to_be32(ep->snd_win); + flowc->mnemval[7].mnemonic = FW_FLOWC_MNEM_MSS; + flowc->mnemval[7].val = cpu_to_be32(ep->emss); + /* Pad WR to 16 byte boundary */ + flowc->mnemval[8].mnemonic = 0; + flowc->mnemval[8].val = 0; + for (i = 0; i < 9; i++) { + flowc->mnemval[i].r4[0] = 0; + flowc->mnemval[i].r4[1] = 0; + flowc->mnemval[i].r4[2] = 0; + } + + set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx); + c4iw_ofld_send(&ep->com.dev->rdev, skb); +} + +static int send_halfclose(struct c4iw_ep *ep, gfp_t gfp) +{ + struct cpl_close_con_req *req; + struct sk_buff *skb; + int wrlen = roundup(sizeof *req, 16); + + PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); + skb = get_skb(NULL, wrlen, gfp); + if (!skb) { + printk(KERN_ERR MOD "%s - failed to alloc skb\n", __func__); + return -ENOMEM; + } + set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx); + t4_set_arp_err_handler(skb, NULL, arp_failure_discard); + req = (struct cpl_close_con_req *) skb_put(skb, wrlen); + memset(req, 0, wrlen); + INIT_TP_WR(req, ep->hwtid); + OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, + ep->hwtid)); + return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t); +} + +static int send_abort(struct c4iw_ep *ep, struct sk_buff *skb, gfp_t gfp) +{ + struct cpl_abort_req *req; + int wrlen = roundup(sizeof *req, 16); + + PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); + skb = get_skb(skb, wrlen, gfp); + if (!skb) { + printk(KERN_ERR MOD "%s - failed to alloc skb.\n", + __func__); + return -ENOMEM; + } + set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx); + t4_set_arp_err_handler(skb, &ep->com.dev->rdev, abort_arp_failure); + req = (struct cpl_abort_req *) skb_put(skb, wrlen); + memset(req, 0, wrlen); + INIT_TP_WR(req, ep->hwtid); + OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_ABORT_REQ, ep->hwtid)); + req->cmd = CPL_ABORT_SEND_RST; + return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t); +} + +/* + * c4iw_form_pm_msg - Form a port mapper message with mapping info + */ +static void c4iw_form_pm_msg(struct c4iw_ep *ep, + struct iwpm_sa_data *pm_msg) +{ + memcpy(&pm_msg->loc_addr, &ep->com.local_addr, + sizeof(ep->com.local_addr)); + memcpy(&pm_msg->rem_addr, &ep->com.remote_addr, + sizeof(ep->com.remote_addr)); +} + +/* + * c4iw_form_reg_msg - Form a port mapper message with dev info + */ +static void c4iw_form_reg_msg(struct c4iw_dev *dev, + struct iwpm_dev_data *pm_msg) +{ + memcpy(pm_msg->dev_name, dev->ibdev.name, IWPM_DEVNAME_SIZE); + memcpy(pm_msg->if_name, dev->rdev.lldi.ports[0]->name, + IWPM_IFNAME_SIZE); +} + +static void c4iw_record_pm_msg(struct c4iw_ep *ep, + struct iwpm_sa_data *pm_msg) +{ + memcpy(&ep->com.mapped_local_addr, &pm_msg->mapped_loc_addr, + sizeof(ep->com.mapped_local_addr)); + memcpy(&ep->com.mapped_remote_addr, &pm_msg->mapped_rem_addr, + sizeof(ep->com.mapped_remote_addr)); +} + +static int get_remote_addr(struct c4iw_ep *parent_ep, struct c4iw_ep *child_ep) +{ + int ret; + + print_addr(&parent_ep->com, __func__, "get_remote_addr parent_ep "); + print_addr(&child_ep->com, __func__, "get_remote_addr child_ep "); + + ret = iwpm_get_remote_info(&parent_ep->com.mapped_local_addr, + &child_ep->com.mapped_remote_addr, + &child_ep->com.remote_addr, RDMA_NL_C4IW); + if (ret) + PDBG("Unable to find remote peer addr info - err %d\n", ret); + + return ret; +} + +static void best_mtu(const unsigned short *mtus, unsigned short mtu, + unsigned int *idx, int use_ts, int ipv6) +{ + unsigned short hdr_size = (ipv6 ? + sizeof(struct ipv6hdr) : + sizeof(struct iphdr)) + + sizeof(struct tcphdr) + + (use_ts ? + round_up(TCPOLEN_TIMESTAMP, 4) : 0); + unsigned short data_size = mtu - hdr_size; + + cxgb4_best_aligned_mtu(mtus, hdr_size, data_size, 8, idx); +} + +static int send_connect(struct c4iw_ep *ep) +{ + struct cpl_act_open_req *req; + struct cpl_t5_act_open_req *t5_req; + struct cpl_act_open_req6 *req6; + struct cpl_t5_act_open_req6 *t5_req6; + struct sk_buff *skb; + u64 opt0; + u32 opt2; + unsigned int mtu_idx; + int wscale; + int wrlen; + int sizev4 = is_t4(ep->com.dev->rdev.lldi.adapter_type) ? + sizeof(struct cpl_act_open_req) : + sizeof(struct cpl_t5_act_open_req); + int sizev6 = is_t4(ep->com.dev->rdev.lldi.adapter_type) ? + sizeof(struct cpl_act_open_req6) : + sizeof(struct cpl_t5_act_open_req6); + struct sockaddr_in *la = (struct sockaddr_in *) + &ep->com.mapped_local_addr; + struct sockaddr_in *ra = (struct sockaddr_in *) + &ep->com.mapped_remote_addr; + struct sockaddr_in6 *la6 = (struct sockaddr_in6 *) + &ep->com.mapped_local_addr; + struct sockaddr_in6 *ra6 = (struct sockaddr_in6 *) + &ep->com.mapped_remote_addr; + int win; + + wrlen = (ep->com.remote_addr.ss_family == AF_INET) ? + roundup(sizev4, 16) : + roundup(sizev6, 16); + + PDBG("%s ep %p atid %u\n", __func__, ep, ep->atid); + + skb = get_skb(NULL, wrlen, GFP_KERNEL); + if (!skb) { + printk(KERN_ERR MOD "%s - failed to alloc skb.\n", + __func__); + return -ENOMEM; + } + set_wr_txq(skb, CPL_PRIORITY_SETUP, ep->ctrlq_idx); + + best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx, + enable_tcp_timestamps, + (AF_INET == ep->com.remote_addr.ss_family) ? 0 : 1); + wscale = compute_wscale(rcv_win); + + /* + * Specify the largest window that will fit in opt0. The + * remainder will be specified in the rx_data_ack. + */ + win = ep->rcv_win >> 10; + if (win > RCV_BUFSIZ_M) + win = RCV_BUFSIZ_M; + + opt0 = (nocong ? NO_CONG_F : 0) | + KEEP_ALIVE_F | + DELACK_F | + WND_SCALE_V(wscale) | + MSS_IDX_V(mtu_idx) | + L2T_IDX_V(ep->l2t->idx) | + TX_CHAN_V(ep->tx_chan) | + SMAC_SEL_V(ep->smac_idx) | + DSCP_V(ep->tos) | + ULP_MODE_V(ULP_MODE_TCPDDP) | + RCV_BUFSIZ_V(win); + opt2 = RX_CHANNEL_V(0) | + CCTRL_ECN_V(enable_ecn) | + RSS_QUEUE_VALID_F | RSS_QUEUE_V(ep->rss_qid); + if (enable_tcp_timestamps) + opt2 |= TSTAMPS_EN_F; + if (enable_tcp_sack) + opt2 |= SACK_EN_F; + if (wscale && enable_tcp_window_scaling) + opt2 |= WND_SCALE_EN_F; + if (is_t5(ep->com.dev->rdev.lldi.adapter_type)) { + opt2 |= T5_OPT_2_VALID_F; + opt2 |= CONG_CNTRL_V(CONG_ALG_TAHOE); + opt2 |= T5_ISS_F; + } + t4_set_arp_err_handler(skb, ep, act_open_req_arp_failure); + + if (is_t4(ep->com.dev->rdev.lldi.adapter_type)) { + if (ep->com.remote_addr.ss_family == AF_INET) { + req = (struct cpl_act_open_req *) skb_put(skb, wrlen); + INIT_TP_WR(req, 0); + OPCODE_TID(req) = cpu_to_be32( + MK_OPCODE_TID(CPL_ACT_OPEN_REQ, + ((ep->rss_qid << 14) | ep->atid))); + req->local_port = la->sin_port; + req->peer_port = ra->sin_port; + req->local_ip = la->sin_addr.s_addr; + req->peer_ip = ra->sin_addr.s_addr; + req->opt0 = cpu_to_be64(opt0); + req->params = cpu_to_be32(cxgb4_select_ntuple( + ep->com.dev->rdev.lldi.ports[0], + ep->l2t)); + req->opt2 = cpu_to_be32(opt2); + } else { + req6 = (struct cpl_act_open_req6 *)skb_put(skb, wrlen); + + INIT_TP_WR(req6, 0); + OPCODE_TID(req6) = cpu_to_be32( + MK_OPCODE_TID(CPL_ACT_OPEN_REQ6, + ((ep->rss_qid<<14)|ep->atid))); + req6->local_port = la6->sin6_port; + req6->peer_port = ra6->sin6_port; + req6->local_ip_hi = *((__be64 *) + (la6->sin6_addr.s6_addr)); + req6->local_ip_lo = *((__be64 *) + (la6->sin6_addr.s6_addr + 8)); + req6->peer_ip_hi = *((__be64 *) + (ra6->sin6_addr.s6_addr)); + req6->peer_ip_lo = *((__be64 *) + (ra6->sin6_addr.s6_addr + 8)); + req6->opt0 = cpu_to_be64(opt0); + req6->params = cpu_to_be32(cxgb4_select_ntuple( + ep->com.dev->rdev.lldi.ports[0], + ep->l2t)); + req6->opt2 = cpu_to_be32(opt2); + } + } else { + u32 isn = (prandom_u32() & ~7UL) - 1; + + if (peer2peer) + isn += 4; + + if (ep->com.remote_addr.ss_family == AF_INET) { + t5_req = (struct cpl_t5_act_open_req *) + skb_put(skb, wrlen); + INIT_TP_WR(t5_req, 0); + OPCODE_TID(t5_req) = cpu_to_be32( + MK_OPCODE_TID(CPL_ACT_OPEN_REQ, + ((ep->rss_qid << 14) | ep->atid))); + t5_req->local_port = la->sin_port; + t5_req->peer_port = ra->sin_port; + t5_req->local_ip = la->sin_addr.s_addr; + t5_req->peer_ip = ra->sin_addr.s_addr; + t5_req->opt0 = cpu_to_be64(opt0); + t5_req->params = cpu_to_be64(FILTER_TUPLE_V( + cxgb4_select_ntuple( + ep->com.dev->rdev.lldi.ports[0], + ep->l2t))); + t5_req->rsvd = cpu_to_be32(isn); + PDBG("%s snd_isn %u\n", __func__, + be32_to_cpu(t5_req->rsvd)); + t5_req->opt2 = cpu_to_be32(opt2); + } else { + t5_req6 = (struct cpl_t5_act_open_req6 *) + skb_put(skb, wrlen); + INIT_TP_WR(t5_req6, 0); + OPCODE_TID(t5_req6) = cpu_to_be32( + MK_OPCODE_TID(CPL_ACT_OPEN_REQ6, + ((ep->rss_qid<<14)|ep->atid))); + t5_req6->local_port = la6->sin6_port; + t5_req6->peer_port = ra6->sin6_port; + t5_req6->local_ip_hi = *((__be64 *) + (la6->sin6_addr.s6_addr)); + t5_req6->local_ip_lo = *((__be64 *) + (la6->sin6_addr.s6_addr + 8)); + t5_req6->peer_ip_hi = *((__be64 *) + (ra6->sin6_addr.s6_addr)); + t5_req6->peer_ip_lo = *((__be64 *) + (ra6->sin6_addr.s6_addr + 8)); + t5_req6->opt0 = cpu_to_be64(opt0); + t5_req6->params = cpu_to_be64(FILTER_TUPLE_V( + cxgb4_select_ntuple( + ep->com.dev->rdev.lldi.ports[0], + ep->l2t))); + t5_req6->rsvd = cpu_to_be32(isn); + PDBG("%s snd_isn %u\n", __func__, + be32_to_cpu(t5_req6->rsvd)); + t5_req6->opt2 = cpu_to_be32(opt2); + } + } + + set_bit(ACT_OPEN_REQ, &ep->com.history); + return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t); +} + +static void send_mpa_req(struct c4iw_ep *ep, struct sk_buff *skb, + u8 mpa_rev_to_use) +{ + int mpalen, wrlen; + struct fw_ofld_tx_data_wr *req; + struct mpa_message *mpa; + struct mpa_v2_conn_params mpa_v2_params; + + PDBG("%s ep %p tid %u pd_len %d\n", __func__, ep, ep->hwtid, ep->plen); + + BUG_ON(skb_cloned(skb)); + + mpalen = sizeof(*mpa) + ep->plen; + if (mpa_rev_to_use == 2) + mpalen += sizeof(struct mpa_v2_conn_params); + wrlen = roundup(mpalen + sizeof *req, 16); + skb = get_skb(skb, wrlen, GFP_KERNEL); + if (!skb) { + connect_reply_upcall(ep, -ENOMEM); + return; + } + set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx); + + req = (struct fw_ofld_tx_data_wr *)skb_put(skb, wrlen); + memset(req, 0, wrlen); + req->op_to_immdlen = cpu_to_be32( + FW_WR_OP_V(FW_OFLD_TX_DATA_WR) | + FW_WR_COMPL_F | + FW_WR_IMMDLEN_V(mpalen)); + req->flowid_len16 = cpu_to_be32( + FW_WR_FLOWID_V(ep->hwtid) | + FW_WR_LEN16_V(wrlen >> 4)); + req->plen = cpu_to_be32(mpalen); + req->tunnel_to_proxy = cpu_to_be32( + FW_OFLD_TX_DATA_WR_FLUSH_F | + FW_OFLD_TX_DATA_WR_SHOVE_F); + + mpa = (struct mpa_message *)(req + 1); + memcpy(mpa->key, MPA_KEY_REQ, sizeof(mpa->key)); + mpa->flags = (crc_enabled ? MPA_CRC : 0) | + (markers_enabled ? MPA_MARKERS : 0) | + (mpa_rev_to_use == 2 ? MPA_ENHANCED_RDMA_CONN : 0); + mpa->private_data_size = htons(ep->plen); + mpa->revision = mpa_rev_to_use; + if (mpa_rev_to_use == 1) { + ep->tried_with_mpa_v1 = 1; + ep->retry_with_mpa_v1 = 0; + } + + if (mpa_rev_to_use == 2) { + mpa->private_data_size = htons(ntohs(mpa->private_data_size) + + sizeof (struct mpa_v2_conn_params)); + PDBG("%s initiator ird %u ord %u\n", __func__, ep->ird, + ep->ord); + mpa_v2_params.ird = htons((u16)ep->ird); + mpa_v2_params.ord = htons((u16)ep->ord); + + if (peer2peer) { + mpa_v2_params.ird |= htons(MPA_V2_PEER2PEER_MODEL); + if (p2p_type == FW_RI_INIT_P2PTYPE_RDMA_WRITE) + mpa_v2_params.ord |= + htons(MPA_V2_RDMA_WRITE_RTR); + else if (p2p_type == FW_RI_INIT_P2PTYPE_READ_REQ) + mpa_v2_params.ord |= + htons(MPA_V2_RDMA_READ_RTR); + } + memcpy(mpa->private_data, &mpa_v2_params, + sizeof(struct mpa_v2_conn_params)); + + if (ep->plen) + memcpy(mpa->private_data + + sizeof(struct mpa_v2_conn_params), + ep->mpa_pkt + sizeof(*mpa), ep->plen); + } else + if (ep->plen) + memcpy(mpa->private_data, + ep->mpa_pkt + sizeof(*mpa), ep->plen); + + /* + * Reference the mpa skb. This ensures the data area + * will remain in memory until the hw acks the tx. + * Function fw4_ack() will deref it. + */ + skb_get(skb); + t4_set_arp_err_handler(skb, NULL, arp_failure_discard); + BUG_ON(ep->mpa_skb); + ep->mpa_skb = skb; + c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t); + start_ep_timer(ep); + __state_set(&ep->com, MPA_REQ_SENT); + ep->mpa_attr.initiator = 1; + ep->snd_seq += mpalen; + return; +} + +static int send_mpa_reject(struct c4iw_ep *ep, const void *pdata, u8 plen) +{ + int mpalen, wrlen; + struct fw_ofld_tx_data_wr *req; + struct mpa_message *mpa; + struct sk_buff *skb; + struct mpa_v2_conn_params mpa_v2_params; + + PDBG("%s ep %p tid %u pd_len %d\n", __func__, ep, ep->hwtid, ep->plen); + + mpalen = sizeof(*mpa) + plen; + if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) + mpalen += sizeof(struct mpa_v2_conn_params); + wrlen = roundup(mpalen + sizeof *req, 16); + + skb = get_skb(NULL, wrlen, GFP_KERNEL); + if (!skb) { + printk(KERN_ERR MOD "%s - cannot alloc skb!\n", __func__); + return -ENOMEM; + } + set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx); + + req = (struct fw_ofld_tx_data_wr *)skb_put(skb, wrlen); + memset(req, 0, wrlen); + req->op_to_immdlen = cpu_to_be32( + FW_WR_OP_V(FW_OFLD_TX_DATA_WR) | + FW_WR_COMPL_F | + FW_WR_IMMDLEN_V(mpalen)); + req->flowid_len16 = cpu_to_be32( + FW_WR_FLOWID_V(ep->hwtid) | + FW_WR_LEN16_V(wrlen >> 4)); + req->plen = cpu_to_be32(mpalen); + req->tunnel_to_proxy = cpu_to_be32( + FW_OFLD_TX_DATA_WR_FLUSH_F | + FW_OFLD_TX_DATA_WR_SHOVE_F); + + mpa = (struct mpa_message *)(req + 1); + memset(mpa, 0, sizeof(*mpa)); + memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key)); + mpa->flags = MPA_REJECT; + mpa->revision = ep->mpa_attr.version; + mpa->private_data_size = htons(plen); + + if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) { + mpa->flags |= MPA_ENHANCED_RDMA_CONN; + mpa->private_data_size = htons(ntohs(mpa->private_data_size) + + sizeof (struct mpa_v2_conn_params)); + mpa_v2_params.ird = htons(((u16)ep->ird) | + (peer2peer ? MPA_V2_PEER2PEER_MODEL : + 0)); + mpa_v2_params.ord = htons(((u16)ep->ord) | (peer2peer ? + (p2p_type == + FW_RI_INIT_P2PTYPE_RDMA_WRITE ? + MPA_V2_RDMA_WRITE_RTR : p2p_type == + FW_RI_INIT_P2PTYPE_READ_REQ ? + MPA_V2_RDMA_READ_RTR : 0) : 0)); + memcpy(mpa->private_data, &mpa_v2_params, + sizeof(struct mpa_v2_conn_params)); + + if (ep->plen) + memcpy(mpa->private_data + + sizeof(struct mpa_v2_conn_params), pdata, plen); + } else + if (plen) + memcpy(mpa->private_data, pdata, plen); + + /* + * Reference the mpa skb again. This ensures the data area + * will remain in memory until the hw acks the tx. + * Function fw4_ack() will deref it. + */ + skb_get(skb); + set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx); + t4_set_arp_err_handler(skb, NULL, arp_failure_discard); + BUG_ON(ep->mpa_skb); + ep->mpa_skb = skb; + ep->snd_seq += mpalen; + return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t); +} + +static int send_mpa_reply(struct c4iw_ep *ep, const void *pdata, u8 plen) +{ + int mpalen, wrlen; + struct fw_ofld_tx_data_wr *req; + struct mpa_message *mpa; + struct sk_buff *skb; + struct mpa_v2_conn_params mpa_v2_params; + + PDBG("%s ep %p tid %u pd_len %d\n", __func__, ep, ep->hwtid, ep->plen); + + mpalen = sizeof(*mpa) + plen; + if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) + mpalen += sizeof(struct mpa_v2_conn_params); + wrlen = roundup(mpalen + sizeof *req, 16); + + skb = get_skb(NULL, wrlen, GFP_KERNEL); + if (!skb) { + printk(KERN_ERR MOD "%s - cannot alloc skb!\n", __func__); + return -ENOMEM; + } + set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx); + + req = (struct fw_ofld_tx_data_wr *) skb_put(skb, wrlen); + memset(req, 0, wrlen); + req->op_to_immdlen = cpu_to_be32( + FW_WR_OP_V(FW_OFLD_TX_DATA_WR) | + FW_WR_COMPL_F | + FW_WR_IMMDLEN_V(mpalen)); + req->flowid_len16 = cpu_to_be32( + FW_WR_FLOWID_V(ep->hwtid) | + FW_WR_LEN16_V(wrlen >> 4)); + req->plen = cpu_to_be32(mpalen); + req->tunnel_to_proxy = cpu_to_be32( + FW_OFLD_TX_DATA_WR_FLUSH_F | + FW_OFLD_TX_DATA_WR_SHOVE_F); + + mpa = (struct mpa_message *)(req + 1); + memset(mpa, 0, sizeof(*mpa)); + memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key)); + mpa->flags = (ep->mpa_attr.crc_enabled ? MPA_CRC : 0) | + (markers_enabled ? MPA_MARKERS : 0); + mpa->revision = ep->mpa_attr.version; + mpa->private_data_size = htons(plen); + + if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) { + mpa->flags |= MPA_ENHANCED_RDMA_CONN; + mpa->private_data_size = htons(ntohs(mpa->private_data_size) + + sizeof (struct mpa_v2_conn_params)); + mpa_v2_params.ird = htons((u16)ep->ird); + mpa_v2_params.ord = htons((u16)ep->ord); + if (peer2peer && (ep->mpa_attr.p2p_type != + FW_RI_INIT_P2PTYPE_DISABLED)) { + mpa_v2_params.ird |= htons(MPA_V2_PEER2PEER_MODEL); + + if (p2p_type == FW_RI_INIT_P2PTYPE_RDMA_WRITE) + mpa_v2_params.ord |= + htons(MPA_V2_RDMA_WRITE_RTR); + else if (p2p_type == FW_RI_INIT_P2PTYPE_READ_REQ) + mpa_v2_params.ord |= + htons(MPA_V2_RDMA_READ_RTR); + } + + memcpy(mpa->private_data, &mpa_v2_params, + sizeof(struct mpa_v2_conn_params)); + + if (ep->plen) + memcpy(mpa->private_data + + sizeof(struct mpa_v2_conn_params), pdata, plen); + } else + if (plen) + memcpy(mpa->private_data, pdata, plen); + + /* + * Reference the mpa skb. This ensures the data area + * will remain in memory until the hw acks the tx. + * Function fw4_ack() will deref it. + */ + skb_get(skb); + t4_set_arp_err_handler(skb, NULL, arp_failure_discard); + ep->mpa_skb = skb; + __state_set(&ep->com, MPA_REP_SENT); + ep->snd_seq += mpalen; + return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t); +} + +static int act_establish(struct c4iw_dev *dev, struct sk_buff *skb) +{ + struct c4iw_ep *ep; + struct cpl_act_establish *req = cplhdr(skb); + unsigned int tid = GET_TID(req); + unsigned int atid = TID_TID_G(ntohl(req->tos_atid)); + struct tid_info *t = dev->rdev.lldi.tids; + + ep = lookup_atid(t, atid); + + PDBG("%s ep %p tid %u snd_isn %u rcv_isn %u\n", __func__, ep, tid, + be32_to_cpu(req->snd_isn), be32_to_cpu(req->rcv_isn)); + + mutex_lock(&ep->com.mutex); + dst_confirm(ep->dst); + + /* setup the hwtid for this connection */ + ep->hwtid = tid; + cxgb4_insert_tid(t, ep, tid); + insert_handle(dev, &dev->hwtid_idr, ep, ep->hwtid); + + ep->snd_seq = be32_to_cpu(req->snd_isn); + ep->rcv_seq = be32_to_cpu(req->rcv_isn); + + set_emss(ep, ntohs(req->tcp_opt)); + + /* dealloc the atid */ + remove_handle(ep->com.dev, &ep->com.dev->atid_idr, atid); + cxgb4_free_atid(t, atid); + set_bit(ACT_ESTAB, &ep->com.history); + + /* start MPA negotiation */ + send_flowc(ep, NULL); + if (ep->retry_with_mpa_v1) + send_mpa_req(ep, skb, 1); + else + send_mpa_req(ep, skb, mpa_rev); + mutex_unlock(&ep->com.mutex); + return 0; +} + +static void close_complete_upcall(struct c4iw_ep *ep, int status) +{ + struct iw_cm_event event; + + PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); + memset(&event, 0, sizeof(event)); + event.event = IW_CM_EVENT_CLOSE; + event.status = status; + if (ep->com.cm_id) { + PDBG("close complete delivered ep %p cm_id %p tid %u\n", + ep, ep->com.cm_id, ep->hwtid); + ep->com.cm_id->event_handler(ep->com.cm_id, &event); + ep->com.cm_id->rem_ref(ep->com.cm_id); + ep->com.cm_id = NULL; + set_bit(CLOSE_UPCALL, &ep->com.history); + } +} + +static int abort_connection(struct c4iw_ep *ep, struct sk_buff *skb, gfp_t gfp) +{ + PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); + __state_set(&ep->com, ABORTING); + set_bit(ABORT_CONN, &ep->com.history); + return send_abort(ep, skb, gfp); +} + +static void peer_close_upcall(struct c4iw_ep *ep) +{ + struct iw_cm_event event; + + PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); + memset(&event, 0, sizeof(event)); + event.event = IW_CM_EVENT_DISCONNECT; + if (ep->com.cm_id) { + PDBG("peer close delivered ep %p cm_id %p tid %u\n", + ep, ep->com.cm_id, ep->hwtid); + ep->com.cm_id->event_handler(ep->com.cm_id, &event); + set_bit(DISCONN_UPCALL, &ep->com.history); + } +} + +static void peer_abort_upcall(struct c4iw_ep *ep) +{ + struct iw_cm_event event; + + PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); + memset(&event, 0, sizeof(event)); + event.event = IW_CM_EVENT_CLOSE; + event.status = -ECONNRESET; + if (ep->com.cm_id) { + PDBG("abort delivered ep %p cm_id %p tid %u\n", ep, + ep->com.cm_id, ep->hwtid); + ep->com.cm_id->event_handler(ep->com.cm_id, &event); + ep->com.cm_id->rem_ref(ep->com.cm_id); + ep->com.cm_id = NULL; + set_bit(ABORT_UPCALL, &ep->com.history); + } +} + +static void connect_reply_upcall(struct c4iw_ep *ep, int status) +{ + struct iw_cm_event event; + + PDBG("%s ep %p tid %u status %d\n", __func__, ep, ep->hwtid, status); + memset(&event, 0, sizeof(event)); + event.event = IW_CM_EVENT_CONNECT_REPLY; + event.status = status; + memcpy(&event.local_addr, &ep->com.local_addr, + sizeof(ep->com.local_addr)); + memcpy(&event.remote_addr, &ep->com.remote_addr, + sizeof(ep->com.remote_addr)); + + if ((status == 0) || (status == -ECONNREFUSED)) { + if (!ep->tried_with_mpa_v1) { + /* this means MPA_v2 is used */ + event.private_data_len = ep->plen - + sizeof(struct mpa_v2_conn_params); + event.private_data = ep->mpa_pkt + + sizeof(struct mpa_message) + + sizeof(struct mpa_v2_conn_params); + } else { + /* this means MPA_v1 is used */ + event.private_data_len = ep->plen; + event.private_data = ep->mpa_pkt + + sizeof(struct mpa_message); + } + } + + PDBG("%s ep %p tid %u status %d\n", __func__, ep, + ep->hwtid, status); + set_bit(CONN_RPL_UPCALL, &ep->com.history); + ep->com.cm_id->event_handler(ep->com.cm_id, &event); + + if (status < 0) { + ep->com.cm_id->rem_ref(ep->com.cm_id); + ep->com.cm_id = NULL; + } +} + +static int connect_request_upcall(struct c4iw_ep *ep) +{ + struct iw_cm_event event; + int ret; + + PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); + memset(&event, 0, sizeof(event)); + event.event = IW_CM_EVENT_CONNECT_REQUEST; + memcpy(&event.local_addr, &ep->com.local_addr, + sizeof(ep->com.local_addr)); + memcpy(&event.remote_addr, &ep->com.remote_addr, + sizeof(ep->com.remote_addr)); + event.provider_data = ep; + if (!ep->tried_with_mpa_v1) { + /* this means MPA_v2 is used */ + event.ord = ep->ord; + event.ird = ep->ird; + event.private_data_len = ep->plen - + sizeof(struct mpa_v2_conn_params); + event.private_data = ep->mpa_pkt + sizeof(struct mpa_message) + + sizeof(struct mpa_v2_conn_params); + } else { + /* this means MPA_v1 is used. Send max supported */ + event.ord = cur_max_read_depth(ep->com.dev); + event.ird = cur_max_read_depth(ep->com.dev); + event.private_data_len = ep->plen; + event.private_data = ep->mpa_pkt + sizeof(struct mpa_message); + } + c4iw_get_ep(&ep->com); + ret = ep->parent_ep->com.cm_id->event_handler(ep->parent_ep->com.cm_id, + &event); + if (ret) + c4iw_put_ep(&ep->com); + set_bit(CONNREQ_UPCALL, &ep->com.history); + c4iw_put_ep(&ep->parent_ep->com); + return ret; +} + +static void established_upcall(struct c4iw_ep *ep) +{ + struct iw_cm_event event; + + PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); + memset(&event, 0, sizeof(event)); + event.event = IW_CM_EVENT_ESTABLISHED; + event.ird = ep->ird; + event.ord = ep->ord; + if (ep->com.cm_id) { + PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); + ep->com.cm_id->event_handler(ep->com.cm_id, &event); + set_bit(ESTAB_UPCALL, &ep->com.history); + } +} + +static int update_rx_credits(struct c4iw_ep *ep, u32 credits) +{ + struct cpl_rx_data_ack *req; + struct sk_buff *skb; + int wrlen = roundup(sizeof *req, 16); + + PDBG("%s ep %p tid %u credits %u\n", __func__, ep, ep->hwtid, credits); + skb = get_skb(NULL, wrlen, GFP_KERNEL); + if (!skb) { + printk(KERN_ERR MOD "update_rx_credits - cannot alloc skb!\n"); + return 0; + } + + /* + * If we couldn't specify the entire rcv window at connection setup + * due to the limit in the number of bits in the RCV_BUFSIZ field, + * then add the overage in to the credits returned. + */ + if (ep->rcv_win > RCV_BUFSIZ_M * 1024) + credits += ep->rcv_win - RCV_BUFSIZ_M * 1024; + + req = (struct cpl_rx_data_ack *) skb_put(skb, wrlen); + memset(req, 0, wrlen); + INIT_TP_WR(req, ep->hwtid); + OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_RX_DATA_ACK, + ep->hwtid)); + req->credit_dack = cpu_to_be32(credits | RX_FORCE_ACK_F | + RX_DACK_CHANGE_F | + RX_DACK_MODE_V(dack_mode)); + set_wr_txq(skb, CPL_PRIORITY_ACK, ep->ctrlq_idx); + c4iw_ofld_send(&ep->com.dev->rdev, skb); + return credits; +} + +#define RELAXED_IRD_NEGOTIATION 1 + +static int process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb) +{ + struct mpa_message *mpa; + struct mpa_v2_conn_params *mpa_v2_params; + u16 plen; + u16 resp_ird, resp_ord; + u8 rtr_mismatch = 0, insuff_ird = 0; + struct c4iw_qp_attributes attrs; + enum c4iw_qp_attr_mask mask; + int err; + int disconnect = 0; + + PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); + + /* + * Stop mpa timer. If it expired, then + * we ignore the MPA reply. process_timeout() + * will abort the connection. + */ + if (stop_ep_timer(ep)) + return 0; + + /* + * If we get more than the supported amount of private data + * then we must fail this connection. + */ + if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt)) { + err = -EINVAL; + goto err; + } + + /* + * copy the new data into our accumulation buffer. + */ + skb_copy_from_linear_data(skb, &(ep->mpa_pkt[ep->mpa_pkt_len]), + skb->len); + ep->mpa_pkt_len += skb->len; + + /* + * if we don't even have the mpa message, then bail. + */ + if (ep->mpa_pkt_len < sizeof(*mpa)) + return 0; + mpa = (struct mpa_message *) ep->mpa_pkt; + + /* Validate MPA header. */ + if (mpa->revision > mpa_rev) { + printk(KERN_ERR MOD "%s MPA version mismatch. Local = %d," + " Received = %d\n", __func__, mpa_rev, mpa->revision); + err = -EPROTO; + goto err; + } + if (memcmp(mpa->key, MPA_KEY_REP, sizeof(mpa->key))) { + err = -EPROTO; + goto err; + } + + plen = ntohs(mpa->private_data_size); + + /* + * Fail if there's too much private data. + */ + if (plen > MPA_MAX_PRIVATE_DATA) { + err = -EPROTO; + goto err; + } + + /* + * If plen does not account for pkt size + */ + if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) { + err = -EPROTO; + goto err; + } + + ep->plen = (u8) plen; + + /* + * If we don't have all the pdata yet, then bail. + * We'll continue process when more data arrives. + */ + if (ep->mpa_pkt_len < (sizeof(*mpa) + plen)) + return 0; + + if (mpa->flags & MPA_REJECT) { + err = -ECONNREFUSED; + goto err; + } + + /* + * If we get here we have accumulated the entire mpa + * start reply message including private data. And + * the MPA header is valid. + */ + __state_set(&ep->com, FPDU_MODE); + ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0; + ep->mpa_attr.recv_marker_enabled = markers_enabled; + ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0; + ep->mpa_attr.version = mpa->revision; + ep->mpa_attr.p2p_type = FW_RI_INIT_P2PTYPE_DISABLED; + + if (mpa->revision == 2) { + ep->mpa_attr.enhanced_rdma_conn = + mpa->flags & MPA_ENHANCED_RDMA_CONN ? 1 : 0; + if (ep->mpa_attr.enhanced_rdma_conn) { + mpa_v2_params = (struct mpa_v2_conn_params *) + (ep->mpa_pkt + sizeof(*mpa)); + resp_ird = ntohs(mpa_v2_params->ird) & + MPA_V2_IRD_ORD_MASK; + resp_ord = ntohs(mpa_v2_params->ord) & + MPA_V2_IRD_ORD_MASK; + PDBG("%s responder ird %u ord %u ep ird %u ord %u\n", + __func__, resp_ird, resp_ord, ep->ird, ep->ord); + + /* + * This is a double-check. Ideally, below checks are + * not required since ird/ord stuff has been taken + * care of in c4iw_accept_cr + */ + if (ep->ird < resp_ord) { + if (RELAXED_IRD_NEGOTIATION && resp_ord <= + ep->com.dev->rdev.lldi.max_ordird_qp) + ep->ird = resp_ord; + else + insuff_ird = 1; + } else if (ep->ird > resp_ord) { + ep->ird = resp_ord; + } + if (ep->ord > resp_ird) { + if (RELAXED_IRD_NEGOTIATION) + ep->ord = resp_ird; + else + insuff_ird = 1; + } + if (insuff_ird) { + err = -ENOMEM; + ep->ird = resp_ord; + ep->ord = resp_ird; + } + + if (ntohs(mpa_v2_params->ird) & + MPA_V2_PEER2PEER_MODEL) { + if (ntohs(mpa_v2_params->ord) & + MPA_V2_RDMA_WRITE_RTR) + ep->mpa_attr.p2p_type = + FW_RI_INIT_P2PTYPE_RDMA_WRITE; + else if (ntohs(mpa_v2_params->ord) & + MPA_V2_RDMA_READ_RTR) + ep->mpa_attr.p2p_type = + FW_RI_INIT_P2PTYPE_READ_REQ; + } + } + } else if (mpa->revision == 1) + if (peer2peer) + ep->mpa_attr.p2p_type = p2p_type; + + PDBG("%s - crc_enabled=%d, recv_marker_enabled=%d, " + "xmit_marker_enabled=%d, version=%d p2p_type=%d local-p2p_type = " + "%d\n", __func__, ep->mpa_attr.crc_enabled, + ep->mpa_attr.recv_marker_enabled, + ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version, + ep->mpa_attr.p2p_type, p2p_type); + + /* + * If responder's RTR does not match with that of initiator, assign + * FW_RI_INIT_P2PTYPE_DISABLED in mpa attributes so that RTR is not + * generated when moving QP to RTS state. + * A TERM message will be sent after QP has moved to RTS state + */ + if ((ep->mpa_attr.version == 2) && peer2peer && + (ep->mpa_attr.p2p_type != p2p_type)) { + ep->mpa_attr.p2p_type = FW_RI_INIT_P2PTYPE_DISABLED; + rtr_mismatch = 1; + } + + attrs.mpa_attr = ep->mpa_attr; + attrs.max_ird = ep->ird; + attrs.max_ord = ep->ord; + attrs.llp_stream_handle = ep; + attrs.next_state = C4IW_QP_STATE_RTS; + + mask = C4IW_QP_ATTR_NEXT_STATE | + C4IW_QP_ATTR_LLP_STREAM_HANDLE | C4IW_QP_ATTR_MPA_ATTR | + C4IW_QP_ATTR_MAX_IRD | C4IW_QP_ATTR_MAX_ORD; + + /* bind QP and TID with INIT_WR */ + err = c4iw_modify_qp(ep->com.qp->rhp, + ep->com.qp, mask, &attrs, 1); + if (err) + goto err; + + /* + * If responder's RTR requirement did not match with what initiator + * supports, generate TERM message + */ + if (rtr_mismatch) { + printk(KERN_ERR "%s: RTR mismatch, sending TERM\n", __func__); + attrs.layer_etype = LAYER_MPA | DDP_LLP; + attrs.ecode = MPA_NOMATCH_RTR; + attrs.next_state = C4IW_QP_STATE_TERMINATE; + attrs.send_term = 1; + err = c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp, + C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); + err = -ENOMEM; + disconnect = 1; + goto out; + } + + /* + * Generate TERM if initiator IRD is not sufficient for responder + * provided ORD. Currently, we do the same behaviour even when + * responder provided IRD is also not sufficient as regards to + * initiator ORD. + */ + if (insuff_ird) { + printk(KERN_ERR "%s: Insufficient IRD, sending TERM\n", + __func__); + attrs.layer_etype = LAYER_MPA | DDP_LLP; + attrs.ecode = MPA_INSUFF_IRD; + attrs.next_state = C4IW_QP_STATE_TERMINATE; + attrs.send_term = 1; + err = c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp, + C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); + err = -ENOMEM; + disconnect = 1; + goto out; + } + goto out; +err: + __state_set(&ep->com, ABORTING); + send_abort(ep, skb, GFP_KERNEL); +out: + connect_reply_upcall(ep, err); + return disconnect; +} + +static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb) +{ + struct mpa_message *mpa; + struct mpa_v2_conn_params *mpa_v2_params; + u16 plen; + + PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); + + /* + * If we get more than the supported amount of private data + * then we must fail this connection. + */ + if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt)) { + (void)stop_ep_timer(ep); + abort_connection(ep, skb, GFP_KERNEL); + return; + } + + PDBG("%s enter (%s line %u)\n", __func__, __FILE__, __LINE__); + + /* + * Copy the new data into our accumulation buffer. + */ + skb_copy_from_linear_data(skb, &(ep->mpa_pkt[ep->mpa_pkt_len]), + skb->len); + ep->mpa_pkt_len += skb->len; + + /* + * If we don't even have the mpa message, then bail. + * We'll continue process when more data arrives. + */ + if (ep->mpa_pkt_len < sizeof(*mpa)) + return; + + PDBG("%s enter (%s line %u)\n", __func__, __FILE__, __LINE__); + mpa = (struct mpa_message *) ep->mpa_pkt; + + /* + * Validate MPA Header. + */ + if (mpa->revision > mpa_rev) { + printk(KERN_ERR MOD "%s MPA version mismatch. Local = %d," + " Received = %d\n", __func__, mpa_rev, mpa->revision); + (void)stop_ep_timer(ep); + abort_connection(ep, skb, GFP_KERNEL); + return; + } + + if (memcmp(mpa->key, MPA_KEY_REQ, sizeof(mpa->key))) { + (void)stop_ep_timer(ep); + abort_connection(ep, skb, GFP_KERNEL); + return; + } + + plen = ntohs(mpa->private_data_size); + + /* + * Fail if there's too much private data. + */ + if (plen > MPA_MAX_PRIVATE_DATA) { + (void)stop_ep_timer(ep); + abort_connection(ep, skb, GFP_KERNEL); + return; + } + + /* + * If plen does not account for pkt size + */ + if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) { + (void)stop_ep_timer(ep); + abort_connection(ep, skb, GFP_KERNEL); + return; + } + ep->plen = (u8) plen; + + /* + * If we don't have all the pdata yet, then bail. + */ + if (ep->mpa_pkt_len < (sizeof(*mpa) + plen)) + return; + + /* + * If we get here we have accumulated the entire mpa + * start reply message including private data. + */ + ep->mpa_attr.initiator = 0; + ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0; + ep->mpa_attr.recv_marker_enabled = markers_enabled; + ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0; + ep->mpa_attr.version = mpa->revision; + if (mpa->revision == 1) + ep->tried_with_mpa_v1 = 1; + ep->mpa_attr.p2p_type = FW_RI_INIT_P2PTYPE_DISABLED; + + if (mpa->revision == 2) { + ep->mpa_attr.enhanced_rdma_conn = + mpa->flags & MPA_ENHANCED_RDMA_CONN ? 1 : 0; + if (ep->mpa_attr.enhanced_rdma_conn) { + mpa_v2_params = (struct mpa_v2_conn_params *) + (ep->mpa_pkt + sizeof(*mpa)); + ep->ird = ntohs(mpa_v2_params->ird) & + MPA_V2_IRD_ORD_MASK; + ep->ord = ntohs(mpa_v2_params->ord) & + MPA_V2_IRD_ORD_MASK; + PDBG("%s initiator ird %u ord %u\n", __func__, ep->ird, + ep->ord); + if (ntohs(mpa_v2_params->ird) & MPA_V2_PEER2PEER_MODEL) + if (peer2peer) { + if (ntohs(mpa_v2_params->ord) & + MPA_V2_RDMA_WRITE_RTR) + ep->mpa_attr.p2p_type = + FW_RI_INIT_P2PTYPE_RDMA_WRITE; + else if (ntohs(mpa_v2_params->ord) & + MPA_V2_RDMA_READ_RTR) + ep->mpa_attr.p2p_type = + FW_RI_INIT_P2PTYPE_READ_REQ; + } + } + } else if (mpa->revision == 1) + if (peer2peer) + ep->mpa_attr.p2p_type = p2p_type; + + PDBG("%s - crc_enabled=%d, recv_marker_enabled=%d, " + "xmit_marker_enabled=%d, version=%d p2p_type=%d\n", __func__, + ep->mpa_attr.crc_enabled, ep->mpa_attr.recv_marker_enabled, + ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version, + ep->mpa_attr.p2p_type); + + /* + * If the endpoint timer already expired, then we ignore + * the start request. process_timeout() will abort + * the connection. + */ + if (!stop_ep_timer(ep)) { + __state_set(&ep->com, MPA_REQ_RCVD); + + /* drive upcall */ + mutex_lock_nested(&ep->parent_ep->com.mutex, + SINGLE_DEPTH_NESTING); + if (ep->parent_ep->com.state != DEAD) { + if (connect_request_upcall(ep)) + abort_connection(ep, skb, GFP_KERNEL); + } else { + abort_connection(ep, skb, GFP_KERNEL); + } + mutex_unlock(&ep->parent_ep->com.mutex); + } + return; +} + +static int rx_data(struct c4iw_dev *dev, struct sk_buff *skb) +{ + struct c4iw_ep *ep; + struct cpl_rx_data *hdr = cplhdr(skb); + unsigned int dlen = ntohs(hdr->len); + unsigned int tid = GET_TID(hdr); + struct tid_info *t = dev->rdev.lldi.tids; + __u8 status = hdr->status; + int disconnect = 0; + + ep = lookup_tid(t, tid); + if (!ep) + return 0; + PDBG("%s ep %p tid %u dlen %u\n", __func__, ep, ep->hwtid, dlen); + skb_pull(skb, sizeof(*hdr)); + skb_trim(skb, dlen); + mutex_lock(&ep->com.mutex); + + /* update RX credits */ + update_rx_credits(ep, dlen); + + switch (ep->com.state) { + case MPA_REQ_SENT: + ep->rcv_seq += dlen; + disconnect = process_mpa_reply(ep, skb); + break; + case MPA_REQ_WAIT: + ep->rcv_seq += dlen; + process_mpa_request(ep, skb); + break; + case FPDU_MODE: { + struct c4iw_qp_attributes attrs; + BUG_ON(!ep->com.qp); + if (status) + pr_err("%s Unexpected streaming data." \ + " qpid %u ep %p state %d tid %u status %d\n", + __func__, ep->com.qp->wq.sq.qid, ep, + ep->com.state, ep->hwtid, status); + attrs.next_state = C4IW_QP_STATE_TERMINATE; + c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp, + C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); + disconnect = 1; + break; + } + default: + break; + } + mutex_unlock(&ep->com.mutex); + if (disconnect) + c4iw_ep_disconnect(ep, 0, GFP_KERNEL); + return 0; +} + +static int abort_rpl(struct c4iw_dev *dev, struct sk_buff *skb) +{ + struct c4iw_ep *ep; + struct cpl_abort_rpl_rss *rpl = cplhdr(skb); + int release = 0; + unsigned int tid = GET_TID(rpl); + struct tid_info *t = dev->rdev.lldi.tids; + + ep = lookup_tid(t, tid); + if (!ep) { + printk(KERN_WARNING MOD "Abort rpl to freed endpoint\n"); + return 0; + } + PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); + mutex_lock(&ep->com.mutex); + switch (ep->com.state) { + case ABORTING: + c4iw_wake_up(&ep->com.wr_wait, -ECONNRESET); + __state_set(&ep->com, DEAD); + release = 1; + break; + default: + printk(KERN_ERR "%s ep %p state %d\n", + __func__, ep, ep->com.state); + break; + } + mutex_unlock(&ep->com.mutex); + + if (release) + release_ep_resources(ep); + return 0; +} + +static void send_fw_act_open_req(struct c4iw_ep *ep, unsigned int atid) +{ + struct sk_buff *skb; + struct fw_ofld_connection_wr *req; + unsigned int mtu_idx; + int wscale; + struct sockaddr_in *sin; + int win; + + skb = get_skb(NULL, sizeof(*req), GFP_KERNEL); + req = (struct fw_ofld_connection_wr *)__skb_put(skb, sizeof(*req)); + memset(req, 0, sizeof(*req)); + req->op_compl = htonl(WR_OP_V(FW_OFLD_CONNECTION_WR)); + req->len16_pkd = htonl(FW_WR_LEN16_V(DIV_ROUND_UP(sizeof(*req), 16))); + req->le.filter = cpu_to_be32(cxgb4_select_ntuple( + ep->com.dev->rdev.lldi.ports[0], + ep->l2t)); + sin = (struct sockaddr_in *)&ep->com.mapped_local_addr; + req->le.lport = sin->sin_port; + req->le.u.ipv4.lip = sin->sin_addr.s_addr; + sin = (struct sockaddr_in *)&ep->com.mapped_remote_addr; + req->le.pport = sin->sin_port; + req->le.u.ipv4.pip = sin->sin_addr.s_addr; + req->tcb.t_state_to_astid = + htonl(FW_OFLD_CONNECTION_WR_T_STATE_V(TCP_SYN_SENT) | + FW_OFLD_CONNECTION_WR_ASTID_V(atid)); + req->tcb.cplrxdataack_cplpassacceptrpl = + htons(FW_OFLD_CONNECTION_WR_CPLRXDATAACK_F); + req->tcb.tx_max = (__force __be32) jiffies; + req->tcb.rcv_adv = htons(1); + best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx, + enable_tcp_timestamps, + (AF_INET == ep->com.remote_addr.ss_family) ? 0 : 1); + wscale = compute_wscale(rcv_win); + + /* + * Specify the largest window that will fit in opt0. The + * remainder will be specified in the rx_data_ack. + */ + win = ep->rcv_win >> 10; + if (win > RCV_BUFSIZ_M) + win = RCV_BUFSIZ_M; + + req->tcb.opt0 = (__force __be64) (TCAM_BYPASS_F | + (nocong ? NO_CONG_F : 0) | + KEEP_ALIVE_F | + DELACK_F | + WND_SCALE_V(wscale) | + MSS_IDX_V(mtu_idx) | + L2T_IDX_V(ep->l2t->idx) | + TX_CHAN_V(ep->tx_chan) | + SMAC_SEL_V(ep->smac_idx) | + DSCP_V(ep->tos) | + ULP_MODE_V(ULP_MODE_TCPDDP) | + RCV_BUFSIZ_V(win)); + req->tcb.opt2 = (__force __be32) (PACE_V(1) | + TX_QUEUE_V(ep->com.dev->rdev.lldi.tx_modq[ep->tx_chan]) | + RX_CHANNEL_V(0) | + CCTRL_ECN_V(enable_ecn) | + RSS_QUEUE_VALID_F | RSS_QUEUE_V(ep->rss_qid)); + if (enable_tcp_timestamps) + req->tcb.opt2 |= (__force __be32)TSTAMPS_EN_F; + if (enable_tcp_sack) + req->tcb.opt2 |= (__force __be32)SACK_EN_F; + if (wscale && enable_tcp_window_scaling) + req->tcb.opt2 |= (__force __be32)WND_SCALE_EN_F; + req->tcb.opt0 = cpu_to_be64((__force u64)req->tcb.opt0); + req->tcb.opt2 = cpu_to_be32((__force u32)req->tcb.opt2); + set_wr_txq(skb, CPL_PRIORITY_CONTROL, ep->ctrlq_idx); + set_bit(ACT_OFLD_CONN, &ep->com.history); + c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t); +} + +/* + * Return whether a failed active open has allocated a TID + */ +static inline int act_open_has_tid(int status) +{ + return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST && + status != CPL_ERR_ARP_MISS; +} + +/* Returns whether a CPL status conveys negative advice. + */ +static int is_neg_adv(unsigned int status) +{ + return status == CPL_ERR_RTX_NEG_ADVICE || + status == CPL_ERR_PERSIST_NEG_ADVICE || + status == CPL_ERR_KEEPALV_NEG_ADVICE; +} + +static char *neg_adv_str(unsigned int status) +{ + switch (status) { + case CPL_ERR_RTX_NEG_ADVICE: + return "Retransmit timeout"; + case CPL_ERR_PERSIST_NEG_ADVICE: + return "Persist timeout"; + case CPL_ERR_KEEPALV_NEG_ADVICE: + return "Keepalive timeout"; + default: + return "Unknown"; + } +} + +static void set_tcp_window(struct c4iw_ep *ep, struct port_info *pi) +{ + ep->snd_win = snd_win; + ep->rcv_win = rcv_win; + PDBG("%s snd_win %d rcv_win %d\n", __func__, ep->snd_win, ep->rcv_win); +} + +#define ACT_OPEN_RETRY_COUNT 2 + +static int import_ep(struct c4iw_ep *ep, int iptype, __u8 *peer_ip, + struct dst_entry *dst, struct c4iw_dev *cdev, + bool clear_mpa_v1) +{ + struct neighbour *n; + int err, step; + struct net_device *pdev; + + n = dst_neigh_lookup(dst, peer_ip); + if (!n) + return -ENODEV; + + rcu_read_lock(); + err = -ENOMEM; + if (n->dev->flags & IFF_LOOPBACK) { + if (iptype == 4) + pdev = ip_dev_find(&init_net, *(__be32 *)peer_ip); + else if (IS_ENABLED(CONFIG_IPV6)) + for_each_netdev(&init_net, pdev) { + if (ipv6_chk_addr(&init_net, + (struct in6_addr *)peer_ip, + pdev, 1)) + break; + } + else + pdev = NULL; + + if (!pdev) { + err = -ENODEV; + goto out; + } + ep->l2t = cxgb4_l2t_get(cdev->rdev.lldi.l2t, + n, pdev, 0); + if (!ep->l2t) + goto out; + ep->mtu = pdev->mtu; + ep->tx_chan = cxgb4_port_chan(pdev); + ep->smac_idx = (cxgb4_port_viid(pdev) & 0x7F) << 1; + step = cdev->rdev.lldi.ntxq / + cdev->rdev.lldi.nchan; + ep->txq_idx = cxgb4_port_idx(pdev) * step; + step = cdev->rdev.lldi.nrxq / + cdev->rdev.lldi.nchan; + ep->ctrlq_idx = cxgb4_port_idx(pdev); + ep->rss_qid = cdev->rdev.lldi.rxq_ids[ + cxgb4_port_idx(pdev) * step]; + set_tcp_window(ep, (struct port_info *)netdev_priv(pdev)); + dev_put(pdev); + } else { + pdev = get_real_dev(n->dev); + ep->l2t = cxgb4_l2t_get(cdev->rdev.lldi.l2t, + n, pdev, 0); + if (!ep->l2t) + goto out; + ep->mtu = dst_mtu(dst); + ep->tx_chan = cxgb4_port_chan(pdev); + ep->smac_idx = (cxgb4_port_viid(pdev) & 0x7F) << 1; + step = cdev->rdev.lldi.ntxq / + cdev->rdev.lldi.nchan; + ep->txq_idx = cxgb4_port_idx(pdev) * step; + ep->ctrlq_idx = cxgb4_port_idx(pdev); + step = cdev->rdev.lldi.nrxq / + cdev->rdev.lldi.nchan; + ep->rss_qid = cdev->rdev.lldi.rxq_ids[ + cxgb4_port_idx(pdev) * step]; + set_tcp_window(ep, (struct port_info *)netdev_priv(pdev)); + + if (clear_mpa_v1) { + ep->retry_with_mpa_v1 = 0; + ep->tried_with_mpa_v1 = 0; + } + } + err = 0; +out: + rcu_read_unlock(); + + neigh_release(n); + + return err; +} + +static int c4iw_reconnect(struct c4iw_ep *ep) +{ + int err = 0; + struct sockaddr_in *laddr = (struct sockaddr_in *) + &ep->com.cm_id->local_addr; + struct sockaddr_in *raddr = (struct sockaddr_in *) + &ep->com.cm_id->remote_addr; + struct sockaddr_in6 *laddr6 = (struct sockaddr_in6 *) + &ep->com.cm_id->local_addr; + struct sockaddr_in6 *raddr6 = (struct sockaddr_in6 *) + &ep->com.cm_id->remote_addr; + int iptype; + __u8 *ra; + + PDBG("%s qp %p cm_id %p\n", __func__, ep->com.qp, ep->com.cm_id); + init_timer(&ep->timer); + + /* + * Allocate an active TID to initiate a TCP connection. + */ + ep->atid = cxgb4_alloc_atid(ep->com.dev->rdev.lldi.tids, ep); + if (ep->atid == -1) { + pr_err("%s - cannot alloc atid.\n", __func__); + err = -ENOMEM; + goto fail2; + } + insert_handle(ep->com.dev, &ep->com.dev->atid_idr, ep, ep->atid); + + /* find a route */ + if (ep->com.cm_id->local_addr.ss_family == AF_INET) { + ep->dst = find_route(ep->com.dev, laddr->sin_addr.s_addr, + raddr->sin_addr.s_addr, laddr->sin_port, + raddr->sin_port, 0); + iptype = 4; + ra = (__u8 *)&raddr->sin_addr; + } else { + ep->dst = find_route6(ep->com.dev, laddr6->sin6_addr.s6_addr, + raddr6->sin6_addr.s6_addr, + laddr6->sin6_port, raddr6->sin6_port, 0, + raddr6->sin6_scope_id); + iptype = 6; + ra = (__u8 *)&raddr6->sin6_addr; + } + if (!ep->dst) { + pr_err("%s - cannot find route.\n", __func__); + err = -EHOSTUNREACH; + goto fail3; + } + err = import_ep(ep, iptype, ra, ep->dst, ep->com.dev, false); + if (err) { + pr_err("%s - cannot alloc l2e.\n", __func__); + goto fail4; + } + + PDBG("%s txq_idx %u tx_chan %u smac_idx %u rss_qid %u l2t_idx %u\n", + __func__, ep->txq_idx, ep->tx_chan, ep->smac_idx, ep->rss_qid, + ep->l2t->idx); + + state_set(&ep->com, CONNECTING); + ep->tos = 0; + + /* send connect request to rnic */ + err = send_connect(ep); + if (!err) + goto out; + + cxgb4_l2t_release(ep->l2t); +fail4: + dst_release(ep->dst); +fail3: + remove_handle(ep->com.dev, &ep->com.dev->atid_idr, ep->atid); + cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid); +fail2: + /* + * remember to send notification to upper layer. + * We are in here so the upper layer is not aware that this is + * re-connect attempt and so, upper layer is still waiting for + * response of 1st connect request. + */ + connect_reply_upcall(ep, -ECONNRESET); + c4iw_put_ep(&ep->com); +out: + return err; +} + +static int act_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb) +{ + struct c4iw_ep *ep; + struct cpl_act_open_rpl *rpl = cplhdr(skb); + unsigned int atid = TID_TID_G(AOPEN_ATID_G( + ntohl(rpl->atid_status))); + struct tid_info *t = dev->rdev.lldi.tids; + int status = AOPEN_STATUS_G(ntohl(rpl->atid_status)); + struct sockaddr_in *la; + struct sockaddr_in *ra; + struct sockaddr_in6 *la6; + struct sockaddr_in6 *ra6; + + ep = lookup_atid(t, atid); + la = (struct sockaddr_in *)&ep->com.mapped_local_addr; + ra = (struct sockaddr_in *)&ep->com.mapped_remote_addr; + la6 = (struct sockaddr_in6 *)&ep->com.mapped_local_addr; + ra6 = (struct sockaddr_in6 *)&ep->com.mapped_remote_addr; + + PDBG("%s ep %p atid %u status %u errno %d\n", __func__, ep, atid, + status, status2errno(status)); + + if (is_neg_adv(status)) { + PDBG("%s Connection problems for atid %u status %u (%s)\n", + __func__, atid, status, neg_adv_str(status)); + ep->stats.connect_neg_adv++; + mutex_lock(&dev->rdev.stats.lock); + dev->rdev.stats.neg_adv++; + mutex_unlock(&dev->rdev.stats.lock); + return 0; + } + + set_bit(ACT_OPEN_RPL, &ep->com.history); + + /* + * Log interesting failures. + */ + switch (status) { + case CPL_ERR_CONN_RESET: + case CPL_ERR_CONN_TIMEDOUT: + break; + case CPL_ERR_TCAM_FULL: + mutex_lock(&dev->rdev.stats.lock); + dev->rdev.stats.tcam_full++; + mutex_unlock(&dev->rdev.stats.lock); + if (ep->com.local_addr.ss_family == AF_INET && + dev->rdev.lldi.enable_fw_ofld_conn) { + send_fw_act_open_req(ep, + TID_TID_G(AOPEN_ATID_G( + ntohl(rpl->atid_status)))); + return 0; + } + break; + case CPL_ERR_CONN_EXIST: + if (ep->retry_count++ < ACT_OPEN_RETRY_COUNT) { + set_bit(ACT_RETRY_INUSE, &ep->com.history); + remove_handle(ep->com.dev, &ep->com.dev->atid_idr, + atid); + cxgb4_free_atid(t, atid); + dst_release(ep->dst); + cxgb4_l2t_release(ep->l2t); + c4iw_reconnect(ep); + return 0; + } + break; + default: + if (ep->com.local_addr.ss_family == AF_INET) { + pr_info("Active open failure - atid %u status %u errno %d %pI4:%u->%pI4:%u\n", + atid, status, status2errno(status), + &la->sin_addr.s_addr, ntohs(la->sin_port), + &ra->sin_addr.s_addr, ntohs(ra->sin_port)); + } else { + pr_info("Active open failure - atid %u status %u errno %d %pI6:%u->%pI6:%u\n", + atid, status, status2errno(status), + la6->sin6_addr.s6_addr, ntohs(la6->sin6_port), + ra6->sin6_addr.s6_addr, ntohs(ra6->sin6_port)); + } + break; + } + + connect_reply_upcall(ep, status2errno(status)); + state_set(&ep->com, DEAD); + + if (status && act_open_has_tid(status)) + cxgb4_remove_tid(ep->com.dev->rdev.lldi.tids, 0, GET_TID(rpl)); + + remove_handle(ep->com.dev, &ep->com.dev->atid_idr, atid); + cxgb4_free_atid(t, atid); + dst_release(ep->dst); + cxgb4_l2t_release(ep->l2t); + c4iw_put_ep(&ep->com); + + return 0; +} + +static int pass_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb) +{ + struct cpl_pass_open_rpl *rpl = cplhdr(skb); + struct tid_info *t = dev->rdev.lldi.tids; + unsigned int stid = GET_TID(rpl); + struct c4iw_listen_ep *ep = lookup_stid(t, stid); + + if (!ep) { + PDBG("%s stid %d lookup failure!\n", __func__, stid); + goto out; + } + PDBG("%s ep %p status %d error %d\n", __func__, ep, + rpl->status, status2errno(rpl->status)); + c4iw_wake_up(&ep->com.wr_wait, status2errno(rpl->status)); + +out: + return 0; +} + +static int close_listsrv_rpl(struct c4iw_dev *dev, struct sk_buff *skb) +{ + struct cpl_close_listsvr_rpl *rpl = cplhdr(skb); + struct tid_info *t = dev->rdev.lldi.tids; + unsigned int stid = GET_TID(rpl); + struct c4iw_listen_ep *ep = lookup_stid(t, stid); + + PDBG("%s ep %p\n", __func__, ep); + c4iw_wake_up(&ep->com.wr_wait, status2errno(rpl->status)); + return 0; +} + +static void accept_cr(struct c4iw_ep *ep, struct sk_buff *skb, + struct cpl_pass_accept_req *req) +{ + struct cpl_pass_accept_rpl *rpl; + unsigned int mtu_idx; + u64 opt0; + u32 opt2; + int wscale; + struct cpl_t5_pass_accept_rpl *rpl5 = NULL; + int win; + + PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); + BUG_ON(skb_cloned(skb)); + + skb_get(skb); + rpl = cplhdr(skb); + if (is_t5(ep->com.dev->rdev.lldi.adapter_type)) { + skb_trim(skb, roundup(sizeof(*rpl5), 16)); + rpl5 = (void *)rpl; + INIT_TP_WR(rpl5, ep->hwtid); + } else { + skb_trim(skb, sizeof(*rpl)); + INIT_TP_WR(rpl, ep->hwtid); + } + OPCODE_TID(rpl) = cpu_to_be32(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, + ep->hwtid)); + + best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx, + enable_tcp_timestamps && req->tcpopt.tstamp, + (AF_INET == ep->com.remote_addr.ss_family) ? 0 : 1); + wscale = compute_wscale(rcv_win); + + /* + * Specify the largest window that will fit in opt0. The + * remainder will be specified in the rx_data_ack. + */ + win = ep->rcv_win >> 10; + if (win > RCV_BUFSIZ_M) + win = RCV_BUFSIZ_M; + opt0 = (nocong ? NO_CONG_F : 0) | + KEEP_ALIVE_F | + DELACK_F | + WND_SCALE_V(wscale) | + MSS_IDX_V(mtu_idx) | + L2T_IDX_V(ep->l2t->idx) | + TX_CHAN_V(ep->tx_chan) | + SMAC_SEL_V(ep->smac_idx) | + DSCP_V(ep->tos >> 2) | + ULP_MODE_V(ULP_MODE_TCPDDP) | + RCV_BUFSIZ_V(win); + opt2 = RX_CHANNEL_V(0) | + RSS_QUEUE_VALID_F | RSS_QUEUE_V(ep->rss_qid); + + if (enable_tcp_timestamps && req->tcpopt.tstamp) + opt2 |= TSTAMPS_EN_F; + if (enable_tcp_sack && req->tcpopt.sack) + opt2 |= SACK_EN_F; + if (wscale && enable_tcp_window_scaling) + opt2 |= WND_SCALE_EN_F; + if (enable_ecn) { + const struct tcphdr *tcph; + u32 hlen = ntohl(req->hdr_len); + + tcph = (const void *)(req + 1) + ETH_HDR_LEN_G(hlen) + + IP_HDR_LEN_G(hlen); + if (tcph->ece && tcph->cwr) + opt2 |= CCTRL_ECN_V(1); + } + if (is_t5(ep->com.dev->rdev.lldi.adapter_type)) { + u32 isn = (prandom_u32() & ~7UL) - 1; + opt2 |= T5_OPT_2_VALID_F; + opt2 |= CONG_CNTRL_V(CONG_ALG_TAHOE); + opt2 |= T5_ISS_F; + rpl5 = (void *)rpl; + memset(&rpl5->iss, 0, roundup(sizeof(*rpl5)-sizeof(*rpl), 16)); + if (peer2peer) + isn += 4; + rpl5->iss = cpu_to_be32(isn); + PDBG("%s iss %u\n", __func__, be32_to_cpu(rpl5->iss)); + } + + rpl->opt0 = cpu_to_be64(opt0); + rpl->opt2 = cpu_to_be32(opt2); + set_wr_txq(skb, CPL_PRIORITY_SETUP, ep->ctrlq_idx); + t4_set_arp_err_handler(skb, NULL, arp_failure_discard); + c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t); + + return; +} + +static void reject_cr(struct c4iw_dev *dev, u32 hwtid, struct sk_buff *skb) +{ + PDBG("%s c4iw_dev %p tid %u\n", __func__, dev, hwtid); + BUG_ON(skb_cloned(skb)); + skb_trim(skb, sizeof(struct cpl_tid_release)); + release_tid(&dev->rdev, hwtid, skb); + return; +} + +static void get_4tuple(struct cpl_pass_accept_req *req, int *iptype, + __u8 *local_ip, __u8 *peer_ip, + __be16 *local_port, __be16 *peer_port) +{ + int eth_len = ETH_HDR_LEN_G(be32_to_cpu(req->hdr_len)); + int ip_len = IP_HDR_LEN_G(be32_to_cpu(req->hdr_len)); + struct iphdr *ip = (struct iphdr *)((u8 *)(req + 1) + eth_len); + struct ipv6hdr *ip6 = (struct ipv6hdr *)((u8 *)(req + 1) + eth_len); + struct tcphdr *tcp = (struct tcphdr *) + ((u8 *)(req + 1) + eth_len + ip_len); + + if (ip->version == 4) { + PDBG("%s saddr 0x%x daddr 0x%x sport %u dport %u\n", __func__, + ntohl(ip->saddr), ntohl(ip->daddr), ntohs(tcp->source), + ntohs(tcp->dest)); + *iptype = 4; + memcpy(peer_ip, &ip->saddr, 4); + memcpy(local_ip, &ip->daddr, 4); + } else { + PDBG("%s saddr %pI6 daddr %pI6 sport %u dport %u\n", __func__, + ip6->saddr.s6_addr, ip6->daddr.s6_addr, ntohs(tcp->source), + ntohs(tcp->dest)); + *iptype = 6; + memcpy(peer_ip, ip6->saddr.s6_addr, 16); + memcpy(local_ip, ip6->daddr.s6_addr, 16); + } + *peer_port = tcp->source; + *local_port = tcp->dest; + + return; +} + +static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb) +{ + struct c4iw_ep *child_ep = NULL, *parent_ep; + struct cpl_pass_accept_req *req = cplhdr(skb); + unsigned int stid = PASS_OPEN_TID_G(ntohl(req->tos_stid)); + struct tid_info *t = dev->rdev.lldi.tids; + unsigned int hwtid = GET_TID(req); + struct dst_entry *dst; + __u8 local_ip[16], peer_ip[16]; + __be16 local_port, peer_port; + int err; + u16 peer_mss = ntohs(req->tcpopt.mss); + int iptype; + unsigned short hdrs; + + parent_ep = lookup_stid(t, stid); + if (!parent_ep) { + PDBG("%s connect request on invalid stid %d\n", __func__, stid); + goto reject; + } + + if (state_read(&parent_ep->com) != LISTEN) { + printk(KERN_ERR "%s - listening ep not in LISTEN\n", + __func__); + goto reject; + } + + get_4tuple(req, &iptype, local_ip, peer_ip, &local_port, &peer_port); + + /* Find output route */ + if (iptype == 4) { + PDBG("%s parent ep %p hwtid %u laddr %pI4 raddr %pI4 lport %d rport %d peer_mss %d\n" + , __func__, parent_ep, hwtid, + local_ip, peer_ip, ntohs(local_port), + ntohs(peer_port), peer_mss); + dst = find_route(dev, *(__be32 *)local_ip, *(__be32 *)peer_ip, + local_port, peer_port, + PASS_OPEN_TOS_G(ntohl(req->tos_stid))); + } else { + PDBG("%s parent ep %p hwtid %u laddr %pI6 raddr %pI6 lport %d rport %d peer_mss %d\n" + , __func__, parent_ep, hwtid, + local_ip, peer_ip, ntohs(local_port), + ntohs(peer_port), peer_mss); + dst = find_route6(dev, local_ip, peer_ip, local_port, peer_port, + PASS_OPEN_TOS_G(ntohl(req->tos_stid)), + ((struct sockaddr_in6 *) + &parent_ep->com.local_addr)->sin6_scope_id); + } + if (!dst) { + printk(KERN_ERR MOD "%s - failed to find dst entry!\n", + __func__); + goto reject; + } + + child_ep = alloc_ep(sizeof(*child_ep), GFP_KERNEL); + if (!child_ep) { + printk(KERN_ERR MOD "%s - failed to allocate ep entry!\n", + __func__); + dst_release(dst); + goto reject; + } + + err = import_ep(child_ep, iptype, peer_ip, dst, dev, false); + if (err) { + printk(KERN_ERR MOD "%s - failed to allocate l2t entry!\n", + __func__); + dst_release(dst); + kfree(child_ep); + goto reject; + } + + hdrs = sizeof(struct iphdr) + sizeof(struct tcphdr) + + ((enable_tcp_timestamps && req->tcpopt.tstamp) ? 12 : 0); + if (peer_mss && child_ep->mtu > (peer_mss + hdrs)) + child_ep->mtu = peer_mss + hdrs; + + state_set(&child_ep->com, CONNECTING); + child_ep->com.dev = dev; + child_ep->com.cm_id = NULL; + + /* + * The mapped_local and mapped_remote addresses get setup with + * the actual 4-tuple. The local address will be based on the + * actual local address of the connection, but on the port number + * of the parent listening endpoint. The remote address is + * setup based on a query to the IWPM since we don't know what it + * originally was before mapping. If no mapping was done, then + * mapped_remote == remote, and mapped_local == local. + */ + if (iptype == 4) { + struct sockaddr_in *sin = (struct sockaddr_in *) + &child_ep->com.mapped_local_addr; + + sin->sin_family = PF_INET; + sin->sin_port = local_port; + sin->sin_addr.s_addr = *(__be32 *)local_ip; + + sin = (struct sockaddr_in *)&child_ep->com.local_addr; + sin->sin_family = PF_INET; + sin->sin_port = ((struct sockaddr_in *) + &parent_ep->com.local_addr)->sin_port; + sin->sin_addr.s_addr = *(__be32 *)local_ip; + + sin = (struct sockaddr_in *)&child_ep->com.mapped_remote_addr; + sin->sin_family = PF_INET; + sin->sin_port = peer_port; + sin->sin_addr.s_addr = *(__be32 *)peer_ip; + } else { + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) + &child_ep->com.mapped_local_addr; + + sin6->sin6_family = PF_INET6; + sin6->sin6_port = local_port; + memcpy(sin6->sin6_addr.s6_addr, local_ip, 16); + + sin6 = (struct sockaddr_in6 *)&child_ep->com.local_addr; + sin6->sin6_family = PF_INET6; + sin6->sin6_port = ((struct sockaddr_in6 *) + &parent_ep->com.local_addr)->sin6_port; + memcpy(sin6->sin6_addr.s6_addr, local_ip, 16); + + sin6 = (struct sockaddr_in6 *)&child_ep->com.mapped_remote_addr; + sin6->sin6_family = PF_INET6; + sin6->sin6_port = peer_port; + memcpy(sin6->sin6_addr.s6_addr, peer_ip, 16); + } + memcpy(&child_ep->com.remote_addr, &child_ep->com.mapped_remote_addr, + sizeof(child_ep->com.remote_addr)); + get_remote_addr(parent_ep, child_ep); + + c4iw_get_ep(&parent_ep->com); + child_ep->parent_ep = parent_ep; + child_ep->tos = PASS_OPEN_TOS_G(ntohl(req->tos_stid)); + child_ep->dst = dst; + child_ep->hwtid = hwtid; + + PDBG("%s tx_chan %u smac_idx %u rss_qid %u\n", __func__, + child_ep->tx_chan, child_ep->smac_idx, child_ep->rss_qid); + + init_timer(&child_ep->timer); + cxgb4_insert_tid(t, child_ep, hwtid); + insert_handle(dev, &dev->hwtid_idr, child_ep, child_ep->hwtid); + accept_cr(child_ep, skb, req); + set_bit(PASS_ACCEPT_REQ, &child_ep->com.history); + goto out; +reject: + reject_cr(dev, hwtid, skb); +out: + return 0; +} + +static int pass_establish(struct c4iw_dev *dev, struct sk_buff *skb) +{ + struct c4iw_ep *ep; + struct cpl_pass_establish *req = cplhdr(skb); + struct tid_info *t = dev->rdev.lldi.tids; + unsigned int tid = GET_TID(req); + + ep = lookup_tid(t, tid); + PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); + ep->snd_seq = be32_to_cpu(req->snd_isn); + ep->rcv_seq = be32_to_cpu(req->rcv_isn); + + PDBG("%s ep %p hwtid %u tcp_opt 0x%02x\n", __func__, ep, tid, + ntohs(req->tcp_opt)); + + set_emss(ep, ntohs(req->tcp_opt)); + + dst_confirm(ep->dst); + state_set(&ep->com, MPA_REQ_WAIT); + start_ep_timer(ep); + send_flowc(ep, skb); + set_bit(PASS_ESTAB, &ep->com.history); + + return 0; +} + +static int peer_close(struct c4iw_dev *dev, struct sk_buff *skb) +{ + struct cpl_peer_close *hdr = cplhdr(skb); + struct c4iw_ep *ep; + struct c4iw_qp_attributes attrs; + int disconnect = 1; + int release = 0; + struct tid_info *t = dev->rdev.lldi.tids; + unsigned int tid = GET_TID(hdr); + int ret; + + ep = lookup_tid(t, tid); + PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); + dst_confirm(ep->dst); + + set_bit(PEER_CLOSE, &ep->com.history); + mutex_lock(&ep->com.mutex); + switch (ep->com.state) { + case MPA_REQ_WAIT: + __state_set(&ep->com, CLOSING); + break; + case MPA_REQ_SENT: + __state_set(&ep->com, CLOSING); + connect_reply_upcall(ep, -ECONNRESET); + break; + case MPA_REQ_RCVD: + + /* + * We're gonna mark this puppy DEAD, but keep + * the reference on it until the ULP accepts or + * rejects the CR. Also wake up anyone waiting + * in rdma connection migration (see c4iw_accept_cr()). + */ + __state_set(&ep->com, CLOSING); + PDBG("waking up ep %p tid %u\n", ep, ep->hwtid); + c4iw_wake_up(&ep->com.wr_wait, -ECONNRESET); + break; + case MPA_REP_SENT: + __state_set(&ep->com, CLOSING); + PDBG("waking up ep %p tid %u\n", ep, ep->hwtid); + c4iw_wake_up(&ep->com.wr_wait, -ECONNRESET); + break; + case FPDU_MODE: + start_ep_timer(ep); + __state_set(&ep->com, CLOSING); + attrs.next_state = C4IW_QP_STATE_CLOSING; + ret = c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp, + C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); + if (ret != -ECONNRESET) { + peer_close_upcall(ep); + disconnect = 1; + } + break; + case ABORTING: + disconnect = 0; + break; + case CLOSING: + __state_set(&ep->com, MORIBUND); + disconnect = 0; + break; + case MORIBUND: + (void)stop_ep_timer(ep); + if (ep->com.cm_id && ep->com.qp) { + attrs.next_state = C4IW_QP_STATE_IDLE; + c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp, + C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); + } + close_complete_upcall(ep, 0); + __state_set(&ep->com, DEAD); + release = 1; + disconnect = 0; + break; + case DEAD: + disconnect = 0; + break; + default: + BUG_ON(1); + } + mutex_unlock(&ep->com.mutex); + if (disconnect) + c4iw_ep_disconnect(ep, 0, GFP_KERNEL); + if (release) + release_ep_resources(ep); + return 0; +} + +static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb) +{ + struct cpl_abort_req_rss *req = cplhdr(skb); + struct c4iw_ep *ep; + struct cpl_abort_rpl *rpl; + struct sk_buff *rpl_skb; + struct c4iw_qp_attributes attrs; + int ret; + int release = 0; + struct tid_info *t = dev->rdev.lldi.tids; + unsigned int tid = GET_TID(req); + + ep = lookup_tid(t, tid); + if (is_neg_adv(req->status)) { + PDBG("%s Negative advice on abort- tid %u status %d (%s)\n", + __func__, ep->hwtid, req->status, + neg_adv_str(req->status)); + ep->stats.abort_neg_adv++; + mutex_lock(&dev->rdev.stats.lock); + dev->rdev.stats.neg_adv++; + mutex_unlock(&dev->rdev.stats.lock); + return 0; + } + PDBG("%s ep %p tid %u state %u\n", __func__, ep, ep->hwtid, + ep->com.state); + set_bit(PEER_ABORT, &ep->com.history); + + /* + * Wake up any threads in rdma_init() or rdma_fini(). + * However, this is not needed if com state is just + * MPA_REQ_SENT + */ + if (ep->com.state != MPA_REQ_SENT) + c4iw_wake_up(&ep->com.wr_wait, -ECONNRESET); + + mutex_lock(&ep->com.mutex); + switch (ep->com.state) { + case CONNECTING: + break; + case MPA_REQ_WAIT: + (void)stop_ep_timer(ep); + break; + case MPA_REQ_SENT: + (void)stop_ep_timer(ep); + if (mpa_rev == 1 || (mpa_rev == 2 && ep->tried_with_mpa_v1)) + connect_reply_upcall(ep, -ECONNRESET); + else { + /* + * we just don't send notification upwards because we + * want to retry with mpa_v1 without upper layers even + * knowing it. + * + * do some housekeeping so as to re-initiate the + * connection + */ + PDBG("%s: mpa_rev=%d. Retrying with mpav1\n", __func__, + mpa_rev); + ep->retry_with_mpa_v1 = 1; + } + break; + case MPA_REP_SENT: + break; + case MPA_REQ_RCVD: + break; + case MORIBUND: + case CLOSING: + stop_ep_timer(ep); + /*FALLTHROUGH*/ + case FPDU_MODE: + if (ep->com.cm_id && ep->com.qp) { + attrs.next_state = C4IW_QP_STATE_ERROR; + ret = c4iw_modify_qp(ep->com.qp->rhp, + ep->com.qp, C4IW_QP_ATTR_NEXT_STATE, + &attrs, 1); + if (ret) + printk(KERN_ERR MOD + "%s - qp <- error failed!\n", + __func__); + } + peer_abort_upcall(ep); + break; + case ABORTING: + break; + case DEAD: + PDBG("%s PEER_ABORT IN DEAD STATE!!!!\n", __func__); + mutex_unlock(&ep->com.mutex); + return 0; + default: + BUG_ON(1); + break; + } + dst_confirm(ep->dst); + if (ep->com.state != ABORTING) { + __state_set(&ep->com, DEAD); + /* we don't release if we want to retry with mpa_v1 */ + if (!ep->retry_with_mpa_v1) + release = 1; + } + mutex_unlock(&ep->com.mutex); + + rpl_skb = get_skb(skb, sizeof(*rpl), GFP_KERNEL); + if (!rpl_skb) { + printk(KERN_ERR MOD "%s - cannot allocate skb!\n", + __func__); + release = 1; + goto out; + } + set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx); + rpl = (struct cpl_abort_rpl *) skb_put(rpl_skb, sizeof(*rpl)); + INIT_TP_WR(rpl, ep->hwtid); + OPCODE_TID(rpl) = cpu_to_be32(MK_OPCODE_TID(CPL_ABORT_RPL, ep->hwtid)); + rpl->cmd = CPL_ABORT_NO_RST; + c4iw_ofld_send(&ep->com.dev->rdev, rpl_skb); +out: + if (release) + release_ep_resources(ep); + else if (ep->retry_with_mpa_v1) { + remove_handle(ep->com.dev, &ep->com.dev->hwtid_idr, ep->hwtid); + cxgb4_remove_tid(ep->com.dev->rdev.lldi.tids, 0, ep->hwtid); + dst_release(ep->dst); + cxgb4_l2t_release(ep->l2t); + c4iw_reconnect(ep); + } + + return 0; +} + +static int close_con_rpl(struct c4iw_dev *dev, struct sk_buff *skb) +{ + struct c4iw_ep *ep; + struct c4iw_qp_attributes attrs; + struct cpl_close_con_rpl *rpl = cplhdr(skb); + int release = 0; + struct tid_info *t = dev->rdev.lldi.tids; + unsigned int tid = GET_TID(rpl); + + ep = lookup_tid(t, tid); + + PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); + BUG_ON(!ep); + + /* The cm_id may be null if we failed to connect */ + mutex_lock(&ep->com.mutex); + switch (ep->com.state) { + case CLOSING: + __state_set(&ep->com, MORIBUND); + break; + case MORIBUND: + (void)stop_ep_timer(ep); + if ((ep->com.cm_id) && (ep->com.qp)) { + attrs.next_state = C4IW_QP_STATE_IDLE; + c4iw_modify_qp(ep->com.qp->rhp, + ep->com.qp, + C4IW_QP_ATTR_NEXT_STATE, + &attrs, 1); + } + close_complete_upcall(ep, 0); + __state_set(&ep->com, DEAD); + release = 1; + break; + case ABORTING: + case DEAD: + break; + default: + BUG_ON(1); + break; + } + mutex_unlock(&ep->com.mutex); + if (release) + release_ep_resources(ep); + return 0; +} + +static int terminate(struct c4iw_dev *dev, struct sk_buff *skb) +{ + struct cpl_rdma_terminate *rpl = cplhdr(skb); + struct tid_info *t = dev->rdev.lldi.tids; + unsigned int tid = GET_TID(rpl); + struct c4iw_ep *ep; + struct c4iw_qp_attributes attrs; + + ep = lookup_tid(t, tid); + BUG_ON(!ep); + + if (ep && ep->com.qp) { + printk(KERN_WARNING MOD "TERM received tid %u qpid %u\n", tid, + ep->com.qp->wq.sq.qid); + attrs.next_state = C4IW_QP_STATE_TERMINATE; + c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp, + C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); + } else + printk(KERN_WARNING MOD "TERM received tid %u no ep/qp\n", tid); + + return 0; +} + +/* + * Upcall from the adapter indicating data has been transmitted. + * For us its just the single MPA request or reply. We can now free + * the skb holding the mpa message. + */ +static int fw4_ack(struct c4iw_dev *dev, struct sk_buff *skb) +{ + struct c4iw_ep *ep; + struct cpl_fw4_ack *hdr = cplhdr(skb); + u8 credits = hdr->credits; + unsigned int tid = GET_TID(hdr); + struct tid_info *t = dev->rdev.lldi.tids; + + + ep = lookup_tid(t, tid); + PDBG("%s ep %p tid %u credits %u\n", __func__, ep, ep->hwtid, credits); + if (credits == 0) { + PDBG("%s 0 credit ack ep %p tid %u state %u\n", + __func__, ep, ep->hwtid, state_read(&ep->com)); + return 0; + } + + dst_confirm(ep->dst); + if (ep->mpa_skb) { + PDBG("%s last streaming msg ack ep %p tid %u state %u " + "initiator %u freeing skb\n", __func__, ep, ep->hwtid, + state_read(&ep->com), ep->mpa_attr.initiator ? 1 : 0); + kfree_skb(ep->mpa_skb); + ep->mpa_skb = NULL; + } + return 0; +} + +int c4iw_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len) +{ + int err = 0; + int disconnect = 0; + struct c4iw_ep *ep = to_ep(cm_id); + PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); + + mutex_lock(&ep->com.mutex); + if (ep->com.state == DEAD) { + mutex_unlock(&ep->com.mutex); + c4iw_put_ep(&ep->com); + return -ECONNRESET; + } + set_bit(ULP_REJECT, &ep->com.history); + BUG_ON(ep->com.state != MPA_REQ_RCVD); + if (mpa_rev == 0) + abort_connection(ep, NULL, GFP_KERNEL); + else { + err = send_mpa_reject(ep, pdata, pdata_len); + disconnect = 1; + } + mutex_unlock(&ep->com.mutex); + if (disconnect) + err = c4iw_ep_disconnect(ep, 0, GFP_KERNEL); + c4iw_put_ep(&ep->com); + return 0; +} + +int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) +{ + int err; + struct c4iw_qp_attributes attrs; + enum c4iw_qp_attr_mask mask; + struct c4iw_ep *ep = to_ep(cm_id); + struct c4iw_dev *h = to_c4iw_dev(cm_id->device); + struct c4iw_qp *qp = get_qhp(h, conn_param->qpn); + + PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); + + mutex_lock(&ep->com.mutex); + if (ep->com.state == DEAD) { + err = -ECONNRESET; + goto err; + } + + BUG_ON(ep->com.state != MPA_REQ_RCVD); + BUG_ON(!qp); + + set_bit(ULP_ACCEPT, &ep->com.history); + if ((conn_param->ord > cur_max_read_depth(ep->com.dev)) || + (conn_param->ird > cur_max_read_depth(ep->com.dev))) { + abort_connection(ep, NULL, GFP_KERNEL); + err = -EINVAL; + goto err; + } + + if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) { + if (conn_param->ord > ep->ird) { + if (RELAXED_IRD_NEGOTIATION) { + ep->ord = ep->ird; + } else { + ep->ird = conn_param->ird; + ep->ord = conn_param->ord; + send_mpa_reject(ep, conn_param->private_data, + conn_param->private_data_len); + abort_connection(ep, NULL, GFP_KERNEL); + err = -ENOMEM; + goto err; + } + } + if (conn_param->ird < ep->ord) { + if (RELAXED_IRD_NEGOTIATION && + ep->ord <= h->rdev.lldi.max_ordird_qp) { + conn_param->ird = ep->ord; + } else { + abort_connection(ep, NULL, GFP_KERNEL); + err = -ENOMEM; + goto err; + } + } + } + ep->ird = conn_param->ird; + ep->ord = conn_param->ord; + + if (ep->mpa_attr.version == 1) { + if (peer2peer && ep->ird == 0) + ep->ird = 1; + } else { + if (peer2peer && + (ep->mpa_attr.p2p_type != FW_RI_INIT_P2PTYPE_DISABLED) && + (p2p_type == FW_RI_INIT_P2PTYPE_READ_REQ) && ep->ord == 0) + ep->ird = 1; + } + + PDBG("%s %d ird %d ord %d\n", __func__, __LINE__, ep->ird, ep->ord); + + cm_id->add_ref(cm_id); + ep->com.cm_id = cm_id; + ep->com.qp = qp; + ref_qp(ep); + + /* bind QP to EP and move to RTS */ + attrs.mpa_attr = ep->mpa_attr; + attrs.max_ird = ep->ird; + attrs.max_ord = ep->ord; + attrs.llp_stream_handle = ep; + attrs.next_state = C4IW_QP_STATE_RTS; + + /* bind QP and TID with INIT_WR */ + mask = C4IW_QP_ATTR_NEXT_STATE | + C4IW_QP_ATTR_LLP_STREAM_HANDLE | + C4IW_QP_ATTR_MPA_ATTR | + C4IW_QP_ATTR_MAX_IRD | + C4IW_QP_ATTR_MAX_ORD; + + err = c4iw_modify_qp(ep->com.qp->rhp, + ep->com.qp, mask, &attrs, 1); + if (err) + goto err1; + err = send_mpa_reply(ep, conn_param->private_data, + conn_param->private_data_len); + if (err) + goto err1; + + __state_set(&ep->com, FPDU_MODE); + established_upcall(ep); + mutex_unlock(&ep->com.mutex); + c4iw_put_ep(&ep->com); + return 0; +err1: + ep->com.cm_id = NULL; + abort_connection(ep, NULL, GFP_KERNEL); + cm_id->rem_ref(cm_id); +err: + mutex_unlock(&ep->com.mutex); + c4iw_put_ep(&ep->com); + return err; +} + +static int pick_local_ipaddrs(struct c4iw_dev *dev, struct iw_cm_id *cm_id) +{ + struct in_device *ind; + int found = 0; + struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->local_addr; + struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->remote_addr; + + ind = in_dev_get(dev->rdev.lldi.ports[0]); + if (!ind) + return -EADDRNOTAVAIL; + for_primary_ifa(ind) { + laddr->sin_addr.s_addr = ifa->ifa_address; + raddr->sin_addr.s_addr = ifa->ifa_address; + found = 1; + break; + } + endfor_ifa(ind); + in_dev_put(ind); + return found ? 0 : -EADDRNOTAVAIL; +} + +static int get_lladdr(struct net_device *dev, struct in6_addr *addr, + unsigned char banned_flags) +{ + struct inet6_dev *idev; + int err = -EADDRNOTAVAIL; + + rcu_read_lock(); + idev = __in6_dev_get(dev); + if (idev != NULL) { + struct inet6_ifaddr *ifp; + + read_lock_bh(&idev->lock); + list_for_each_entry(ifp, &idev->addr_list, if_list) { + if (ifp->scope == IFA_LINK && + !(ifp->flags & banned_flags)) { + memcpy(addr, &ifp->addr, 16); + err = 0; + break; + } + } + read_unlock_bh(&idev->lock); + } + rcu_read_unlock(); + return err; +} + +static int pick_local_ip6addrs(struct c4iw_dev *dev, struct iw_cm_id *cm_id) +{ + struct in6_addr uninitialized_var(addr); + struct sockaddr_in6 *la6 = (struct sockaddr_in6 *)&cm_id->local_addr; + struct sockaddr_in6 *ra6 = (struct sockaddr_in6 *)&cm_id->remote_addr; + + if (get_lladdr(dev->rdev.lldi.ports[0], &addr, IFA_F_TENTATIVE)) { + memcpy(la6->sin6_addr.s6_addr, &addr, 16); + memcpy(ra6->sin6_addr.s6_addr, &addr, 16); + return 0; + } + return -EADDRNOTAVAIL; +} + +int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) +{ + struct c4iw_dev *dev = to_c4iw_dev(cm_id->device); + struct c4iw_ep *ep; + int err = 0; + struct sockaddr_in *laddr; + struct sockaddr_in *raddr; + struct sockaddr_in6 *laddr6; + struct sockaddr_in6 *raddr6; + struct iwpm_dev_data pm_reg_msg; + struct iwpm_sa_data pm_msg; + __u8 *ra; + int iptype; + int iwpm_err = 0; + + if ((conn_param->ord > cur_max_read_depth(dev)) || + (conn_param->ird > cur_max_read_depth(dev))) { + err = -EINVAL; + goto out; + } + ep = alloc_ep(sizeof(*ep), GFP_KERNEL); + if (!ep) { + printk(KERN_ERR MOD "%s - cannot alloc ep.\n", __func__); + err = -ENOMEM; + goto out; + } + init_timer(&ep->timer); + ep->plen = conn_param->private_data_len; + if (ep->plen) + memcpy(ep->mpa_pkt + sizeof(struct mpa_message), + conn_param->private_data, ep->plen); + ep->ird = conn_param->ird; + ep->ord = conn_param->ord; + + if (peer2peer && ep->ord == 0) + ep->ord = 1; + + cm_id->add_ref(cm_id); + ep->com.dev = dev; + ep->com.cm_id = cm_id; + ep->com.qp = get_qhp(dev, conn_param->qpn); + if (!ep->com.qp) { + PDBG("%s qpn 0x%x not found!\n", __func__, conn_param->qpn); + err = -EINVAL; + goto fail1; + } + ref_qp(ep); + PDBG("%s qpn 0x%x qp %p cm_id %p\n", __func__, conn_param->qpn, + ep->com.qp, cm_id); + + /* + * Allocate an active TID to initiate a TCP connection. + */ + ep->atid = cxgb4_alloc_atid(dev->rdev.lldi.tids, ep); + if (ep->atid == -1) { + printk(KERN_ERR MOD "%s - cannot alloc atid.\n", __func__); + err = -ENOMEM; + goto fail1; + } + insert_handle(dev, &dev->atid_idr, ep, ep->atid); + + memcpy(&ep->com.local_addr, &cm_id->local_addr, + sizeof(ep->com.local_addr)); + memcpy(&ep->com.remote_addr, &cm_id->remote_addr, + sizeof(ep->com.remote_addr)); + + /* No port mapper available, go with the specified peer information */ + memcpy(&ep->com.mapped_local_addr, &cm_id->local_addr, + sizeof(ep->com.mapped_local_addr)); + memcpy(&ep->com.mapped_remote_addr, &cm_id->remote_addr, + sizeof(ep->com.mapped_remote_addr)); + + c4iw_form_reg_msg(dev, &pm_reg_msg); + iwpm_err = iwpm_register_pid(&pm_reg_msg, RDMA_NL_C4IW); + if (iwpm_err) { + PDBG("%s: Port Mapper reg pid fail (err = %d).\n", + __func__, iwpm_err); + } + if (iwpm_valid_pid() && !iwpm_err) { + c4iw_form_pm_msg(ep, &pm_msg); + iwpm_err = iwpm_add_and_query_mapping(&pm_msg, RDMA_NL_C4IW); + if (iwpm_err) + PDBG("%s: Port Mapper query fail (err = %d).\n", + __func__, iwpm_err); + else + c4iw_record_pm_msg(ep, &pm_msg); + } + if (iwpm_create_mapinfo(&ep->com.local_addr, + &ep->com.mapped_local_addr, RDMA_NL_C4IW)) { + iwpm_remove_mapping(&ep->com.local_addr, RDMA_NL_C4IW); + err = -ENOMEM; + goto fail1; + } + print_addr(&ep->com, __func__, "add_query/create_mapinfo"); + set_bit(RELEASE_MAPINFO, &ep->com.flags); + + laddr = (struct sockaddr_in *)&ep->com.mapped_local_addr; + raddr = (struct sockaddr_in *)&ep->com.mapped_remote_addr; + laddr6 = (struct sockaddr_in6 *)&ep->com.mapped_local_addr; + raddr6 = (struct sockaddr_in6 *) &ep->com.mapped_remote_addr; + + if (cm_id->remote_addr.ss_family == AF_INET) { + iptype = 4; + ra = (__u8 *)&raddr->sin_addr; + + /* + * Handle loopback requests to INADDR_ANY. + */ + if ((__force int)raddr->sin_addr.s_addr == INADDR_ANY) { + err = pick_local_ipaddrs(dev, cm_id); + if (err) + goto fail1; + } + + /* find a route */ + PDBG("%s saddr %pI4 sport 0x%x raddr %pI4 rport 0x%x\n", + __func__, &laddr->sin_addr, ntohs(laddr->sin_port), + ra, ntohs(raddr->sin_port)); + ep->dst = find_route(dev, laddr->sin_addr.s_addr, + raddr->sin_addr.s_addr, laddr->sin_port, + raddr->sin_port, 0); + } else { + iptype = 6; + ra = (__u8 *)&raddr6->sin6_addr; + + /* + * Handle loopback requests to INADDR_ANY. + */ + if (ipv6_addr_type(&raddr6->sin6_addr) == IPV6_ADDR_ANY) { + err = pick_local_ip6addrs(dev, cm_id); + if (err) + goto fail1; + } + + /* find a route */ + PDBG("%s saddr %pI6 sport 0x%x raddr %pI6 rport 0x%x\n", + __func__, laddr6->sin6_addr.s6_addr, + ntohs(laddr6->sin6_port), + raddr6->sin6_addr.s6_addr, ntohs(raddr6->sin6_port)); + ep->dst = find_route6(dev, laddr6->sin6_addr.s6_addr, + raddr6->sin6_addr.s6_addr, + laddr6->sin6_port, raddr6->sin6_port, 0, + raddr6->sin6_scope_id); + } + if (!ep->dst) { + printk(KERN_ERR MOD "%s - cannot find route.\n", __func__); + err = -EHOSTUNREACH; + goto fail2; + } + + err = import_ep(ep, iptype, ra, ep->dst, ep->com.dev, true); + if (err) { + printk(KERN_ERR MOD "%s - cannot alloc l2e.\n", __func__); + goto fail3; + } + + PDBG("%s txq_idx %u tx_chan %u smac_idx %u rss_qid %u l2t_idx %u\n", + __func__, ep->txq_idx, ep->tx_chan, ep->smac_idx, ep->rss_qid, + ep->l2t->idx); + + state_set(&ep->com, CONNECTING); + ep->tos = 0; + + /* send connect request to rnic */ + err = send_connect(ep); + if (!err) + goto out; + + cxgb4_l2t_release(ep->l2t); +fail3: + dst_release(ep->dst); +fail2: + remove_handle(ep->com.dev, &ep->com.dev->atid_idr, ep->atid); + cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid); +fail1: + cm_id->rem_ref(cm_id); + c4iw_put_ep(&ep->com); +out: + return err; +} + +static int create_server6(struct c4iw_dev *dev, struct c4iw_listen_ep *ep) +{ + int err; + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) + &ep->com.mapped_local_addr; + + c4iw_init_wr_wait(&ep->com.wr_wait); + err = cxgb4_create_server6(ep->com.dev->rdev.lldi.ports[0], + ep->stid, &sin6->sin6_addr, + sin6->sin6_port, + ep->com.dev->rdev.lldi.rxq_ids[0]); + if (!err) + err = c4iw_wait_for_reply(&ep->com.dev->rdev, + &ep->com.wr_wait, + 0, 0, __func__); + else if (err > 0) + err = net_xmit_errno(err); + if (err) + pr_err("cxgb4_create_server6/filter failed err %d stid %d laddr %pI6 lport %d\n", + err, ep->stid, + sin6->sin6_addr.s6_addr, ntohs(sin6->sin6_port)); + return err; +} + +static int create_server4(struct c4iw_dev *dev, struct c4iw_listen_ep *ep) +{ + int err; + struct sockaddr_in *sin = (struct sockaddr_in *) + &ep->com.mapped_local_addr; + + if (dev->rdev.lldi.enable_fw_ofld_conn) { + do { + err = cxgb4_create_server_filter( + ep->com.dev->rdev.lldi.ports[0], ep->stid, + sin->sin_addr.s_addr, sin->sin_port, 0, + ep->com.dev->rdev.lldi.rxq_ids[0], 0, 0); + if (err == -EBUSY) { + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(usecs_to_jiffies(100)); + } + } while (err == -EBUSY); + } else { + c4iw_init_wr_wait(&ep->com.wr_wait); + err = cxgb4_create_server(ep->com.dev->rdev.lldi.ports[0], + ep->stid, sin->sin_addr.s_addr, sin->sin_port, + 0, ep->com.dev->rdev.lldi.rxq_ids[0]); + if (!err) + err = c4iw_wait_for_reply(&ep->com.dev->rdev, + &ep->com.wr_wait, + 0, 0, __func__); + else if (err > 0) + err = net_xmit_errno(err); + } + if (err) + pr_err("cxgb4_create_server/filter failed err %d stid %d laddr %pI4 lport %d\n" + , err, ep->stid, + &sin->sin_addr, ntohs(sin->sin_port)); + return err; +} + +int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog) +{ + int err = 0; + struct c4iw_dev *dev = to_c4iw_dev(cm_id->device); + struct c4iw_listen_ep *ep; + struct iwpm_dev_data pm_reg_msg; + struct iwpm_sa_data pm_msg; + int iwpm_err = 0; + + might_sleep(); + + ep = alloc_ep(sizeof(*ep), GFP_KERNEL); + if (!ep) { + printk(KERN_ERR MOD "%s - cannot alloc ep.\n", __func__); + err = -ENOMEM; + goto fail1; + } + PDBG("%s ep %p\n", __func__, ep); + cm_id->add_ref(cm_id); + ep->com.cm_id = cm_id; + ep->com.dev = dev; + ep->backlog = backlog; + memcpy(&ep->com.local_addr, &cm_id->local_addr, + sizeof(ep->com.local_addr)); + + /* + * Allocate a server TID. + */ + if (dev->rdev.lldi.enable_fw_ofld_conn && + ep->com.local_addr.ss_family == AF_INET) + ep->stid = cxgb4_alloc_sftid(dev->rdev.lldi.tids, + cm_id->local_addr.ss_family, ep); + else + ep->stid = cxgb4_alloc_stid(dev->rdev.lldi.tids, + cm_id->local_addr.ss_family, ep); + + if (ep->stid == -1) { + printk(KERN_ERR MOD "%s - cannot alloc stid.\n", __func__); + err = -ENOMEM; + goto fail2; + } + insert_handle(dev, &dev->stid_idr, ep, ep->stid); + + /* No port mapper available, go with the specified info */ + memcpy(&ep->com.mapped_local_addr, &cm_id->local_addr, + sizeof(ep->com.mapped_local_addr)); + + c4iw_form_reg_msg(dev, &pm_reg_msg); + iwpm_err = iwpm_register_pid(&pm_reg_msg, RDMA_NL_C4IW); + if (iwpm_err) { + PDBG("%s: Port Mapper reg pid fail (err = %d).\n", + __func__, iwpm_err); + } + if (iwpm_valid_pid() && !iwpm_err) { + memcpy(&pm_msg.loc_addr, &ep->com.local_addr, + sizeof(ep->com.local_addr)); + iwpm_err = iwpm_add_mapping(&pm_msg, RDMA_NL_C4IW); + if (iwpm_err) + PDBG("%s: Port Mapper query fail (err = %d).\n", + __func__, iwpm_err); + else + memcpy(&ep->com.mapped_local_addr, + &pm_msg.mapped_loc_addr, + sizeof(ep->com.mapped_local_addr)); + } + if (iwpm_create_mapinfo(&ep->com.local_addr, + &ep->com.mapped_local_addr, RDMA_NL_C4IW)) { + err = -ENOMEM; + goto fail3; + } + print_addr(&ep->com, __func__, "add_mapping/create_mapinfo"); + + set_bit(RELEASE_MAPINFO, &ep->com.flags); + state_set(&ep->com, LISTEN); + if (ep->com.local_addr.ss_family == AF_INET) + err = create_server4(dev, ep); + else + err = create_server6(dev, ep); + if (!err) { + cm_id->provider_data = ep; + goto out; + } + +fail3: + cxgb4_free_stid(ep->com.dev->rdev.lldi.tids, ep->stid, + ep->com.local_addr.ss_family); +fail2: + cm_id->rem_ref(cm_id); + c4iw_put_ep(&ep->com); +fail1: +out: + return err; +} + +int c4iw_destroy_listen(struct iw_cm_id *cm_id) +{ + int err; + struct c4iw_listen_ep *ep = to_listen_ep(cm_id); + + PDBG("%s ep %p\n", __func__, ep); + + might_sleep(); + state_set(&ep->com, DEAD); + if (ep->com.dev->rdev.lldi.enable_fw_ofld_conn && + ep->com.local_addr.ss_family == AF_INET) { + err = cxgb4_remove_server_filter( + ep->com.dev->rdev.lldi.ports[0], ep->stid, + ep->com.dev->rdev.lldi.rxq_ids[0], 0); + } else { + c4iw_init_wr_wait(&ep->com.wr_wait); + err = cxgb4_remove_server( + ep->com.dev->rdev.lldi.ports[0], ep->stid, + ep->com.dev->rdev.lldi.rxq_ids[0], 0); + if (err) + goto done; + err = c4iw_wait_for_reply(&ep->com.dev->rdev, &ep->com.wr_wait, + 0, 0, __func__); + } + remove_handle(ep->com.dev, &ep->com.dev->stid_idr, ep->stid); + cxgb4_free_stid(ep->com.dev->rdev.lldi.tids, ep->stid, + ep->com.local_addr.ss_family); +done: + cm_id->rem_ref(cm_id); + c4iw_put_ep(&ep->com); + return err; +} + +int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp) +{ + int ret = 0; + int close = 0; + int fatal = 0; + struct c4iw_rdev *rdev; + + mutex_lock(&ep->com.mutex); + + PDBG("%s ep %p state %s, abrupt %d\n", __func__, ep, + states[ep->com.state], abrupt); + + rdev = &ep->com.dev->rdev; + if (c4iw_fatal_error(rdev)) { + fatal = 1; + close_complete_upcall(ep, -EIO); + ep->com.state = DEAD; + } + switch (ep->com.state) { + case MPA_REQ_WAIT: + case MPA_REQ_SENT: + case MPA_REQ_RCVD: + case MPA_REP_SENT: + case FPDU_MODE: + close = 1; + if (abrupt) + ep->com.state = ABORTING; + else { + ep->com.state = CLOSING; + start_ep_timer(ep); + } + set_bit(CLOSE_SENT, &ep->com.flags); + break; + case CLOSING: + if (!test_and_set_bit(CLOSE_SENT, &ep->com.flags)) { + close = 1; + if (abrupt) { + (void)stop_ep_timer(ep); + ep->com.state = ABORTING; + } else + ep->com.state = MORIBUND; + } + break; + case MORIBUND: + case ABORTING: + case DEAD: + PDBG("%s ignoring disconnect ep %p state %u\n", + __func__, ep, ep->com.state); + break; + default: + BUG(); + break; + } + + if (close) { + if (abrupt) { + set_bit(EP_DISC_ABORT, &ep->com.history); + close_complete_upcall(ep, -ECONNRESET); + ret = send_abort(ep, NULL, gfp); + } else { + set_bit(EP_DISC_CLOSE, &ep->com.history); + ret = send_halfclose(ep, gfp); + } + if (ret) + fatal = 1; + } + mutex_unlock(&ep->com.mutex); + if (fatal) + release_ep_resources(ep); + return ret; +} + +static void active_ofld_conn_reply(struct c4iw_dev *dev, struct sk_buff *skb, + struct cpl_fw6_msg_ofld_connection_wr_rpl *req) +{ + struct c4iw_ep *ep; + int atid = be32_to_cpu(req->tid); + + ep = (struct c4iw_ep *)lookup_atid(dev->rdev.lldi.tids, + (__force u32) req->tid); + if (!ep) + return; + + switch (req->retval) { + case FW_ENOMEM: + set_bit(ACT_RETRY_NOMEM, &ep->com.history); + if (ep->retry_count++ < ACT_OPEN_RETRY_COUNT) { + send_fw_act_open_req(ep, atid); + return; + } + case FW_EADDRINUSE: + set_bit(ACT_RETRY_INUSE, &ep->com.history); + if (ep->retry_count++ < ACT_OPEN_RETRY_COUNT) { + send_fw_act_open_req(ep, atid); + return; + } + break; + default: + pr_info("%s unexpected ofld conn wr retval %d\n", + __func__, req->retval); + break; + } + pr_err("active ofld_connect_wr failure %d atid %d\n", + req->retval, atid); + mutex_lock(&dev->rdev.stats.lock); + dev->rdev.stats.act_ofld_conn_fails++; + mutex_unlock(&dev->rdev.stats.lock); + connect_reply_upcall(ep, status2errno(req->retval)); + state_set(&ep->com, DEAD); + remove_handle(dev, &dev->atid_idr, atid); + cxgb4_free_atid(dev->rdev.lldi.tids, atid); + dst_release(ep->dst); + cxgb4_l2t_release(ep->l2t); + c4iw_put_ep(&ep->com); +} + +static void passive_ofld_conn_reply(struct c4iw_dev *dev, struct sk_buff *skb, + struct cpl_fw6_msg_ofld_connection_wr_rpl *req) +{ + struct sk_buff *rpl_skb; + struct cpl_pass_accept_req *cpl; + int ret; + + rpl_skb = (struct sk_buff *)(unsigned long)req->cookie; + BUG_ON(!rpl_skb); + if (req->retval) { + PDBG("%s passive open failure %d\n", __func__, req->retval); + mutex_lock(&dev->rdev.stats.lock); + dev->rdev.stats.pas_ofld_conn_fails++; + mutex_unlock(&dev->rdev.stats.lock); + kfree_skb(rpl_skb); + } else { + cpl = (struct cpl_pass_accept_req *)cplhdr(rpl_skb); + OPCODE_TID(cpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_REQ, + (__force u32) htonl( + (__force u32) req->tid))); + ret = pass_accept_req(dev, rpl_skb); + if (!ret) + kfree_skb(rpl_skb); + } + return; +} + +static int deferred_fw6_msg(struct c4iw_dev *dev, struct sk_buff *skb) +{ + struct cpl_fw6_msg *rpl = cplhdr(skb); + struct cpl_fw6_msg_ofld_connection_wr_rpl *req; + + switch (rpl->type) { + case FW6_TYPE_CQE: + c4iw_ev_dispatch(dev, (struct t4_cqe *)&rpl->data[0]); + break; + case FW6_TYPE_OFLD_CONNECTION_WR_RPL: + req = (struct cpl_fw6_msg_ofld_connection_wr_rpl *)rpl->data; + switch (req->t_state) { + case TCP_SYN_SENT: + active_ofld_conn_reply(dev, skb, req); + break; + case TCP_SYN_RECV: + passive_ofld_conn_reply(dev, skb, req); + break; + default: + pr_err("%s unexpected ofld conn wr state %d\n", + __func__, req->t_state); + break; + } + break; + } + return 0; +} + +static void build_cpl_pass_accept_req(struct sk_buff *skb, int stid , u8 tos) +{ + u32 l2info; + u16 vlantag, len, hdr_len, eth_hdr_len; + u8 intf; + struct cpl_rx_pkt *cpl = cplhdr(skb); + struct cpl_pass_accept_req *req; + struct tcp_options_received tmp_opt; + struct c4iw_dev *dev; + + dev = *((struct c4iw_dev **) (skb->cb + sizeof(void *))); + /* Store values from cpl_rx_pkt in temporary location. */ + vlantag = (__force u16) cpl->vlan; + len = (__force u16) cpl->len; + l2info = (__force u32) cpl->l2info; + hdr_len = (__force u16) cpl->hdr_len; + intf = cpl->iff; + + __skb_pull(skb, sizeof(*req) + sizeof(struct rss_header)); + + /* + * We need to parse the TCP options from SYN packet. + * to generate cpl_pass_accept_req. + */ + memset(&tmp_opt, 0, sizeof(tmp_opt)); + tcp_clear_options(&tmp_opt); + tcp_parse_options(skb, &tmp_opt, 0, NULL); + + req = (struct cpl_pass_accept_req *)__skb_push(skb, sizeof(*req)); + memset(req, 0, sizeof(*req)); + req->l2info = cpu_to_be16(SYN_INTF_V(intf) | + SYN_MAC_IDX_V(RX_MACIDX_G( + (__force int) htonl(l2info))) | + SYN_XACT_MATCH_F); + eth_hdr_len = is_t4(dev->rdev.lldi.adapter_type) ? + RX_ETHHDR_LEN_G((__force int)htonl(l2info)) : + RX_T5_ETHHDR_LEN_G((__force int)htonl(l2info)); + req->hdr_len = cpu_to_be32(SYN_RX_CHAN_V(RX_CHAN_G( + (__force int) htonl(l2info))) | + TCP_HDR_LEN_V(RX_TCPHDR_LEN_G( + (__force int) htons(hdr_len))) | + IP_HDR_LEN_V(RX_IPHDR_LEN_G( + (__force int) htons(hdr_len))) | + ETH_HDR_LEN_V(RX_ETHHDR_LEN_G(eth_hdr_len))); + req->vlan = (__force __be16) vlantag; + req->len = (__force __be16) len; + req->tos_stid = cpu_to_be32(PASS_OPEN_TID_V(stid) | + PASS_OPEN_TOS_V(tos)); + req->tcpopt.mss = htons(tmp_opt.mss_clamp); + if (tmp_opt.wscale_ok) + req->tcpopt.wsf = tmp_opt.snd_wscale; + req->tcpopt.tstamp = tmp_opt.saw_tstamp; + if (tmp_opt.sack_ok) + req->tcpopt.sack = 1; + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_REQ, 0)); + return; +} + +static void send_fw_pass_open_req(struct c4iw_dev *dev, struct sk_buff *skb, + __be32 laddr, __be16 lport, + __be32 raddr, __be16 rport, + u32 rcv_isn, u32 filter, u16 window, + u32 rss_qid, u8 port_id) +{ + struct sk_buff *req_skb; + struct fw_ofld_connection_wr *req; + struct cpl_pass_accept_req *cpl = cplhdr(skb); + int ret; + + req_skb = alloc_skb(sizeof(struct fw_ofld_connection_wr), GFP_KERNEL); + req = (struct fw_ofld_connection_wr *)__skb_put(req_skb, sizeof(*req)); + memset(req, 0, sizeof(*req)); + req->op_compl = htonl(WR_OP_V(FW_OFLD_CONNECTION_WR) | FW_WR_COMPL_F); + req->len16_pkd = htonl(FW_WR_LEN16_V(DIV_ROUND_UP(sizeof(*req), 16))); + req->le.version_cpl = htonl(FW_OFLD_CONNECTION_WR_CPL_F); + req->le.filter = (__force __be32) filter; + req->le.lport = lport; + req->le.pport = rport; + req->le.u.ipv4.lip = laddr; + req->le.u.ipv4.pip = raddr; + req->tcb.rcv_nxt = htonl(rcv_isn + 1); + req->tcb.rcv_adv = htons(window); + req->tcb.t_state_to_astid = + htonl(FW_OFLD_CONNECTION_WR_T_STATE_V(TCP_SYN_RECV) | + FW_OFLD_CONNECTION_WR_RCV_SCALE_V(cpl->tcpopt.wsf) | + FW_OFLD_CONNECTION_WR_ASTID_V( + PASS_OPEN_TID_G(ntohl(cpl->tos_stid)))); + + /* + * We store the qid in opt2 which will be used by the firmware + * to send us the wr response. + */ + req->tcb.opt2 = htonl(RSS_QUEUE_V(rss_qid)); + + /* + * We initialize the MSS index in TCB to 0xF. + * So that when driver sends cpl_pass_accept_rpl + * TCB picks up the correct value. If this was 0 + * TP will ignore any value > 0 for MSS index. + */ + req->tcb.opt0 = cpu_to_be64(MSS_IDX_V(0xF)); + req->cookie = (uintptr_t)skb; + + set_wr_txq(req_skb, CPL_PRIORITY_CONTROL, port_id); + ret = cxgb4_ofld_send(dev->rdev.lldi.ports[0], req_skb); + if (ret < 0) { + pr_err("%s - cxgb4_ofld_send error %d - dropping\n", __func__, + ret); + kfree_skb(skb); + kfree_skb(req_skb); + } +} + +/* + * Handler for CPL_RX_PKT message. Need to handle cpl_rx_pkt + * messages when a filter is being used instead of server to + * redirect a syn packet. When packets hit filter they are redirected + * to the offload queue and driver tries to establish the connection + * using firmware work request. + */ +static int rx_pkt(struct c4iw_dev *dev, struct sk_buff *skb) +{ + int stid; + unsigned int filter; + struct ethhdr *eh = NULL; + struct vlan_ethhdr *vlan_eh = NULL; + struct iphdr *iph; + struct tcphdr *tcph; + struct rss_header *rss = (void *)skb->data; + struct cpl_rx_pkt *cpl = (void *)skb->data; + struct cpl_pass_accept_req *req = (void *)(rss + 1); + struct l2t_entry *e; + struct dst_entry *dst; + struct c4iw_ep *lep; + u16 window; + struct port_info *pi; + struct net_device *pdev; + u16 rss_qid, eth_hdr_len; + int step; + u32 tx_chan; + struct neighbour *neigh; + + /* Drop all non-SYN packets */ + if (!(cpl->l2info & cpu_to_be32(RXF_SYN_F))) + goto reject; + + /* + * Drop all packets which did not hit the filter. + * Unlikely to happen. + */ + if (!(rss->filter_hit && rss->filter_tid)) + goto reject; + + /* + * Calculate the server tid from filter hit index from cpl_rx_pkt. + */ + stid = (__force int) cpu_to_be32((__force u32) rss->hash_val); + + lep = (struct c4iw_ep *)lookup_stid(dev->rdev.lldi.tids, stid); + if (!lep) { + PDBG("%s connect request on invalid stid %d\n", __func__, stid); + goto reject; + } + + eth_hdr_len = is_t4(dev->rdev.lldi.adapter_type) ? + RX_ETHHDR_LEN_G(htonl(cpl->l2info)) : + RX_T5_ETHHDR_LEN_G(htonl(cpl->l2info)); + if (eth_hdr_len == ETH_HLEN) { + eh = (struct ethhdr *)(req + 1); + iph = (struct iphdr *)(eh + 1); + } else { + vlan_eh = (struct vlan_ethhdr *)(req + 1); + iph = (struct iphdr *)(vlan_eh + 1); + skb->vlan_tci = ntohs(cpl->vlan); + } + + if (iph->version != 0x4) + goto reject; + + tcph = (struct tcphdr *)(iph + 1); + skb_set_network_header(skb, (void *)iph - (void *)rss); + skb_set_transport_header(skb, (void *)tcph - (void *)rss); + skb_get(skb); + + PDBG("%s lip 0x%x lport %u pip 0x%x pport %u tos %d\n", __func__, + ntohl(iph->daddr), ntohs(tcph->dest), ntohl(iph->saddr), + ntohs(tcph->source), iph->tos); + + dst = find_route(dev, iph->daddr, iph->saddr, tcph->dest, tcph->source, + iph->tos); + if (!dst) { + pr_err("%s - failed to find dst entry!\n", + __func__); + goto reject; + } + neigh = dst_neigh_lookup_skb(dst, skb); + + if (!neigh) { + pr_err("%s - failed to allocate neigh!\n", + __func__); + goto free_dst; + } + + if (neigh->dev->flags & IFF_LOOPBACK) { + pdev = ip_dev_find(&init_net, iph->daddr); + e = cxgb4_l2t_get(dev->rdev.lldi.l2t, neigh, + pdev, 0); + pi = (struct port_info *)netdev_priv(pdev); + tx_chan = cxgb4_port_chan(pdev); + dev_put(pdev); + } else { + pdev = get_real_dev(neigh->dev); + e = cxgb4_l2t_get(dev->rdev.lldi.l2t, neigh, + pdev, 0); + pi = (struct port_info *)netdev_priv(pdev); + tx_chan = cxgb4_port_chan(pdev); + } + neigh_release(neigh); + if (!e) { + pr_err("%s - failed to allocate l2t entry!\n", + __func__); + goto free_dst; + } + + step = dev->rdev.lldi.nrxq / dev->rdev.lldi.nchan; + rss_qid = dev->rdev.lldi.rxq_ids[pi->port_id * step]; + window = (__force u16) htons((__force u16)tcph->window); + + /* Calcuate filter portion for LE region. */ + filter = (__force unsigned int) cpu_to_be32(cxgb4_select_ntuple( + dev->rdev.lldi.ports[0], + e)); + + /* + * Synthesize the cpl_pass_accept_req. We have everything except the + * TID. Once firmware sends a reply with TID we update the TID field + * in cpl and pass it through the regular cpl_pass_accept_req path. + */ + build_cpl_pass_accept_req(skb, stid, iph->tos); + send_fw_pass_open_req(dev, skb, iph->daddr, tcph->dest, iph->saddr, + tcph->source, ntohl(tcph->seq), filter, window, + rss_qid, pi->port_id); + cxgb4_l2t_release(e); +free_dst: + dst_release(dst); +reject: + return 0; +} + +/* + * These are the real handlers that are called from a + * work queue. + */ +static c4iw_handler_func work_handlers[NUM_CPL_CMDS] = { + [CPL_ACT_ESTABLISH] = act_establish, + [CPL_ACT_OPEN_RPL] = act_open_rpl, + [CPL_RX_DATA] = rx_data, + [CPL_ABORT_RPL_RSS] = abort_rpl, + [CPL_ABORT_RPL] = abort_rpl, + [CPL_PASS_OPEN_RPL] = pass_open_rpl, + [CPL_CLOSE_LISTSRV_RPL] = close_listsrv_rpl, + [CPL_PASS_ACCEPT_REQ] = pass_accept_req, + [CPL_PASS_ESTABLISH] = pass_establish, + [CPL_PEER_CLOSE] = peer_close, + [CPL_ABORT_REQ_RSS] = peer_abort, + [CPL_CLOSE_CON_RPL] = close_con_rpl, + [CPL_RDMA_TERMINATE] = terminate, + [CPL_FW4_ACK] = fw4_ack, + [CPL_FW6_MSG] = deferred_fw6_msg, + [CPL_RX_PKT] = rx_pkt +}; + +static void process_timeout(struct c4iw_ep *ep) +{ + struct c4iw_qp_attributes attrs; + int abort = 1; + + mutex_lock(&ep->com.mutex); + PDBG("%s ep %p tid %u state %d\n", __func__, ep, ep->hwtid, + ep->com.state); + set_bit(TIMEDOUT, &ep->com.history); + switch (ep->com.state) { + case MPA_REQ_SENT: + __state_set(&ep->com, ABORTING); + connect_reply_upcall(ep, -ETIMEDOUT); + break; + case MPA_REQ_WAIT: + __state_set(&ep->com, ABORTING); + break; + case CLOSING: + case MORIBUND: + if (ep->com.cm_id && ep->com.qp) { + attrs.next_state = C4IW_QP_STATE_ERROR; + c4iw_modify_qp(ep->com.qp->rhp, + ep->com.qp, C4IW_QP_ATTR_NEXT_STATE, + &attrs, 1); + } + __state_set(&ep->com, ABORTING); + close_complete_upcall(ep, -ETIMEDOUT); + break; + case ABORTING: + case DEAD: + + /* + * These states are expected if the ep timed out at the same + * time as another thread was calling stop_ep_timer(). + * So we silently do nothing for these states. + */ + abort = 0; + break; + default: + WARN(1, "%s unexpected state ep %p tid %u state %u\n", + __func__, ep, ep->hwtid, ep->com.state); + abort = 0; + } + if (abort) + abort_connection(ep, NULL, GFP_KERNEL); + mutex_unlock(&ep->com.mutex); + c4iw_put_ep(&ep->com); +} + +static void process_timedout_eps(void) +{ + struct c4iw_ep *ep; + + spin_lock_irq(&timeout_lock); + while (!list_empty(&timeout_list)) { + struct list_head *tmp; + + tmp = timeout_list.next; + list_del(tmp); + tmp->next = NULL; + tmp->prev = NULL; + spin_unlock_irq(&timeout_lock); + ep = list_entry(tmp, struct c4iw_ep, entry); + process_timeout(ep); + spin_lock_irq(&timeout_lock); + } + spin_unlock_irq(&timeout_lock); +} + +static void process_work(struct work_struct *work) +{ + struct sk_buff *skb = NULL; + struct c4iw_dev *dev; + struct cpl_act_establish *rpl; + unsigned int opcode; + int ret; + + process_timedout_eps(); + while ((skb = skb_dequeue(&rxq))) { + rpl = cplhdr(skb); + dev = *((struct c4iw_dev **) (skb->cb + sizeof(void *))); + opcode = rpl->ot.opcode; + + BUG_ON(!work_handlers[opcode]); + ret = work_handlers[opcode](dev, skb); + if (!ret) + kfree_skb(skb); + process_timedout_eps(); + } +} + +static DECLARE_WORK(skb_work, process_work); + +static void ep_timeout(unsigned long arg) +{ + struct c4iw_ep *ep = (struct c4iw_ep *)arg; + int kickit = 0; + + spin_lock(&timeout_lock); + if (!test_and_set_bit(TIMEOUT, &ep->com.flags)) { + /* + * Only insert if it is not already on the list. + */ + if (!ep->entry.next) { + list_add_tail(&ep->entry, &timeout_list); + kickit = 1; + } + } + spin_unlock(&timeout_lock); + if (kickit) + queue_work(workq, &skb_work); +} + +/* + * All the CM events are handled on a work queue to have a safe context. + */ +static int sched(struct c4iw_dev *dev, struct sk_buff *skb) +{ + + /* + * Save dev in the skb->cb area. + */ + *((struct c4iw_dev **) (skb->cb + sizeof(void *))) = dev; + + /* + * Queue the skb and schedule the worker thread. + */ + skb_queue_tail(&rxq, skb); + queue_work(workq, &skb_work); + return 0; +} + +static int set_tcb_rpl(struct c4iw_dev *dev, struct sk_buff *skb) +{ + struct cpl_set_tcb_rpl *rpl = cplhdr(skb); + + if (rpl->status != CPL_ERR_NONE) { + printk(KERN_ERR MOD "Unexpected SET_TCB_RPL status %u " + "for tid %u\n", rpl->status, GET_TID(rpl)); + } + kfree_skb(skb); + return 0; +} + +static int fw6_msg(struct c4iw_dev *dev, struct sk_buff *skb) +{ + struct cpl_fw6_msg *rpl = cplhdr(skb); + struct c4iw_wr_wait *wr_waitp; + int ret; + + PDBG("%s type %u\n", __func__, rpl->type); + + switch (rpl->type) { + case FW6_TYPE_WR_RPL: + ret = (int)((be64_to_cpu(rpl->data[0]) >> 8) & 0xff); + wr_waitp = (struct c4iw_wr_wait *)(__force unsigned long) rpl->data[1]; + PDBG("%s wr_waitp %p ret %u\n", __func__, wr_waitp, ret); + if (wr_waitp) + c4iw_wake_up(wr_waitp, ret ? -ret : 0); + kfree_skb(skb); + break; + case FW6_TYPE_CQE: + case FW6_TYPE_OFLD_CONNECTION_WR_RPL: + sched(dev, skb); + break; + default: + printk(KERN_ERR MOD "%s unexpected fw6 msg type %u\n", __func__, + rpl->type); + kfree_skb(skb); + break; + } + return 0; +} + +static int peer_abort_intr(struct c4iw_dev *dev, struct sk_buff *skb) +{ + struct cpl_abort_req_rss *req = cplhdr(skb); + struct c4iw_ep *ep; + struct tid_info *t = dev->rdev.lldi.tids; + unsigned int tid = GET_TID(req); + + ep = lookup_tid(t, tid); + if (!ep) { + printk(KERN_WARNING MOD + "Abort on non-existent endpoint, tid %d\n", tid); + kfree_skb(skb); + return 0; + } + if (is_neg_adv(req->status)) { + PDBG("%s Negative advice on abort- tid %u status %d (%s)\n", + __func__, ep->hwtid, req->status, + neg_adv_str(req->status)); + ep->stats.abort_neg_adv++; + dev->rdev.stats.neg_adv++; + kfree_skb(skb); + return 0; + } + PDBG("%s ep %p tid %u state %u\n", __func__, ep, ep->hwtid, + ep->com.state); + + /* + * Wake up any threads in rdma_init() or rdma_fini(). + * However, if we are on MPAv2 and want to retry with MPAv1 + * then, don't wake up yet. + */ + if (mpa_rev == 2 && !ep->tried_with_mpa_v1) { + if (ep->com.state != MPA_REQ_SENT) + c4iw_wake_up(&ep->com.wr_wait, -ECONNRESET); + } else + c4iw_wake_up(&ep->com.wr_wait, -ECONNRESET); + sched(dev, skb); + return 0; +} + +/* + * Most upcalls from the T4 Core go to sched() to + * schedule the processing on a work queue. + */ +c4iw_handler_func c4iw_handlers[NUM_CPL_CMDS] = { + [CPL_ACT_ESTABLISH] = sched, + [CPL_ACT_OPEN_RPL] = sched, + [CPL_RX_DATA] = sched, + [CPL_ABORT_RPL_RSS] = sched, + [CPL_ABORT_RPL] = sched, + [CPL_PASS_OPEN_RPL] = sched, + [CPL_CLOSE_LISTSRV_RPL] = sched, + [CPL_PASS_ACCEPT_REQ] = sched, + [CPL_PASS_ESTABLISH] = sched, + [CPL_PEER_CLOSE] = sched, + [CPL_CLOSE_CON_RPL] = sched, + [CPL_ABORT_REQ_RSS] = peer_abort_intr, + [CPL_RDMA_TERMINATE] = sched, + [CPL_FW4_ACK] = sched, + [CPL_SET_TCB_RPL] = set_tcb_rpl, + [CPL_FW6_MSG] = fw6_msg, + [CPL_RX_PKT] = sched +}; + +int __init c4iw_cm_init(void) +{ + spin_lock_init(&timeout_lock); + skb_queue_head_init(&rxq); + + workq = create_singlethread_workqueue("iw_cxgb4"); + if (!workq) + return -ENOMEM; + + return 0; +} + +void c4iw_cm_term(void) +{ + WARN_ON(!list_empty(&timeout_list)); + flush_workqueue(workq); + destroy_workqueue(workq); +} diff --git a/kernel/drivers/infiniband/hw/cxgb4/cq.c b/kernel/drivers/infiniband/hw/cxgb4/cq.c new file mode 100644 index 000000000..68ddb3710 --- /dev/null +++ b/kernel/drivers/infiniband/hw/cxgb4/cq.c @@ -0,0 +1,1015 @@ +/* + * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "iw_cxgb4.h" + +static int destroy_cq(struct c4iw_rdev *rdev, struct t4_cq *cq, + struct c4iw_dev_ucontext *uctx) +{ + struct fw_ri_res_wr *res_wr; + struct fw_ri_res *res; + int wr_len; + struct c4iw_wr_wait wr_wait; + struct sk_buff *skb; + int ret; + + wr_len = sizeof *res_wr + sizeof *res; + skb = alloc_skb(wr_len, GFP_KERNEL); + if (!skb) + return -ENOMEM; + set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0); + + res_wr = (struct fw_ri_res_wr *)__skb_put(skb, wr_len); + memset(res_wr, 0, wr_len); + res_wr->op_nres = cpu_to_be32( + FW_WR_OP_V(FW_RI_RES_WR) | + FW_RI_RES_WR_NRES_V(1) | + FW_WR_COMPL_F); + res_wr->len16_pkd = cpu_to_be32(DIV_ROUND_UP(wr_len, 16)); + res_wr->cookie = (uintptr_t)&wr_wait; + res = res_wr->res; + res->u.cq.restype = FW_RI_RES_TYPE_CQ; + res->u.cq.op = FW_RI_RES_OP_RESET; + res->u.cq.iqid = cpu_to_be32(cq->cqid); + + c4iw_init_wr_wait(&wr_wait); + ret = c4iw_ofld_send(rdev, skb); + if (!ret) { + ret = c4iw_wait_for_reply(rdev, &wr_wait, 0, 0, __func__); + } + + kfree(cq->sw_queue); + dma_free_coherent(&(rdev->lldi.pdev->dev), + cq->memsize, cq->queue, + dma_unmap_addr(cq, mapping)); + c4iw_put_cqid(rdev, cq->cqid, uctx); + return ret; +} + +static int create_cq(struct c4iw_rdev *rdev, struct t4_cq *cq, + struct c4iw_dev_ucontext *uctx) +{ + struct fw_ri_res_wr *res_wr; + struct fw_ri_res *res; + int wr_len; + int user = (uctx != &rdev->uctx); + struct c4iw_wr_wait wr_wait; + int ret; + struct sk_buff *skb; + + cq->cqid = c4iw_get_cqid(rdev, uctx); + if (!cq->cqid) { + ret = -ENOMEM; + goto err1; + } + + if (!user) { + cq->sw_queue = kzalloc(cq->memsize, GFP_KERNEL); + if (!cq->sw_queue) { + ret = -ENOMEM; + goto err2; + } + } + cq->queue = dma_alloc_coherent(&rdev->lldi.pdev->dev, cq->memsize, + &cq->dma_addr, GFP_KERNEL); + if (!cq->queue) { + ret = -ENOMEM; + goto err3; + } + dma_unmap_addr_set(cq, mapping, cq->dma_addr); + memset(cq->queue, 0, cq->memsize); + + /* build fw_ri_res_wr */ + wr_len = sizeof *res_wr + sizeof *res; + + skb = alloc_skb(wr_len, GFP_KERNEL); + if (!skb) { + ret = -ENOMEM; + goto err4; + } + set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0); + + res_wr = (struct fw_ri_res_wr *)__skb_put(skb, wr_len); + memset(res_wr, 0, wr_len); + res_wr->op_nres = cpu_to_be32( + FW_WR_OP_V(FW_RI_RES_WR) | + FW_RI_RES_WR_NRES_V(1) | + FW_WR_COMPL_F); + res_wr->len16_pkd = cpu_to_be32(DIV_ROUND_UP(wr_len, 16)); + res_wr->cookie = (uintptr_t)&wr_wait; + res = res_wr->res; + res->u.cq.restype = FW_RI_RES_TYPE_CQ; + res->u.cq.op = FW_RI_RES_OP_WRITE; + res->u.cq.iqid = cpu_to_be32(cq->cqid); + res->u.cq.iqandst_to_iqandstindex = cpu_to_be32( + FW_RI_RES_WR_IQANUS_V(0) | + FW_RI_RES_WR_IQANUD_V(1) | + FW_RI_RES_WR_IQANDST_F | + FW_RI_RES_WR_IQANDSTINDEX_V( + rdev->lldi.ciq_ids[cq->vector])); + res->u.cq.iqdroprss_to_iqesize = cpu_to_be16( + FW_RI_RES_WR_IQDROPRSS_F | + FW_RI_RES_WR_IQPCIECH_V(2) | + FW_RI_RES_WR_IQINTCNTTHRESH_V(0) | + FW_RI_RES_WR_IQO_F | + FW_RI_RES_WR_IQESIZE_V(1)); + res->u.cq.iqsize = cpu_to_be16(cq->size); + res->u.cq.iqaddr = cpu_to_be64(cq->dma_addr); + + c4iw_init_wr_wait(&wr_wait); + + ret = c4iw_ofld_send(rdev, skb); + if (ret) + goto err4; + PDBG("%s wait_event wr_wait %p\n", __func__, &wr_wait); + ret = c4iw_wait_for_reply(rdev, &wr_wait, 0, 0, __func__); + if (ret) + goto err4; + + cq->gen = 1; + cq->rdev = rdev; + if (user) { + u32 off = (cq->cqid << rdev->cqshift) & PAGE_MASK; + + cq->ugts = (u64)rdev->bar2_pa + off; + } else if (is_t4(rdev->lldi.adapter_type)) { + cq->gts = rdev->lldi.gts_reg; + cq->qid_mask = -1U; + } else { + u32 off = ((cq->cqid << rdev->cqshift) & PAGE_MASK) + 12; + + cq->gts = rdev->bar2_kva + off; + cq->qid_mask = rdev->qpmask; + } + return 0; +err4: + dma_free_coherent(&rdev->lldi.pdev->dev, cq->memsize, cq->queue, + dma_unmap_addr(cq, mapping)); +err3: + kfree(cq->sw_queue); +err2: + c4iw_put_cqid(rdev, cq->cqid, uctx); +err1: + return ret; +} + +static void insert_recv_cqe(struct t4_wq *wq, struct t4_cq *cq) +{ + struct t4_cqe cqe; + + PDBG("%s wq %p cq %p sw_cidx %u sw_pidx %u\n", __func__, + wq, cq, cq->sw_cidx, cq->sw_pidx); + memset(&cqe, 0, sizeof(cqe)); + cqe.header = cpu_to_be32(CQE_STATUS_V(T4_ERR_SWFLUSH) | + CQE_OPCODE_V(FW_RI_SEND) | + CQE_TYPE_V(0) | + CQE_SWCQE_V(1) | + CQE_QPID_V(wq->sq.qid)); + cqe.bits_type_ts = cpu_to_be64(CQE_GENBIT_V((u64)cq->gen)); + cq->sw_queue[cq->sw_pidx] = cqe; + t4_swcq_produce(cq); +} + +int c4iw_flush_rq(struct t4_wq *wq, struct t4_cq *cq, int count) +{ + int flushed = 0; + int in_use = wq->rq.in_use - count; + + BUG_ON(in_use < 0); + PDBG("%s wq %p cq %p rq.in_use %u skip count %u\n", __func__, + wq, cq, wq->rq.in_use, count); + while (in_use--) { + insert_recv_cqe(wq, cq); + flushed++; + } + return flushed; +} + +static void insert_sq_cqe(struct t4_wq *wq, struct t4_cq *cq, + struct t4_swsqe *swcqe) +{ + struct t4_cqe cqe; + + PDBG("%s wq %p cq %p sw_cidx %u sw_pidx %u\n", __func__, + wq, cq, cq->sw_cidx, cq->sw_pidx); + memset(&cqe, 0, sizeof(cqe)); + cqe.header = cpu_to_be32(CQE_STATUS_V(T4_ERR_SWFLUSH) | + CQE_OPCODE_V(swcqe->opcode) | + CQE_TYPE_V(1) | + CQE_SWCQE_V(1) | + CQE_QPID_V(wq->sq.qid)); + CQE_WRID_SQ_IDX(&cqe) = swcqe->idx; + cqe.bits_type_ts = cpu_to_be64(CQE_GENBIT_V((u64)cq->gen)); + cq->sw_queue[cq->sw_pidx] = cqe; + t4_swcq_produce(cq); +} + +static void advance_oldest_read(struct t4_wq *wq); + +int c4iw_flush_sq(struct c4iw_qp *qhp) +{ + int flushed = 0; + struct t4_wq *wq = &qhp->wq; + struct c4iw_cq *chp = to_c4iw_cq(qhp->ibqp.send_cq); + struct t4_cq *cq = &chp->cq; + int idx; + struct t4_swsqe *swsqe; + + if (wq->sq.flush_cidx == -1) + wq->sq.flush_cidx = wq->sq.cidx; + idx = wq->sq.flush_cidx; + BUG_ON(idx >= wq->sq.size); + while (idx != wq->sq.pidx) { + swsqe = &wq->sq.sw_sq[idx]; + BUG_ON(swsqe->flushed); + swsqe->flushed = 1; + insert_sq_cqe(wq, cq, swsqe); + if (wq->sq.oldest_read == swsqe) { + BUG_ON(swsqe->opcode != FW_RI_READ_REQ); + advance_oldest_read(wq); + } + flushed++; + if (++idx == wq->sq.size) + idx = 0; + } + wq->sq.flush_cidx += flushed; + if (wq->sq.flush_cidx >= wq->sq.size) + wq->sq.flush_cidx -= wq->sq.size; + return flushed; +} + +static void flush_completed_wrs(struct t4_wq *wq, struct t4_cq *cq) +{ + struct t4_swsqe *swsqe; + int cidx; + + if (wq->sq.flush_cidx == -1) + wq->sq.flush_cidx = wq->sq.cidx; + cidx = wq->sq.flush_cidx; + BUG_ON(cidx > wq->sq.size); + + while (cidx != wq->sq.pidx) { + swsqe = &wq->sq.sw_sq[cidx]; + if (!swsqe->signaled) { + if (++cidx == wq->sq.size) + cidx = 0; + } else if (swsqe->complete) { + + BUG_ON(swsqe->flushed); + + /* + * Insert this completed cqe into the swcq. + */ + PDBG("%s moving cqe into swcq sq idx %u cq idx %u\n", + __func__, cidx, cq->sw_pidx); + swsqe->cqe.header |= htonl(CQE_SWCQE_V(1)); + cq->sw_queue[cq->sw_pidx] = swsqe->cqe; + t4_swcq_produce(cq); + swsqe->flushed = 1; + if (++cidx == wq->sq.size) + cidx = 0; + wq->sq.flush_cidx = cidx; + } else + break; + } +} + +static void create_read_req_cqe(struct t4_wq *wq, struct t4_cqe *hw_cqe, + struct t4_cqe *read_cqe) +{ + read_cqe->u.scqe.cidx = wq->sq.oldest_read->idx; + read_cqe->len = htonl(wq->sq.oldest_read->read_len); + read_cqe->header = htonl(CQE_QPID_V(CQE_QPID(hw_cqe)) | + CQE_SWCQE_V(SW_CQE(hw_cqe)) | + CQE_OPCODE_V(FW_RI_READ_REQ) | + CQE_TYPE_V(1)); + read_cqe->bits_type_ts = hw_cqe->bits_type_ts; +} + +static void advance_oldest_read(struct t4_wq *wq) +{ + + u32 rptr = wq->sq.oldest_read - wq->sq.sw_sq + 1; + + if (rptr == wq->sq.size) + rptr = 0; + while (rptr != wq->sq.pidx) { + wq->sq.oldest_read = &wq->sq.sw_sq[rptr]; + + if (wq->sq.oldest_read->opcode == FW_RI_READ_REQ) + return; + if (++rptr == wq->sq.size) + rptr = 0; + } + wq->sq.oldest_read = NULL; +} + +/* + * Move all CQEs from the HWCQ into the SWCQ. + * Deal with out-of-order and/or completions that complete + * prior unsignalled WRs. + */ +void c4iw_flush_hw_cq(struct c4iw_cq *chp) +{ + struct t4_cqe *hw_cqe, *swcqe, read_cqe; + struct c4iw_qp *qhp; + struct t4_swsqe *swsqe; + int ret; + + PDBG("%s cqid 0x%x\n", __func__, chp->cq.cqid); + ret = t4_next_hw_cqe(&chp->cq, &hw_cqe); + + /* + * This logic is similar to poll_cq(), but not quite the same + * unfortunately. Need to move pertinent HW CQEs to the SW CQ but + * also do any translation magic that poll_cq() normally does. + */ + while (!ret) { + qhp = get_qhp(chp->rhp, CQE_QPID(hw_cqe)); + + /* + * drop CQEs with no associated QP + */ + if (qhp == NULL) + goto next_cqe; + + if (CQE_OPCODE(hw_cqe) == FW_RI_TERMINATE) + goto next_cqe; + + if (CQE_OPCODE(hw_cqe) == FW_RI_READ_RESP) { + + /* If we have reached here because of async + * event or other error, and have egress error + * then drop + */ + if (CQE_TYPE(hw_cqe) == 1) + goto next_cqe; + + /* drop peer2peer RTR reads. + */ + if (CQE_WRID_STAG(hw_cqe) == 1) + goto next_cqe; + + /* + * Eat completions for unsignaled read WRs. + */ + if (!qhp->wq.sq.oldest_read->signaled) { + advance_oldest_read(&qhp->wq); + goto next_cqe; + } + + /* + * Don't write to the HWCQ, create a new read req CQE + * in local memory and move it into the swcq. + */ + create_read_req_cqe(&qhp->wq, hw_cqe, &read_cqe); + hw_cqe = &read_cqe; + advance_oldest_read(&qhp->wq); + } + + /* if its a SQ completion, then do the magic to move all the + * unsignaled and now in-order completions into the swcq. + */ + if (SQ_TYPE(hw_cqe)) { + swsqe = &qhp->wq.sq.sw_sq[CQE_WRID_SQ_IDX(hw_cqe)]; + swsqe->cqe = *hw_cqe; + swsqe->complete = 1; + flush_completed_wrs(&qhp->wq, &chp->cq); + } else { + swcqe = &chp->cq.sw_queue[chp->cq.sw_pidx]; + *swcqe = *hw_cqe; + swcqe->header |= cpu_to_be32(CQE_SWCQE_V(1)); + t4_swcq_produce(&chp->cq); + } +next_cqe: + t4_hwcq_consume(&chp->cq); + ret = t4_next_hw_cqe(&chp->cq, &hw_cqe); + } +} + +static int cqe_completes_wr(struct t4_cqe *cqe, struct t4_wq *wq) +{ + if (CQE_OPCODE(cqe) == FW_RI_TERMINATE) + return 0; + + if ((CQE_OPCODE(cqe) == FW_RI_RDMA_WRITE) && RQ_TYPE(cqe)) + return 0; + + if ((CQE_OPCODE(cqe) == FW_RI_READ_RESP) && SQ_TYPE(cqe)) + return 0; + + if (CQE_SEND_OPCODE(cqe) && RQ_TYPE(cqe) && t4_rq_empty(wq)) + return 0; + return 1; +} + +void c4iw_count_rcqes(struct t4_cq *cq, struct t4_wq *wq, int *count) +{ + struct t4_cqe *cqe; + u32 ptr; + + *count = 0; + PDBG("%s count zero %d\n", __func__, *count); + ptr = cq->sw_cidx; + while (ptr != cq->sw_pidx) { + cqe = &cq->sw_queue[ptr]; + if (RQ_TYPE(cqe) && (CQE_OPCODE(cqe) != FW_RI_READ_RESP) && + (CQE_QPID(cqe) == wq->sq.qid) && cqe_completes_wr(cqe, wq)) + (*count)++; + if (++ptr == cq->size) + ptr = 0; + } + PDBG("%s cq %p count %d\n", __func__, cq, *count); +} + +/* + * poll_cq + * + * Caller must: + * check the validity of the first CQE, + * supply the wq assicated with the qpid. + * + * credit: cq credit to return to sge. + * cqe_flushed: 1 iff the CQE is flushed. + * cqe: copy of the polled CQE. + * + * return value: + * 0 CQE returned ok. + * -EAGAIN CQE skipped, try again. + * -EOVERFLOW CQ overflow detected. + */ +static int poll_cq(struct t4_wq *wq, struct t4_cq *cq, struct t4_cqe *cqe, + u8 *cqe_flushed, u64 *cookie, u32 *credit) +{ + int ret = 0; + struct t4_cqe *hw_cqe, read_cqe; + + *cqe_flushed = 0; + *credit = 0; + ret = t4_next_cqe(cq, &hw_cqe); + if (ret) + return ret; + + PDBG("%s CQE OVF %u qpid 0x%0x genbit %u type %u status 0x%0x" + " opcode 0x%0x len 0x%0x wrid_hi_stag 0x%x wrid_low_msn 0x%x\n", + __func__, CQE_OVFBIT(hw_cqe), CQE_QPID(hw_cqe), + CQE_GENBIT(hw_cqe), CQE_TYPE(hw_cqe), CQE_STATUS(hw_cqe), + CQE_OPCODE(hw_cqe), CQE_LEN(hw_cqe), CQE_WRID_HI(hw_cqe), + CQE_WRID_LOW(hw_cqe)); + + /* + * skip cqe's not affiliated with a QP. + */ + if (wq == NULL) { + ret = -EAGAIN; + goto skip_cqe; + } + + /* + * skip hw cqe's if the wq is flushed. + */ + if (wq->flushed && !SW_CQE(hw_cqe)) { + ret = -EAGAIN; + goto skip_cqe; + } + + /* + * skip TERMINATE cqes... + */ + if (CQE_OPCODE(hw_cqe) == FW_RI_TERMINATE) { + ret = -EAGAIN; + goto skip_cqe; + } + + /* + * Gotta tweak READ completions: + * 1) the cqe doesn't contain the sq_wptr from the wr. + * 2) opcode not reflected from the wr. + * 3) read_len not reflected from the wr. + * 4) cq_type is RQ_TYPE not SQ_TYPE. + */ + if (RQ_TYPE(hw_cqe) && (CQE_OPCODE(hw_cqe) == FW_RI_READ_RESP)) { + + /* If we have reached here because of async + * event or other error, and have egress error + * then drop + */ + if (CQE_TYPE(hw_cqe) == 1) { + if (CQE_STATUS(hw_cqe)) + t4_set_wq_in_error(wq); + ret = -EAGAIN; + goto skip_cqe; + } + + /* If this is an unsolicited read response, then the read + * was generated by the kernel driver as part of peer-2-peer + * connection setup. So ignore the completion. + */ + if (CQE_WRID_STAG(hw_cqe) == 1) { + if (CQE_STATUS(hw_cqe)) + t4_set_wq_in_error(wq); + ret = -EAGAIN; + goto skip_cqe; + } + + /* + * Eat completions for unsignaled read WRs. + */ + if (!wq->sq.oldest_read->signaled) { + advance_oldest_read(wq); + ret = -EAGAIN; + goto skip_cqe; + } + + /* + * Don't write to the HWCQ, so create a new read req CQE + * in local memory. + */ + create_read_req_cqe(wq, hw_cqe, &read_cqe); + hw_cqe = &read_cqe; + advance_oldest_read(wq); + } + + if (CQE_STATUS(hw_cqe) || t4_wq_in_error(wq)) { + *cqe_flushed = (CQE_STATUS(hw_cqe) == T4_ERR_SWFLUSH); + t4_set_wq_in_error(wq); + } + + /* + * RECV completion. + */ + if (RQ_TYPE(hw_cqe)) { + + /* + * HW only validates 4 bits of MSN. So we must validate that + * the MSN in the SEND is the next expected MSN. If its not, + * then we complete this with T4_ERR_MSN and mark the wq in + * error. + */ + + if (t4_rq_empty(wq)) { + t4_set_wq_in_error(wq); + ret = -EAGAIN; + goto skip_cqe; + } + if (unlikely((CQE_WRID_MSN(hw_cqe) != (wq->rq.msn)))) { + t4_set_wq_in_error(wq); + hw_cqe->header |= htonl(CQE_STATUS_V(T4_ERR_MSN)); + goto proc_cqe; + } + goto proc_cqe; + } + + /* + * If we get here its a send completion. + * + * Handle out of order completion. These get stuffed + * in the SW SQ. Then the SW SQ is walked to move any + * now in-order completions into the SW CQ. This handles + * 2 cases: + * 1) reaping unsignaled WRs when the first subsequent + * signaled WR is completed. + * 2) out of order read completions. + */ + if (!SW_CQE(hw_cqe) && (CQE_WRID_SQ_IDX(hw_cqe) != wq->sq.cidx)) { + struct t4_swsqe *swsqe; + + PDBG("%s out of order completion going in sw_sq at idx %u\n", + __func__, CQE_WRID_SQ_IDX(hw_cqe)); + swsqe = &wq->sq.sw_sq[CQE_WRID_SQ_IDX(hw_cqe)]; + swsqe->cqe = *hw_cqe; + swsqe->complete = 1; + ret = -EAGAIN; + goto flush_wq; + } + +proc_cqe: + *cqe = *hw_cqe; + + /* + * Reap the associated WR(s) that are freed up with this + * completion. + */ + if (SQ_TYPE(hw_cqe)) { + int idx = CQE_WRID_SQ_IDX(hw_cqe); + BUG_ON(idx >= wq->sq.size); + + /* + * Account for any unsignaled completions completed by + * this signaled completion. In this case, cidx points + * to the first unsignaled one, and idx points to the + * signaled one. So adjust in_use based on this delta. + * if this is not completing any unsigned wrs, then the + * delta will be 0. Handle wrapping also! + */ + if (idx < wq->sq.cidx) + wq->sq.in_use -= wq->sq.size + idx - wq->sq.cidx; + else + wq->sq.in_use -= idx - wq->sq.cidx; + BUG_ON(wq->sq.in_use <= 0 && wq->sq.in_use >= wq->sq.size); + + wq->sq.cidx = (uint16_t)idx; + PDBG("%s completing sq idx %u\n", __func__, wq->sq.cidx); + *cookie = wq->sq.sw_sq[wq->sq.cidx].wr_id; + if (c4iw_wr_log) + c4iw_log_wr_stats(wq, hw_cqe); + t4_sq_consume(wq); + } else { + PDBG("%s completing rq idx %u\n", __func__, wq->rq.cidx); + *cookie = wq->rq.sw_rq[wq->rq.cidx].wr_id; + BUG_ON(t4_rq_empty(wq)); + if (c4iw_wr_log) + c4iw_log_wr_stats(wq, hw_cqe); + t4_rq_consume(wq); + goto skip_cqe; + } + +flush_wq: + /* + * Flush any completed cqes that are now in-order. + */ + flush_completed_wrs(wq, cq); + +skip_cqe: + if (SW_CQE(hw_cqe)) { + PDBG("%s cq %p cqid 0x%x skip sw cqe cidx %u\n", + __func__, cq, cq->cqid, cq->sw_cidx); + t4_swcq_consume(cq); + } else { + PDBG("%s cq %p cqid 0x%x skip hw cqe cidx %u\n", + __func__, cq, cq->cqid, cq->cidx); + t4_hwcq_consume(cq); + } + return ret; +} + +/* + * Get one cq entry from c4iw and map it to openib. + * + * Returns: + * 0 cqe returned + * -ENODATA EMPTY; + * -EAGAIN caller must try again + * any other -errno fatal error + */ +static int c4iw_poll_cq_one(struct c4iw_cq *chp, struct ib_wc *wc) +{ + struct c4iw_qp *qhp = NULL; + struct t4_cqe uninitialized_var(cqe), *rd_cqe; + struct t4_wq *wq; + u32 credit = 0; + u8 cqe_flushed; + u64 cookie = 0; + int ret; + + ret = t4_next_cqe(&chp->cq, &rd_cqe); + + if (ret) + return ret; + + qhp = get_qhp(chp->rhp, CQE_QPID(rd_cqe)); + if (!qhp) + wq = NULL; + else { + spin_lock(&qhp->lock); + wq = &(qhp->wq); + } + ret = poll_cq(wq, &(chp->cq), &cqe, &cqe_flushed, &cookie, &credit); + if (ret) + goto out; + + wc->wr_id = cookie; + wc->qp = &qhp->ibqp; + wc->vendor_err = CQE_STATUS(&cqe); + wc->wc_flags = 0; + + PDBG("%s qpid 0x%x type %d opcode %d status 0x%x len %u wrid hi 0x%x " + "lo 0x%x cookie 0x%llx\n", __func__, CQE_QPID(&cqe), + CQE_TYPE(&cqe), CQE_OPCODE(&cqe), CQE_STATUS(&cqe), CQE_LEN(&cqe), + CQE_WRID_HI(&cqe), CQE_WRID_LOW(&cqe), (unsigned long long)cookie); + + if (CQE_TYPE(&cqe) == 0) { + if (!CQE_STATUS(&cqe)) + wc->byte_len = CQE_LEN(&cqe); + else + wc->byte_len = 0; + wc->opcode = IB_WC_RECV; + if (CQE_OPCODE(&cqe) == FW_RI_SEND_WITH_INV || + CQE_OPCODE(&cqe) == FW_RI_SEND_WITH_SE_INV) { + wc->ex.invalidate_rkey = CQE_WRID_STAG(&cqe); + wc->wc_flags |= IB_WC_WITH_INVALIDATE; + } + } else { + switch (CQE_OPCODE(&cqe)) { + case FW_RI_RDMA_WRITE: + wc->opcode = IB_WC_RDMA_WRITE; + break; + case FW_RI_READ_REQ: + wc->opcode = IB_WC_RDMA_READ; + wc->byte_len = CQE_LEN(&cqe); + break; + case FW_RI_SEND_WITH_INV: + case FW_RI_SEND_WITH_SE_INV: + wc->opcode = IB_WC_SEND; + wc->wc_flags |= IB_WC_WITH_INVALIDATE; + break; + case FW_RI_SEND: + case FW_RI_SEND_WITH_SE: + wc->opcode = IB_WC_SEND; + break; + case FW_RI_BIND_MW: + wc->opcode = IB_WC_BIND_MW; + break; + + case FW_RI_LOCAL_INV: + wc->opcode = IB_WC_LOCAL_INV; + break; + case FW_RI_FAST_REGISTER: + wc->opcode = IB_WC_FAST_REG_MR; + break; + default: + printk(KERN_ERR MOD "Unexpected opcode %d " + "in the CQE received for QPID=0x%0x\n", + CQE_OPCODE(&cqe), CQE_QPID(&cqe)); + ret = -EINVAL; + goto out; + } + } + + if (cqe_flushed) + wc->status = IB_WC_WR_FLUSH_ERR; + else { + + switch (CQE_STATUS(&cqe)) { + case T4_ERR_SUCCESS: + wc->status = IB_WC_SUCCESS; + break; + case T4_ERR_STAG: + wc->status = IB_WC_LOC_ACCESS_ERR; + break; + case T4_ERR_PDID: + wc->status = IB_WC_LOC_PROT_ERR; + break; + case T4_ERR_QPID: + case T4_ERR_ACCESS: + wc->status = IB_WC_LOC_ACCESS_ERR; + break; + case T4_ERR_WRAP: + wc->status = IB_WC_GENERAL_ERR; + break; + case T4_ERR_BOUND: + wc->status = IB_WC_LOC_LEN_ERR; + break; + case T4_ERR_INVALIDATE_SHARED_MR: + case T4_ERR_INVALIDATE_MR_WITH_MW_BOUND: + wc->status = IB_WC_MW_BIND_ERR; + break; + case T4_ERR_CRC: + case T4_ERR_MARKER: + case T4_ERR_PDU_LEN_ERR: + case T4_ERR_OUT_OF_RQE: + case T4_ERR_DDP_VERSION: + case T4_ERR_RDMA_VERSION: + case T4_ERR_DDP_QUEUE_NUM: + case T4_ERR_MSN: + case T4_ERR_TBIT: + case T4_ERR_MO: + case T4_ERR_MSN_RANGE: + case T4_ERR_IRD_OVERFLOW: + case T4_ERR_OPCODE: + case T4_ERR_INTERNAL_ERR: + wc->status = IB_WC_FATAL_ERR; + break; + case T4_ERR_SWFLUSH: + wc->status = IB_WC_WR_FLUSH_ERR; + break; + default: + printk(KERN_ERR MOD + "Unexpected cqe_status 0x%x for QPID=0x%0x\n", + CQE_STATUS(&cqe), CQE_QPID(&cqe)); + ret = -EINVAL; + } + } +out: + if (wq) + spin_unlock(&qhp->lock); + return ret; +} + +int c4iw_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) +{ + struct c4iw_cq *chp; + unsigned long flags; + int npolled; + int err = 0; + + chp = to_c4iw_cq(ibcq); + + spin_lock_irqsave(&chp->lock, flags); + for (npolled = 0; npolled < num_entries; ++npolled) { + do { + err = c4iw_poll_cq_one(chp, wc + npolled); + } while (err == -EAGAIN); + if (err) + break; + } + spin_unlock_irqrestore(&chp->lock, flags); + return !err || err == -ENODATA ? npolled : err; +} + +int c4iw_destroy_cq(struct ib_cq *ib_cq) +{ + struct c4iw_cq *chp; + struct c4iw_ucontext *ucontext; + + PDBG("%s ib_cq %p\n", __func__, ib_cq); + chp = to_c4iw_cq(ib_cq); + + remove_handle(chp->rhp, &chp->rhp->cqidr, chp->cq.cqid); + atomic_dec(&chp->refcnt); + wait_event(chp->wait, !atomic_read(&chp->refcnt)); + + ucontext = ib_cq->uobject ? to_c4iw_ucontext(ib_cq->uobject->context) + : NULL; + destroy_cq(&chp->rhp->rdev, &chp->cq, + ucontext ? &ucontext->uctx : &chp->cq.rdev->uctx); + kfree(chp); + return 0; +} + +struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, int entries, + int vector, struct ib_ucontext *ib_context, + struct ib_udata *udata) +{ + struct c4iw_dev *rhp; + struct c4iw_cq *chp; + struct c4iw_create_cq_resp uresp; + struct c4iw_ucontext *ucontext = NULL; + int ret; + size_t memsize, hwentries; + struct c4iw_mm_entry *mm, *mm2; + + PDBG("%s ib_dev %p entries %d\n", __func__, ibdev, entries); + + rhp = to_c4iw_dev(ibdev); + + if (vector >= rhp->rdev.lldi.nciq) + return ERR_PTR(-EINVAL); + + chp = kzalloc(sizeof(*chp), GFP_KERNEL); + if (!chp) + return ERR_PTR(-ENOMEM); + + if (ib_context) + ucontext = to_c4iw_ucontext(ib_context); + + /* account for the status page. */ + entries++; + + /* IQ needs one extra entry to differentiate full vs empty. */ + entries++; + + /* + * entries must be multiple of 16 for HW. + */ + entries = roundup(entries, 16); + + /* + * Make actual HW queue 2x to avoid cdix_inc overflows. + */ + hwentries = min(entries * 2, rhp->rdev.hw_queue.t4_max_iq_size); + + /* + * Make HW queue at least 64 entries so GTS updates aren't too + * frequent. + */ + if (hwentries < 64) + hwentries = 64; + + memsize = hwentries * sizeof *chp->cq.queue; + + /* + * memsize must be a multiple of the page size if its a user cq. + */ + if (ucontext) + memsize = roundup(memsize, PAGE_SIZE); + chp->cq.size = hwentries; + chp->cq.memsize = memsize; + chp->cq.vector = vector; + + ret = create_cq(&rhp->rdev, &chp->cq, + ucontext ? &ucontext->uctx : &rhp->rdev.uctx); + if (ret) + goto err1; + + chp->rhp = rhp; + chp->cq.size--; /* status page */ + chp->ibcq.cqe = entries - 2; + spin_lock_init(&chp->lock); + spin_lock_init(&chp->comp_handler_lock); + atomic_set(&chp->refcnt, 1); + init_waitqueue_head(&chp->wait); + ret = insert_handle(rhp, &rhp->cqidr, chp, chp->cq.cqid); + if (ret) + goto err2; + + if (ucontext) { + mm = kmalloc(sizeof *mm, GFP_KERNEL); + if (!mm) + goto err3; + mm2 = kmalloc(sizeof *mm2, GFP_KERNEL); + if (!mm2) + goto err4; + + uresp.qid_mask = rhp->rdev.cqmask; + uresp.cqid = chp->cq.cqid; + uresp.size = chp->cq.size; + uresp.memsize = chp->cq.memsize; + spin_lock(&ucontext->mmap_lock); + uresp.key = ucontext->key; + ucontext->key += PAGE_SIZE; + uresp.gts_key = ucontext->key; + ucontext->key += PAGE_SIZE; + spin_unlock(&ucontext->mmap_lock); + ret = ib_copy_to_udata(udata, &uresp, + sizeof(uresp) - sizeof(uresp.reserved)); + if (ret) + goto err5; + + mm->key = uresp.key; + mm->addr = virt_to_phys(chp->cq.queue); + mm->len = chp->cq.memsize; + insert_mmap(ucontext, mm); + + mm2->key = uresp.gts_key; + mm2->addr = chp->cq.ugts; + mm2->len = PAGE_SIZE; + insert_mmap(ucontext, mm2); + } + PDBG("%s cqid 0x%0x chp %p size %u memsize %zu, dma_addr 0x%0llx\n", + __func__, chp->cq.cqid, chp, chp->cq.size, + chp->cq.memsize, (unsigned long long) chp->cq.dma_addr); + return &chp->ibcq; +err5: + kfree(mm2); +err4: + kfree(mm); +err3: + remove_handle(rhp, &rhp->cqidr, chp->cq.cqid); +err2: + destroy_cq(&chp->rhp->rdev, &chp->cq, + ucontext ? &ucontext->uctx : &rhp->rdev.uctx); +err1: + kfree(chp); + return ERR_PTR(ret); +} + +int c4iw_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata) +{ + return -ENOSYS; +} + +int c4iw_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) +{ + struct c4iw_cq *chp; + int ret; + unsigned long flag; + + chp = to_c4iw_cq(ibcq); + spin_lock_irqsave(&chp->lock, flag); + ret = t4_arm_cq(&chp->cq, + (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED); + spin_unlock_irqrestore(&chp->lock, flag); + if (ret && !(flags & IB_CQ_REPORT_MISSED_EVENTS)) + ret = 0; + return ret; +} diff --git a/kernel/drivers/infiniband/hw/cxgb4/device.c b/kernel/drivers/infiniband/hw/cxgb4/device.c new file mode 100644 index 000000000..7e895d714 --- /dev/null +++ b/kernel/drivers/infiniband/hw/cxgb4/device.c @@ -0,0 +1,1564 @@ +/* + * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include +#include +#include + +#include + +#include "iw_cxgb4.h" + +#define DRV_VERSION "0.1" + +MODULE_AUTHOR("Steve Wise"); +MODULE_DESCRIPTION("Chelsio T4/T5 RDMA Driver"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION(DRV_VERSION); + +static int allow_db_fc_on_t5; +module_param(allow_db_fc_on_t5, int, 0644); +MODULE_PARM_DESC(allow_db_fc_on_t5, + "Allow DB Flow Control on T5 (default = 0)"); + +static int allow_db_coalescing_on_t5; +module_param(allow_db_coalescing_on_t5, int, 0644); +MODULE_PARM_DESC(allow_db_coalescing_on_t5, + "Allow DB Coalescing on T5 (default = 0)"); + +int c4iw_wr_log = 0; +module_param(c4iw_wr_log, int, 0444); +MODULE_PARM_DESC(c4iw_wr_log, "Enables logging of work request timing data."); + +static int c4iw_wr_log_size_order = 12; +module_param(c4iw_wr_log_size_order, int, 0444); +MODULE_PARM_DESC(c4iw_wr_log_size_order, + "Number of entries (log2) in the work request timing log."); + +struct uld_ctx { + struct list_head entry; + struct cxgb4_lld_info lldi; + struct c4iw_dev *dev; +}; + +static LIST_HEAD(uld_ctx_list); +static DEFINE_MUTEX(dev_mutex); + +#define DB_FC_RESUME_SIZE 64 +#define DB_FC_RESUME_DELAY 1 +#define DB_FC_DRAIN_THRESH 0 + +static struct dentry *c4iw_debugfs_root; + +struct c4iw_debugfs_data { + struct c4iw_dev *devp; + char *buf; + int bufsize; + int pos; +}; + +/* registered cxgb4 netlink callbacks */ +static struct ibnl_client_cbs c4iw_nl_cb_table[] = { + [RDMA_NL_IWPM_REG_PID] = {.dump = iwpm_register_pid_cb}, + [RDMA_NL_IWPM_ADD_MAPPING] = {.dump = iwpm_add_mapping_cb}, + [RDMA_NL_IWPM_QUERY_MAPPING] = {.dump = iwpm_add_and_query_mapping_cb}, + [RDMA_NL_IWPM_HANDLE_ERR] = {.dump = iwpm_mapping_error_cb}, + [RDMA_NL_IWPM_REMOTE_INFO] = {.dump = iwpm_remote_info_cb}, + [RDMA_NL_IWPM_MAPINFO] = {.dump = iwpm_mapping_info_cb}, + [RDMA_NL_IWPM_MAPINFO_NUM] = {.dump = iwpm_ack_mapping_info_cb} +}; + +static int count_idrs(int id, void *p, void *data) +{ + int *countp = data; + + *countp = *countp + 1; + return 0; +} + +static ssize_t debugfs_read(struct file *file, char __user *buf, size_t count, + loff_t *ppos) +{ + struct c4iw_debugfs_data *d = file->private_data; + + return simple_read_from_buffer(buf, count, ppos, d->buf, d->pos); +} + +void c4iw_log_wr_stats(struct t4_wq *wq, struct t4_cqe *cqe) +{ + struct wr_log_entry le; + int idx; + + if (!wq->rdev->wr_log) + return; + + idx = (atomic_inc_return(&wq->rdev->wr_log_idx) - 1) & + (wq->rdev->wr_log_size - 1); + le.poll_sge_ts = cxgb4_read_sge_timestamp(wq->rdev->lldi.ports[0]); + getnstimeofday(&le.poll_host_ts); + le.valid = 1; + le.cqe_sge_ts = CQE_TS(cqe); + if (SQ_TYPE(cqe)) { + le.qid = wq->sq.qid; + le.opcode = CQE_OPCODE(cqe); + le.post_host_ts = wq->sq.sw_sq[wq->sq.cidx].host_ts; + le.post_sge_ts = wq->sq.sw_sq[wq->sq.cidx].sge_ts; + le.wr_id = CQE_WRID_SQ_IDX(cqe); + } else { + le.qid = wq->rq.qid; + le.opcode = FW_RI_RECEIVE; + le.post_host_ts = wq->rq.sw_rq[wq->rq.cidx].host_ts; + le.post_sge_ts = wq->rq.sw_rq[wq->rq.cidx].sge_ts; + le.wr_id = CQE_WRID_MSN(cqe); + } + wq->rdev->wr_log[idx] = le; +} + +static int wr_log_show(struct seq_file *seq, void *v) +{ + struct c4iw_dev *dev = seq->private; + struct timespec prev_ts = {0, 0}; + struct wr_log_entry *lep; + int prev_ts_set = 0; + int idx, end; + +#define ts2ns(ts) div64_u64((ts) * dev->rdev.lldi.cclk_ps, 1000) + + idx = atomic_read(&dev->rdev.wr_log_idx) & + (dev->rdev.wr_log_size - 1); + end = idx - 1; + if (end < 0) + end = dev->rdev.wr_log_size - 1; + lep = &dev->rdev.wr_log[idx]; + while (idx != end) { + if (lep->valid) { + if (!prev_ts_set) { + prev_ts_set = 1; + prev_ts = lep->poll_host_ts; + } + seq_printf(seq, "%04u: sec %lu nsec %lu qid %u opcode " + "%u %s 0x%x host_wr_delta sec %lu nsec %lu " + "post_sge_ts 0x%llx cqe_sge_ts 0x%llx " + "poll_sge_ts 0x%llx post_poll_delta_ns %llu " + "cqe_poll_delta_ns %llu\n", + idx, + timespec_sub(lep->poll_host_ts, + prev_ts).tv_sec, + timespec_sub(lep->poll_host_ts, + prev_ts).tv_nsec, + lep->qid, lep->opcode, + lep->opcode == FW_RI_RECEIVE ? + "msn" : "wrid", + lep->wr_id, + timespec_sub(lep->poll_host_ts, + lep->post_host_ts).tv_sec, + timespec_sub(lep->poll_host_ts, + lep->post_host_ts).tv_nsec, + lep->post_sge_ts, lep->cqe_sge_ts, + lep->poll_sge_ts, + ts2ns(lep->poll_sge_ts - lep->post_sge_ts), + ts2ns(lep->poll_sge_ts - lep->cqe_sge_ts)); + prev_ts = lep->poll_host_ts; + } + idx++; + if (idx > (dev->rdev.wr_log_size - 1)) + idx = 0; + lep = &dev->rdev.wr_log[idx]; + } +#undef ts2ns + return 0; +} + +static int wr_log_open(struct inode *inode, struct file *file) +{ + return single_open(file, wr_log_show, inode->i_private); +} + +static ssize_t wr_log_clear(struct file *file, const char __user *buf, + size_t count, loff_t *pos) +{ + struct c4iw_dev *dev = ((struct seq_file *)file->private_data)->private; + int i; + + if (dev->rdev.wr_log) + for (i = 0; i < dev->rdev.wr_log_size; i++) + dev->rdev.wr_log[i].valid = 0; + return count; +} + +static const struct file_operations wr_log_debugfs_fops = { + .owner = THIS_MODULE, + .open = wr_log_open, + .release = single_release, + .read = seq_read, + .llseek = seq_lseek, + .write = wr_log_clear, +}; + +static int dump_qp(int id, void *p, void *data) +{ + struct c4iw_qp *qp = p; + struct c4iw_debugfs_data *qpd = data; + int space; + int cc; + + if (id != qp->wq.sq.qid) + return 0; + + space = qpd->bufsize - qpd->pos - 1; + if (space == 0) + return 1; + + if (qp->ep) { + if (qp->ep->com.local_addr.ss_family == AF_INET) { + struct sockaddr_in *lsin = (struct sockaddr_in *) + &qp->ep->com.local_addr; + struct sockaddr_in *rsin = (struct sockaddr_in *) + &qp->ep->com.remote_addr; + struct sockaddr_in *mapped_lsin = (struct sockaddr_in *) + &qp->ep->com.mapped_local_addr; + struct sockaddr_in *mapped_rsin = (struct sockaddr_in *) + &qp->ep->com.mapped_remote_addr; + + cc = snprintf(qpd->buf + qpd->pos, space, + "rc qp sq id %u rq id %u state %u " + "onchip %u ep tid %u state %u " + "%pI4:%u/%u->%pI4:%u/%u\n", + qp->wq.sq.qid, qp->wq.rq.qid, + (int)qp->attr.state, + qp->wq.sq.flags & T4_SQ_ONCHIP, + qp->ep->hwtid, (int)qp->ep->com.state, + &lsin->sin_addr, ntohs(lsin->sin_port), + ntohs(mapped_lsin->sin_port), + &rsin->sin_addr, ntohs(rsin->sin_port), + ntohs(mapped_rsin->sin_port)); + } else { + struct sockaddr_in6 *lsin6 = (struct sockaddr_in6 *) + &qp->ep->com.local_addr; + struct sockaddr_in6 *rsin6 = (struct sockaddr_in6 *) + &qp->ep->com.remote_addr; + struct sockaddr_in6 *mapped_lsin6 = + (struct sockaddr_in6 *) + &qp->ep->com.mapped_local_addr; + struct sockaddr_in6 *mapped_rsin6 = + (struct sockaddr_in6 *) + &qp->ep->com.mapped_remote_addr; + + cc = snprintf(qpd->buf + qpd->pos, space, + "rc qp sq id %u rq id %u state %u " + "onchip %u ep tid %u state %u " + "%pI6:%u/%u->%pI6:%u/%u\n", + qp->wq.sq.qid, qp->wq.rq.qid, + (int)qp->attr.state, + qp->wq.sq.flags & T4_SQ_ONCHIP, + qp->ep->hwtid, (int)qp->ep->com.state, + &lsin6->sin6_addr, + ntohs(lsin6->sin6_port), + ntohs(mapped_lsin6->sin6_port), + &rsin6->sin6_addr, + ntohs(rsin6->sin6_port), + ntohs(mapped_rsin6->sin6_port)); + } + } else + cc = snprintf(qpd->buf + qpd->pos, space, + "qp sq id %u rq id %u state %u onchip %u\n", + qp->wq.sq.qid, qp->wq.rq.qid, + (int)qp->attr.state, + qp->wq.sq.flags & T4_SQ_ONCHIP); + if (cc < space) + qpd->pos += cc; + return 0; +} + +static int qp_release(struct inode *inode, struct file *file) +{ + struct c4iw_debugfs_data *qpd = file->private_data; + if (!qpd) { + printk(KERN_INFO "%s null qpd?\n", __func__); + return 0; + } + vfree(qpd->buf); + kfree(qpd); + return 0; +} + +static int qp_open(struct inode *inode, struct file *file) +{ + struct c4iw_debugfs_data *qpd; + int ret = 0; + int count = 1; + + qpd = kmalloc(sizeof *qpd, GFP_KERNEL); + if (!qpd) { + ret = -ENOMEM; + goto out; + } + qpd->devp = inode->i_private; + qpd->pos = 0; + + spin_lock_irq(&qpd->devp->lock); + idr_for_each(&qpd->devp->qpidr, count_idrs, &count); + spin_unlock_irq(&qpd->devp->lock); + + qpd->bufsize = count * 128; + qpd->buf = vmalloc(qpd->bufsize); + if (!qpd->buf) { + ret = -ENOMEM; + goto err1; + } + + spin_lock_irq(&qpd->devp->lock); + idr_for_each(&qpd->devp->qpidr, dump_qp, qpd); + spin_unlock_irq(&qpd->devp->lock); + + qpd->buf[qpd->pos++] = 0; + file->private_data = qpd; + goto out; +err1: + kfree(qpd); +out: + return ret; +} + +static const struct file_operations qp_debugfs_fops = { + .owner = THIS_MODULE, + .open = qp_open, + .release = qp_release, + .read = debugfs_read, + .llseek = default_llseek, +}; + +static int dump_stag(int id, void *p, void *data) +{ + struct c4iw_debugfs_data *stagd = data; + int space; + int cc; + struct fw_ri_tpte tpte; + int ret; + + space = stagd->bufsize - stagd->pos - 1; + if (space == 0) + return 1; + + ret = cxgb4_read_tpte(stagd->devp->rdev.lldi.ports[0], (u32)id<<8, + (__be32 *)&tpte); + if (ret) { + dev_err(&stagd->devp->rdev.lldi.pdev->dev, + "%s cxgb4_read_tpte err %d\n", __func__, ret); + return ret; + } + cc = snprintf(stagd->buf + stagd->pos, space, + "stag: idx 0x%x valid %d key 0x%x state %d pdid %d " + "perm 0x%x ps %d len 0x%llx va 0x%llx\n", + (u32)id<<8, + FW_RI_TPTE_VALID_G(ntohl(tpte.valid_to_pdid)), + FW_RI_TPTE_STAGKEY_G(ntohl(tpte.valid_to_pdid)), + FW_RI_TPTE_STAGSTATE_G(ntohl(tpte.valid_to_pdid)), + FW_RI_TPTE_PDID_G(ntohl(tpte.valid_to_pdid)), + FW_RI_TPTE_PERM_G(ntohl(tpte.locread_to_qpid)), + FW_RI_TPTE_PS_G(ntohl(tpte.locread_to_qpid)), + ((u64)ntohl(tpte.len_hi) << 32) | ntohl(tpte.len_lo), + ((u64)ntohl(tpte.va_hi) << 32) | ntohl(tpte.va_lo_fbo)); + if (cc < space) + stagd->pos += cc; + return 0; +} + +static int stag_release(struct inode *inode, struct file *file) +{ + struct c4iw_debugfs_data *stagd = file->private_data; + if (!stagd) { + printk(KERN_INFO "%s null stagd?\n", __func__); + return 0; + } + vfree(stagd->buf); + kfree(stagd); + return 0; +} + +static int stag_open(struct inode *inode, struct file *file) +{ + struct c4iw_debugfs_data *stagd; + int ret = 0; + int count = 1; + + stagd = kmalloc(sizeof *stagd, GFP_KERNEL); + if (!stagd) { + ret = -ENOMEM; + goto out; + } + stagd->devp = inode->i_private; + stagd->pos = 0; + + spin_lock_irq(&stagd->devp->lock); + idr_for_each(&stagd->devp->mmidr, count_idrs, &count); + spin_unlock_irq(&stagd->devp->lock); + + stagd->bufsize = count * 256; + stagd->buf = vmalloc(stagd->bufsize); + if (!stagd->buf) { + ret = -ENOMEM; + goto err1; + } + + spin_lock_irq(&stagd->devp->lock); + idr_for_each(&stagd->devp->mmidr, dump_stag, stagd); + spin_unlock_irq(&stagd->devp->lock); + + stagd->buf[stagd->pos++] = 0; + file->private_data = stagd; + goto out; +err1: + kfree(stagd); +out: + return ret; +} + +static const struct file_operations stag_debugfs_fops = { + .owner = THIS_MODULE, + .open = stag_open, + .release = stag_release, + .read = debugfs_read, + .llseek = default_llseek, +}; + +static char *db_state_str[] = {"NORMAL", "FLOW_CONTROL", "RECOVERY", "STOPPED"}; + +static int stats_show(struct seq_file *seq, void *v) +{ + struct c4iw_dev *dev = seq->private; + + seq_printf(seq, " Object: %10s %10s %10s %10s\n", "Total", "Current", + "Max", "Fail"); + seq_printf(seq, " PDID: %10llu %10llu %10llu %10llu\n", + dev->rdev.stats.pd.total, dev->rdev.stats.pd.cur, + dev->rdev.stats.pd.max, dev->rdev.stats.pd.fail); + seq_printf(seq, " QID: %10llu %10llu %10llu %10llu\n", + dev->rdev.stats.qid.total, dev->rdev.stats.qid.cur, + dev->rdev.stats.qid.max, dev->rdev.stats.qid.fail); + seq_printf(seq, " TPTMEM: %10llu %10llu %10llu %10llu\n", + dev->rdev.stats.stag.total, dev->rdev.stats.stag.cur, + dev->rdev.stats.stag.max, dev->rdev.stats.stag.fail); + seq_printf(seq, " PBLMEM: %10llu %10llu %10llu %10llu\n", + dev->rdev.stats.pbl.total, dev->rdev.stats.pbl.cur, + dev->rdev.stats.pbl.max, dev->rdev.stats.pbl.fail); + seq_printf(seq, " RQTMEM: %10llu %10llu %10llu %10llu\n", + dev->rdev.stats.rqt.total, dev->rdev.stats.rqt.cur, + dev->rdev.stats.rqt.max, dev->rdev.stats.rqt.fail); + seq_printf(seq, " OCQPMEM: %10llu %10llu %10llu %10llu\n", + dev->rdev.stats.ocqp.total, dev->rdev.stats.ocqp.cur, + dev->rdev.stats.ocqp.max, dev->rdev.stats.ocqp.fail); + seq_printf(seq, " DB FULL: %10llu\n", dev->rdev.stats.db_full); + seq_printf(seq, " DB EMPTY: %10llu\n", dev->rdev.stats.db_empty); + seq_printf(seq, " DB DROP: %10llu\n", dev->rdev.stats.db_drop); + seq_printf(seq, " DB State: %s Transitions %llu FC Interruptions %llu\n", + db_state_str[dev->db_state], + dev->rdev.stats.db_state_transitions, + dev->rdev.stats.db_fc_interruptions); + seq_printf(seq, "TCAM_FULL: %10llu\n", dev->rdev.stats.tcam_full); + seq_printf(seq, "ACT_OFLD_CONN_FAILS: %10llu\n", + dev->rdev.stats.act_ofld_conn_fails); + seq_printf(seq, "PAS_OFLD_CONN_FAILS: %10llu\n", + dev->rdev.stats.pas_ofld_conn_fails); + seq_printf(seq, "NEG_ADV_RCVD: %10llu\n", dev->rdev.stats.neg_adv); + seq_printf(seq, "AVAILABLE IRD: %10u\n", dev->avail_ird); + return 0; +} + +static int stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, stats_show, inode->i_private); +} + +static ssize_t stats_clear(struct file *file, const char __user *buf, + size_t count, loff_t *pos) +{ + struct c4iw_dev *dev = ((struct seq_file *)file->private_data)->private; + + mutex_lock(&dev->rdev.stats.lock); + dev->rdev.stats.pd.max = 0; + dev->rdev.stats.pd.fail = 0; + dev->rdev.stats.qid.max = 0; + dev->rdev.stats.qid.fail = 0; + dev->rdev.stats.stag.max = 0; + dev->rdev.stats.stag.fail = 0; + dev->rdev.stats.pbl.max = 0; + dev->rdev.stats.pbl.fail = 0; + dev->rdev.stats.rqt.max = 0; + dev->rdev.stats.rqt.fail = 0; + dev->rdev.stats.ocqp.max = 0; + dev->rdev.stats.ocqp.fail = 0; + dev->rdev.stats.db_full = 0; + dev->rdev.stats.db_empty = 0; + dev->rdev.stats.db_drop = 0; + dev->rdev.stats.db_state_transitions = 0; + dev->rdev.stats.tcam_full = 0; + dev->rdev.stats.act_ofld_conn_fails = 0; + dev->rdev.stats.pas_ofld_conn_fails = 0; + mutex_unlock(&dev->rdev.stats.lock); + return count; +} + +static const struct file_operations stats_debugfs_fops = { + .owner = THIS_MODULE, + .open = stats_open, + .release = single_release, + .read = seq_read, + .llseek = seq_lseek, + .write = stats_clear, +}; + +static int dump_ep(int id, void *p, void *data) +{ + struct c4iw_ep *ep = p; + struct c4iw_debugfs_data *epd = data; + int space; + int cc; + + space = epd->bufsize - epd->pos - 1; + if (space == 0) + return 1; + + if (ep->com.local_addr.ss_family == AF_INET) { + struct sockaddr_in *lsin = (struct sockaddr_in *) + &ep->com.local_addr; + struct sockaddr_in *rsin = (struct sockaddr_in *) + &ep->com.remote_addr; + struct sockaddr_in *mapped_lsin = (struct sockaddr_in *) + &ep->com.mapped_local_addr; + struct sockaddr_in *mapped_rsin = (struct sockaddr_in *) + &ep->com.mapped_remote_addr; + + cc = snprintf(epd->buf + epd->pos, space, + "ep %p cm_id %p qp %p state %d flags 0x%lx " + "history 0x%lx hwtid %d atid %d " + "conn_na %u abort_na %u " + "%pI4:%d/%d <-> %pI4:%d/%d\n", + ep, ep->com.cm_id, ep->com.qp, + (int)ep->com.state, ep->com.flags, + ep->com.history, ep->hwtid, ep->atid, + ep->stats.connect_neg_adv, + ep->stats.abort_neg_adv, + &lsin->sin_addr, ntohs(lsin->sin_port), + ntohs(mapped_lsin->sin_port), + &rsin->sin_addr, ntohs(rsin->sin_port), + ntohs(mapped_rsin->sin_port)); + } else { + struct sockaddr_in6 *lsin6 = (struct sockaddr_in6 *) + &ep->com.local_addr; + struct sockaddr_in6 *rsin6 = (struct sockaddr_in6 *) + &ep->com.remote_addr; + struct sockaddr_in6 *mapped_lsin6 = (struct sockaddr_in6 *) + &ep->com.mapped_local_addr; + struct sockaddr_in6 *mapped_rsin6 = (struct sockaddr_in6 *) + &ep->com.mapped_remote_addr; + + cc = snprintf(epd->buf + epd->pos, space, + "ep %p cm_id %p qp %p state %d flags 0x%lx " + "history 0x%lx hwtid %d atid %d " + "conn_na %u abort_na %u " + "%pI6:%d/%d <-> %pI6:%d/%d\n", + ep, ep->com.cm_id, ep->com.qp, + (int)ep->com.state, ep->com.flags, + ep->com.history, ep->hwtid, ep->atid, + ep->stats.connect_neg_adv, + ep->stats.abort_neg_adv, + &lsin6->sin6_addr, ntohs(lsin6->sin6_port), + ntohs(mapped_lsin6->sin6_port), + &rsin6->sin6_addr, ntohs(rsin6->sin6_port), + ntohs(mapped_rsin6->sin6_port)); + } + if (cc < space) + epd->pos += cc; + return 0; +} + +static int dump_listen_ep(int id, void *p, void *data) +{ + struct c4iw_listen_ep *ep = p; + struct c4iw_debugfs_data *epd = data; + int space; + int cc; + + space = epd->bufsize - epd->pos - 1; + if (space == 0) + return 1; + + if (ep->com.local_addr.ss_family == AF_INET) { + struct sockaddr_in *lsin = (struct sockaddr_in *) + &ep->com.local_addr; + struct sockaddr_in *mapped_lsin = (struct sockaddr_in *) + &ep->com.mapped_local_addr; + + cc = snprintf(epd->buf + epd->pos, space, + "ep %p cm_id %p state %d flags 0x%lx stid %d " + "backlog %d %pI4:%d/%d\n", + ep, ep->com.cm_id, (int)ep->com.state, + ep->com.flags, ep->stid, ep->backlog, + &lsin->sin_addr, ntohs(lsin->sin_port), + ntohs(mapped_lsin->sin_port)); + } else { + struct sockaddr_in6 *lsin6 = (struct sockaddr_in6 *) + &ep->com.local_addr; + struct sockaddr_in6 *mapped_lsin6 = (struct sockaddr_in6 *) + &ep->com.mapped_local_addr; + + cc = snprintf(epd->buf + epd->pos, space, + "ep %p cm_id %p state %d flags 0x%lx stid %d " + "backlog %d %pI6:%d/%d\n", + ep, ep->com.cm_id, (int)ep->com.state, + ep->com.flags, ep->stid, ep->backlog, + &lsin6->sin6_addr, ntohs(lsin6->sin6_port), + ntohs(mapped_lsin6->sin6_port)); + } + if (cc < space) + epd->pos += cc; + return 0; +} + +static int ep_release(struct inode *inode, struct file *file) +{ + struct c4iw_debugfs_data *epd = file->private_data; + if (!epd) { + pr_info("%s null qpd?\n", __func__); + return 0; + } + vfree(epd->buf); + kfree(epd); + return 0; +} + +static int ep_open(struct inode *inode, struct file *file) +{ + struct c4iw_debugfs_data *epd; + int ret = 0; + int count = 1; + + epd = kmalloc(sizeof(*epd), GFP_KERNEL); + if (!epd) { + ret = -ENOMEM; + goto out; + } + epd->devp = inode->i_private; + epd->pos = 0; + + spin_lock_irq(&epd->devp->lock); + idr_for_each(&epd->devp->hwtid_idr, count_idrs, &count); + idr_for_each(&epd->devp->atid_idr, count_idrs, &count); + idr_for_each(&epd->devp->stid_idr, count_idrs, &count); + spin_unlock_irq(&epd->devp->lock); + + epd->bufsize = count * 240; + epd->buf = vmalloc(epd->bufsize); + if (!epd->buf) { + ret = -ENOMEM; + goto err1; + } + + spin_lock_irq(&epd->devp->lock); + idr_for_each(&epd->devp->hwtid_idr, dump_ep, epd); + idr_for_each(&epd->devp->atid_idr, dump_ep, epd); + idr_for_each(&epd->devp->stid_idr, dump_listen_ep, epd); + spin_unlock_irq(&epd->devp->lock); + + file->private_data = epd; + goto out; +err1: + kfree(epd); +out: + return ret; +} + +static const struct file_operations ep_debugfs_fops = { + .owner = THIS_MODULE, + .open = ep_open, + .release = ep_release, + .read = debugfs_read, +}; + +static int setup_debugfs(struct c4iw_dev *devp) +{ + if (!devp->debugfs_root) + return -1; + + debugfs_create_file_size("qps", S_IWUSR, devp->debugfs_root, + (void *)devp, &qp_debugfs_fops, 4096); + + debugfs_create_file_size("stags", S_IWUSR, devp->debugfs_root, + (void *)devp, &stag_debugfs_fops, 4096); + + debugfs_create_file_size("stats", S_IWUSR, devp->debugfs_root, + (void *)devp, &stats_debugfs_fops, 4096); + + debugfs_create_file_size("eps", S_IWUSR, devp->debugfs_root, + (void *)devp, &ep_debugfs_fops, 4096); + + if (c4iw_wr_log) + debugfs_create_file_size("wr_log", S_IWUSR, devp->debugfs_root, + (void *)devp, &wr_log_debugfs_fops, 4096); + return 0; +} + +void c4iw_release_dev_ucontext(struct c4iw_rdev *rdev, + struct c4iw_dev_ucontext *uctx) +{ + struct list_head *pos, *nxt; + struct c4iw_qid_list *entry; + + mutex_lock(&uctx->lock); + list_for_each_safe(pos, nxt, &uctx->qpids) { + entry = list_entry(pos, struct c4iw_qid_list, entry); + list_del_init(&entry->entry); + if (!(entry->qid & rdev->qpmask)) { + c4iw_put_resource(&rdev->resource.qid_table, + entry->qid); + mutex_lock(&rdev->stats.lock); + rdev->stats.qid.cur -= rdev->qpmask + 1; + mutex_unlock(&rdev->stats.lock); + } + kfree(entry); + } + + list_for_each_safe(pos, nxt, &uctx->qpids) { + entry = list_entry(pos, struct c4iw_qid_list, entry); + list_del_init(&entry->entry); + kfree(entry); + } + mutex_unlock(&uctx->lock); +} + +void c4iw_init_dev_ucontext(struct c4iw_rdev *rdev, + struct c4iw_dev_ucontext *uctx) +{ + INIT_LIST_HEAD(&uctx->qpids); + INIT_LIST_HEAD(&uctx->cqids); + mutex_init(&uctx->lock); +} + +/* Caller takes care of locking if needed */ +static int c4iw_rdev_open(struct c4iw_rdev *rdev) +{ + int err; + + c4iw_init_dev_ucontext(rdev, &rdev->uctx); + + /* + * This implementation assumes udb_density == ucq_density! Eventually + * we might need to support this but for now fail the open. Also the + * cqid and qpid range must match for now. + */ + if (rdev->lldi.udb_density != rdev->lldi.ucq_density) { + pr_err(MOD "%s: unsupported udb/ucq densities %u/%u\n", + pci_name(rdev->lldi.pdev), rdev->lldi.udb_density, + rdev->lldi.ucq_density); + err = -EINVAL; + goto err1; + } + if (rdev->lldi.vr->qp.start != rdev->lldi.vr->cq.start || + rdev->lldi.vr->qp.size != rdev->lldi.vr->cq.size) { + pr_err(MOD "%s: unsupported qp and cq id ranges " + "qp start %u size %u cq start %u size %u\n", + pci_name(rdev->lldi.pdev), rdev->lldi.vr->qp.start, + rdev->lldi.vr->qp.size, rdev->lldi.vr->cq.size, + rdev->lldi.vr->cq.size); + err = -EINVAL; + goto err1; + } + + /* + * qpshift is the number of bits to shift the qpid left in order + * to get the correct address of the doorbell for that qp. + */ + rdev->qpshift = PAGE_SHIFT - ilog2(rdev->lldi.udb_density); + rdev->qpmask = rdev->lldi.udb_density - 1; + rdev->cqshift = PAGE_SHIFT - ilog2(rdev->lldi.ucq_density); + rdev->cqmask = rdev->lldi.ucq_density - 1; + PDBG("%s dev %s stag start 0x%0x size 0x%0x num stags %d " + "pbl start 0x%0x size 0x%0x rq start 0x%0x size 0x%0x " + "qp qid start %u size %u cq qid start %u size %u\n", + __func__, pci_name(rdev->lldi.pdev), rdev->lldi.vr->stag.start, + rdev->lldi.vr->stag.size, c4iw_num_stags(rdev), + rdev->lldi.vr->pbl.start, + rdev->lldi.vr->pbl.size, rdev->lldi.vr->rq.start, + rdev->lldi.vr->rq.size, + rdev->lldi.vr->qp.start, + rdev->lldi.vr->qp.size, + rdev->lldi.vr->cq.start, + rdev->lldi.vr->cq.size); + PDBG("udb len 0x%x udb base %p db_reg %p gts_reg %p qpshift %lu " + "qpmask 0x%x cqshift %lu cqmask 0x%x\n", + (unsigned)pci_resource_len(rdev->lldi.pdev, 2), + (void *)pci_resource_start(rdev->lldi.pdev, 2), + rdev->lldi.db_reg, + rdev->lldi.gts_reg, + rdev->qpshift, rdev->qpmask, + rdev->cqshift, rdev->cqmask); + + if (c4iw_num_stags(rdev) == 0) { + err = -EINVAL; + goto err1; + } + + rdev->stats.pd.total = T4_MAX_NUM_PD; + rdev->stats.stag.total = rdev->lldi.vr->stag.size; + rdev->stats.pbl.total = rdev->lldi.vr->pbl.size; + rdev->stats.rqt.total = rdev->lldi.vr->rq.size; + rdev->stats.ocqp.total = rdev->lldi.vr->ocq.size; + rdev->stats.qid.total = rdev->lldi.vr->qp.size; + + err = c4iw_init_resource(rdev, c4iw_num_stags(rdev), T4_MAX_NUM_PD); + if (err) { + printk(KERN_ERR MOD "error %d initializing resources\n", err); + goto err1; + } + err = c4iw_pblpool_create(rdev); + if (err) { + printk(KERN_ERR MOD "error %d initializing pbl pool\n", err); + goto err2; + } + err = c4iw_rqtpool_create(rdev); + if (err) { + printk(KERN_ERR MOD "error %d initializing rqt pool\n", err); + goto err3; + } + err = c4iw_ocqp_pool_create(rdev); + if (err) { + printk(KERN_ERR MOD "error %d initializing ocqp pool\n", err); + goto err4; + } + rdev->status_page = (struct t4_dev_status_page *) + __get_free_page(GFP_KERNEL); + if (!rdev->status_page) { + pr_err(MOD "error allocating status page\n"); + goto err4; + } + + if (c4iw_wr_log) { + rdev->wr_log = kzalloc((1 << c4iw_wr_log_size_order) * + sizeof(*rdev->wr_log), GFP_KERNEL); + if (rdev->wr_log) { + rdev->wr_log_size = 1 << c4iw_wr_log_size_order; + atomic_set(&rdev->wr_log_idx, 0); + } else { + pr_err(MOD "error allocating wr_log. Logging disabled\n"); + } + } + + rdev->status_page->db_off = 0; + + return 0; +err4: + c4iw_rqtpool_destroy(rdev); +err3: + c4iw_pblpool_destroy(rdev); +err2: + c4iw_destroy_resource(&rdev->resource); +err1: + return err; +} + +static void c4iw_rdev_close(struct c4iw_rdev *rdev) +{ + kfree(rdev->wr_log); + free_page((unsigned long)rdev->status_page); + c4iw_pblpool_destroy(rdev); + c4iw_rqtpool_destroy(rdev); + c4iw_destroy_resource(&rdev->resource); +} + +static void c4iw_dealloc(struct uld_ctx *ctx) +{ + c4iw_rdev_close(&ctx->dev->rdev); + idr_destroy(&ctx->dev->cqidr); + idr_destroy(&ctx->dev->qpidr); + idr_destroy(&ctx->dev->mmidr); + idr_destroy(&ctx->dev->hwtid_idr); + idr_destroy(&ctx->dev->stid_idr); + idr_destroy(&ctx->dev->atid_idr); + if (ctx->dev->rdev.bar2_kva) + iounmap(ctx->dev->rdev.bar2_kva); + if (ctx->dev->rdev.oc_mw_kva) + iounmap(ctx->dev->rdev.oc_mw_kva); + ib_dealloc_device(&ctx->dev->ibdev); + ctx->dev = NULL; +} + +static void c4iw_remove(struct uld_ctx *ctx) +{ + PDBG("%s c4iw_dev %p\n", __func__, ctx->dev); + c4iw_unregister_device(ctx->dev); + c4iw_dealloc(ctx); +} + +static int rdma_supported(const struct cxgb4_lld_info *infop) +{ + return infop->vr->stag.size > 0 && infop->vr->pbl.size > 0 && + infop->vr->rq.size > 0 && infop->vr->qp.size > 0 && + infop->vr->cq.size > 0; +} + +static struct c4iw_dev *c4iw_alloc(const struct cxgb4_lld_info *infop) +{ + struct c4iw_dev *devp; + int ret; + + if (!rdma_supported(infop)) { + printk(KERN_INFO MOD "%s: RDMA not supported on this device.\n", + pci_name(infop->pdev)); + return ERR_PTR(-ENOSYS); + } + if (!ocqp_supported(infop)) + pr_info("%s: On-Chip Queues not supported on this device.\n", + pci_name(infop->pdev)); + + devp = (struct c4iw_dev *)ib_alloc_device(sizeof(*devp)); + if (!devp) { + printk(KERN_ERR MOD "Cannot allocate ib device\n"); + return ERR_PTR(-ENOMEM); + } + devp->rdev.lldi = *infop; + + /* init various hw-queue params based on lld info */ + PDBG("%s: Ing. padding boundary is %d, egrsstatuspagesize = %d\n", + __func__, devp->rdev.lldi.sge_ingpadboundary, + devp->rdev.lldi.sge_egrstatuspagesize); + + devp->rdev.hw_queue.t4_eq_status_entries = + devp->rdev.lldi.sge_ingpadboundary > 64 ? 2 : 1; + devp->rdev.hw_queue.t4_max_eq_size = 65520; + devp->rdev.hw_queue.t4_max_iq_size = 65520; + devp->rdev.hw_queue.t4_max_rq_size = 8192 - + devp->rdev.hw_queue.t4_eq_status_entries - 1; + devp->rdev.hw_queue.t4_max_sq_size = + devp->rdev.hw_queue.t4_max_eq_size - + devp->rdev.hw_queue.t4_eq_status_entries - 1; + devp->rdev.hw_queue.t4_max_qp_depth = + devp->rdev.hw_queue.t4_max_rq_size; + devp->rdev.hw_queue.t4_max_cq_depth = + devp->rdev.hw_queue.t4_max_iq_size - 2; + devp->rdev.hw_queue.t4_stat_len = + devp->rdev.lldi.sge_egrstatuspagesize; + + /* + * For T5 devices, we map all of BAR2 with WC. + * For T4 devices with onchip qp mem, we map only that part + * of BAR2 with WC. + */ + devp->rdev.bar2_pa = pci_resource_start(devp->rdev.lldi.pdev, 2); + if (is_t5(devp->rdev.lldi.adapter_type)) { + devp->rdev.bar2_kva = ioremap_wc(devp->rdev.bar2_pa, + pci_resource_len(devp->rdev.lldi.pdev, 2)); + if (!devp->rdev.bar2_kva) { + pr_err(MOD "Unable to ioremap BAR2\n"); + ib_dealloc_device(&devp->ibdev); + return ERR_PTR(-EINVAL); + } + } else if (ocqp_supported(infop)) { + devp->rdev.oc_mw_pa = + pci_resource_start(devp->rdev.lldi.pdev, 2) + + pci_resource_len(devp->rdev.lldi.pdev, 2) - + roundup_pow_of_two(devp->rdev.lldi.vr->ocq.size); + devp->rdev.oc_mw_kva = ioremap_wc(devp->rdev.oc_mw_pa, + devp->rdev.lldi.vr->ocq.size); + if (!devp->rdev.oc_mw_kva) { + pr_err(MOD "Unable to ioremap onchip mem\n"); + ib_dealloc_device(&devp->ibdev); + return ERR_PTR(-EINVAL); + } + } + + PDBG(KERN_INFO MOD "ocq memory: " + "hw_start 0x%x size %u mw_pa 0x%lx mw_kva %p\n", + devp->rdev.lldi.vr->ocq.start, devp->rdev.lldi.vr->ocq.size, + devp->rdev.oc_mw_pa, devp->rdev.oc_mw_kva); + + ret = c4iw_rdev_open(&devp->rdev); + if (ret) { + printk(KERN_ERR MOD "Unable to open CXIO rdev err %d\n", ret); + ib_dealloc_device(&devp->ibdev); + return ERR_PTR(ret); + } + + idr_init(&devp->cqidr); + idr_init(&devp->qpidr); + idr_init(&devp->mmidr); + idr_init(&devp->hwtid_idr); + idr_init(&devp->stid_idr); + idr_init(&devp->atid_idr); + spin_lock_init(&devp->lock); + mutex_init(&devp->rdev.stats.lock); + mutex_init(&devp->db_mutex); + INIT_LIST_HEAD(&devp->db_fc_list); + devp->avail_ird = devp->rdev.lldi.max_ird_adapter; + + if (c4iw_debugfs_root) { + devp->debugfs_root = debugfs_create_dir( + pci_name(devp->rdev.lldi.pdev), + c4iw_debugfs_root); + setup_debugfs(devp); + } + + + return devp; +} + +static void *c4iw_uld_add(const struct cxgb4_lld_info *infop) +{ + struct uld_ctx *ctx; + static int vers_printed; + int i; + + if (!vers_printed++) + pr_info("Chelsio T4/T5 RDMA Driver - version %s\n", + DRV_VERSION); + + ctx = kzalloc(sizeof *ctx, GFP_KERNEL); + if (!ctx) { + ctx = ERR_PTR(-ENOMEM); + goto out; + } + ctx->lldi = *infop; + + PDBG("%s found device %s nchan %u nrxq %u ntxq %u nports %u\n", + __func__, pci_name(ctx->lldi.pdev), + ctx->lldi.nchan, ctx->lldi.nrxq, + ctx->lldi.ntxq, ctx->lldi.nports); + + mutex_lock(&dev_mutex); + list_add_tail(&ctx->entry, &uld_ctx_list); + mutex_unlock(&dev_mutex); + + for (i = 0; i < ctx->lldi.nrxq; i++) + PDBG("rxqid[%u] %u\n", i, ctx->lldi.rxq_ids[i]); +out: + return ctx; +} + +static inline struct sk_buff *copy_gl_to_skb_pkt(const struct pkt_gl *gl, + const __be64 *rsp, + u32 pktshift) +{ + struct sk_buff *skb; + + /* + * Allocate space for cpl_pass_accept_req which will be synthesized by + * driver. Once the driver synthesizes the request the skb will go + * through the regular cpl_pass_accept_req processing. + * The math here assumes sizeof cpl_pass_accept_req >= sizeof + * cpl_rx_pkt. + */ + skb = alloc_skb(gl->tot_len + sizeof(struct cpl_pass_accept_req) + + sizeof(struct rss_header) - pktshift, GFP_ATOMIC); + if (unlikely(!skb)) + return NULL; + + __skb_put(skb, gl->tot_len + sizeof(struct cpl_pass_accept_req) + + sizeof(struct rss_header) - pktshift); + + /* + * This skb will contain: + * rss_header from the rspq descriptor (1 flit) + * cpl_rx_pkt struct from the rspq descriptor (2 flits) + * space for the difference between the size of an + * rx_pkt and pass_accept_req cpl (1 flit) + * the packet data from the gl + */ + skb_copy_to_linear_data(skb, rsp, sizeof(struct cpl_pass_accept_req) + + sizeof(struct rss_header)); + skb_copy_to_linear_data_offset(skb, sizeof(struct rss_header) + + sizeof(struct cpl_pass_accept_req), + gl->va + pktshift, + gl->tot_len - pktshift); + return skb; +} + +static inline int recv_rx_pkt(struct c4iw_dev *dev, const struct pkt_gl *gl, + const __be64 *rsp) +{ + unsigned int opcode = *(u8 *)rsp; + struct sk_buff *skb; + + if (opcode != CPL_RX_PKT) + goto out; + + skb = copy_gl_to_skb_pkt(gl , rsp, dev->rdev.lldi.sge_pktshift); + if (skb == NULL) + goto out; + + if (c4iw_handlers[opcode] == NULL) { + pr_info("%s no handler opcode 0x%x...\n", __func__, + opcode); + kfree_skb(skb); + goto out; + } + c4iw_handlers[opcode](dev, skb); + return 1; +out: + return 0; +} + +static int c4iw_uld_rx_handler(void *handle, const __be64 *rsp, + const struct pkt_gl *gl) +{ + struct uld_ctx *ctx = handle; + struct c4iw_dev *dev = ctx->dev; + struct sk_buff *skb; + u8 opcode; + + if (gl == NULL) { + /* omit RSS and rsp_ctrl at end of descriptor */ + unsigned int len = 64 - sizeof(struct rsp_ctrl) - 8; + + skb = alloc_skb(256, GFP_ATOMIC); + if (!skb) + goto nomem; + __skb_put(skb, len); + skb_copy_to_linear_data(skb, &rsp[1], len); + } else if (gl == CXGB4_MSG_AN) { + const struct rsp_ctrl *rc = (void *)rsp; + + u32 qid = be32_to_cpu(rc->pldbuflen_qid); + c4iw_ev_handler(dev, qid); + return 0; + } else if (unlikely(*(u8 *)rsp != *(u8 *)gl->va)) { + if (recv_rx_pkt(dev, gl, rsp)) + return 0; + + pr_info("%s: unexpected FL contents at %p, " \ + "RSS %#llx, FL %#llx, len %u\n", + pci_name(ctx->lldi.pdev), gl->va, + (unsigned long long)be64_to_cpu(*rsp), + (unsigned long long)be64_to_cpu( + *(__force __be64 *)gl->va), + gl->tot_len); + + return 0; + } else { + skb = cxgb4_pktgl_to_skb(gl, 128, 128); + if (unlikely(!skb)) + goto nomem; + } + + opcode = *(u8 *)rsp; + if (c4iw_handlers[opcode]) { + c4iw_handlers[opcode](dev, skb); + } else { + pr_info("%s no handler opcode 0x%x...\n", __func__, + opcode); + kfree_skb(skb); + } + + return 0; +nomem: + return -1; +} + +static int c4iw_uld_state_change(void *handle, enum cxgb4_state new_state) +{ + struct uld_ctx *ctx = handle; + + PDBG("%s new_state %u\n", __func__, new_state); + switch (new_state) { + case CXGB4_STATE_UP: + printk(KERN_INFO MOD "%s: Up\n", pci_name(ctx->lldi.pdev)); + if (!ctx->dev) { + int ret; + + ctx->dev = c4iw_alloc(&ctx->lldi); + if (IS_ERR(ctx->dev)) { + printk(KERN_ERR MOD + "%s: initialization failed: %ld\n", + pci_name(ctx->lldi.pdev), + PTR_ERR(ctx->dev)); + ctx->dev = NULL; + break; + } + ret = c4iw_register_device(ctx->dev); + if (ret) { + printk(KERN_ERR MOD + "%s: RDMA registration failed: %d\n", + pci_name(ctx->lldi.pdev), ret); + c4iw_dealloc(ctx); + } + } + break; + case CXGB4_STATE_DOWN: + printk(KERN_INFO MOD "%s: Down\n", + pci_name(ctx->lldi.pdev)); + if (ctx->dev) + c4iw_remove(ctx); + break; + case CXGB4_STATE_START_RECOVERY: + printk(KERN_INFO MOD "%s: Fatal Error\n", + pci_name(ctx->lldi.pdev)); + if (ctx->dev) { + struct ib_event event; + + ctx->dev->rdev.flags |= T4_FATAL_ERROR; + memset(&event, 0, sizeof event); + event.event = IB_EVENT_DEVICE_FATAL; + event.device = &ctx->dev->ibdev; + ib_dispatch_event(&event); + c4iw_remove(ctx); + } + break; + case CXGB4_STATE_DETACH: + printk(KERN_INFO MOD "%s: Detach\n", + pci_name(ctx->lldi.pdev)); + if (ctx->dev) + c4iw_remove(ctx); + break; + } + return 0; +} + +static int disable_qp_db(int id, void *p, void *data) +{ + struct c4iw_qp *qp = p; + + t4_disable_wq_db(&qp->wq); + return 0; +} + +static void stop_queues(struct uld_ctx *ctx) +{ + unsigned long flags; + + spin_lock_irqsave(&ctx->dev->lock, flags); + ctx->dev->rdev.stats.db_state_transitions++; + ctx->dev->db_state = STOPPED; + if (ctx->dev->rdev.flags & T4_STATUS_PAGE_DISABLED) + idr_for_each(&ctx->dev->qpidr, disable_qp_db, NULL); + else + ctx->dev->rdev.status_page->db_off = 1; + spin_unlock_irqrestore(&ctx->dev->lock, flags); +} + +static int enable_qp_db(int id, void *p, void *data) +{ + struct c4iw_qp *qp = p; + + t4_enable_wq_db(&qp->wq); + return 0; +} + +static void resume_rc_qp(struct c4iw_qp *qp) +{ + spin_lock(&qp->lock); + t4_ring_sq_db(&qp->wq, qp->wq.sq.wq_pidx_inc, + is_t5(qp->rhp->rdev.lldi.adapter_type), NULL); + qp->wq.sq.wq_pidx_inc = 0; + t4_ring_rq_db(&qp->wq, qp->wq.rq.wq_pidx_inc, + is_t5(qp->rhp->rdev.lldi.adapter_type), NULL); + qp->wq.rq.wq_pidx_inc = 0; + spin_unlock(&qp->lock); +} + +static void resume_a_chunk(struct uld_ctx *ctx) +{ + int i; + struct c4iw_qp *qp; + + for (i = 0; i < DB_FC_RESUME_SIZE; i++) { + qp = list_first_entry(&ctx->dev->db_fc_list, struct c4iw_qp, + db_fc_entry); + list_del_init(&qp->db_fc_entry); + resume_rc_qp(qp); + if (list_empty(&ctx->dev->db_fc_list)) + break; + } +} + +static void resume_queues(struct uld_ctx *ctx) +{ + spin_lock_irq(&ctx->dev->lock); + if (ctx->dev->db_state != STOPPED) + goto out; + ctx->dev->db_state = FLOW_CONTROL; + while (1) { + if (list_empty(&ctx->dev->db_fc_list)) { + WARN_ON(ctx->dev->db_state != FLOW_CONTROL); + ctx->dev->db_state = NORMAL; + ctx->dev->rdev.stats.db_state_transitions++; + if (ctx->dev->rdev.flags & T4_STATUS_PAGE_DISABLED) { + idr_for_each(&ctx->dev->qpidr, enable_qp_db, + NULL); + } else { + ctx->dev->rdev.status_page->db_off = 0; + } + break; + } else { + if (cxgb4_dbfifo_count(ctx->dev->rdev.lldi.ports[0], 1) + < (ctx->dev->rdev.lldi.dbfifo_int_thresh << + DB_FC_DRAIN_THRESH)) { + resume_a_chunk(ctx); + } + if (!list_empty(&ctx->dev->db_fc_list)) { + spin_unlock_irq(&ctx->dev->lock); + if (DB_FC_RESUME_DELAY) { + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(DB_FC_RESUME_DELAY); + } + spin_lock_irq(&ctx->dev->lock); + if (ctx->dev->db_state != FLOW_CONTROL) + break; + } + } + } +out: + if (ctx->dev->db_state != NORMAL) + ctx->dev->rdev.stats.db_fc_interruptions++; + spin_unlock_irq(&ctx->dev->lock); +} + +struct qp_list { + unsigned idx; + struct c4iw_qp **qps; +}; + +static int add_and_ref_qp(int id, void *p, void *data) +{ + struct qp_list *qp_listp = data; + struct c4iw_qp *qp = p; + + c4iw_qp_add_ref(&qp->ibqp); + qp_listp->qps[qp_listp->idx++] = qp; + return 0; +} + +static int count_qps(int id, void *p, void *data) +{ + unsigned *countp = data; + (*countp)++; + return 0; +} + +static void deref_qps(struct qp_list *qp_list) +{ + int idx; + + for (idx = 0; idx < qp_list->idx; idx++) + c4iw_qp_rem_ref(&qp_list->qps[idx]->ibqp); +} + +static void recover_lost_dbs(struct uld_ctx *ctx, struct qp_list *qp_list) +{ + int idx; + int ret; + + for (idx = 0; idx < qp_list->idx; idx++) { + struct c4iw_qp *qp = qp_list->qps[idx]; + + spin_lock_irq(&qp->rhp->lock); + spin_lock(&qp->lock); + ret = cxgb4_sync_txq_pidx(qp->rhp->rdev.lldi.ports[0], + qp->wq.sq.qid, + t4_sq_host_wq_pidx(&qp->wq), + t4_sq_wq_size(&qp->wq)); + if (ret) { + pr_err(MOD "%s: Fatal error - " + "DB overflow recovery failed - " + "error syncing SQ qid %u\n", + pci_name(ctx->lldi.pdev), qp->wq.sq.qid); + spin_unlock(&qp->lock); + spin_unlock_irq(&qp->rhp->lock); + return; + } + qp->wq.sq.wq_pidx_inc = 0; + + ret = cxgb4_sync_txq_pidx(qp->rhp->rdev.lldi.ports[0], + qp->wq.rq.qid, + t4_rq_host_wq_pidx(&qp->wq), + t4_rq_wq_size(&qp->wq)); + + if (ret) { + pr_err(MOD "%s: Fatal error - " + "DB overflow recovery failed - " + "error syncing RQ qid %u\n", + pci_name(ctx->lldi.pdev), qp->wq.rq.qid); + spin_unlock(&qp->lock); + spin_unlock_irq(&qp->rhp->lock); + return; + } + qp->wq.rq.wq_pidx_inc = 0; + spin_unlock(&qp->lock); + spin_unlock_irq(&qp->rhp->lock); + + /* Wait for the dbfifo to drain */ + while (cxgb4_dbfifo_count(qp->rhp->rdev.lldi.ports[0], 1) > 0) { + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(usecs_to_jiffies(10)); + } + } +} + +static void recover_queues(struct uld_ctx *ctx) +{ + int count = 0; + struct qp_list qp_list; + int ret; + + /* slow everybody down */ + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(usecs_to_jiffies(1000)); + + /* flush the SGE contexts */ + ret = cxgb4_flush_eq_cache(ctx->dev->rdev.lldi.ports[0]); + if (ret) { + printk(KERN_ERR MOD "%s: Fatal error - DB overflow recovery failed\n", + pci_name(ctx->lldi.pdev)); + return; + } + + /* Count active queues so we can build a list of queues to recover */ + spin_lock_irq(&ctx->dev->lock); + WARN_ON(ctx->dev->db_state != STOPPED); + ctx->dev->db_state = RECOVERY; + idr_for_each(&ctx->dev->qpidr, count_qps, &count); + + qp_list.qps = kzalloc(count * sizeof *qp_list.qps, GFP_ATOMIC); + if (!qp_list.qps) { + printk(KERN_ERR MOD "%s: Fatal error - DB overflow recovery failed\n", + pci_name(ctx->lldi.pdev)); + spin_unlock_irq(&ctx->dev->lock); + return; + } + qp_list.idx = 0; + + /* add and ref each qp so it doesn't get freed */ + idr_for_each(&ctx->dev->qpidr, add_and_ref_qp, &qp_list); + + spin_unlock_irq(&ctx->dev->lock); + + /* now traverse the list in a safe context to recover the db state*/ + recover_lost_dbs(ctx, &qp_list); + + /* we're almost done! deref the qps and clean up */ + deref_qps(&qp_list); + kfree(qp_list.qps); + + spin_lock_irq(&ctx->dev->lock); + WARN_ON(ctx->dev->db_state != RECOVERY); + ctx->dev->db_state = STOPPED; + spin_unlock_irq(&ctx->dev->lock); +} + +static int c4iw_uld_control(void *handle, enum cxgb4_control control, ...) +{ + struct uld_ctx *ctx = handle; + + switch (control) { + case CXGB4_CONTROL_DB_FULL: + stop_queues(ctx); + ctx->dev->rdev.stats.db_full++; + break; + case CXGB4_CONTROL_DB_EMPTY: + resume_queues(ctx); + mutex_lock(&ctx->dev->rdev.stats.lock); + ctx->dev->rdev.stats.db_empty++; + mutex_unlock(&ctx->dev->rdev.stats.lock); + break; + case CXGB4_CONTROL_DB_DROP: + recover_queues(ctx); + mutex_lock(&ctx->dev->rdev.stats.lock); + ctx->dev->rdev.stats.db_drop++; + mutex_unlock(&ctx->dev->rdev.stats.lock); + break; + default: + printk(KERN_WARNING MOD "%s: unknown control cmd %u\n", + pci_name(ctx->lldi.pdev), control); + break; + } + return 0; +} + +static struct cxgb4_uld_info c4iw_uld_info = { + .name = DRV_NAME, + .add = c4iw_uld_add, + .rx_handler = c4iw_uld_rx_handler, + .state_change = c4iw_uld_state_change, + .control = c4iw_uld_control, +}; + +static int __init c4iw_init_module(void) +{ + int err; + + err = c4iw_cm_init(); + if (err) + return err; + + c4iw_debugfs_root = debugfs_create_dir(DRV_NAME, NULL); + if (!c4iw_debugfs_root) + printk(KERN_WARNING MOD + "could not create debugfs entry, continuing\n"); + + if (ibnl_add_client(RDMA_NL_C4IW, RDMA_NL_IWPM_NUM_OPS, + c4iw_nl_cb_table)) + pr_err("%s[%u]: Failed to add netlink callback\n" + , __func__, __LINE__); + + err = iwpm_init(RDMA_NL_C4IW); + if (err) { + pr_err("port mapper initialization failed with %d\n", err); + ibnl_remove_client(RDMA_NL_C4IW); + c4iw_cm_term(); + debugfs_remove_recursive(c4iw_debugfs_root); + return err; + } + + cxgb4_register_uld(CXGB4_ULD_RDMA, &c4iw_uld_info); + + return 0; +} + +static void __exit c4iw_exit_module(void) +{ + struct uld_ctx *ctx, *tmp; + + mutex_lock(&dev_mutex); + list_for_each_entry_safe(ctx, tmp, &uld_ctx_list, entry) { + if (ctx->dev) + c4iw_remove(ctx); + kfree(ctx); + } + mutex_unlock(&dev_mutex); + cxgb4_unregister_uld(CXGB4_ULD_RDMA); + iwpm_exit(RDMA_NL_C4IW); + ibnl_remove_client(RDMA_NL_C4IW); + c4iw_cm_term(); + debugfs_remove_recursive(c4iw_debugfs_root); +} + +module_init(c4iw_init_module); +module_exit(c4iw_exit_module); diff --git a/kernel/drivers/infiniband/hw/cxgb4/ev.c b/kernel/drivers/infiniband/hw/cxgb4/ev.c new file mode 100644 index 000000000..bdfac2ccb --- /dev/null +++ b/kernel/drivers/infiniband/hw/cxgb4/ev.c @@ -0,0 +1,244 @@ +/* + * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include + +#include "iw_cxgb4.h" + +static void print_tpte(struct c4iw_dev *dev, u32 stag) +{ + int ret; + struct fw_ri_tpte tpte; + + ret = cxgb4_read_tpte(dev->rdev.lldi.ports[0], stag, + (__be32 *)&tpte); + if (ret) { + dev_err(&dev->rdev.lldi.pdev->dev, + "%s cxgb4_read_tpte err %d\n", __func__, ret); + return; + } + PDBG("stag idx 0x%x valid %d key 0x%x state %d pdid %d " + "perm 0x%x ps %d len 0x%llx va 0x%llx\n", + stag & 0xffffff00, + FW_RI_TPTE_VALID_G(ntohl(tpte.valid_to_pdid)), + FW_RI_TPTE_STAGKEY_G(ntohl(tpte.valid_to_pdid)), + FW_RI_TPTE_STAGSTATE_G(ntohl(tpte.valid_to_pdid)), + FW_RI_TPTE_PDID_G(ntohl(tpte.valid_to_pdid)), + FW_RI_TPTE_PERM_G(ntohl(tpte.locread_to_qpid)), + FW_RI_TPTE_PS_G(ntohl(tpte.locread_to_qpid)), + ((u64)ntohl(tpte.len_hi) << 32) | ntohl(tpte.len_lo), + ((u64)ntohl(tpte.va_hi) << 32) | ntohl(tpte.va_lo_fbo)); +} + +static void dump_err_cqe(struct c4iw_dev *dev, struct t4_cqe *err_cqe) +{ + __be64 *p = (void *)err_cqe; + + dev_err(&dev->rdev.lldi.pdev->dev, + "AE qpid %d opcode %d status 0x%x " + "type %d len 0x%x wrid.hi 0x%x wrid.lo 0x%x\n", + CQE_QPID(err_cqe), CQE_OPCODE(err_cqe), + CQE_STATUS(err_cqe), CQE_TYPE(err_cqe), ntohl(err_cqe->len), + CQE_WRID_HI(err_cqe), CQE_WRID_LOW(err_cqe)); + + PDBG("%016llx %016llx %016llx %016llx\n", + be64_to_cpu(p[0]), be64_to_cpu(p[1]), be64_to_cpu(p[2]), + be64_to_cpu(p[3])); + + /* + * Ingress WRITE and READ_RESP errors provide + * the offending stag, so parse and log it. + */ + if (RQ_TYPE(err_cqe) && (CQE_OPCODE(err_cqe) == FW_RI_RDMA_WRITE || + CQE_OPCODE(err_cqe) == FW_RI_READ_RESP)) + print_tpte(dev, CQE_WRID_STAG(err_cqe)); +} + +static void post_qp_event(struct c4iw_dev *dev, struct c4iw_cq *chp, + struct c4iw_qp *qhp, + struct t4_cqe *err_cqe, + enum ib_event_type ib_event) +{ + struct ib_event event; + struct c4iw_qp_attributes attrs; + unsigned long flag; + + dump_err_cqe(dev, err_cqe); + + if (qhp->attr.state == C4IW_QP_STATE_RTS) { + attrs.next_state = C4IW_QP_STATE_TERMINATE; + c4iw_modify_qp(qhp->rhp, qhp, C4IW_QP_ATTR_NEXT_STATE, + &attrs, 0); + } + + event.event = ib_event; + event.device = chp->ibcq.device; + if (ib_event == IB_EVENT_CQ_ERR) + event.element.cq = &chp->ibcq; + else + event.element.qp = &qhp->ibqp; + if (qhp->ibqp.event_handler) + (*qhp->ibqp.event_handler)(&event, qhp->ibqp.qp_context); + + spin_lock_irqsave(&chp->comp_handler_lock, flag); + (*chp->ibcq.comp_handler)(&chp->ibcq, chp->ibcq.cq_context); + spin_unlock_irqrestore(&chp->comp_handler_lock, flag); +} + +void c4iw_ev_dispatch(struct c4iw_dev *dev, struct t4_cqe *err_cqe) +{ + struct c4iw_cq *chp; + struct c4iw_qp *qhp; + u32 cqid; + + spin_lock_irq(&dev->lock); + qhp = get_qhp(dev, CQE_QPID(err_cqe)); + if (!qhp) { + printk(KERN_ERR MOD "BAD AE qpid 0x%x opcode %d " + "status 0x%x type %d wrid.hi 0x%x wrid.lo 0x%x\n", + CQE_QPID(err_cqe), + CQE_OPCODE(err_cqe), CQE_STATUS(err_cqe), + CQE_TYPE(err_cqe), CQE_WRID_HI(err_cqe), + CQE_WRID_LOW(err_cqe)); + spin_unlock_irq(&dev->lock); + goto out; + } + + if (SQ_TYPE(err_cqe)) + cqid = qhp->attr.scq; + else + cqid = qhp->attr.rcq; + chp = get_chp(dev, cqid); + if (!chp) { + printk(KERN_ERR MOD "BAD AE cqid 0x%x qpid 0x%x opcode %d " + "status 0x%x type %d wrid.hi 0x%x wrid.lo 0x%x\n", + cqid, CQE_QPID(err_cqe), + CQE_OPCODE(err_cqe), CQE_STATUS(err_cqe), + CQE_TYPE(err_cqe), CQE_WRID_HI(err_cqe), + CQE_WRID_LOW(err_cqe)); + spin_unlock_irq(&dev->lock); + goto out; + } + + c4iw_qp_add_ref(&qhp->ibqp); + atomic_inc(&chp->refcnt); + spin_unlock_irq(&dev->lock); + + /* Bad incoming write */ + if (RQ_TYPE(err_cqe) && + (CQE_OPCODE(err_cqe) == FW_RI_RDMA_WRITE)) { + post_qp_event(dev, chp, qhp, err_cqe, IB_EVENT_QP_REQ_ERR); + goto done; + } + + switch (CQE_STATUS(err_cqe)) { + + /* Completion Events */ + case T4_ERR_SUCCESS: + printk(KERN_ERR MOD "AE with status 0!\n"); + break; + + case T4_ERR_STAG: + case T4_ERR_PDID: + case T4_ERR_QPID: + case T4_ERR_ACCESS: + case T4_ERR_WRAP: + case T4_ERR_BOUND: + case T4_ERR_INVALIDATE_SHARED_MR: + case T4_ERR_INVALIDATE_MR_WITH_MW_BOUND: + post_qp_event(dev, chp, qhp, err_cqe, IB_EVENT_QP_ACCESS_ERR); + break; + + /* Device Fatal Errors */ + case T4_ERR_ECC: + case T4_ERR_ECC_PSTAG: + case T4_ERR_INTERNAL_ERR: + post_qp_event(dev, chp, qhp, err_cqe, IB_EVENT_DEVICE_FATAL); + break; + + /* QP Fatal Errors */ + case T4_ERR_OUT_OF_RQE: + case T4_ERR_PBL_ADDR_BOUND: + case T4_ERR_CRC: + case T4_ERR_MARKER: + case T4_ERR_PDU_LEN_ERR: + case T4_ERR_DDP_VERSION: + case T4_ERR_RDMA_VERSION: + case T4_ERR_OPCODE: + case T4_ERR_DDP_QUEUE_NUM: + case T4_ERR_MSN: + case T4_ERR_TBIT: + case T4_ERR_MO: + case T4_ERR_MSN_GAP: + case T4_ERR_MSN_RANGE: + case T4_ERR_RQE_ADDR_BOUND: + case T4_ERR_IRD_OVERFLOW: + post_qp_event(dev, chp, qhp, err_cqe, IB_EVENT_QP_FATAL); + break; + + default: + printk(KERN_ERR MOD "Unknown T4 status 0x%x QPID 0x%x\n", + CQE_STATUS(err_cqe), qhp->wq.sq.qid); + post_qp_event(dev, chp, qhp, err_cqe, IB_EVENT_QP_FATAL); + break; + } +done: + if (atomic_dec_and_test(&chp->refcnt)) + wake_up(&chp->wait); + c4iw_qp_rem_ref(&qhp->ibqp); +out: + return; +} + +int c4iw_ev_handler(struct c4iw_dev *dev, u32 qid) +{ + struct c4iw_cq *chp; + unsigned long flag; + + spin_lock_irqsave(&dev->lock, flag); + chp = get_chp(dev, qid); + if (chp) { + atomic_inc(&chp->refcnt); + spin_unlock_irqrestore(&dev->lock, flag); + t4_clear_cq_armed(&chp->cq); + spin_lock_irqsave(&chp->comp_handler_lock, flag); + (*chp->ibcq.comp_handler)(&chp->ibcq, chp->ibcq.cq_context); + spin_unlock_irqrestore(&chp->comp_handler_lock, flag); + if (atomic_dec_and_test(&chp->refcnt)) + wake_up(&chp->wait); + } else { + PDBG("%s unknown cqid 0x%x\n", __func__, qid); + spin_unlock_irqrestore(&dev->lock, flag); + } + return 0; +} diff --git a/kernel/drivers/infiniband/hw/cxgb4/id_table.c b/kernel/drivers/infiniband/hw/cxgb4/id_table.c new file mode 100644 index 000000000..0161ae6ad --- /dev/null +++ b/kernel/drivers/infiniband/hw/cxgb4/id_table.c @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2011 Chelsio Communications. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include "iw_cxgb4.h" + +#define RANDOM_SKIP 16 + +/* + * Trivial bitmap-based allocator. If the random flag is set, the + * allocator is designed to: + * - pseudo-randomize the id returned such that it is not trivially predictable. + * - avoid reuse of recently used id (at the expense of predictability) + */ +u32 c4iw_id_alloc(struct c4iw_id_table *alloc) +{ + unsigned long flags; + u32 obj; + + spin_lock_irqsave(&alloc->lock, flags); + + obj = find_next_zero_bit(alloc->table, alloc->max, alloc->last); + if (obj >= alloc->max) + obj = find_first_zero_bit(alloc->table, alloc->max); + + if (obj < alloc->max) { + if (alloc->flags & C4IW_ID_TABLE_F_RANDOM) + alloc->last += prandom_u32() % RANDOM_SKIP; + else + alloc->last = obj + 1; + if (alloc->last >= alloc->max) + alloc->last = 0; + set_bit(obj, alloc->table); + obj += alloc->start; + } else + obj = -1; + + spin_unlock_irqrestore(&alloc->lock, flags); + return obj; +} + +void c4iw_id_free(struct c4iw_id_table *alloc, u32 obj) +{ + unsigned long flags; + + obj -= alloc->start; + BUG_ON((int)obj < 0); + + spin_lock_irqsave(&alloc->lock, flags); + clear_bit(obj, alloc->table); + spin_unlock_irqrestore(&alloc->lock, flags); +} + +int c4iw_id_table_alloc(struct c4iw_id_table *alloc, u32 start, u32 num, + u32 reserved, u32 flags) +{ + int i; + + alloc->start = start; + alloc->flags = flags; + if (flags & C4IW_ID_TABLE_F_RANDOM) + alloc->last = prandom_u32() % RANDOM_SKIP; + else + alloc->last = 0; + alloc->max = num; + spin_lock_init(&alloc->lock); + alloc->table = kmalloc(BITS_TO_LONGS(num) * sizeof(long), + GFP_KERNEL); + if (!alloc->table) + return -ENOMEM; + + bitmap_zero(alloc->table, num); + if (!(alloc->flags & C4IW_ID_TABLE_F_EMPTY)) + for (i = 0; i < reserved; ++i) + set_bit(i, alloc->table); + + return 0; +} + +void c4iw_id_table_free(struct c4iw_id_table *alloc) +{ + kfree(alloc->table); +} diff --git a/kernel/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/kernel/drivers/infiniband/hw/cxgb4/iw_cxgb4.h new file mode 100644 index 000000000..97bb5550a --- /dev/null +++ b/kernel/drivers/infiniband/hw/cxgb4/iw_cxgb4.h @@ -0,0 +1,1042 @@ +/* + * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __IW_CXGB4_H__ +#define __IW_CXGB4_H__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include +#include +#include + +#include "cxgb4.h" +#include "cxgb4_uld.h" +#include "l2t.h" +#include "user.h" + +#define DRV_NAME "iw_cxgb4" +#define MOD DRV_NAME ":" + +extern int c4iw_debug; +#define PDBG(fmt, args...) \ +do { \ + if (c4iw_debug) \ + printk(MOD fmt, ## args); \ +} while (0) + +#include "t4.h" + +#define PBL_OFF(rdev_p, a) ((a) - (rdev_p)->lldi.vr->pbl.start) +#define RQT_OFF(rdev_p, a) ((a) - (rdev_p)->lldi.vr->rq.start) + +static inline void *cplhdr(struct sk_buff *skb) +{ + return skb->data; +} + +#define C4IW_ID_TABLE_F_RANDOM 1 /* Pseudo-randomize the id's returned */ +#define C4IW_ID_TABLE_F_EMPTY 2 /* Table is initially empty */ + +struct c4iw_id_table { + u32 flags; + u32 start; /* logical minimal id */ + u32 last; /* hint for find */ + u32 max; + spinlock_t lock; + unsigned long *table; +}; + +struct c4iw_resource { + struct c4iw_id_table tpt_table; + struct c4iw_id_table qid_table; + struct c4iw_id_table pdid_table; +}; + +struct c4iw_qid_list { + struct list_head entry; + u32 qid; +}; + +struct c4iw_dev_ucontext { + struct list_head qpids; + struct list_head cqids; + struct mutex lock; +}; + +enum c4iw_rdev_flags { + T4_FATAL_ERROR = (1<<0), + T4_STATUS_PAGE_DISABLED = (1<<1), +}; + +struct c4iw_stat { + u64 total; + u64 cur; + u64 max; + u64 fail; +}; + +struct c4iw_stats { + struct mutex lock; + struct c4iw_stat qid; + struct c4iw_stat pd; + struct c4iw_stat stag; + struct c4iw_stat pbl; + struct c4iw_stat rqt; + struct c4iw_stat ocqp; + u64 db_full; + u64 db_empty; + u64 db_drop; + u64 db_state_transitions; + u64 db_fc_interruptions; + u64 tcam_full; + u64 act_ofld_conn_fails; + u64 pas_ofld_conn_fails; + u64 neg_adv; +}; + +struct c4iw_hw_queue { + int t4_eq_status_entries; + int t4_max_eq_size; + int t4_max_iq_size; + int t4_max_rq_size; + int t4_max_sq_size; + int t4_max_qp_depth; + int t4_max_cq_depth; + int t4_stat_len; +}; + +struct wr_log_entry { + struct timespec post_host_ts; + struct timespec poll_host_ts; + u64 post_sge_ts; + u64 cqe_sge_ts; + u64 poll_sge_ts; + u16 qid; + u16 wr_id; + u8 opcode; + u8 valid; +}; + +struct c4iw_rdev { + struct c4iw_resource resource; + unsigned long qpshift; + u32 qpmask; + unsigned long cqshift; + u32 cqmask; + struct c4iw_dev_ucontext uctx; + struct gen_pool *pbl_pool; + struct gen_pool *rqt_pool; + struct gen_pool *ocqp_pool; + u32 flags; + struct cxgb4_lld_info lldi; + unsigned long bar2_pa; + void __iomem *bar2_kva; + unsigned long oc_mw_pa; + void __iomem *oc_mw_kva; + struct c4iw_stats stats; + struct c4iw_hw_queue hw_queue; + struct t4_dev_status_page *status_page; + atomic_t wr_log_idx; + struct wr_log_entry *wr_log; + int wr_log_size; +}; + +static inline int c4iw_fatal_error(struct c4iw_rdev *rdev) +{ + return rdev->flags & T4_FATAL_ERROR; +} + +static inline int c4iw_num_stags(struct c4iw_rdev *rdev) +{ + return (int)(rdev->lldi.vr->stag.size >> 5); +} + +#define C4IW_WR_TO (60*HZ) + +struct c4iw_wr_wait { + struct completion completion; + int ret; +}; + +static inline void c4iw_init_wr_wait(struct c4iw_wr_wait *wr_waitp) +{ + wr_waitp->ret = 0; + init_completion(&wr_waitp->completion); +} + +static inline void c4iw_wake_up(struct c4iw_wr_wait *wr_waitp, int ret) +{ + wr_waitp->ret = ret; + complete(&wr_waitp->completion); +} + +static inline int c4iw_wait_for_reply(struct c4iw_rdev *rdev, + struct c4iw_wr_wait *wr_waitp, + u32 hwtid, u32 qpid, + const char *func) +{ + int ret; + + if (c4iw_fatal_error(rdev)) { + wr_waitp->ret = -EIO; + goto out; + } + + ret = wait_for_completion_timeout(&wr_waitp->completion, C4IW_WR_TO); + if (!ret) { + PDBG("%s - Device %s not responding (disabling device) - tid %u qpid %u\n", + func, pci_name(rdev->lldi.pdev), hwtid, qpid); + rdev->flags |= T4_FATAL_ERROR; + wr_waitp->ret = -EIO; + } +out: + if (wr_waitp->ret) + PDBG("%s: FW reply %d tid %u qpid %u\n", + pci_name(rdev->lldi.pdev), wr_waitp->ret, hwtid, qpid); + return wr_waitp->ret; +} + +enum db_state { + NORMAL = 0, + FLOW_CONTROL = 1, + RECOVERY = 2, + STOPPED = 3 +}; + +struct c4iw_dev { + struct ib_device ibdev; + struct c4iw_rdev rdev; + u32 device_cap_flags; + struct idr cqidr; + struct idr qpidr; + struct idr mmidr; + spinlock_t lock; + struct mutex db_mutex; + struct dentry *debugfs_root; + enum db_state db_state; + struct idr hwtid_idr; + struct idr atid_idr; + struct idr stid_idr; + struct list_head db_fc_list; + u32 avail_ird; +}; + +static inline struct c4iw_dev *to_c4iw_dev(struct ib_device *ibdev) +{ + return container_of(ibdev, struct c4iw_dev, ibdev); +} + +static inline struct c4iw_dev *rdev_to_c4iw_dev(struct c4iw_rdev *rdev) +{ + return container_of(rdev, struct c4iw_dev, rdev); +} + +static inline struct c4iw_cq *get_chp(struct c4iw_dev *rhp, u32 cqid) +{ + return idr_find(&rhp->cqidr, cqid); +} + +static inline struct c4iw_qp *get_qhp(struct c4iw_dev *rhp, u32 qpid) +{ + return idr_find(&rhp->qpidr, qpid); +} + +static inline struct c4iw_mr *get_mhp(struct c4iw_dev *rhp, u32 mmid) +{ + return idr_find(&rhp->mmidr, mmid); +} + +static inline int _insert_handle(struct c4iw_dev *rhp, struct idr *idr, + void *handle, u32 id, int lock) +{ + int ret; + + if (lock) { + idr_preload(GFP_KERNEL); + spin_lock_irq(&rhp->lock); + } + + ret = idr_alloc(idr, handle, id, id + 1, GFP_ATOMIC); + + if (lock) { + spin_unlock_irq(&rhp->lock); + idr_preload_end(); + } + + BUG_ON(ret == -ENOSPC); + return ret < 0 ? ret : 0; +} + +static inline int insert_handle(struct c4iw_dev *rhp, struct idr *idr, + void *handle, u32 id) +{ + return _insert_handle(rhp, idr, handle, id, 1); +} + +static inline int insert_handle_nolock(struct c4iw_dev *rhp, struct idr *idr, + void *handle, u32 id) +{ + return _insert_handle(rhp, idr, handle, id, 0); +} + +static inline void _remove_handle(struct c4iw_dev *rhp, struct idr *idr, + u32 id, int lock) +{ + if (lock) + spin_lock_irq(&rhp->lock); + idr_remove(idr, id); + if (lock) + spin_unlock_irq(&rhp->lock); +} + +static inline void remove_handle(struct c4iw_dev *rhp, struct idr *idr, u32 id) +{ + _remove_handle(rhp, idr, id, 1); +} + +static inline void remove_handle_nolock(struct c4iw_dev *rhp, + struct idr *idr, u32 id) +{ + _remove_handle(rhp, idr, id, 0); +} + +extern uint c4iw_max_read_depth; + +static inline int cur_max_read_depth(struct c4iw_dev *dev) +{ + return min(dev->rdev.lldi.max_ordird_qp, c4iw_max_read_depth); +} + +struct c4iw_pd { + struct ib_pd ibpd; + u32 pdid; + struct c4iw_dev *rhp; +}; + +static inline struct c4iw_pd *to_c4iw_pd(struct ib_pd *ibpd) +{ + return container_of(ibpd, struct c4iw_pd, ibpd); +} + +struct tpt_attributes { + u64 len; + u64 va_fbo; + enum fw_ri_mem_perms perms; + u32 stag; + u32 pdid; + u32 qpid; + u32 pbl_addr; + u32 pbl_size; + u32 state:1; + u32 type:2; + u32 rsvd:1; + u32 remote_invaliate_disable:1; + u32 zbva:1; + u32 mw_bind_enable:1; + u32 page_size:5; +}; + +struct c4iw_mr { + struct ib_mr ibmr; + struct ib_umem *umem; + struct c4iw_dev *rhp; + u64 kva; + struct tpt_attributes attr; +}; + +static inline struct c4iw_mr *to_c4iw_mr(struct ib_mr *ibmr) +{ + return container_of(ibmr, struct c4iw_mr, ibmr); +} + +struct c4iw_mw { + struct ib_mw ibmw; + struct c4iw_dev *rhp; + u64 kva; + struct tpt_attributes attr; +}; + +static inline struct c4iw_mw *to_c4iw_mw(struct ib_mw *ibmw) +{ + return container_of(ibmw, struct c4iw_mw, ibmw); +} + +struct c4iw_fr_page_list { + struct ib_fast_reg_page_list ibpl; + DEFINE_DMA_UNMAP_ADDR(mapping); + dma_addr_t dma_addr; + struct c4iw_dev *dev; + int pll_len; +}; + +static inline struct c4iw_fr_page_list *to_c4iw_fr_page_list( + struct ib_fast_reg_page_list *ibpl) +{ + return container_of(ibpl, struct c4iw_fr_page_list, ibpl); +} + +struct c4iw_cq { + struct ib_cq ibcq; + struct c4iw_dev *rhp; + struct t4_cq cq; + spinlock_t lock; + spinlock_t comp_handler_lock; + atomic_t refcnt; + wait_queue_head_t wait; +}; + +static inline struct c4iw_cq *to_c4iw_cq(struct ib_cq *ibcq) +{ + return container_of(ibcq, struct c4iw_cq, ibcq); +} + +struct c4iw_mpa_attributes { + u8 initiator; + u8 recv_marker_enabled; + u8 xmit_marker_enabled; + u8 crc_enabled; + u8 enhanced_rdma_conn; + u8 version; + u8 p2p_type; +}; + +struct c4iw_qp_attributes { + u32 scq; + u32 rcq; + u32 sq_num_entries; + u32 rq_num_entries; + u32 sq_max_sges; + u32 sq_max_sges_rdma_write; + u32 rq_max_sges; + u32 state; + u8 enable_rdma_read; + u8 enable_rdma_write; + u8 enable_bind; + u8 enable_mmid0_fastreg; + u32 max_ord; + u32 max_ird; + u32 pd; + u32 next_state; + char terminate_buffer[52]; + u32 terminate_msg_len; + u8 is_terminate_local; + struct c4iw_mpa_attributes mpa_attr; + struct c4iw_ep *llp_stream_handle; + u8 layer_etype; + u8 ecode; + u16 sq_db_inc; + u16 rq_db_inc; + u8 send_term; +}; + +struct c4iw_qp { + struct ib_qp ibqp; + struct list_head db_fc_entry; + struct c4iw_dev *rhp; + struct c4iw_ep *ep; + struct c4iw_qp_attributes attr; + struct t4_wq wq; + spinlock_t lock; + struct mutex mutex; + atomic_t refcnt; + wait_queue_head_t wait; + struct timer_list timer; + int sq_sig_all; +}; + +static inline struct c4iw_qp *to_c4iw_qp(struct ib_qp *ibqp) +{ + return container_of(ibqp, struct c4iw_qp, ibqp); +} + +struct c4iw_ucontext { + struct ib_ucontext ibucontext; + struct c4iw_dev_ucontext uctx; + u32 key; + spinlock_t mmap_lock; + struct list_head mmaps; +}; + +static inline struct c4iw_ucontext *to_c4iw_ucontext(struct ib_ucontext *c) +{ + return container_of(c, struct c4iw_ucontext, ibucontext); +} + +struct c4iw_mm_entry { + struct list_head entry; + u64 addr; + u32 key; + unsigned len; +}; + +static inline struct c4iw_mm_entry *remove_mmap(struct c4iw_ucontext *ucontext, + u32 key, unsigned len) +{ + struct list_head *pos, *nxt; + struct c4iw_mm_entry *mm; + + spin_lock(&ucontext->mmap_lock); + list_for_each_safe(pos, nxt, &ucontext->mmaps) { + + mm = list_entry(pos, struct c4iw_mm_entry, entry); + if (mm->key == key && mm->len == len) { + list_del_init(&mm->entry); + spin_unlock(&ucontext->mmap_lock); + PDBG("%s key 0x%x addr 0x%llx len %d\n", __func__, + key, (unsigned long long) mm->addr, mm->len); + return mm; + } + } + spin_unlock(&ucontext->mmap_lock); + return NULL; +} + +static inline void insert_mmap(struct c4iw_ucontext *ucontext, + struct c4iw_mm_entry *mm) +{ + spin_lock(&ucontext->mmap_lock); + PDBG("%s key 0x%x addr 0x%llx len %d\n", __func__, + mm->key, (unsigned long long) mm->addr, mm->len); + list_add_tail(&mm->entry, &ucontext->mmaps); + spin_unlock(&ucontext->mmap_lock); +} + +enum c4iw_qp_attr_mask { + C4IW_QP_ATTR_NEXT_STATE = 1 << 0, + C4IW_QP_ATTR_SQ_DB = 1<<1, + C4IW_QP_ATTR_RQ_DB = 1<<2, + C4IW_QP_ATTR_ENABLE_RDMA_READ = 1 << 7, + C4IW_QP_ATTR_ENABLE_RDMA_WRITE = 1 << 8, + C4IW_QP_ATTR_ENABLE_RDMA_BIND = 1 << 9, + C4IW_QP_ATTR_MAX_ORD = 1 << 11, + C4IW_QP_ATTR_MAX_IRD = 1 << 12, + C4IW_QP_ATTR_LLP_STREAM_HANDLE = 1 << 22, + C4IW_QP_ATTR_STREAM_MSG_BUFFER = 1 << 23, + C4IW_QP_ATTR_MPA_ATTR = 1 << 24, + C4IW_QP_ATTR_QP_CONTEXT_ACTIVATE = 1 << 25, + C4IW_QP_ATTR_VALID_MODIFY = (C4IW_QP_ATTR_ENABLE_RDMA_READ | + C4IW_QP_ATTR_ENABLE_RDMA_WRITE | + C4IW_QP_ATTR_MAX_ORD | + C4IW_QP_ATTR_MAX_IRD | + C4IW_QP_ATTR_LLP_STREAM_HANDLE | + C4IW_QP_ATTR_STREAM_MSG_BUFFER | + C4IW_QP_ATTR_MPA_ATTR | + C4IW_QP_ATTR_QP_CONTEXT_ACTIVATE) +}; + +int c4iw_modify_qp(struct c4iw_dev *rhp, + struct c4iw_qp *qhp, + enum c4iw_qp_attr_mask mask, + struct c4iw_qp_attributes *attrs, + int internal); + +enum c4iw_qp_state { + C4IW_QP_STATE_IDLE, + C4IW_QP_STATE_RTS, + C4IW_QP_STATE_ERROR, + C4IW_QP_STATE_TERMINATE, + C4IW_QP_STATE_CLOSING, + C4IW_QP_STATE_TOT +}; + +static inline int c4iw_convert_state(enum ib_qp_state ib_state) +{ + switch (ib_state) { + case IB_QPS_RESET: + case IB_QPS_INIT: + return C4IW_QP_STATE_IDLE; + case IB_QPS_RTS: + return C4IW_QP_STATE_RTS; + case IB_QPS_SQD: + return C4IW_QP_STATE_CLOSING; + case IB_QPS_SQE: + return C4IW_QP_STATE_TERMINATE; + case IB_QPS_ERR: + return C4IW_QP_STATE_ERROR; + default: + return -1; + } +} + +static inline int to_ib_qp_state(int c4iw_qp_state) +{ + switch (c4iw_qp_state) { + case C4IW_QP_STATE_IDLE: + return IB_QPS_INIT; + case C4IW_QP_STATE_RTS: + return IB_QPS_RTS; + case C4IW_QP_STATE_CLOSING: + return IB_QPS_SQD; + case C4IW_QP_STATE_TERMINATE: + return IB_QPS_SQE; + case C4IW_QP_STATE_ERROR: + return IB_QPS_ERR; + } + return IB_QPS_ERR; +} + +static inline u32 c4iw_ib_to_tpt_access(int a) +{ + return (a & IB_ACCESS_REMOTE_WRITE ? FW_RI_MEM_ACCESS_REM_WRITE : 0) | + (a & IB_ACCESS_REMOTE_READ ? FW_RI_MEM_ACCESS_REM_READ : 0) | + (a & IB_ACCESS_LOCAL_WRITE ? FW_RI_MEM_ACCESS_LOCAL_WRITE : 0) | + FW_RI_MEM_ACCESS_LOCAL_READ; +} + +static inline u32 c4iw_ib_to_tpt_bind_access(int acc) +{ + return (acc & IB_ACCESS_REMOTE_WRITE ? FW_RI_MEM_ACCESS_REM_WRITE : 0) | + (acc & IB_ACCESS_REMOTE_READ ? FW_RI_MEM_ACCESS_REM_READ : 0); +} + +enum c4iw_mmid_state { + C4IW_STAG_STATE_VALID, + C4IW_STAG_STATE_INVALID +}; + +#define C4IW_NODE_DESC "cxgb4 Chelsio Communications" + +#define MPA_KEY_REQ "MPA ID Req Frame" +#define MPA_KEY_REP "MPA ID Rep Frame" + +#define MPA_MAX_PRIVATE_DATA 256 +#define MPA_ENHANCED_RDMA_CONN 0x10 +#define MPA_REJECT 0x20 +#define MPA_CRC 0x40 +#define MPA_MARKERS 0x80 +#define MPA_FLAGS_MASK 0xE0 + +#define MPA_V2_PEER2PEER_MODEL 0x8000 +#define MPA_V2_ZERO_LEN_FPDU_RTR 0x4000 +#define MPA_V2_RDMA_WRITE_RTR 0x8000 +#define MPA_V2_RDMA_READ_RTR 0x4000 +#define MPA_V2_IRD_ORD_MASK 0x3FFF + +#define c4iw_put_ep(ep) { \ + PDBG("put_ep (via %s:%u) ep %p refcnt %d\n", __func__, __LINE__, \ + ep, atomic_read(&((ep)->kref.refcount))); \ + WARN_ON(atomic_read(&((ep)->kref.refcount)) < 1); \ + kref_put(&((ep)->kref), _c4iw_free_ep); \ +} + +#define c4iw_get_ep(ep) { \ + PDBG("get_ep (via %s:%u) ep %p, refcnt %d\n", __func__, __LINE__, \ + ep, atomic_read(&((ep)->kref.refcount))); \ + kref_get(&((ep)->kref)); \ +} +void _c4iw_free_ep(struct kref *kref); + +struct mpa_message { + u8 key[16]; + u8 flags; + u8 revision; + __be16 private_data_size; + u8 private_data[0]; +}; + +struct mpa_v2_conn_params { + __be16 ird; + __be16 ord; +}; + +struct terminate_message { + u8 layer_etype; + u8 ecode; + __be16 hdrct_rsvd; + u8 len_hdrs[0]; +}; + +#define TERM_MAX_LENGTH (sizeof(struct terminate_message) + 2 + 18 + 28) + +enum c4iw_layers_types { + LAYER_RDMAP = 0x00, + LAYER_DDP = 0x10, + LAYER_MPA = 0x20, + RDMAP_LOCAL_CATA = 0x00, + RDMAP_REMOTE_PROT = 0x01, + RDMAP_REMOTE_OP = 0x02, + DDP_LOCAL_CATA = 0x00, + DDP_TAGGED_ERR = 0x01, + DDP_UNTAGGED_ERR = 0x02, + DDP_LLP = 0x03 +}; + +enum c4iw_rdma_ecodes { + RDMAP_INV_STAG = 0x00, + RDMAP_BASE_BOUNDS = 0x01, + RDMAP_ACC_VIOL = 0x02, + RDMAP_STAG_NOT_ASSOC = 0x03, + RDMAP_TO_WRAP = 0x04, + RDMAP_INV_VERS = 0x05, + RDMAP_INV_OPCODE = 0x06, + RDMAP_STREAM_CATA = 0x07, + RDMAP_GLOBAL_CATA = 0x08, + RDMAP_CANT_INV_STAG = 0x09, + RDMAP_UNSPECIFIED = 0xff +}; + +enum c4iw_ddp_ecodes { + DDPT_INV_STAG = 0x00, + DDPT_BASE_BOUNDS = 0x01, + DDPT_STAG_NOT_ASSOC = 0x02, + DDPT_TO_WRAP = 0x03, + DDPT_INV_VERS = 0x04, + DDPU_INV_QN = 0x01, + DDPU_INV_MSN_NOBUF = 0x02, + DDPU_INV_MSN_RANGE = 0x03, + DDPU_INV_MO = 0x04, + DDPU_MSG_TOOBIG = 0x05, + DDPU_INV_VERS = 0x06 +}; + +enum c4iw_mpa_ecodes { + MPA_CRC_ERR = 0x02, + MPA_MARKER_ERR = 0x03, + MPA_LOCAL_CATA = 0x05, + MPA_INSUFF_IRD = 0x06, + MPA_NOMATCH_RTR = 0x07, +}; + +enum c4iw_ep_state { + IDLE = 0, + LISTEN, + CONNECTING, + MPA_REQ_WAIT, + MPA_REQ_SENT, + MPA_REQ_RCVD, + MPA_REP_SENT, + FPDU_MODE, + ABORTING, + CLOSING, + MORIBUND, + DEAD, +}; + +enum c4iw_ep_flags { + PEER_ABORT_IN_PROGRESS = 0, + ABORT_REQ_IN_PROGRESS = 1, + RELEASE_RESOURCES = 2, + CLOSE_SENT = 3, + TIMEOUT = 4, + QP_REFERENCED = 5, + RELEASE_MAPINFO = 6, +}; + +enum c4iw_ep_history { + ACT_OPEN_REQ = 0, + ACT_OFLD_CONN = 1, + ACT_OPEN_RPL = 2, + ACT_ESTAB = 3, + PASS_ACCEPT_REQ = 4, + PASS_ESTAB = 5, + ABORT_UPCALL = 6, + ESTAB_UPCALL = 7, + CLOSE_UPCALL = 8, + ULP_ACCEPT = 9, + ULP_REJECT = 10, + TIMEDOUT = 11, + PEER_ABORT = 12, + PEER_CLOSE = 13, + CONNREQ_UPCALL = 14, + ABORT_CONN = 15, + DISCONN_UPCALL = 16, + EP_DISC_CLOSE = 17, + EP_DISC_ABORT = 18, + CONN_RPL_UPCALL = 19, + ACT_RETRY_NOMEM = 20, + ACT_RETRY_INUSE = 21 +}; + +struct c4iw_ep_common { + struct iw_cm_id *cm_id; + struct c4iw_qp *qp; + struct c4iw_dev *dev; + enum c4iw_ep_state state; + struct kref kref; + struct mutex mutex; + struct sockaddr_storage local_addr; + struct sockaddr_storage remote_addr; + struct sockaddr_storage mapped_local_addr; + struct sockaddr_storage mapped_remote_addr; + struct c4iw_wr_wait wr_wait; + unsigned long flags; + unsigned long history; +}; + +struct c4iw_listen_ep { + struct c4iw_ep_common com; + unsigned int stid; + int backlog; +}; + +struct c4iw_ep_stats { + unsigned connect_neg_adv; + unsigned abort_neg_adv; +}; + +struct c4iw_ep { + struct c4iw_ep_common com; + struct c4iw_ep *parent_ep; + struct timer_list timer; + struct list_head entry; + unsigned int atid; + u32 hwtid; + u32 snd_seq; + u32 rcv_seq; + struct l2t_entry *l2t; + struct dst_entry *dst; + struct sk_buff *mpa_skb; + struct c4iw_mpa_attributes mpa_attr; + u8 mpa_pkt[sizeof(struct mpa_message) + MPA_MAX_PRIVATE_DATA]; + unsigned int mpa_pkt_len; + u32 ird; + u32 ord; + u32 smac_idx; + u32 tx_chan; + u32 mtu; + u16 mss; + u16 emss; + u16 plen; + u16 rss_qid; + u16 txq_idx; + u16 ctrlq_idx; + u8 tos; + u8 retry_with_mpa_v1; + u8 tried_with_mpa_v1; + unsigned int retry_count; + int snd_win; + int rcv_win; + struct c4iw_ep_stats stats; +}; + +static inline void print_addr(struct c4iw_ep_common *epc, const char *func, + const char *msg) +{ + +#define SINA(a) (&(((struct sockaddr_in *)(a))->sin_addr.s_addr)) +#define SINP(a) ntohs(((struct sockaddr_in *)(a))->sin_port) +#define SIN6A(a) (&(((struct sockaddr_in6 *)(a))->sin6_addr)) +#define SIN6P(a) ntohs(((struct sockaddr_in6 *)(a))->sin6_port) + + if (c4iw_debug) { + switch (epc->local_addr.ss_family) { + case AF_INET: + PDBG("%s %s %pI4:%u/%u <-> %pI4:%u/%u\n", + func, msg, SINA(&epc->local_addr), + SINP(&epc->local_addr), + SINP(&epc->mapped_local_addr), + SINA(&epc->remote_addr), + SINP(&epc->remote_addr), + SINP(&epc->mapped_remote_addr)); + break; + case AF_INET6: + PDBG("%s %s %pI6:%u/%u <-> %pI6:%u/%u\n", + func, msg, SIN6A(&epc->local_addr), + SIN6P(&epc->local_addr), + SIN6P(&epc->mapped_local_addr), + SIN6A(&epc->remote_addr), + SIN6P(&epc->remote_addr), + SIN6P(&epc->mapped_remote_addr)); + break; + default: + break; + } + } +#undef SINA +#undef SINP +#undef SIN6A +#undef SIN6P +} + +static inline struct c4iw_ep *to_ep(struct iw_cm_id *cm_id) +{ + return cm_id->provider_data; +} + +static inline struct c4iw_listen_ep *to_listen_ep(struct iw_cm_id *cm_id) +{ + return cm_id->provider_data; +} + +static inline int compute_wscale(int win) +{ + int wscale = 0; + + while (wscale < 14 && (65535<vr->ocq.size > 0; +#else + return 0; +#endif +} + +u32 c4iw_id_alloc(struct c4iw_id_table *alloc); +void c4iw_id_free(struct c4iw_id_table *alloc, u32 obj); +int c4iw_id_table_alloc(struct c4iw_id_table *alloc, u32 start, u32 num, + u32 reserved, u32 flags); +void c4iw_id_table_free(struct c4iw_id_table *alloc); + +typedef int (*c4iw_handler_func)(struct c4iw_dev *dev, struct sk_buff *skb); + +int c4iw_ep_redirect(void *ctx, struct dst_entry *old, struct dst_entry *new, + struct l2t_entry *l2t); +void c4iw_put_qpid(struct c4iw_rdev *rdev, u32 qpid, + struct c4iw_dev_ucontext *uctx); +u32 c4iw_get_resource(struct c4iw_id_table *id_table); +void c4iw_put_resource(struct c4iw_id_table *id_table, u32 entry); +int c4iw_init_resource(struct c4iw_rdev *rdev, u32 nr_tpt, u32 nr_pdid); +int c4iw_init_ctrl_qp(struct c4iw_rdev *rdev); +int c4iw_pblpool_create(struct c4iw_rdev *rdev); +int c4iw_rqtpool_create(struct c4iw_rdev *rdev); +int c4iw_ocqp_pool_create(struct c4iw_rdev *rdev); +void c4iw_pblpool_destroy(struct c4iw_rdev *rdev); +void c4iw_rqtpool_destroy(struct c4iw_rdev *rdev); +void c4iw_ocqp_pool_destroy(struct c4iw_rdev *rdev); +void c4iw_destroy_resource(struct c4iw_resource *rscp); +int c4iw_destroy_ctrl_qp(struct c4iw_rdev *rdev); +int c4iw_register_device(struct c4iw_dev *dev); +void c4iw_unregister_device(struct c4iw_dev *dev); +int __init c4iw_cm_init(void); +void c4iw_cm_term(void); +void c4iw_release_dev_ucontext(struct c4iw_rdev *rdev, + struct c4iw_dev_ucontext *uctx); +void c4iw_init_dev_ucontext(struct c4iw_rdev *rdev, + struct c4iw_dev_ucontext *uctx); +int c4iw_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); +int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr); +int c4iw_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr); +int c4iw_bind_mw(struct ib_qp *qp, struct ib_mw *mw, + struct ib_mw_bind *mw_bind); +int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param); +int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog); +int c4iw_destroy_listen(struct iw_cm_id *cm_id); +int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param); +int c4iw_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len); +void c4iw_qp_add_ref(struct ib_qp *qp); +void c4iw_qp_rem_ref(struct ib_qp *qp); +void c4iw_free_fastreg_pbl(struct ib_fast_reg_page_list *page_list); +struct ib_fast_reg_page_list *c4iw_alloc_fastreg_pbl( + struct ib_device *device, + int page_list_len); +struct ib_mr *c4iw_alloc_fast_reg_mr(struct ib_pd *pd, int pbl_depth); +int c4iw_dealloc_mw(struct ib_mw *mw); +struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd, enum ib_mw_type type); +struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, + u64 length, u64 virt, int acc, + struct ib_udata *udata); +struct ib_mr *c4iw_get_dma_mr(struct ib_pd *pd, int acc); +struct ib_mr *c4iw_register_phys_mem(struct ib_pd *pd, + struct ib_phys_buf *buffer_list, + int num_phys_buf, + int acc, + u64 *iova_start); +int c4iw_reregister_phys_mem(struct ib_mr *mr, + int mr_rereg_mask, + struct ib_pd *pd, + struct ib_phys_buf *buffer_list, + int num_phys_buf, + int acc, u64 *iova_start); +int c4iw_dereg_mr(struct ib_mr *ib_mr); +int c4iw_destroy_cq(struct ib_cq *ib_cq); +struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, int entries, + int vector, + struct ib_ucontext *ib_context, + struct ib_udata *udata); +int c4iw_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata); +int c4iw_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); +int c4iw_destroy_qp(struct ib_qp *ib_qp); +struct ib_qp *c4iw_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *attrs, + struct ib_udata *udata); +int c4iw_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata); +int c4iw_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_qp_init_attr *init_attr); +struct ib_qp *c4iw_get_qp(struct ib_device *dev, int qpn); +u32 c4iw_rqtpool_alloc(struct c4iw_rdev *rdev, int size); +void c4iw_rqtpool_free(struct c4iw_rdev *rdev, u32 addr, int size); +u32 c4iw_pblpool_alloc(struct c4iw_rdev *rdev, int size); +void c4iw_pblpool_free(struct c4iw_rdev *rdev, u32 addr, int size); +u32 c4iw_ocqp_pool_alloc(struct c4iw_rdev *rdev, int size); +void c4iw_ocqp_pool_free(struct c4iw_rdev *rdev, u32 addr, int size); +int c4iw_ofld_send(struct c4iw_rdev *rdev, struct sk_buff *skb); +void c4iw_flush_hw_cq(struct c4iw_cq *chp); +void c4iw_count_rcqes(struct t4_cq *cq, struct t4_wq *wq, int *count); +int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp); +int c4iw_flush_rq(struct t4_wq *wq, struct t4_cq *cq, int count); +int c4iw_flush_sq(struct c4iw_qp *qhp); +int c4iw_ev_handler(struct c4iw_dev *rnicp, u32 qid); +u16 c4iw_rqes_posted(struct c4iw_qp *qhp); +int c4iw_post_terminate(struct c4iw_qp *qhp, struct t4_cqe *err_cqe); +u32 c4iw_get_cqid(struct c4iw_rdev *rdev, struct c4iw_dev_ucontext *uctx); +void c4iw_put_cqid(struct c4iw_rdev *rdev, u32 qid, + struct c4iw_dev_ucontext *uctx); +u32 c4iw_get_qpid(struct c4iw_rdev *rdev, struct c4iw_dev_ucontext *uctx); +void c4iw_put_qpid(struct c4iw_rdev *rdev, u32 qid, + struct c4iw_dev_ucontext *uctx); +void c4iw_ev_dispatch(struct c4iw_dev *dev, struct t4_cqe *err_cqe); + +extern struct cxgb4_client t4c_client; +extern c4iw_handler_func c4iw_handlers[NUM_CPL_CMDS]; +extern void c4iw_log_wr_stats(struct t4_wq *wq, struct t4_cqe *cqe); +extern int c4iw_wr_log; +extern int db_fc_threshold; +extern int db_coalescing_threshold; +extern int use_dsgl; + + +#endif diff --git a/kernel/drivers/infiniband/hw/cxgb4/mem.c b/kernel/drivers/infiniband/hw/cxgb4/mem.c new file mode 100644 index 000000000..cff815b91 --- /dev/null +++ b/kernel/drivers/infiniband/hw/cxgb4/mem.c @@ -0,0 +1,979 @@ +/* + * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#include "iw_cxgb4.h" + +int use_dsgl = 0; +module_param(use_dsgl, int, 0644); +MODULE_PARM_DESC(use_dsgl, "Use DSGL for PBL/FastReg (default=0)"); + +#define T4_ULPTX_MIN_IO 32 +#define C4IW_MAX_INLINE_SIZE 96 +#define T4_ULPTX_MAX_DMA 1024 +#define C4IW_INLINE_THRESHOLD 128 + +static int inline_threshold = C4IW_INLINE_THRESHOLD; +module_param(inline_threshold, int, 0644); +MODULE_PARM_DESC(inline_threshold, "inline vs dsgl threshold (default=128)"); + +static int mr_exceeds_hw_limits(struct c4iw_dev *dev, u64 length) +{ + return (is_t4(dev->rdev.lldi.adapter_type) || + is_t5(dev->rdev.lldi.adapter_type)) && + length >= 8*1024*1024*1024ULL; +} + +static int _c4iw_write_mem_dma_aligned(struct c4iw_rdev *rdev, u32 addr, + u32 len, dma_addr_t data, int wait) +{ + struct sk_buff *skb; + struct ulp_mem_io *req; + struct ulptx_sgl *sgl; + u8 wr_len; + int ret = 0; + struct c4iw_wr_wait wr_wait; + + addr &= 0x7FFFFFF; + + if (wait) + c4iw_init_wr_wait(&wr_wait); + wr_len = roundup(sizeof(*req) + sizeof(*sgl), 16); + + skb = alloc_skb(wr_len, GFP_KERNEL); + if (!skb) + return -ENOMEM; + set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0); + + req = (struct ulp_mem_io *)__skb_put(skb, wr_len); + memset(req, 0, wr_len); + INIT_ULPTX_WR(req, wr_len, 0, 0); + req->wr.wr_hi = cpu_to_be32(FW_WR_OP_V(FW_ULPTX_WR) | + (wait ? FW_WR_COMPL_F : 0)); + req->wr.wr_lo = wait ? (__force __be64)(unsigned long) &wr_wait : 0L; + req->wr.wr_mid = cpu_to_be32(FW_WR_LEN16_V(DIV_ROUND_UP(wr_len, 16))); + req->cmd = cpu_to_be32(ULPTX_CMD_V(ULP_TX_MEM_WRITE)); + req->cmd |= cpu_to_be32(T5_ULP_MEMIO_ORDER_V(1)); + req->dlen = cpu_to_be32(ULP_MEMIO_DATA_LEN_V(len>>5)); + req->len16 = cpu_to_be32(DIV_ROUND_UP(wr_len-sizeof(req->wr), 16)); + req->lock_addr = cpu_to_be32(ULP_MEMIO_ADDR_V(addr)); + + sgl = (struct ulptx_sgl *)(req + 1); + sgl->cmd_nsge = cpu_to_be32(ULPTX_CMD_V(ULP_TX_SC_DSGL) | + ULPTX_NSGE_V(1)); + sgl->len0 = cpu_to_be32(len); + sgl->addr0 = cpu_to_be64(data); + + ret = c4iw_ofld_send(rdev, skb); + if (ret) + return ret; + if (wait) + ret = c4iw_wait_for_reply(rdev, &wr_wait, 0, 0, __func__); + return ret; +} + +static int _c4iw_write_mem_inline(struct c4iw_rdev *rdev, u32 addr, u32 len, + void *data) +{ + struct sk_buff *skb; + struct ulp_mem_io *req; + struct ulptx_idata *sc; + u8 wr_len, *to_dp, *from_dp; + int copy_len, num_wqe, i, ret = 0; + struct c4iw_wr_wait wr_wait; + __be32 cmd = cpu_to_be32(ULPTX_CMD_V(ULP_TX_MEM_WRITE)); + + if (is_t4(rdev->lldi.adapter_type)) + cmd |= cpu_to_be32(ULP_MEMIO_ORDER_F); + else + cmd |= cpu_to_be32(T5_ULP_MEMIO_IMM_F); + + addr &= 0x7FFFFFF; + PDBG("%s addr 0x%x len %u\n", __func__, addr, len); + num_wqe = DIV_ROUND_UP(len, C4IW_MAX_INLINE_SIZE); + c4iw_init_wr_wait(&wr_wait); + for (i = 0; i < num_wqe; i++) { + + copy_len = len > C4IW_MAX_INLINE_SIZE ? C4IW_MAX_INLINE_SIZE : + len; + wr_len = roundup(sizeof *req + sizeof *sc + + roundup(copy_len, T4_ULPTX_MIN_IO), 16); + + skb = alloc_skb(wr_len, GFP_KERNEL); + if (!skb) + return -ENOMEM; + set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0); + + req = (struct ulp_mem_io *)__skb_put(skb, wr_len); + memset(req, 0, wr_len); + INIT_ULPTX_WR(req, wr_len, 0, 0); + + if (i == (num_wqe-1)) { + req->wr.wr_hi = cpu_to_be32(FW_WR_OP_V(FW_ULPTX_WR) | + FW_WR_COMPL_F); + req->wr.wr_lo = (__force __be64)&wr_wait; + } else + req->wr.wr_hi = cpu_to_be32(FW_WR_OP_V(FW_ULPTX_WR)); + req->wr.wr_mid = cpu_to_be32( + FW_WR_LEN16_V(DIV_ROUND_UP(wr_len, 16))); + + req->cmd = cmd; + req->dlen = cpu_to_be32(ULP_MEMIO_DATA_LEN_V( + DIV_ROUND_UP(copy_len, T4_ULPTX_MIN_IO))); + req->len16 = cpu_to_be32(DIV_ROUND_UP(wr_len-sizeof(req->wr), + 16)); + req->lock_addr = cpu_to_be32(ULP_MEMIO_ADDR_V(addr + i * 3)); + + sc = (struct ulptx_idata *)(req + 1); + sc->cmd_more = cpu_to_be32(ULPTX_CMD_V(ULP_TX_SC_IMM)); + sc->len = cpu_to_be32(roundup(copy_len, T4_ULPTX_MIN_IO)); + + to_dp = (u8 *)(sc + 1); + from_dp = (u8 *)data + i * C4IW_MAX_INLINE_SIZE; + if (data) + memcpy(to_dp, from_dp, copy_len); + else + memset(to_dp, 0, copy_len); + if (copy_len % T4_ULPTX_MIN_IO) + memset(to_dp + copy_len, 0, T4_ULPTX_MIN_IO - + (copy_len % T4_ULPTX_MIN_IO)); + ret = c4iw_ofld_send(rdev, skb); + if (ret) + return ret; + len -= C4IW_MAX_INLINE_SIZE; + } + + ret = c4iw_wait_for_reply(rdev, &wr_wait, 0, 0, __func__); + return ret; +} + +static int _c4iw_write_mem_dma(struct c4iw_rdev *rdev, u32 addr, u32 len, void *data) +{ + u32 remain = len; + u32 dmalen; + int ret = 0; + dma_addr_t daddr; + dma_addr_t save; + + daddr = dma_map_single(&rdev->lldi.pdev->dev, data, len, DMA_TO_DEVICE); + if (dma_mapping_error(&rdev->lldi.pdev->dev, daddr)) + return -1; + save = daddr; + + while (remain > inline_threshold) { + if (remain < T4_ULPTX_MAX_DMA) { + if (remain & ~T4_ULPTX_MIN_IO) + dmalen = remain & ~(T4_ULPTX_MIN_IO-1); + else + dmalen = remain; + } else + dmalen = T4_ULPTX_MAX_DMA; + remain -= dmalen; + ret = _c4iw_write_mem_dma_aligned(rdev, addr, dmalen, daddr, + !remain); + if (ret) + goto out; + addr += dmalen >> 5; + data += dmalen; + daddr += dmalen; + } + if (remain) + ret = _c4iw_write_mem_inline(rdev, addr, remain, data); +out: + dma_unmap_single(&rdev->lldi.pdev->dev, save, len, DMA_TO_DEVICE); + return ret; +} + +/* + * write len bytes of data into addr (32B aligned address) + * If data is NULL, clear len byte of memory to zero. + */ +static int write_adapter_mem(struct c4iw_rdev *rdev, u32 addr, u32 len, + void *data) +{ + if (is_t5(rdev->lldi.adapter_type) && use_dsgl) { + if (len > inline_threshold) { + if (_c4iw_write_mem_dma(rdev, addr, len, data)) { + printk_ratelimited(KERN_WARNING + "%s: dma map" + " failure (non fatal)\n", + pci_name(rdev->lldi.pdev)); + return _c4iw_write_mem_inline(rdev, addr, len, + data); + } else + return 0; + } else + return _c4iw_write_mem_inline(rdev, addr, len, data); + } else + return _c4iw_write_mem_inline(rdev, addr, len, data); +} + +/* + * Build and write a TPT entry. + * IN: stag key, pdid, perm, bind_enabled, zbva, to, len, page_size, + * pbl_size and pbl_addr + * OUT: stag index + */ +static int write_tpt_entry(struct c4iw_rdev *rdev, u32 reset_tpt_entry, + u32 *stag, u8 stag_state, u32 pdid, + enum fw_ri_stag_type type, enum fw_ri_mem_perms perm, + int bind_enabled, u32 zbva, u64 to, + u64 len, u8 page_size, u32 pbl_size, u32 pbl_addr) +{ + int err; + struct fw_ri_tpte tpt; + u32 stag_idx; + static atomic_t key; + + if (c4iw_fatal_error(rdev)) + return -EIO; + + stag_state = stag_state > 0; + stag_idx = (*stag) >> 8; + + if ((!reset_tpt_entry) && (*stag == T4_STAG_UNSET)) { + stag_idx = c4iw_get_resource(&rdev->resource.tpt_table); + if (!stag_idx) { + mutex_lock(&rdev->stats.lock); + rdev->stats.stag.fail++; + mutex_unlock(&rdev->stats.lock); + return -ENOMEM; + } + mutex_lock(&rdev->stats.lock); + rdev->stats.stag.cur += 32; + if (rdev->stats.stag.cur > rdev->stats.stag.max) + rdev->stats.stag.max = rdev->stats.stag.cur; + mutex_unlock(&rdev->stats.lock); + *stag = (stag_idx << 8) | (atomic_inc_return(&key) & 0xff); + } + PDBG("%s stag_state 0x%0x type 0x%0x pdid 0x%0x, stag_idx 0x%x\n", + __func__, stag_state, type, pdid, stag_idx); + + /* write TPT entry */ + if (reset_tpt_entry) + memset(&tpt, 0, sizeof(tpt)); + else { + tpt.valid_to_pdid = cpu_to_be32(FW_RI_TPTE_VALID_F | + FW_RI_TPTE_STAGKEY_V((*stag & FW_RI_TPTE_STAGKEY_M)) | + FW_RI_TPTE_STAGSTATE_V(stag_state) | + FW_RI_TPTE_STAGTYPE_V(type) | FW_RI_TPTE_PDID_V(pdid)); + tpt.locread_to_qpid = cpu_to_be32(FW_RI_TPTE_PERM_V(perm) | + (bind_enabled ? FW_RI_TPTE_MWBINDEN_F : 0) | + FW_RI_TPTE_ADDRTYPE_V((zbva ? FW_RI_ZERO_BASED_TO : + FW_RI_VA_BASED_TO))| + FW_RI_TPTE_PS_V(page_size)); + tpt.nosnoop_pbladdr = !pbl_size ? 0 : cpu_to_be32( + FW_RI_TPTE_PBLADDR_V(PBL_OFF(rdev, pbl_addr)>>3)); + tpt.len_lo = cpu_to_be32((u32)(len & 0xffffffffUL)); + tpt.va_hi = cpu_to_be32((u32)(to >> 32)); + tpt.va_lo_fbo = cpu_to_be32((u32)(to & 0xffffffffUL)); + tpt.dca_mwbcnt_pstag = cpu_to_be32(0); + tpt.len_hi = cpu_to_be32((u32)(len >> 32)); + } + err = write_adapter_mem(rdev, stag_idx + + (rdev->lldi.vr->stag.start >> 5), + sizeof(tpt), &tpt); + + if (reset_tpt_entry) { + c4iw_put_resource(&rdev->resource.tpt_table, stag_idx); + mutex_lock(&rdev->stats.lock); + rdev->stats.stag.cur -= 32; + mutex_unlock(&rdev->stats.lock); + } + return err; +} + +static int write_pbl(struct c4iw_rdev *rdev, __be64 *pbl, + u32 pbl_addr, u32 pbl_size) +{ + int err; + + PDBG("%s *pdb_addr 0x%x, pbl_base 0x%x, pbl_size %d\n", + __func__, pbl_addr, rdev->lldi.vr->pbl.start, + pbl_size); + + err = write_adapter_mem(rdev, pbl_addr >> 5, pbl_size << 3, pbl); + return err; +} + +static int dereg_mem(struct c4iw_rdev *rdev, u32 stag, u32 pbl_size, + u32 pbl_addr) +{ + return write_tpt_entry(rdev, 1, &stag, 0, 0, 0, 0, 0, 0, 0UL, 0, 0, + pbl_size, pbl_addr); +} + +static int allocate_window(struct c4iw_rdev *rdev, u32 * stag, u32 pdid) +{ + *stag = T4_STAG_UNSET; + return write_tpt_entry(rdev, 0, stag, 0, pdid, FW_RI_STAG_MW, 0, 0, 0, + 0UL, 0, 0, 0, 0); +} + +static int deallocate_window(struct c4iw_rdev *rdev, u32 stag) +{ + return write_tpt_entry(rdev, 1, &stag, 0, 0, 0, 0, 0, 0, 0UL, 0, 0, 0, + 0); +} + +static int allocate_stag(struct c4iw_rdev *rdev, u32 *stag, u32 pdid, + u32 pbl_size, u32 pbl_addr) +{ + *stag = T4_STAG_UNSET; + return write_tpt_entry(rdev, 0, stag, 0, pdid, FW_RI_STAG_NSMR, 0, 0, 0, + 0UL, 0, 0, pbl_size, pbl_addr); +} + +static int finish_mem_reg(struct c4iw_mr *mhp, u32 stag) +{ + u32 mmid; + + mhp->attr.state = 1; + mhp->attr.stag = stag; + mmid = stag >> 8; + mhp->ibmr.rkey = mhp->ibmr.lkey = stag; + PDBG("%s mmid 0x%x mhp %p\n", __func__, mmid, mhp); + return insert_handle(mhp->rhp, &mhp->rhp->mmidr, mhp, mmid); +} + +static int register_mem(struct c4iw_dev *rhp, struct c4iw_pd *php, + struct c4iw_mr *mhp, int shift) +{ + u32 stag = T4_STAG_UNSET; + int ret; + + ret = write_tpt_entry(&rhp->rdev, 0, &stag, 1, mhp->attr.pdid, + FW_RI_STAG_NSMR, mhp->attr.len ? + mhp->attr.perms : 0, + mhp->attr.mw_bind_enable, mhp->attr.zbva, + mhp->attr.va_fbo, mhp->attr.len ? + mhp->attr.len : -1, shift - 12, + mhp->attr.pbl_size, mhp->attr.pbl_addr); + if (ret) + return ret; + + ret = finish_mem_reg(mhp, stag); + if (ret) + dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size, + mhp->attr.pbl_addr); + return ret; +} + +static int reregister_mem(struct c4iw_dev *rhp, struct c4iw_pd *php, + struct c4iw_mr *mhp, int shift, int npages) +{ + u32 stag; + int ret; + + if (npages > mhp->attr.pbl_size) + return -ENOMEM; + + stag = mhp->attr.stag; + ret = write_tpt_entry(&rhp->rdev, 0, &stag, 1, mhp->attr.pdid, + FW_RI_STAG_NSMR, mhp->attr.perms, + mhp->attr.mw_bind_enable, mhp->attr.zbva, + mhp->attr.va_fbo, mhp->attr.len, shift - 12, + mhp->attr.pbl_size, mhp->attr.pbl_addr); + if (ret) + return ret; + + ret = finish_mem_reg(mhp, stag); + if (ret) + dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size, + mhp->attr.pbl_addr); + + return ret; +} + +static int alloc_pbl(struct c4iw_mr *mhp, int npages) +{ + mhp->attr.pbl_addr = c4iw_pblpool_alloc(&mhp->rhp->rdev, + npages << 3); + + if (!mhp->attr.pbl_addr) + return -ENOMEM; + + mhp->attr.pbl_size = npages; + + return 0; +} + +static int build_phys_page_list(struct ib_phys_buf *buffer_list, + int num_phys_buf, u64 *iova_start, + u64 *total_size, int *npages, + int *shift, __be64 **page_list) +{ + u64 mask; + int i, j, n; + + mask = 0; + *total_size = 0; + for (i = 0; i < num_phys_buf; ++i) { + if (i != 0 && buffer_list[i].addr & ~PAGE_MASK) + return -EINVAL; + if (i != 0 && i != num_phys_buf - 1 && + (buffer_list[i].size & ~PAGE_MASK)) + return -EINVAL; + *total_size += buffer_list[i].size; + if (i > 0) + mask |= buffer_list[i].addr; + else + mask |= buffer_list[i].addr & PAGE_MASK; + if (i != num_phys_buf - 1) + mask |= buffer_list[i].addr + buffer_list[i].size; + else + mask |= (buffer_list[i].addr + buffer_list[i].size + + PAGE_SIZE - 1) & PAGE_MASK; + } + + if (*total_size > 0xFFFFFFFFULL) + return -ENOMEM; + + /* Find largest page shift we can use to cover buffers */ + for (*shift = PAGE_SHIFT; *shift < 27; ++(*shift)) + if ((1ULL << *shift) & mask) + break; + + buffer_list[0].size += buffer_list[0].addr & ((1ULL << *shift) - 1); + buffer_list[0].addr &= ~0ull << *shift; + + *npages = 0; + for (i = 0; i < num_phys_buf; ++i) + *npages += (buffer_list[i].size + + (1ULL << *shift) - 1) >> *shift; + + if (!*npages) + return -EINVAL; + + *page_list = kmalloc(sizeof(u64) * *npages, GFP_KERNEL); + if (!*page_list) + return -ENOMEM; + + n = 0; + for (i = 0; i < num_phys_buf; ++i) + for (j = 0; + j < (buffer_list[i].size + (1ULL << *shift) - 1) >> *shift; + ++j) + (*page_list)[n++] = cpu_to_be64(buffer_list[i].addr + + ((u64) j << *shift)); + + PDBG("%s va 0x%llx mask 0x%llx shift %d len %lld pbl_size %d\n", + __func__, (unsigned long long)*iova_start, + (unsigned long long)mask, *shift, (unsigned long long)*total_size, + *npages); + + return 0; + +} + +int c4iw_reregister_phys_mem(struct ib_mr *mr, int mr_rereg_mask, + struct ib_pd *pd, struct ib_phys_buf *buffer_list, + int num_phys_buf, int acc, u64 *iova_start) +{ + + struct c4iw_mr mh, *mhp; + struct c4iw_pd *php; + struct c4iw_dev *rhp; + __be64 *page_list = NULL; + int shift = 0; + u64 total_size; + int npages; + int ret; + + PDBG("%s ib_mr %p ib_pd %p\n", __func__, mr, pd); + + /* There can be no memory windows */ + if (atomic_read(&mr->usecnt)) + return -EINVAL; + + mhp = to_c4iw_mr(mr); + rhp = mhp->rhp; + php = to_c4iw_pd(mr->pd); + + /* make sure we are on the same adapter */ + if (rhp != php->rhp) + return -EINVAL; + + memcpy(&mh, mhp, sizeof *mhp); + + if (mr_rereg_mask & IB_MR_REREG_PD) + php = to_c4iw_pd(pd); + if (mr_rereg_mask & IB_MR_REREG_ACCESS) { + mh.attr.perms = c4iw_ib_to_tpt_access(acc); + mh.attr.mw_bind_enable = (acc & IB_ACCESS_MW_BIND) == + IB_ACCESS_MW_BIND; + } + if (mr_rereg_mask & IB_MR_REREG_TRANS) { + ret = build_phys_page_list(buffer_list, num_phys_buf, + iova_start, + &total_size, &npages, + &shift, &page_list); + if (ret) + return ret; + } + + if (mr_exceeds_hw_limits(rhp, total_size)) { + kfree(page_list); + return -EINVAL; + } + + ret = reregister_mem(rhp, php, &mh, shift, npages); + kfree(page_list); + if (ret) + return ret; + if (mr_rereg_mask & IB_MR_REREG_PD) + mhp->attr.pdid = php->pdid; + if (mr_rereg_mask & IB_MR_REREG_ACCESS) + mhp->attr.perms = c4iw_ib_to_tpt_access(acc); + if (mr_rereg_mask & IB_MR_REREG_TRANS) { + mhp->attr.zbva = 0; + mhp->attr.va_fbo = *iova_start; + mhp->attr.page_size = shift - 12; + mhp->attr.len = (u32) total_size; + mhp->attr.pbl_size = npages; + } + + return 0; +} + +struct ib_mr *c4iw_register_phys_mem(struct ib_pd *pd, + struct ib_phys_buf *buffer_list, + int num_phys_buf, int acc, u64 *iova_start) +{ + __be64 *page_list; + int shift; + u64 total_size; + int npages; + struct c4iw_dev *rhp; + struct c4iw_pd *php; + struct c4iw_mr *mhp; + int ret; + + PDBG("%s ib_pd %p\n", __func__, pd); + php = to_c4iw_pd(pd); + rhp = php->rhp; + + mhp = kzalloc(sizeof(*mhp), GFP_KERNEL); + if (!mhp) + return ERR_PTR(-ENOMEM); + + mhp->rhp = rhp; + + /* First check that we have enough alignment */ + if ((*iova_start & ~PAGE_MASK) != (buffer_list[0].addr & ~PAGE_MASK)) { + ret = -EINVAL; + goto err; + } + + if (num_phys_buf > 1 && + ((buffer_list[0].addr + buffer_list[0].size) & ~PAGE_MASK)) { + ret = -EINVAL; + goto err; + } + + ret = build_phys_page_list(buffer_list, num_phys_buf, iova_start, + &total_size, &npages, &shift, + &page_list); + if (ret) + goto err; + + if (mr_exceeds_hw_limits(rhp, total_size)) { + kfree(page_list); + ret = -EINVAL; + goto err; + } + + ret = alloc_pbl(mhp, npages); + if (ret) { + kfree(page_list); + goto err; + } + + ret = write_pbl(&mhp->rhp->rdev, page_list, mhp->attr.pbl_addr, + npages); + kfree(page_list); + if (ret) + goto err_pbl; + + mhp->attr.pdid = php->pdid; + mhp->attr.zbva = 0; + + mhp->attr.perms = c4iw_ib_to_tpt_access(acc); + mhp->attr.va_fbo = *iova_start; + mhp->attr.page_size = shift - 12; + + mhp->attr.len = (u32) total_size; + mhp->attr.pbl_size = npages; + ret = register_mem(rhp, php, mhp, shift); + if (ret) + goto err_pbl; + + return &mhp->ibmr; + +err_pbl: + c4iw_pblpool_free(&mhp->rhp->rdev, mhp->attr.pbl_addr, + mhp->attr.pbl_size << 3); + +err: + kfree(mhp); + return ERR_PTR(ret); + +} + +struct ib_mr *c4iw_get_dma_mr(struct ib_pd *pd, int acc) +{ + struct c4iw_dev *rhp; + struct c4iw_pd *php; + struct c4iw_mr *mhp; + int ret; + u32 stag = T4_STAG_UNSET; + + PDBG("%s ib_pd %p\n", __func__, pd); + php = to_c4iw_pd(pd); + rhp = php->rhp; + + mhp = kzalloc(sizeof(*mhp), GFP_KERNEL); + if (!mhp) + return ERR_PTR(-ENOMEM); + + mhp->rhp = rhp; + mhp->attr.pdid = php->pdid; + mhp->attr.perms = c4iw_ib_to_tpt_access(acc); + mhp->attr.mw_bind_enable = (acc&IB_ACCESS_MW_BIND) == IB_ACCESS_MW_BIND; + mhp->attr.zbva = 0; + mhp->attr.va_fbo = 0; + mhp->attr.page_size = 0; + mhp->attr.len = ~0ULL; + mhp->attr.pbl_size = 0; + + ret = write_tpt_entry(&rhp->rdev, 0, &stag, 1, php->pdid, + FW_RI_STAG_NSMR, mhp->attr.perms, + mhp->attr.mw_bind_enable, 0, 0, ~0ULL, 0, 0, 0); + if (ret) + goto err1; + + ret = finish_mem_reg(mhp, stag); + if (ret) + goto err2; + return &mhp->ibmr; +err2: + dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size, + mhp->attr.pbl_addr); +err1: + kfree(mhp); + return ERR_PTR(ret); +} + +struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt, int acc, struct ib_udata *udata) +{ + __be64 *pages; + int shift, n, len; + int i, k, entry; + int err = 0; + struct scatterlist *sg; + struct c4iw_dev *rhp; + struct c4iw_pd *php; + struct c4iw_mr *mhp; + + PDBG("%s ib_pd %p\n", __func__, pd); + + if (length == ~0ULL) + return ERR_PTR(-EINVAL); + + if ((length + start) < start) + return ERR_PTR(-EINVAL); + + php = to_c4iw_pd(pd); + rhp = php->rhp; + + if (mr_exceeds_hw_limits(rhp, length)) + return ERR_PTR(-EINVAL); + + mhp = kzalloc(sizeof(*mhp), GFP_KERNEL); + if (!mhp) + return ERR_PTR(-ENOMEM); + + mhp->rhp = rhp; + + mhp->umem = ib_umem_get(pd->uobject->context, start, length, acc, 0); + if (IS_ERR(mhp->umem)) { + err = PTR_ERR(mhp->umem); + kfree(mhp); + return ERR_PTR(err); + } + + shift = ffs(mhp->umem->page_size) - 1; + + n = mhp->umem->nmap; + err = alloc_pbl(mhp, n); + if (err) + goto err; + + pages = (__be64 *) __get_free_page(GFP_KERNEL); + if (!pages) { + err = -ENOMEM; + goto err_pbl; + } + + i = n = 0; + + for_each_sg(mhp->umem->sg_head.sgl, sg, mhp->umem->nmap, entry) { + len = sg_dma_len(sg) >> shift; + for (k = 0; k < len; ++k) { + pages[i++] = cpu_to_be64(sg_dma_address(sg) + + mhp->umem->page_size * k); + if (i == PAGE_SIZE / sizeof *pages) { + err = write_pbl(&mhp->rhp->rdev, + pages, + mhp->attr.pbl_addr + (n << 3), i); + if (err) + goto pbl_done; + n += i; + i = 0; + } + } + } + + if (i) + err = write_pbl(&mhp->rhp->rdev, pages, + mhp->attr.pbl_addr + (n << 3), i); + +pbl_done: + free_page((unsigned long) pages); + if (err) + goto err_pbl; + + mhp->attr.pdid = php->pdid; + mhp->attr.zbva = 0; + mhp->attr.perms = c4iw_ib_to_tpt_access(acc); + mhp->attr.va_fbo = virt; + mhp->attr.page_size = shift - 12; + mhp->attr.len = length; + + err = register_mem(rhp, php, mhp, shift); + if (err) + goto err_pbl; + + return &mhp->ibmr; + +err_pbl: + c4iw_pblpool_free(&mhp->rhp->rdev, mhp->attr.pbl_addr, + mhp->attr.pbl_size << 3); + +err: + ib_umem_release(mhp->umem); + kfree(mhp); + return ERR_PTR(err); +} + +struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd, enum ib_mw_type type) +{ + struct c4iw_dev *rhp; + struct c4iw_pd *php; + struct c4iw_mw *mhp; + u32 mmid; + u32 stag = 0; + int ret; + + if (type != IB_MW_TYPE_1) + return ERR_PTR(-EINVAL); + + php = to_c4iw_pd(pd); + rhp = php->rhp; + mhp = kzalloc(sizeof(*mhp), GFP_KERNEL); + if (!mhp) + return ERR_PTR(-ENOMEM); + ret = allocate_window(&rhp->rdev, &stag, php->pdid); + if (ret) { + kfree(mhp); + return ERR_PTR(ret); + } + mhp->rhp = rhp; + mhp->attr.pdid = php->pdid; + mhp->attr.type = FW_RI_STAG_MW; + mhp->attr.stag = stag; + mmid = (stag) >> 8; + mhp->ibmw.rkey = stag; + if (insert_handle(rhp, &rhp->mmidr, mhp, mmid)) { + deallocate_window(&rhp->rdev, mhp->attr.stag); + kfree(mhp); + return ERR_PTR(-ENOMEM); + } + PDBG("%s mmid 0x%x mhp %p stag 0x%x\n", __func__, mmid, mhp, stag); + return &(mhp->ibmw); +} + +int c4iw_dealloc_mw(struct ib_mw *mw) +{ + struct c4iw_dev *rhp; + struct c4iw_mw *mhp; + u32 mmid; + + mhp = to_c4iw_mw(mw); + rhp = mhp->rhp; + mmid = (mw->rkey) >> 8; + remove_handle(rhp, &rhp->mmidr, mmid); + deallocate_window(&rhp->rdev, mhp->attr.stag); + kfree(mhp); + PDBG("%s ib_mw %p mmid 0x%x ptr %p\n", __func__, mw, mmid, mhp); + return 0; +} + +struct ib_mr *c4iw_alloc_fast_reg_mr(struct ib_pd *pd, int pbl_depth) +{ + struct c4iw_dev *rhp; + struct c4iw_pd *php; + struct c4iw_mr *mhp; + u32 mmid; + u32 stag = 0; + int ret = 0; + + php = to_c4iw_pd(pd); + rhp = php->rhp; + mhp = kzalloc(sizeof(*mhp), GFP_KERNEL); + if (!mhp) { + ret = -ENOMEM; + goto err; + } + + mhp->rhp = rhp; + ret = alloc_pbl(mhp, pbl_depth); + if (ret) + goto err1; + mhp->attr.pbl_size = pbl_depth; + ret = allocate_stag(&rhp->rdev, &stag, php->pdid, + mhp->attr.pbl_size, mhp->attr.pbl_addr); + if (ret) + goto err2; + mhp->attr.pdid = php->pdid; + mhp->attr.type = FW_RI_STAG_NSMR; + mhp->attr.stag = stag; + mhp->attr.state = 1; + mmid = (stag) >> 8; + mhp->ibmr.rkey = mhp->ibmr.lkey = stag; + if (insert_handle(rhp, &rhp->mmidr, mhp, mmid)) { + ret = -ENOMEM; + goto err3; + } + + PDBG("%s mmid 0x%x mhp %p stag 0x%x\n", __func__, mmid, mhp, stag); + return &(mhp->ibmr); +err3: + dereg_mem(&rhp->rdev, stag, mhp->attr.pbl_size, + mhp->attr.pbl_addr); +err2: + c4iw_pblpool_free(&mhp->rhp->rdev, mhp->attr.pbl_addr, + mhp->attr.pbl_size << 3); +err1: + kfree(mhp); +err: + return ERR_PTR(ret); +} + +struct ib_fast_reg_page_list *c4iw_alloc_fastreg_pbl(struct ib_device *device, + int page_list_len) +{ + struct c4iw_fr_page_list *c4pl; + struct c4iw_dev *dev = to_c4iw_dev(device); + dma_addr_t dma_addr; + int pll_len = roundup(page_list_len * sizeof(u64), 32); + + c4pl = kmalloc(sizeof(*c4pl), GFP_KERNEL); + if (!c4pl) + return ERR_PTR(-ENOMEM); + + c4pl->ibpl.page_list = dma_alloc_coherent(&dev->rdev.lldi.pdev->dev, + pll_len, &dma_addr, + GFP_KERNEL); + if (!c4pl->ibpl.page_list) { + kfree(c4pl); + return ERR_PTR(-ENOMEM); + } + dma_unmap_addr_set(c4pl, mapping, dma_addr); + c4pl->dma_addr = dma_addr; + c4pl->dev = dev; + c4pl->pll_len = pll_len; + + PDBG("%s c4pl %p pll_len %u page_list %p dma_addr %pad\n", + __func__, c4pl, c4pl->pll_len, c4pl->ibpl.page_list, + &c4pl->dma_addr); + + return &c4pl->ibpl; +} + +void c4iw_free_fastreg_pbl(struct ib_fast_reg_page_list *ibpl) +{ + struct c4iw_fr_page_list *c4pl = to_c4iw_fr_page_list(ibpl); + + PDBG("%s c4pl %p pll_len %u page_list %p dma_addr %pad\n", + __func__, c4pl, c4pl->pll_len, c4pl->ibpl.page_list, + &c4pl->dma_addr); + + dma_free_coherent(&c4pl->dev->rdev.lldi.pdev->dev, + c4pl->pll_len, + c4pl->ibpl.page_list, dma_unmap_addr(c4pl, mapping)); + kfree(c4pl); +} + +int c4iw_dereg_mr(struct ib_mr *ib_mr) +{ + struct c4iw_dev *rhp; + struct c4iw_mr *mhp; + u32 mmid; + + PDBG("%s ib_mr %p\n", __func__, ib_mr); + /* There can be no memory windows */ + if (atomic_read(&ib_mr->usecnt)) + return -EINVAL; + + mhp = to_c4iw_mr(ib_mr); + rhp = mhp->rhp; + mmid = mhp->attr.stag >> 8; + remove_handle(rhp, &rhp->mmidr, mmid); + dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size, + mhp->attr.pbl_addr); + if (mhp->attr.pbl_size) + c4iw_pblpool_free(&mhp->rhp->rdev, mhp->attr.pbl_addr, + mhp->attr.pbl_size << 3); + if (mhp->kva) + kfree((void *) (unsigned long) mhp->kva); + if (mhp->umem) + ib_umem_release(mhp->umem); + PDBG("%s mmid 0x%x ptr %p\n", __func__, mmid, mhp); + kfree(mhp); + return 0; +} diff --git a/kernel/drivers/infiniband/hw/cxgb4/provider.c b/kernel/drivers/infiniband/hw/cxgb4/provider.c new file mode 100644 index 000000000..66bd6a2ad --- /dev/null +++ b/kernel/drivers/infiniband/hw/cxgb4/provider.c @@ -0,0 +1,588 @@ +/* + * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include "iw_cxgb4.h" + +static int fastreg_support = 1; +module_param(fastreg_support, int, 0644); +MODULE_PARM_DESC(fastreg_support, "Advertise fastreg support (default=1)"); + +static struct ib_ah *c4iw_ah_create(struct ib_pd *pd, + struct ib_ah_attr *ah_attr) +{ + return ERR_PTR(-ENOSYS); +} + +static int c4iw_ah_destroy(struct ib_ah *ah) +{ + return -ENOSYS; +} + +static int c4iw_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + return -ENOSYS; +} + +static int c4iw_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + return -ENOSYS; +} + +static int c4iw_process_mad(struct ib_device *ibdev, int mad_flags, + u8 port_num, struct ib_wc *in_wc, + struct ib_grh *in_grh, struct ib_mad *in_mad, + struct ib_mad *out_mad) +{ + return -ENOSYS; +} + +static int c4iw_dealloc_ucontext(struct ib_ucontext *context) +{ + struct c4iw_dev *rhp = to_c4iw_dev(context->device); + struct c4iw_ucontext *ucontext = to_c4iw_ucontext(context); + struct c4iw_mm_entry *mm, *tmp; + + PDBG("%s context %p\n", __func__, context); + list_for_each_entry_safe(mm, tmp, &ucontext->mmaps, entry) + kfree(mm); + c4iw_release_dev_ucontext(&rhp->rdev, &ucontext->uctx); + kfree(ucontext); + return 0; +} + +static struct ib_ucontext *c4iw_alloc_ucontext(struct ib_device *ibdev, + struct ib_udata *udata) +{ + struct c4iw_ucontext *context; + struct c4iw_dev *rhp = to_c4iw_dev(ibdev); + static int warned; + struct c4iw_alloc_ucontext_resp uresp; + int ret = 0; + struct c4iw_mm_entry *mm = NULL; + + PDBG("%s ibdev %p\n", __func__, ibdev); + context = kzalloc(sizeof(*context), GFP_KERNEL); + if (!context) { + ret = -ENOMEM; + goto err; + } + + c4iw_init_dev_ucontext(&rhp->rdev, &context->uctx); + INIT_LIST_HEAD(&context->mmaps); + spin_lock_init(&context->mmap_lock); + + if (udata->outlen < sizeof(uresp) - sizeof(uresp.reserved)) { + if (!warned++) + pr_err(MOD "Warning - downlevel libcxgb4 (non-fatal), device status page disabled."); + rhp->rdev.flags |= T4_STATUS_PAGE_DISABLED; + } else { + mm = kmalloc(sizeof(*mm), GFP_KERNEL); + if (!mm) { + ret = -ENOMEM; + goto err_free; + } + + uresp.status_page_size = PAGE_SIZE; + + spin_lock(&context->mmap_lock); + uresp.status_page_key = context->key; + context->key += PAGE_SIZE; + spin_unlock(&context->mmap_lock); + + ret = ib_copy_to_udata(udata, &uresp, + sizeof(uresp) - sizeof(uresp.reserved)); + if (ret) + goto err_mm; + + mm->key = uresp.status_page_key; + mm->addr = virt_to_phys(rhp->rdev.status_page); + mm->len = PAGE_SIZE; + insert_mmap(context, mm); + } + return &context->ibucontext; +err_mm: + kfree(mm); +err_free: + kfree(context); +err: + return ERR_PTR(ret); +} + +static int c4iw_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) +{ + int len = vma->vm_end - vma->vm_start; + u32 key = vma->vm_pgoff << PAGE_SHIFT; + struct c4iw_rdev *rdev; + int ret = 0; + struct c4iw_mm_entry *mm; + struct c4iw_ucontext *ucontext; + u64 addr; + + PDBG("%s pgoff 0x%lx key 0x%x len %d\n", __func__, vma->vm_pgoff, + key, len); + + if (vma->vm_start & (PAGE_SIZE-1)) + return -EINVAL; + + rdev = &(to_c4iw_dev(context->device)->rdev); + ucontext = to_c4iw_ucontext(context); + + mm = remove_mmap(ucontext, key, len); + if (!mm) + return -EINVAL; + addr = mm->addr; + kfree(mm); + + if ((addr >= pci_resource_start(rdev->lldi.pdev, 0)) && + (addr < (pci_resource_start(rdev->lldi.pdev, 0) + + pci_resource_len(rdev->lldi.pdev, 0)))) { + + /* + * MA_SYNC register... + */ + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + ret = io_remap_pfn_range(vma, vma->vm_start, + addr >> PAGE_SHIFT, + len, vma->vm_page_prot); + } else if ((addr >= pci_resource_start(rdev->lldi.pdev, 2)) && + (addr < (pci_resource_start(rdev->lldi.pdev, 2) + + pci_resource_len(rdev->lldi.pdev, 2)))) { + + /* + * Map user DB or OCQP memory... + */ + if (addr >= rdev->oc_mw_pa) + vma->vm_page_prot = t4_pgprot_wc(vma->vm_page_prot); + else { + if (is_t5(rdev->lldi.adapter_type)) + vma->vm_page_prot = + t4_pgprot_wc(vma->vm_page_prot); + else + vma->vm_page_prot = + pgprot_noncached(vma->vm_page_prot); + } + ret = io_remap_pfn_range(vma, vma->vm_start, + addr >> PAGE_SHIFT, + len, vma->vm_page_prot); + } else { + + /* + * Map WQ or CQ contig dma memory... + */ + ret = remap_pfn_range(vma, vma->vm_start, + addr >> PAGE_SHIFT, + len, vma->vm_page_prot); + } + + return ret; +} + +static int c4iw_deallocate_pd(struct ib_pd *pd) +{ + struct c4iw_dev *rhp; + struct c4iw_pd *php; + + php = to_c4iw_pd(pd); + rhp = php->rhp; + PDBG("%s ibpd %p pdid 0x%x\n", __func__, pd, php->pdid); + c4iw_put_resource(&rhp->rdev.resource.pdid_table, php->pdid); + mutex_lock(&rhp->rdev.stats.lock); + rhp->rdev.stats.pd.cur--; + mutex_unlock(&rhp->rdev.stats.lock); + kfree(php); + return 0; +} + +static struct ib_pd *c4iw_allocate_pd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct c4iw_pd *php; + u32 pdid; + struct c4iw_dev *rhp; + + PDBG("%s ibdev %p\n", __func__, ibdev); + rhp = (struct c4iw_dev *) ibdev; + pdid = c4iw_get_resource(&rhp->rdev.resource.pdid_table); + if (!pdid) + return ERR_PTR(-EINVAL); + php = kzalloc(sizeof(*php), GFP_KERNEL); + if (!php) { + c4iw_put_resource(&rhp->rdev.resource.pdid_table, pdid); + return ERR_PTR(-ENOMEM); + } + php->pdid = pdid; + php->rhp = rhp; + if (context) { + if (ib_copy_to_udata(udata, &php->pdid, sizeof(u32))) { + c4iw_deallocate_pd(&php->ibpd); + return ERR_PTR(-EFAULT); + } + } + mutex_lock(&rhp->rdev.stats.lock); + rhp->rdev.stats.pd.cur++; + if (rhp->rdev.stats.pd.cur > rhp->rdev.stats.pd.max) + rhp->rdev.stats.pd.max = rhp->rdev.stats.pd.cur; + mutex_unlock(&rhp->rdev.stats.lock); + PDBG("%s pdid 0x%0x ptr 0x%p\n", __func__, pdid, php); + return &php->ibpd; +} + +static int c4iw_query_pkey(struct ib_device *ibdev, u8 port, u16 index, + u16 *pkey) +{ + PDBG("%s ibdev %p\n", __func__, ibdev); + *pkey = 0; + return 0; +} + +static int c4iw_query_gid(struct ib_device *ibdev, u8 port, int index, + union ib_gid *gid) +{ + struct c4iw_dev *dev; + + PDBG("%s ibdev %p, port %d, index %d, gid %p\n", + __func__, ibdev, port, index, gid); + dev = to_c4iw_dev(ibdev); + BUG_ON(port == 0); + memset(&(gid->raw[0]), 0, sizeof(gid->raw)); + memcpy(&(gid->raw[0]), dev->rdev.lldi.ports[port-1]->dev_addr, 6); + return 0; +} + +static int c4iw_query_device(struct ib_device *ibdev, + struct ib_device_attr *props) +{ + + struct c4iw_dev *dev; + PDBG("%s ibdev %p\n", __func__, ibdev); + + dev = to_c4iw_dev(ibdev); + memset(props, 0, sizeof *props); + memcpy(&props->sys_image_guid, dev->rdev.lldi.ports[0]->dev_addr, 6); + props->hw_ver = CHELSIO_CHIP_RELEASE(dev->rdev.lldi.adapter_type); + props->fw_ver = dev->rdev.lldi.fw_vers; + props->device_cap_flags = dev->device_cap_flags; + props->page_size_cap = T4_PAGESIZE_MASK; + props->vendor_id = (u32)dev->rdev.lldi.pdev->vendor; + props->vendor_part_id = (u32)dev->rdev.lldi.pdev->device; + props->max_mr_size = T4_MAX_MR_SIZE; + props->max_qp = dev->rdev.lldi.vr->qp.size / 2; + props->max_qp_wr = dev->rdev.hw_queue.t4_max_qp_depth; + props->max_sge = T4_MAX_RECV_SGE; + props->max_sge_rd = 1; + props->max_res_rd_atom = dev->rdev.lldi.max_ird_adapter; + props->max_qp_rd_atom = min(dev->rdev.lldi.max_ordird_qp, + c4iw_max_read_depth); + props->max_qp_init_rd_atom = props->max_qp_rd_atom; + props->max_cq = dev->rdev.lldi.vr->qp.size; + props->max_cqe = dev->rdev.hw_queue.t4_max_cq_depth; + props->max_mr = c4iw_num_stags(&dev->rdev); + props->max_pd = T4_MAX_NUM_PD; + props->local_ca_ack_delay = 0; + props->max_fast_reg_page_list_len = t4_max_fr_depth(use_dsgl); + + return 0; +} + +static int c4iw_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props) +{ + struct c4iw_dev *dev; + struct net_device *netdev; + struct in_device *inetdev; + + PDBG("%s ibdev %p\n", __func__, ibdev); + + dev = to_c4iw_dev(ibdev); + netdev = dev->rdev.lldi.ports[port-1]; + + memset(props, 0, sizeof(struct ib_port_attr)); + props->max_mtu = IB_MTU_4096; + if (netdev->mtu >= 4096) + props->active_mtu = IB_MTU_4096; + else if (netdev->mtu >= 2048) + props->active_mtu = IB_MTU_2048; + else if (netdev->mtu >= 1024) + props->active_mtu = IB_MTU_1024; + else if (netdev->mtu >= 512) + props->active_mtu = IB_MTU_512; + else + props->active_mtu = IB_MTU_256; + + if (!netif_carrier_ok(netdev)) + props->state = IB_PORT_DOWN; + else { + inetdev = in_dev_get(netdev); + if (inetdev) { + if (inetdev->ifa_list) + props->state = IB_PORT_ACTIVE; + else + props->state = IB_PORT_INIT; + in_dev_put(inetdev); + } else + props->state = IB_PORT_INIT; + } + + props->port_cap_flags = + IB_PORT_CM_SUP | + IB_PORT_SNMP_TUNNEL_SUP | + IB_PORT_REINIT_SUP | + IB_PORT_DEVICE_MGMT_SUP | + IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP; + props->gid_tbl_len = 1; + props->pkey_tbl_len = 1; + props->active_width = 2; + props->active_speed = IB_SPEED_DDR; + props->max_msg_sz = -1; + + return 0; +} + +static ssize_t show_rev(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev, + ibdev.dev); + PDBG("%s dev 0x%p\n", __func__, dev); + return sprintf(buf, "%d\n", + CHELSIO_CHIP_RELEASE(c4iw_dev->rdev.lldi.adapter_type)); +} + +static ssize_t show_fw_ver(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev, + ibdev.dev); + PDBG("%s dev 0x%p\n", __func__, dev); + + return sprintf(buf, "%u.%u.%u.%u\n", + FW_HDR_FW_VER_MAJOR_G(c4iw_dev->rdev.lldi.fw_vers), + FW_HDR_FW_VER_MINOR_G(c4iw_dev->rdev.lldi.fw_vers), + FW_HDR_FW_VER_MICRO_G(c4iw_dev->rdev.lldi.fw_vers), + FW_HDR_FW_VER_BUILD_G(c4iw_dev->rdev.lldi.fw_vers)); +} + +static ssize_t show_hca(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev, + ibdev.dev); + struct ethtool_drvinfo info; + struct net_device *lldev = c4iw_dev->rdev.lldi.ports[0]; + + PDBG("%s dev 0x%p\n", __func__, dev); + lldev->ethtool_ops->get_drvinfo(lldev, &info); + return sprintf(buf, "%s\n", info.driver); +} + +static ssize_t show_board(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev, + ibdev.dev); + PDBG("%s dev 0x%p\n", __func__, dev); + return sprintf(buf, "%x.%x\n", c4iw_dev->rdev.lldi.pdev->vendor, + c4iw_dev->rdev.lldi.pdev->device); +} + +static int c4iw_get_mib(struct ib_device *ibdev, + union rdma_protocol_stats *stats) +{ + struct tp_tcp_stats v4, v6; + struct c4iw_dev *c4iw_dev = to_c4iw_dev(ibdev); + + cxgb4_get_tcp_stats(c4iw_dev->rdev.lldi.pdev, &v4, &v6); + memset(stats, 0, sizeof *stats); + stats->iw.tcpInSegs = v4.tcpInSegs + v6.tcpInSegs; + stats->iw.tcpOutSegs = v4.tcpOutSegs + v6.tcpOutSegs; + stats->iw.tcpRetransSegs = v4.tcpRetransSegs + v6.tcpRetransSegs; + stats->iw.tcpOutRsts = v4.tcpOutRsts + v6.tcpOutSegs; + + return 0; +} + +static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); +static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); +static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); +static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); + +static struct device_attribute *c4iw_class_attributes[] = { + &dev_attr_hw_rev, + &dev_attr_fw_ver, + &dev_attr_hca_type, + &dev_attr_board_id, +}; + +int c4iw_register_device(struct c4iw_dev *dev) +{ + int ret; + int i; + + PDBG("%s c4iw_dev %p\n", __func__, dev); + BUG_ON(!dev->rdev.lldi.ports[0]); + strlcpy(dev->ibdev.name, "cxgb4_%d", IB_DEVICE_NAME_MAX); + memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid)); + memcpy(&dev->ibdev.node_guid, dev->rdev.lldi.ports[0]->dev_addr, 6); + dev->ibdev.owner = THIS_MODULE; + dev->device_cap_flags = IB_DEVICE_LOCAL_DMA_LKEY | IB_DEVICE_MEM_WINDOW; + if (fastreg_support) + dev->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; + dev->ibdev.local_dma_lkey = 0; + dev->ibdev.uverbs_cmd_mask = + (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | + (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | + (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_DEREG_MR) | + (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | + (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | + (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_QP) | + (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | + (1ull << IB_USER_VERBS_CMD_QUERY_QP) | + (1ull << IB_USER_VERBS_CMD_POLL_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | + (1ull << IB_USER_VERBS_CMD_POST_SEND) | + (1ull << IB_USER_VERBS_CMD_POST_RECV); + dev->ibdev.node_type = RDMA_NODE_RNIC; + memcpy(dev->ibdev.node_desc, C4IW_NODE_DESC, sizeof(C4IW_NODE_DESC)); + dev->ibdev.phys_port_cnt = dev->rdev.lldi.nports; + dev->ibdev.num_comp_vectors = dev->rdev.lldi.nciq; + dev->ibdev.dma_device = &(dev->rdev.lldi.pdev->dev); + dev->ibdev.query_device = c4iw_query_device; + dev->ibdev.query_port = c4iw_query_port; + dev->ibdev.query_pkey = c4iw_query_pkey; + dev->ibdev.query_gid = c4iw_query_gid; + dev->ibdev.alloc_ucontext = c4iw_alloc_ucontext; + dev->ibdev.dealloc_ucontext = c4iw_dealloc_ucontext; + dev->ibdev.mmap = c4iw_mmap; + dev->ibdev.alloc_pd = c4iw_allocate_pd; + dev->ibdev.dealloc_pd = c4iw_deallocate_pd; + dev->ibdev.create_ah = c4iw_ah_create; + dev->ibdev.destroy_ah = c4iw_ah_destroy; + dev->ibdev.create_qp = c4iw_create_qp; + dev->ibdev.modify_qp = c4iw_ib_modify_qp; + dev->ibdev.query_qp = c4iw_ib_query_qp; + dev->ibdev.destroy_qp = c4iw_destroy_qp; + dev->ibdev.create_cq = c4iw_create_cq; + dev->ibdev.destroy_cq = c4iw_destroy_cq; + dev->ibdev.resize_cq = c4iw_resize_cq; + dev->ibdev.poll_cq = c4iw_poll_cq; + dev->ibdev.get_dma_mr = c4iw_get_dma_mr; + dev->ibdev.reg_phys_mr = c4iw_register_phys_mem; + dev->ibdev.rereg_phys_mr = c4iw_reregister_phys_mem; + dev->ibdev.reg_user_mr = c4iw_reg_user_mr; + dev->ibdev.dereg_mr = c4iw_dereg_mr; + dev->ibdev.alloc_mw = c4iw_alloc_mw; + dev->ibdev.bind_mw = c4iw_bind_mw; + dev->ibdev.dealloc_mw = c4iw_dealloc_mw; + dev->ibdev.alloc_fast_reg_mr = c4iw_alloc_fast_reg_mr; + dev->ibdev.alloc_fast_reg_page_list = c4iw_alloc_fastreg_pbl; + dev->ibdev.free_fast_reg_page_list = c4iw_free_fastreg_pbl; + dev->ibdev.attach_mcast = c4iw_multicast_attach; + dev->ibdev.detach_mcast = c4iw_multicast_detach; + dev->ibdev.process_mad = c4iw_process_mad; + dev->ibdev.req_notify_cq = c4iw_arm_cq; + dev->ibdev.post_send = c4iw_post_send; + dev->ibdev.post_recv = c4iw_post_receive; + dev->ibdev.get_protocol_stats = c4iw_get_mib; + dev->ibdev.uverbs_abi_ver = C4IW_UVERBS_ABI_VERSION; + + dev->ibdev.iwcm = kmalloc(sizeof(struct iw_cm_verbs), GFP_KERNEL); + if (!dev->ibdev.iwcm) + return -ENOMEM; + + dev->ibdev.iwcm->connect = c4iw_connect; + dev->ibdev.iwcm->accept = c4iw_accept_cr; + dev->ibdev.iwcm->reject = c4iw_reject_cr; + dev->ibdev.iwcm->create_listen = c4iw_create_listen; + dev->ibdev.iwcm->destroy_listen = c4iw_destroy_listen; + dev->ibdev.iwcm->add_ref = c4iw_qp_add_ref; + dev->ibdev.iwcm->rem_ref = c4iw_qp_rem_ref; + dev->ibdev.iwcm->get_qp = c4iw_get_qp; + + ret = ib_register_device(&dev->ibdev, NULL); + if (ret) + goto bail1; + + for (i = 0; i < ARRAY_SIZE(c4iw_class_attributes); ++i) { + ret = device_create_file(&dev->ibdev.dev, + c4iw_class_attributes[i]); + if (ret) + goto bail2; + } + return 0; +bail2: + ib_unregister_device(&dev->ibdev); +bail1: + kfree(dev->ibdev.iwcm); + return ret; +} + +void c4iw_unregister_device(struct c4iw_dev *dev) +{ + int i; + + PDBG("%s c4iw_dev %p\n", __func__, dev); + for (i = 0; i < ARRAY_SIZE(c4iw_class_attributes); ++i) + device_remove_file(&dev->ibdev.dev, + c4iw_class_attributes[i]); + ib_unregister_device(&dev->ibdev); + kfree(dev->ibdev.iwcm); + return; +} diff --git a/kernel/drivers/infiniband/hw/cxgb4/qp.c b/kernel/drivers/infiniband/hw/cxgb4/qp.c new file mode 100644 index 000000000..389ced335 --- /dev/null +++ b/kernel/drivers/infiniband/hw/cxgb4/qp.c @@ -0,0 +1,1886 @@ +/* + * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include "iw_cxgb4.h" + +static int db_delay_usecs = 1; +module_param(db_delay_usecs, int, 0644); +MODULE_PARM_DESC(db_delay_usecs, "Usecs to delay awaiting db fifo to drain"); + +static int ocqp_support = 1; +module_param(ocqp_support, int, 0644); +MODULE_PARM_DESC(ocqp_support, "Support on-chip SQs (default=1)"); + +int db_fc_threshold = 1000; +module_param(db_fc_threshold, int, 0644); +MODULE_PARM_DESC(db_fc_threshold, + "QP count/threshold that triggers" + " automatic db flow control mode (default = 1000)"); + +int db_coalescing_threshold; +module_param(db_coalescing_threshold, int, 0644); +MODULE_PARM_DESC(db_coalescing_threshold, + "QP count/threshold that triggers" + " disabling db coalescing (default = 0)"); + +static int max_fr_immd = T4_MAX_FR_IMMD; +module_param(max_fr_immd, int, 0644); +MODULE_PARM_DESC(max_fr_immd, "fastreg threshold for using DSGL instead of immedate"); + +static int alloc_ird(struct c4iw_dev *dev, u32 ird) +{ + int ret = 0; + + spin_lock_irq(&dev->lock); + if (ird <= dev->avail_ird) + dev->avail_ird -= ird; + else + ret = -ENOMEM; + spin_unlock_irq(&dev->lock); + + if (ret) + dev_warn(&dev->rdev.lldi.pdev->dev, + "device IRD resources exhausted\n"); + + return ret; +} + +static void free_ird(struct c4iw_dev *dev, int ird) +{ + spin_lock_irq(&dev->lock); + dev->avail_ird += ird; + spin_unlock_irq(&dev->lock); +} + +static void set_state(struct c4iw_qp *qhp, enum c4iw_qp_state state) +{ + unsigned long flag; + spin_lock_irqsave(&qhp->lock, flag); + qhp->attr.state = state; + spin_unlock_irqrestore(&qhp->lock, flag); +} + +static void dealloc_oc_sq(struct c4iw_rdev *rdev, struct t4_sq *sq) +{ + c4iw_ocqp_pool_free(rdev, sq->dma_addr, sq->memsize); +} + +static void dealloc_host_sq(struct c4iw_rdev *rdev, struct t4_sq *sq) +{ + dma_free_coherent(&(rdev->lldi.pdev->dev), sq->memsize, sq->queue, + pci_unmap_addr(sq, mapping)); +} + +static void dealloc_sq(struct c4iw_rdev *rdev, struct t4_sq *sq) +{ + if (t4_sq_onchip(sq)) + dealloc_oc_sq(rdev, sq); + else + dealloc_host_sq(rdev, sq); +} + +static int alloc_oc_sq(struct c4iw_rdev *rdev, struct t4_sq *sq) +{ + if (!ocqp_support || !ocqp_supported(&rdev->lldi)) + return -ENOSYS; + sq->dma_addr = c4iw_ocqp_pool_alloc(rdev, sq->memsize); + if (!sq->dma_addr) + return -ENOMEM; + sq->phys_addr = rdev->oc_mw_pa + sq->dma_addr - + rdev->lldi.vr->ocq.start; + sq->queue = (__force union t4_wr *)(rdev->oc_mw_kva + sq->dma_addr - + rdev->lldi.vr->ocq.start); + sq->flags |= T4_SQ_ONCHIP; + return 0; +} + +static int alloc_host_sq(struct c4iw_rdev *rdev, struct t4_sq *sq) +{ + sq->queue = dma_alloc_coherent(&(rdev->lldi.pdev->dev), sq->memsize, + &(sq->dma_addr), GFP_KERNEL); + if (!sq->queue) + return -ENOMEM; + sq->phys_addr = virt_to_phys(sq->queue); + pci_unmap_addr_set(sq, mapping, sq->dma_addr); + return 0; +} + +static int alloc_sq(struct c4iw_rdev *rdev, struct t4_sq *sq, int user) +{ + int ret = -ENOSYS; + if (user) + ret = alloc_oc_sq(rdev, sq); + if (ret) + ret = alloc_host_sq(rdev, sq); + return ret; +} + +static int destroy_qp(struct c4iw_rdev *rdev, struct t4_wq *wq, + struct c4iw_dev_ucontext *uctx) +{ + /* + * uP clears EQ contexts when the connection exits rdma mode, + * so no need to post a RESET WR for these EQs. + */ + dma_free_coherent(&(rdev->lldi.pdev->dev), + wq->rq.memsize, wq->rq.queue, + dma_unmap_addr(&wq->rq, mapping)); + dealloc_sq(rdev, &wq->sq); + c4iw_rqtpool_free(rdev, wq->rq.rqt_hwaddr, wq->rq.rqt_size); + kfree(wq->rq.sw_rq); + kfree(wq->sq.sw_sq); + c4iw_put_qpid(rdev, wq->rq.qid, uctx); + c4iw_put_qpid(rdev, wq->sq.qid, uctx); + return 0; +} + +static int create_qp(struct c4iw_rdev *rdev, struct t4_wq *wq, + struct t4_cq *rcq, struct t4_cq *scq, + struct c4iw_dev_ucontext *uctx) +{ + int user = (uctx != &rdev->uctx); + struct fw_ri_res_wr *res_wr; + struct fw_ri_res *res; + int wr_len; + struct c4iw_wr_wait wr_wait; + struct sk_buff *skb; + int ret = 0; + int eqsize; + + wq->sq.qid = c4iw_get_qpid(rdev, uctx); + if (!wq->sq.qid) + return -ENOMEM; + + wq->rq.qid = c4iw_get_qpid(rdev, uctx); + if (!wq->rq.qid) { + ret = -ENOMEM; + goto free_sq_qid; + } + + if (!user) { + wq->sq.sw_sq = kzalloc(wq->sq.size * sizeof *wq->sq.sw_sq, + GFP_KERNEL); + if (!wq->sq.sw_sq) { + ret = -ENOMEM; + goto free_rq_qid; + } + + wq->rq.sw_rq = kzalloc(wq->rq.size * sizeof *wq->rq.sw_rq, + GFP_KERNEL); + if (!wq->rq.sw_rq) { + ret = -ENOMEM; + goto free_sw_sq; + } + } + + /* + * RQT must be a power of 2 and at least 16 deep. + */ + wq->rq.rqt_size = roundup_pow_of_two(max_t(u16, wq->rq.size, 16)); + wq->rq.rqt_hwaddr = c4iw_rqtpool_alloc(rdev, wq->rq.rqt_size); + if (!wq->rq.rqt_hwaddr) { + ret = -ENOMEM; + goto free_sw_rq; + } + + ret = alloc_sq(rdev, &wq->sq, user); + if (ret) + goto free_hwaddr; + memset(wq->sq.queue, 0, wq->sq.memsize); + dma_unmap_addr_set(&wq->sq, mapping, wq->sq.dma_addr); + + wq->rq.queue = dma_alloc_coherent(&(rdev->lldi.pdev->dev), + wq->rq.memsize, &(wq->rq.dma_addr), + GFP_KERNEL); + if (!wq->rq.queue) { + ret = -ENOMEM; + goto free_sq; + } + PDBG("%s sq base va 0x%p pa 0x%llx rq base va 0x%p pa 0x%llx\n", + __func__, wq->sq.queue, + (unsigned long long)virt_to_phys(wq->sq.queue), + wq->rq.queue, + (unsigned long long)virt_to_phys(wq->rq.queue)); + memset(wq->rq.queue, 0, wq->rq.memsize); + dma_unmap_addr_set(&wq->rq, mapping, wq->rq.dma_addr); + + wq->db = rdev->lldi.db_reg; + wq->gts = rdev->lldi.gts_reg; + if (user || is_t5(rdev->lldi.adapter_type)) { + u32 off; + + off = (wq->sq.qid << rdev->qpshift) & PAGE_MASK; + if (user) { + wq->sq.udb = (u64 __iomem *)(rdev->bar2_pa + off); + } else { + off += 128 * (wq->sq.qid & rdev->qpmask) + 8; + wq->sq.udb = (u64 __iomem *)(rdev->bar2_kva + off); + } + off = (wq->rq.qid << rdev->qpshift) & PAGE_MASK; + if (user) { + wq->rq.udb = (u64 __iomem *)(rdev->bar2_pa + off); + } else { + off += 128 * (wq->rq.qid & rdev->qpmask) + 8; + wq->rq.udb = (u64 __iomem *)(rdev->bar2_kva + off); + } + } + wq->rdev = rdev; + wq->rq.msn = 1; + + /* build fw_ri_res_wr */ + wr_len = sizeof *res_wr + 2 * sizeof *res; + + skb = alloc_skb(wr_len, GFP_KERNEL); + if (!skb) { + ret = -ENOMEM; + goto free_dma; + } + set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0); + + res_wr = (struct fw_ri_res_wr *)__skb_put(skb, wr_len); + memset(res_wr, 0, wr_len); + res_wr->op_nres = cpu_to_be32( + FW_WR_OP_V(FW_RI_RES_WR) | + FW_RI_RES_WR_NRES_V(2) | + FW_WR_COMPL_F); + res_wr->len16_pkd = cpu_to_be32(DIV_ROUND_UP(wr_len, 16)); + res_wr->cookie = (uintptr_t)&wr_wait; + res = res_wr->res; + res->u.sqrq.restype = FW_RI_RES_TYPE_SQ; + res->u.sqrq.op = FW_RI_RES_OP_WRITE; + + /* + * eqsize is the number of 64B entries plus the status page size. + */ + eqsize = wq->sq.size * T4_SQ_NUM_SLOTS + + rdev->hw_queue.t4_eq_status_entries; + + res->u.sqrq.fetchszm_to_iqid = cpu_to_be32( + FW_RI_RES_WR_HOSTFCMODE_V(0) | /* no host cidx updates */ + FW_RI_RES_WR_CPRIO_V(0) | /* don't keep in chip cache */ + FW_RI_RES_WR_PCIECHN_V(0) | /* set by uP at ri_init time */ + (t4_sq_onchip(&wq->sq) ? FW_RI_RES_WR_ONCHIP_F : 0) | + FW_RI_RES_WR_IQID_V(scq->cqid)); + res->u.sqrq.dcaen_to_eqsize = cpu_to_be32( + FW_RI_RES_WR_DCAEN_V(0) | + FW_RI_RES_WR_DCACPU_V(0) | + FW_RI_RES_WR_FBMIN_V(2) | + FW_RI_RES_WR_FBMAX_V(2) | + FW_RI_RES_WR_CIDXFTHRESHO_V(0) | + FW_RI_RES_WR_CIDXFTHRESH_V(0) | + FW_RI_RES_WR_EQSIZE_V(eqsize)); + res->u.sqrq.eqid = cpu_to_be32(wq->sq.qid); + res->u.sqrq.eqaddr = cpu_to_be64(wq->sq.dma_addr); + res++; + res->u.sqrq.restype = FW_RI_RES_TYPE_RQ; + res->u.sqrq.op = FW_RI_RES_OP_WRITE; + + /* + * eqsize is the number of 64B entries plus the status page size. + */ + eqsize = wq->rq.size * T4_RQ_NUM_SLOTS + + rdev->hw_queue.t4_eq_status_entries; + res->u.sqrq.fetchszm_to_iqid = cpu_to_be32( + FW_RI_RES_WR_HOSTFCMODE_V(0) | /* no host cidx updates */ + FW_RI_RES_WR_CPRIO_V(0) | /* don't keep in chip cache */ + FW_RI_RES_WR_PCIECHN_V(0) | /* set by uP at ri_init time */ + FW_RI_RES_WR_IQID_V(rcq->cqid)); + res->u.sqrq.dcaen_to_eqsize = cpu_to_be32( + FW_RI_RES_WR_DCAEN_V(0) | + FW_RI_RES_WR_DCACPU_V(0) | + FW_RI_RES_WR_FBMIN_V(2) | + FW_RI_RES_WR_FBMAX_V(2) | + FW_RI_RES_WR_CIDXFTHRESHO_V(0) | + FW_RI_RES_WR_CIDXFTHRESH_V(0) | + FW_RI_RES_WR_EQSIZE_V(eqsize)); + res->u.sqrq.eqid = cpu_to_be32(wq->rq.qid); + res->u.sqrq.eqaddr = cpu_to_be64(wq->rq.dma_addr); + + c4iw_init_wr_wait(&wr_wait); + + ret = c4iw_ofld_send(rdev, skb); + if (ret) + goto free_dma; + ret = c4iw_wait_for_reply(rdev, &wr_wait, 0, wq->sq.qid, __func__); + if (ret) + goto free_dma; + + PDBG("%s sqid 0x%x rqid 0x%x kdb 0x%p squdb 0x%lx rqudb 0x%lx\n", + __func__, wq->sq.qid, wq->rq.qid, wq->db, + (__force unsigned long) wq->sq.udb, + (__force unsigned long) wq->rq.udb); + + return 0; +free_dma: + dma_free_coherent(&(rdev->lldi.pdev->dev), + wq->rq.memsize, wq->rq.queue, + dma_unmap_addr(&wq->rq, mapping)); +free_sq: + dealloc_sq(rdev, &wq->sq); +free_hwaddr: + c4iw_rqtpool_free(rdev, wq->rq.rqt_hwaddr, wq->rq.rqt_size); +free_sw_rq: + kfree(wq->rq.sw_rq); +free_sw_sq: + kfree(wq->sq.sw_sq); +free_rq_qid: + c4iw_put_qpid(rdev, wq->rq.qid, uctx); +free_sq_qid: + c4iw_put_qpid(rdev, wq->sq.qid, uctx); + return ret; +} + +static int build_immd(struct t4_sq *sq, struct fw_ri_immd *immdp, + struct ib_send_wr *wr, int max, u32 *plenp) +{ + u8 *dstp, *srcp; + u32 plen = 0; + int i; + int rem, len; + + dstp = (u8 *)immdp->data; + for (i = 0; i < wr->num_sge; i++) { + if ((plen + wr->sg_list[i].length) > max) + return -EMSGSIZE; + srcp = (u8 *)(unsigned long)wr->sg_list[i].addr; + plen += wr->sg_list[i].length; + rem = wr->sg_list[i].length; + while (rem) { + if (dstp == (u8 *)&sq->queue[sq->size]) + dstp = (u8 *)sq->queue; + if (rem <= (u8 *)&sq->queue[sq->size] - dstp) + len = rem; + else + len = (u8 *)&sq->queue[sq->size] - dstp; + memcpy(dstp, srcp, len); + dstp += len; + srcp += len; + rem -= len; + } + } + len = roundup(plen + sizeof *immdp, 16) - (plen + sizeof *immdp); + if (len) + memset(dstp, 0, len); + immdp->op = FW_RI_DATA_IMMD; + immdp->r1 = 0; + immdp->r2 = 0; + immdp->immdlen = cpu_to_be32(plen); + *plenp = plen; + return 0; +} + +static int build_isgl(__be64 *queue_start, __be64 *queue_end, + struct fw_ri_isgl *isglp, struct ib_sge *sg_list, + int num_sge, u32 *plenp) + +{ + int i; + u32 plen = 0; + __be64 *flitp = (__be64 *)isglp->sge; + + for (i = 0; i < num_sge; i++) { + if ((plen + sg_list[i].length) < plen) + return -EMSGSIZE; + plen += sg_list[i].length; + *flitp = cpu_to_be64(((u64)sg_list[i].lkey << 32) | + sg_list[i].length); + if (++flitp == queue_end) + flitp = queue_start; + *flitp = cpu_to_be64(sg_list[i].addr); + if (++flitp == queue_end) + flitp = queue_start; + } + *flitp = (__force __be64)0; + isglp->op = FW_RI_DATA_ISGL; + isglp->r1 = 0; + isglp->nsge = cpu_to_be16(num_sge); + isglp->r2 = 0; + if (plenp) + *plenp = plen; + return 0; +} + +static int build_rdma_send(struct t4_sq *sq, union t4_wr *wqe, + struct ib_send_wr *wr, u8 *len16) +{ + u32 plen; + int size; + int ret; + + if (wr->num_sge > T4_MAX_SEND_SGE) + return -EINVAL; + switch (wr->opcode) { + case IB_WR_SEND: + if (wr->send_flags & IB_SEND_SOLICITED) + wqe->send.sendop_pkd = cpu_to_be32( + FW_RI_SEND_WR_SENDOP_V(FW_RI_SEND_WITH_SE)); + else + wqe->send.sendop_pkd = cpu_to_be32( + FW_RI_SEND_WR_SENDOP_V(FW_RI_SEND)); + wqe->send.stag_inv = 0; + break; + case IB_WR_SEND_WITH_INV: + if (wr->send_flags & IB_SEND_SOLICITED) + wqe->send.sendop_pkd = cpu_to_be32( + FW_RI_SEND_WR_SENDOP_V(FW_RI_SEND_WITH_SE_INV)); + else + wqe->send.sendop_pkd = cpu_to_be32( + FW_RI_SEND_WR_SENDOP_V(FW_RI_SEND_WITH_INV)); + wqe->send.stag_inv = cpu_to_be32(wr->ex.invalidate_rkey); + break; + + default: + return -EINVAL; + } + wqe->send.r3 = 0; + wqe->send.r4 = 0; + + plen = 0; + if (wr->num_sge) { + if (wr->send_flags & IB_SEND_INLINE) { + ret = build_immd(sq, wqe->send.u.immd_src, wr, + T4_MAX_SEND_INLINE, &plen); + if (ret) + return ret; + size = sizeof wqe->send + sizeof(struct fw_ri_immd) + + plen; + } else { + ret = build_isgl((__be64 *)sq->queue, + (__be64 *)&sq->queue[sq->size], + wqe->send.u.isgl_src, + wr->sg_list, wr->num_sge, &plen); + if (ret) + return ret; + size = sizeof wqe->send + sizeof(struct fw_ri_isgl) + + wr->num_sge * sizeof(struct fw_ri_sge); + } + } else { + wqe->send.u.immd_src[0].op = FW_RI_DATA_IMMD; + wqe->send.u.immd_src[0].r1 = 0; + wqe->send.u.immd_src[0].r2 = 0; + wqe->send.u.immd_src[0].immdlen = 0; + size = sizeof wqe->send + sizeof(struct fw_ri_immd); + plen = 0; + } + *len16 = DIV_ROUND_UP(size, 16); + wqe->send.plen = cpu_to_be32(plen); + return 0; +} + +static int build_rdma_write(struct t4_sq *sq, union t4_wr *wqe, + struct ib_send_wr *wr, u8 *len16) +{ + u32 plen; + int size; + int ret; + + if (wr->num_sge > T4_MAX_SEND_SGE) + return -EINVAL; + wqe->write.r2 = 0; + wqe->write.stag_sink = cpu_to_be32(wr->wr.rdma.rkey); + wqe->write.to_sink = cpu_to_be64(wr->wr.rdma.remote_addr); + if (wr->num_sge) { + if (wr->send_flags & IB_SEND_INLINE) { + ret = build_immd(sq, wqe->write.u.immd_src, wr, + T4_MAX_WRITE_INLINE, &plen); + if (ret) + return ret; + size = sizeof wqe->write + sizeof(struct fw_ri_immd) + + plen; + } else { + ret = build_isgl((__be64 *)sq->queue, + (__be64 *)&sq->queue[sq->size], + wqe->write.u.isgl_src, + wr->sg_list, wr->num_sge, &plen); + if (ret) + return ret; + size = sizeof wqe->write + sizeof(struct fw_ri_isgl) + + wr->num_sge * sizeof(struct fw_ri_sge); + } + } else { + wqe->write.u.immd_src[0].op = FW_RI_DATA_IMMD; + wqe->write.u.immd_src[0].r1 = 0; + wqe->write.u.immd_src[0].r2 = 0; + wqe->write.u.immd_src[0].immdlen = 0; + size = sizeof wqe->write + sizeof(struct fw_ri_immd); + plen = 0; + } + *len16 = DIV_ROUND_UP(size, 16); + wqe->write.plen = cpu_to_be32(plen); + return 0; +} + +static int build_rdma_read(union t4_wr *wqe, struct ib_send_wr *wr, u8 *len16) +{ + if (wr->num_sge > 1) + return -EINVAL; + if (wr->num_sge) { + wqe->read.stag_src = cpu_to_be32(wr->wr.rdma.rkey); + wqe->read.to_src_hi = cpu_to_be32((u32)(wr->wr.rdma.remote_addr + >> 32)); + wqe->read.to_src_lo = cpu_to_be32((u32)wr->wr.rdma.remote_addr); + wqe->read.stag_sink = cpu_to_be32(wr->sg_list[0].lkey); + wqe->read.plen = cpu_to_be32(wr->sg_list[0].length); + wqe->read.to_sink_hi = cpu_to_be32((u32)(wr->sg_list[0].addr + >> 32)); + wqe->read.to_sink_lo = cpu_to_be32((u32)(wr->sg_list[0].addr)); + } else { + wqe->read.stag_src = cpu_to_be32(2); + wqe->read.to_src_hi = 0; + wqe->read.to_src_lo = 0; + wqe->read.stag_sink = cpu_to_be32(2); + wqe->read.plen = 0; + wqe->read.to_sink_hi = 0; + wqe->read.to_sink_lo = 0; + } + wqe->read.r2 = 0; + wqe->read.r5 = 0; + *len16 = DIV_ROUND_UP(sizeof wqe->read, 16); + return 0; +} + +static int build_rdma_recv(struct c4iw_qp *qhp, union t4_recv_wr *wqe, + struct ib_recv_wr *wr, u8 *len16) +{ + int ret; + + ret = build_isgl((__be64 *)qhp->wq.rq.queue, + (__be64 *)&qhp->wq.rq.queue[qhp->wq.rq.size], + &wqe->recv.isgl, wr->sg_list, wr->num_sge, NULL); + if (ret) + return ret; + *len16 = DIV_ROUND_UP(sizeof wqe->recv + + wr->num_sge * sizeof(struct fw_ri_sge), 16); + return 0; +} + +static int build_fastreg(struct t4_sq *sq, union t4_wr *wqe, + struct ib_send_wr *wr, u8 *len16, u8 t5dev) +{ + + struct fw_ri_immd *imdp; + __be64 *p; + int i; + int pbllen = roundup(wr->wr.fast_reg.page_list_len * sizeof(u64), 32); + int rem; + + if (wr->wr.fast_reg.page_list_len > + t4_max_fr_depth(use_dsgl)) + return -EINVAL; + + wqe->fr.qpbinde_to_dcacpu = 0; + wqe->fr.pgsz_shift = wr->wr.fast_reg.page_shift - 12; + wqe->fr.addr_type = FW_RI_VA_BASED_TO; + wqe->fr.mem_perms = c4iw_ib_to_tpt_access(wr->wr.fast_reg.access_flags); + wqe->fr.len_hi = 0; + wqe->fr.len_lo = cpu_to_be32(wr->wr.fast_reg.length); + wqe->fr.stag = cpu_to_be32(wr->wr.fast_reg.rkey); + wqe->fr.va_hi = cpu_to_be32(wr->wr.fast_reg.iova_start >> 32); + wqe->fr.va_lo_fbo = cpu_to_be32(wr->wr.fast_reg.iova_start & + 0xffffffff); + + if (t5dev && use_dsgl && (pbllen > max_fr_immd)) { + struct c4iw_fr_page_list *c4pl = + to_c4iw_fr_page_list(wr->wr.fast_reg.page_list); + struct fw_ri_dsgl *sglp; + + for (i = 0; i < wr->wr.fast_reg.page_list_len; i++) { + wr->wr.fast_reg.page_list->page_list[i] = (__force u64) + cpu_to_be64((u64) + wr->wr.fast_reg.page_list->page_list[i]); + } + + sglp = (struct fw_ri_dsgl *)(&wqe->fr + 1); + sglp->op = FW_RI_DATA_DSGL; + sglp->r1 = 0; + sglp->nsge = cpu_to_be16(1); + sglp->addr0 = cpu_to_be64(c4pl->dma_addr); + sglp->len0 = cpu_to_be32(pbllen); + + *len16 = DIV_ROUND_UP(sizeof(wqe->fr) + sizeof(*sglp), 16); + } else { + imdp = (struct fw_ri_immd *)(&wqe->fr + 1); + imdp->op = FW_RI_DATA_IMMD; + imdp->r1 = 0; + imdp->r2 = 0; + imdp->immdlen = cpu_to_be32(pbllen); + p = (__be64 *)(imdp + 1); + rem = pbllen; + for (i = 0; i < wr->wr.fast_reg.page_list_len; i++) { + *p = cpu_to_be64( + (u64)wr->wr.fast_reg.page_list->page_list[i]); + rem -= sizeof(*p); + if (++p == (__be64 *)&sq->queue[sq->size]) + p = (__be64 *)sq->queue; + } + BUG_ON(rem < 0); + while (rem) { + *p = 0; + rem -= sizeof(*p); + if (++p == (__be64 *)&sq->queue[sq->size]) + p = (__be64 *)sq->queue; + } + *len16 = DIV_ROUND_UP(sizeof(wqe->fr) + sizeof(*imdp) + + pbllen, 16); + } + return 0; +} + +static int build_inv_stag(union t4_wr *wqe, struct ib_send_wr *wr, + u8 *len16) +{ + wqe->inv.stag_inv = cpu_to_be32(wr->ex.invalidate_rkey); + wqe->inv.r2 = 0; + *len16 = DIV_ROUND_UP(sizeof wqe->inv, 16); + return 0; +} + +void c4iw_qp_add_ref(struct ib_qp *qp) +{ + PDBG("%s ib_qp %p\n", __func__, qp); + atomic_inc(&(to_c4iw_qp(qp)->refcnt)); +} + +void c4iw_qp_rem_ref(struct ib_qp *qp) +{ + PDBG("%s ib_qp %p\n", __func__, qp); + if (atomic_dec_and_test(&(to_c4iw_qp(qp)->refcnt))) + wake_up(&(to_c4iw_qp(qp)->wait)); +} + +static void add_to_fc_list(struct list_head *head, struct list_head *entry) +{ + if (list_empty(entry)) + list_add_tail(entry, head); +} + +static int ring_kernel_sq_db(struct c4iw_qp *qhp, u16 inc) +{ + unsigned long flags; + + spin_lock_irqsave(&qhp->rhp->lock, flags); + spin_lock(&qhp->lock); + if (qhp->rhp->db_state == NORMAL) + t4_ring_sq_db(&qhp->wq, inc, + is_t5(qhp->rhp->rdev.lldi.adapter_type), NULL); + else { + add_to_fc_list(&qhp->rhp->db_fc_list, &qhp->db_fc_entry); + qhp->wq.sq.wq_pidx_inc += inc; + } + spin_unlock(&qhp->lock); + spin_unlock_irqrestore(&qhp->rhp->lock, flags); + return 0; +} + +static int ring_kernel_rq_db(struct c4iw_qp *qhp, u16 inc) +{ + unsigned long flags; + + spin_lock_irqsave(&qhp->rhp->lock, flags); + spin_lock(&qhp->lock); + if (qhp->rhp->db_state == NORMAL) + t4_ring_rq_db(&qhp->wq, inc, + is_t5(qhp->rhp->rdev.lldi.adapter_type), NULL); + else { + add_to_fc_list(&qhp->rhp->db_fc_list, &qhp->db_fc_entry); + qhp->wq.rq.wq_pidx_inc += inc; + } + spin_unlock(&qhp->lock); + spin_unlock_irqrestore(&qhp->rhp->lock, flags); + return 0; +} + +int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr) +{ + int err = 0; + u8 len16 = 0; + enum fw_wr_opcodes fw_opcode = 0; + enum fw_ri_wr_flags fw_flags; + struct c4iw_qp *qhp; + union t4_wr *wqe = NULL; + u32 num_wrs; + struct t4_swsqe *swsqe; + unsigned long flag; + u16 idx = 0; + + qhp = to_c4iw_qp(ibqp); + spin_lock_irqsave(&qhp->lock, flag); + if (t4_wq_in_error(&qhp->wq)) { + spin_unlock_irqrestore(&qhp->lock, flag); + return -EINVAL; + } + num_wrs = t4_sq_avail(&qhp->wq); + if (num_wrs == 0) { + spin_unlock_irqrestore(&qhp->lock, flag); + return -ENOMEM; + } + while (wr) { + if (num_wrs == 0) { + err = -ENOMEM; + *bad_wr = wr; + break; + } + wqe = (union t4_wr *)((u8 *)qhp->wq.sq.queue + + qhp->wq.sq.wq_pidx * T4_EQ_ENTRY_SIZE); + + fw_flags = 0; + if (wr->send_flags & IB_SEND_SOLICITED) + fw_flags |= FW_RI_SOLICITED_EVENT_FLAG; + if (wr->send_flags & IB_SEND_SIGNALED || qhp->sq_sig_all) + fw_flags |= FW_RI_COMPLETION_FLAG; + swsqe = &qhp->wq.sq.sw_sq[qhp->wq.sq.pidx]; + switch (wr->opcode) { + case IB_WR_SEND_WITH_INV: + case IB_WR_SEND: + if (wr->send_flags & IB_SEND_FENCE) + fw_flags |= FW_RI_READ_FENCE_FLAG; + fw_opcode = FW_RI_SEND_WR; + if (wr->opcode == IB_WR_SEND) + swsqe->opcode = FW_RI_SEND; + else + swsqe->opcode = FW_RI_SEND_WITH_INV; + err = build_rdma_send(&qhp->wq.sq, wqe, wr, &len16); + break; + case IB_WR_RDMA_WRITE: + fw_opcode = FW_RI_RDMA_WRITE_WR; + swsqe->opcode = FW_RI_RDMA_WRITE; + err = build_rdma_write(&qhp->wq.sq, wqe, wr, &len16); + break; + case IB_WR_RDMA_READ: + case IB_WR_RDMA_READ_WITH_INV: + fw_opcode = FW_RI_RDMA_READ_WR; + swsqe->opcode = FW_RI_READ_REQ; + if (wr->opcode == IB_WR_RDMA_READ_WITH_INV) + fw_flags = FW_RI_RDMA_READ_INVALIDATE; + else + fw_flags = 0; + err = build_rdma_read(wqe, wr, &len16); + if (err) + break; + swsqe->read_len = wr->sg_list[0].length; + if (!qhp->wq.sq.oldest_read) + qhp->wq.sq.oldest_read = swsqe; + break; + case IB_WR_FAST_REG_MR: + fw_opcode = FW_RI_FR_NSMR_WR; + swsqe->opcode = FW_RI_FAST_REGISTER; + err = build_fastreg(&qhp->wq.sq, wqe, wr, &len16, + is_t5( + qhp->rhp->rdev.lldi.adapter_type) ? + 1 : 0); + break; + case IB_WR_LOCAL_INV: + if (wr->send_flags & IB_SEND_FENCE) + fw_flags |= FW_RI_LOCAL_FENCE_FLAG; + fw_opcode = FW_RI_INV_LSTAG_WR; + swsqe->opcode = FW_RI_LOCAL_INV; + err = build_inv_stag(wqe, wr, &len16); + break; + default: + PDBG("%s post of type=%d TBD!\n", __func__, + wr->opcode); + err = -EINVAL; + } + if (err) { + *bad_wr = wr; + break; + } + swsqe->idx = qhp->wq.sq.pidx; + swsqe->complete = 0; + swsqe->signaled = (wr->send_flags & IB_SEND_SIGNALED) || + qhp->sq_sig_all; + swsqe->flushed = 0; + swsqe->wr_id = wr->wr_id; + if (c4iw_wr_log) { + swsqe->sge_ts = cxgb4_read_sge_timestamp( + qhp->rhp->rdev.lldi.ports[0]); + getnstimeofday(&swsqe->host_ts); + } + + init_wr_hdr(wqe, qhp->wq.sq.pidx, fw_opcode, fw_flags, len16); + + PDBG("%s cookie 0x%llx pidx 0x%x opcode 0x%x read_len %u\n", + __func__, (unsigned long long)wr->wr_id, qhp->wq.sq.pidx, + swsqe->opcode, swsqe->read_len); + wr = wr->next; + num_wrs--; + t4_sq_produce(&qhp->wq, len16); + idx += DIV_ROUND_UP(len16*16, T4_EQ_ENTRY_SIZE); + } + if (!qhp->rhp->rdev.status_page->db_off) { + t4_ring_sq_db(&qhp->wq, idx, + is_t5(qhp->rhp->rdev.lldi.adapter_type), wqe); + spin_unlock_irqrestore(&qhp->lock, flag); + } else { + spin_unlock_irqrestore(&qhp->lock, flag); + ring_kernel_sq_db(qhp, idx); + } + return err; +} + +int c4iw_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + int err = 0; + struct c4iw_qp *qhp; + union t4_recv_wr *wqe = NULL; + u32 num_wrs; + u8 len16 = 0; + unsigned long flag; + u16 idx = 0; + + qhp = to_c4iw_qp(ibqp); + spin_lock_irqsave(&qhp->lock, flag); + if (t4_wq_in_error(&qhp->wq)) { + spin_unlock_irqrestore(&qhp->lock, flag); + return -EINVAL; + } + num_wrs = t4_rq_avail(&qhp->wq); + if (num_wrs == 0) { + spin_unlock_irqrestore(&qhp->lock, flag); + return -ENOMEM; + } + while (wr) { + if (wr->num_sge > T4_MAX_RECV_SGE) { + err = -EINVAL; + *bad_wr = wr; + break; + } + wqe = (union t4_recv_wr *)((u8 *)qhp->wq.rq.queue + + qhp->wq.rq.wq_pidx * + T4_EQ_ENTRY_SIZE); + if (num_wrs) + err = build_rdma_recv(qhp, wqe, wr, &len16); + else + err = -ENOMEM; + if (err) { + *bad_wr = wr; + break; + } + + qhp->wq.rq.sw_rq[qhp->wq.rq.pidx].wr_id = wr->wr_id; + if (c4iw_wr_log) { + qhp->wq.rq.sw_rq[qhp->wq.rq.pidx].sge_ts = + cxgb4_read_sge_timestamp( + qhp->rhp->rdev.lldi.ports[0]); + getnstimeofday( + &qhp->wq.rq.sw_rq[qhp->wq.rq.pidx].host_ts); + } + + wqe->recv.opcode = FW_RI_RECV_WR; + wqe->recv.r1 = 0; + wqe->recv.wrid = qhp->wq.rq.pidx; + wqe->recv.r2[0] = 0; + wqe->recv.r2[1] = 0; + wqe->recv.r2[2] = 0; + wqe->recv.len16 = len16; + PDBG("%s cookie 0x%llx pidx %u\n", __func__, + (unsigned long long) wr->wr_id, qhp->wq.rq.pidx); + t4_rq_produce(&qhp->wq, len16); + idx += DIV_ROUND_UP(len16*16, T4_EQ_ENTRY_SIZE); + wr = wr->next; + num_wrs--; + } + if (!qhp->rhp->rdev.status_page->db_off) { + t4_ring_rq_db(&qhp->wq, idx, + is_t5(qhp->rhp->rdev.lldi.adapter_type), wqe); + spin_unlock_irqrestore(&qhp->lock, flag); + } else { + spin_unlock_irqrestore(&qhp->lock, flag); + ring_kernel_rq_db(qhp, idx); + } + return err; +} + +int c4iw_bind_mw(struct ib_qp *qp, struct ib_mw *mw, struct ib_mw_bind *mw_bind) +{ + return -ENOSYS; +} + +static inline void build_term_codes(struct t4_cqe *err_cqe, u8 *layer_type, + u8 *ecode) +{ + int status; + int tagged; + int opcode; + int rqtype; + int send_inv; + + if (!err_cqe) { + *layer_type = LAYER_RDMAP|DDP_LOCAL_CATA; + *ecode = 0; + return; + } + + status = CQE_STATUS(err_cqe); + opcode = CQE_OPCODE(err_cqe); + rqtype = RQ_TYPE(err_cqe); + send_inv = (opcode == FW_RI_SEND_WITH_INV) || + (opcode == FW_RI_SEND_WITH_SE_INV); + tagged = (opcode == FW_RI_RDMA_WRITE) || + (rqtype && (opcode == FW_RI_READ_RESP)); + + switch (status) { + case T4_ERR_STAG: + if (send_inv) { + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP; + *ecode = RDMAP_CANT_INV_STAG; + } else { + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT; + *ecode = RDMAP_INV_STAG; + } + break; + case T4_ERR_PDID: + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT; + if ((opcode == FW_RI_SEND_WITH_INV) || + (opcode == FW_RI_SEND_WITH_SE_INV)) + *ecode = RDMAP_CANT_INV_STAG; + else + *ecode = RDMAP_STAG_NOT_ASSOC; + break; + case T4_ERR_QPID: + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT; + *ecode = RDMAP_STAG_NOT_ASSOC; + break; + case T4_ERR_ACCESS: + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT; + *ecode = RDMAP_ACC_VIOL; + break; + case T4_ERR_WRAP: + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT; + *ecode = RDMAP_TO_WRAP; + break; + case T4_ERR_BOUND: + if (tagged) { + *layer_type = LAYER_DDP|DDP_TAGGED_ERR; + *ecode = DDPT_BASE_BOUNDS; + } else { + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT; + *ecode = RDMAP_BASE_BOUNDS; + } + break; + case T4_ERR_INVALIDATE_SHARED_MR: + case T4_ERR_INVALIDATE_MR_WITH_MW_BOUND: + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP; + *ecode = RDMAP_CANT_INV_STAG; + break; + case T4_ERR_ECC: + case T4_ERR_ECC_PSTAG: + case T4_ERR_INTERNAL_ERR: + *layer_type = LAYER_RDMAP|RDMAP_LOCAL_CATA; + *ecode = 0; + break; + case T4_ERR_OUT_OF_RQE: + *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; + *ecode = DDPU_INV_MSN_NOBUF; + break; + case T4_ERR_PBL_ADDR_BOUND: + *layer_type = LAYER_DDP|DDP_TAGGED_ERR; + *ecode = DDPT_BASE_BOUNDS; + break; + case T4_ERR_CRC: + *layer_type = LAYER_MPA|DDP_LLP; + *ecode = MPA_CRC_ERR; + break; + case T4_ERR_MARKER: + *layer_type = LAYER_MPA|DDP_LLP; + *ecode = MPA_MARKER_ERR; + break; + case T4_ERR_PDU_LEN_ERR: + *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; + *ecode = DDPU_MSG_TOOBIG; + break; + case T4_ERR_DDP_VERSION: + if (tagged) { + *layer_type = LAYER_DDP|DDP_TAGGED_ERR; + *ecode = DDPT_INV_VERS; + } else { + *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; + *ecode = DDPU_INV_VERS; + } + break; + case T4_ERR_RDMA_VERSION: + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP; + *ecode = RDMAP_INV_VERS; + break; + case T4_ERR_OPCODE: + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP; + *ecode = RDMAP_INV_OPCODE; + break; + case T4_ERR_DDP_QUEUE_NUM: + *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; + *ecode = DDPU_INV_QN; + break; + case T4_ERR_MSN: + case T4_ERR_MSN_GAP: + case T4_ERR_MSN_RANGE: + case T4_ERR_IRD_OVERFLOW: + *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; + *ecode = DDPU_INV_MSN_RANGE; + break; + case T4_ERR_TBIT: + *layer_type = LAYER_DDP|DDP_LOCAL_CATA; + *ecode = 0; + break; + case T4_ERR_MO: + *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; + *ecode = DDPU_INV_MO; + break; + default: + *layer_type = LAYER_RDMAP|DDP_LOCAL_CATA; + *ecode = 0; + break; + } +} + +static void post_terminate(struct c4iw_qp *qhp, struct t4_cqe *err_cqe, + gfp_t gfp) +{ + struct fw_ri_wr *wqe; + struct sk_buff *skb; + struct terminate_message *term; + + PDBG("%s qhp %p qid 0x%x tid %u\n", __func__, qhp, qhp->wq.sq.qid, + qhp->ep->hwtid); + + skb = alloc_skb(sizeof *wqe, gfp); + if (!skb) + return; + set_wr_txq(skb, CPL_PRIORITY_DATA, qhp->ep->txq_idx); + + wqe = (struct fw_ri_wr *)__skb_put(skb, sizeof(*wqe)); + memset(wqe, 0, sizeof *wqe); + wqe->op_compl = cpu_to_be32(FW_WR_OP_V(FW_RI_INIT_WR)); + wqe->flowid_len16 = cpu_to_be32( + FW_WR_FLOWID_V(qhp->ep->hwtid) | + FW_WR_LEN16_V(DIV_ROUND_UP(sizeof(*wqe), 16))); + + wqe->u.terminate.type = FW_RI_TYPE_TERMINATE; + wqe->u.terminate.immdlen = cpu_to_be32(sizeof *term); + term = (struct terminate_message *)wqe->u.terminate.termmsg; + if (qhp->attr.layer_etype == (LAYER_MPA|DDP_LLP)) { + term->layer_etype = qhp->attr.layer_etype; + term->ecode = qhp->attr.ecode; + } else + build_term_codes(err_cqe, &term->layer_etype, &term->ecode); + c4iw_ofld_send(&qhp->rhp->rdev, skb); +} + +/* + * Assumes qhp lock is held. + */ +static void __flush_qp(struct c4iw_qp *qhp, struct c4iw_cq *rchp, + struct c4iw_cq *schp) +{ + int count; + int rq_flushed, sq_flushed; + unsigned long flag; + + PDBG("%s qhp %p rchp %p schp %p\n", __func__, qhp, rchp, schp); + + /* locking hierarchy: cq lock first, then qp lock. */ + spin_lock_irqsave(&rchp->lock, flag); + spin_lock(&qhp->lock); + + if (qhp->wq.flushed) { + spin_unlock(&qhp->lock); + spin_unlock_irqrestore(&rchp->lock, flag); + return; + } + qhp->wq.flushed = 1; + + c4iw_flush_hw_cq(rchp); + c4iw_count_rcqes(&rchp->cq, &qhp->wq, &count); + rq_flushed = c4iw_flush_rq(&qhp->wq, &rchp->cq, count); + spin_unlock(&qhp->lock); + spin_unlock_irqrestore(&rchp->lock, flag); + + /* locking hierarchy: cq lock first, then qp lock. */ + spin_lock_irqsave(&schp->lock, flag); + spin_lock(&qhp->lock); + if (schp != rchp) + c4iw_flush_hw_cq(schp); + sq_flushed = c4iw_flush_sq(qhp); + spin_unlock(&qhp->lock); + spin_unlock_irqrestore(&schp->lock, flag); + + if (schp == rchp) { + if (t4_clear_cq_armed(&rchp->cq) && + (rq_flushed || sq_flushed)) { + spin_lock_irqsave(&rchp->comp_handler_lock, flag); + (*rchp->ibcq.comp_handler)(&rchp->ibcq, + rchp->ibcq.cq_context); + spin_unlock_irqrestore(&rchp->comp_handler_lock, flag); + } + } else { + if (t4_clear_cq_armed(&rchp->cq) && rq_flushed) { + spin_lock_irqsave(&rchp->comp_handler_lock, flag); + (*rchp->ibcq.comp_handler)(&rchp->ibcq, + rchp->ibcq.cq_context); + spin_unlock_irqrestore(&rchp->comp_handler_lock, flag); + } + if (t4_clear_cq_armed(&schp->cq) && sq_flushed) { + spin_lock_irqsave(&schp->comp_handler_lock, flag); + (*schp->ibcq.comp_handler)(&schp->ibcq, + schp->ibcq.cq_context); + spin_unlock_irqrestore(&schp->comp_handler_lock, flag); + } + } +} + +static void flush_qp(struct c4iw_qp *qhp) +{ + struct c4iw_cq *rchp, *schp; + unsigned long flag; + + rchp = to_c4iw_cq(qhp->ibqp.recv_cq); + schp = to_c4iw_cq(qhp->ibqp.send_cq); + + t4_set_wq_in_error(&qhp->wq); + if (qhp->ibqp.uobject) { + t4_set_cq_in_error(&rchp->cq); + spin_lock_irqsave(&rchp->comp_handler_lock, flag); + (*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context); + spin_unlock_irqrestore(&rchp->comp_handler_lock, flag); + if (schp != rchp) { + t4_set_cq_in_error(&schp->cq); + spin_lock_irqsave(&schp->comp_handler_lock, flag); + (*schp->ibcq.comp_handler)(&schp->ibcq, + schp->ibcq.cq_context); + spin_unlock_irqrestore(&schp->comp_handler_lock, flag); + } + return; + } + __flush_qp(qhp, rchp, schp); +} + +static int rdma_fini(struct c4iw_dev *rhp, struct c4iw_qp *qhp, + struct c4iw_ep *ep) +{ + struct fw_ri_wr *wqe; + int ret; + struct sk_buff *skb; + + PDBG("%s qhp %p qid 0x%x tid %u\n", __func__, qhp, qhp->wq.sq.qid, + ep->hwtid); + + skb = alloc_skb(sizeof *wqe, GFP_KERNEL); + if (!skb) + return -ENOMEM; + set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx); + + wqe = (struct fw_ri_wr *)__skb_put(skb, sizeof(*wqe)); + memset(wqe, 0, sizeof *wqe); + wqe->op_compl = cpu_to_be32( + FW_WR_OP_V(FW_RI_INIT_WR) | + FW_WR_COMPL_F); + wqe->flowid_len16 = cpu_to_be32( + FW_WR_FLOWID_V(ep->hwtid) | + FW_WR_LEN16_V(DIV_ROUND_UP(sizeof(*wqe), 16))); + wqe->cookie = (uintptr_t)&ep->com.wr_wait; + + wqe->u.fini.type = FW_RI_TYPE_FINI; + ret = c4iw_ofld_send(&rhp->rdev, skb); + if (ret) + goto out; + + ret = c4iw_wait_for_reply(&rhp->rdev, &ep->com.wr_wait, qhp->ep->hwtid, + qhp->wq.sq.qid, __func__); +out: + PDBG("%s ret %d\n", __func__, ret); + return ret; +} + +static void build_rtr_msg(u8 p2p_type, struct fw_ri_init *init) +{ + PDBG("%s p2p_type = %d\n", __func__, p2p_type); + memset(&init->u, 0, sizeof init->u); + switch (p2p_type) { + case FW_RI_INIT_P2PTYPE_RDMA_WRITE: + init->u.write.opcode = FW_RI_RDMA_WRITE_WR; + init->u.write.stag_sink = cpu_to_be32(1); + init->u.write.to_sink = cpu_to_be64(1); + init->u.write.u.immd_src[0].op = FW_RI_DATA_IMMD; + init->u.write.len16 = DIV_ROUND_UP(sizeof init->u.write + + sizeof(struct fw_ri_immd), + 16); + break; + case FW_RI_INIT_P2PTYPE_READ_REQ: + init->u.write.opcode = FW_RI_RDMA_READ_WR; + init->u.read.stag_src = cpu_to_be32(1); + init->u.read.to_src_lo = cpu_to_be32(1); + init->u.read.stag_sink = cpu_to_be32(1); + init->u.read.to_sink_lo = cpu_to_be32(1); + init->u.read.len16 = DIV_ROUND_UP(sizeof init->u.read, 16); + break; + } +} + +static int rdma_init(struct c4iw_dev *rhp, struct c4iw_qp *qhp) +{ + struct fw_ri_wr *wqe; + int ret; + struct sk_buff *skb; + + PDBG("%s qhp %p qid 0x%x tid %u ird %u ord %u\n", __func__, qhp, + qhp->wq.sq.qid, qhp->ep->hwtid, qhp->ep->ird, qhp->ep->ord); + + skb = alloc_skb(sizeof *wqe, GFP_KERNEL); + if (!skb) { + ret = -ENOMEM; + goto out; + } + ret = alloc_ird(rhp, qhp->attr.max_ird); + if (ret) { + qhp->attr.max_ird = 0; + kfree_skb(skb); + goto out; + } + set_wr_txq(skb, CPL_PRIORITY_DATA, qhp->ep->txq_idx); + + wqe = (struct fw_ri_wr *)__skb_put(skb, sizeof(*wqe)); + memset(wqe, 0, sizeof *wqe); + wqe->op_compl = cpu_to_be32( + FW_WR_OP_V(FW_RI_INIT_WR) | + FW_WR_COMPL_F); + wqe->flowid_len16 = cpu_to_be32( + FW_WR_FLOWID_V(qhp->ep->hwtid) | + FW_WR_LEN16_V(DIV_ROUND_UP(sizeof(*wqe), 16))); + + wqe->cookie = (uintptr_t)&qhp->ep->com.wr_wait; + + wqe->u.init.type = FW_RI_TYPE_INIT; + wqe->u.init.mpareqbit_p2ptype = + FW_RI_WR_MPAREQBIT_V(qhp->attr.mpa_attr.initiator) | + FW_RI_WR_P2PTYPE_V(qhp->attr.mpa_attr.p2p_type); + wqe->u.init.mpa_attrs = FW_RI_MPA_IETF_ENABLE; + if (qhp->attr.mpa_attr.recv_marker_enabled) + wqe->u.init.mpa_attrs |= FW_RI_MPA_RX_MARKER_ENABLE; + if (qhp->attr.mpa_attr.xmit_marker_enabled) + wqe->u.init.mpa_attrs |= FW_RI_MPA_TX_MARKER_ENABLE; + if (qhp->attr.mpa_attr.crc_enabled) + wqe->u.init.mpa_attrs |= FW_RI_MPA_CRC_ENABLE; + + wqe->u.init.qp_caps = FW_RI_QP_RDMA_READ_ENABLE | + FW_RI_QP_RDMA_WRITE_ENABLE | + FW_RI_QP_BIND_ENABLE; + if (!qhp->ibqp.uobject) + wqe->u.init.qp_caps |= FW_RI_QP_FAST_REGISTER_ENABLE | + FW_RI_QP_STAG0_ENABLE; + wqe->u.init.nrqe = cpu_to_be16(t4_rqes_posted(&qhp->wq)); + wqe->u.init.pdid = cpu_to_be32(qhp->attr.pd); + wqe->u.init.qpid = cpu_to_be32(qhp->wq.sq.qid); + wqe->u.init.sq_eqid = cpu_to_be32(qhp->wq.sq.qid); + wqe->u.init.rq_eqid = cpu_to_be32(qhp->wq.rq.qid); + wqe->u.init.scqid = cpu_to_be32(qhp->attr.scq); + wqe->u.init.rcqid = cpu_to_be32(qhp->attr.rcq); + wqe->u.init.ord_max = cpu_to_be32(qhp->attr.max_ord); + wqe->u.init.ird_max = cpu_to_be32(qhp->attr.max_ird); + wqe->u.init.iss = cpu_to_be32(qhp->ep->snd_seq); + wqe->u.init.irs = cpu_to_be32(qhp->ep->rcv_seq); + wqe->u.init.hwrqsize = cpu_to_be32(qhp->wq.rq.rqt_size); + wqe->u.init.hwrqaddr = cpu_to_be32(qhp->wq.rq.rqt_hwaddr - + rhp->rdev.lldi.vr->rq.start); + if (qhp->attr.mpa_attr.initiator) + build_rtr_msg(qhp->attr.mpa_attr.p2p_type, &wqe->u.init); + + ret = c4iw_ofld_send(&rhp->rdev, skb); + if (ret) + goto err1; + + ret = c4iw_wait_for_reply(&rhp->rdev, &qhp->ep->com.wr_wait, + qhp->ep->hwtid, qhp->wq.sq.qid, __func__); + if (!ret) + goto out; +err1: + free_ird(rhp, qhp->attr.max_ird); +out: + PDBG("%s ret %d\n", __func__, ret); + return ret; +} + +int c4iw_modify_qp(struct c4iw_dev *rhp, struct c4iw_qp *qhp, + enum c4iw_qp_attr_mask mask, + struct c4iw_qp_attributes *attrs, + int internal) +{ + int ret = 0; + struct c4iw_qp_attributes newattr = qhp->attr; + int disconnect = 0; + int terminate = 0; + int abort = 0; + int free = 0; + struct c4iw_ep *ep = NULL; + + PDBG("%s qhp %p sqid 0x%x rqid 0x%x ep %p state %d -> %d\n", __func__, + qhp, qhp->wq.sq.qid, qhp->wq.rq.qid, qhp->ep, qhp->attr.state, + (mask & C4IW_QP_ATTR_NEXT_STATE) ? attrs->next_state : -1); + + mutex_lock(&qhp->mutex); + + /* Process attr changes if in IDLE */ + if (mask & C4IW_QP_ATTR_VALID_MODIFY) { + if (qhp->attr.state != C4IW_QP_STATE_IDLE) { + ret = -EIO; + goto out; + } + if (mask & C4IW_QP_ATTR_ENABLE_RDMA_READ) + newattr.enable_rdma_read = attrs->enable_rdma_read; + if (mask & C4IW_QP_ATTR_ENABLE_RDMA_WRITE) + newattr.enable_rdma_write = attrs->enable_rdma_write; + if (mask & C4IW_QP_ATTR_ENABLE_RDMA_BIND) + newattr.enable_bind = attrs->enable_bind; + if (mask & C4IW_QP_ATTR_MAX_ORD) { + if (attrs->max_ord > c4iw_max_read_depth) { + ret = -EINVAL; + goto out; + } + newattr.max_ord = attrs->max_ord; + } + if (mask & C4IW_QP_ATTR_MAX_IRD) { + if (attrs->max_ird > cur_max_read_depth(rhp)) { + ret = -EINVAL; + goto out; + } + newattr.max_ird = attrs->max_ird; + } + qhp->attr = newattr; + } + + if (mask & C4IW_QP_ATTR_SQ_DB) { + ret = ring_kernel_sq_db(qhp, attrs->sq_db_inc); + goto out; + } + if (mask & C4IW_QP_ATTR_RQ_DB) { + ret = ring_kernel_rq_db(qhp, attrs->rq_db_inc); + goto out; + } + + if (!(mask & C4IW_QP_ATTR_NEXT_STATE)) + goto out; + if (qhp->attr.state == attrs->next_state) + goto out; + + switch (qhp->attr.state) { + case C4IW_QP_STATE_IDLE: + switch (attrs->next_state) { + case C4IW_QP_STATE_RTS: + if (!(mask & C4IW_QP_ATTR_LLP_STREAM_HANDLE)) { + ret = -EINVAL; + goto out; + } + if (!(mask & C4IW_QP_ATTR_MPA_ATTR)) { + ret = -EINVAL; + goto out; + } + qhp->attr.mpa_attr = attrs->mpa_attr; + qhp->attr.llp_stream_handle = attrs->llp_stream_handle; + qhp->ep = qhp->attr.llp_stream_handle; + set_state(qhp, C4IW_QP_STATE_RTS); + + /* + * Ref the endpoint here and deref when we + * disassociate the endpoint from the QP. This + * happens in CLOSING->IDLE transition or *->ERROR + * transition. + */ + c4iw_get_ep(&qhp->ep->com); + ret = rdma_init(rhp, qhp); + if (ret) + goto err; + break; + case C4IW_QP_STATE_ERROR: + set_state(qhp, C4IW_QP_STATE_ERROR); + flush_qp(qhp); + break; + default: + ret = -EINVAL; + goto out; + } + break; + case C4IW_QP_STATE_RTS: + switch (attrs->next_state) { + case C4IW_QP_STATE_CLOSING: + BUG_ON(atomic_read(&qhp->ep->com.kref.refcount) < 2); + t4_set_wq_in_error(&qhp->wq); + set_state(qhp, C4IW_QP_STATE_CLOSING); + ep = qhp->ep; + if (!internal) { + abort = 0; + disconnect = 1; + c4iw_get_ep(&qhp->ep->com); + } + ret = rdma_fini(rhp, qhp, ep); + if (ret) + goto err; + break; + case C4IW_QP_STATE_TERMINATE: + t4_set_wq_in_error(&qhp->wq); + set_state(qhp, C4IW_QP_STATE_TERMINATE); + qhp->attr.layer_etype = attrs->layer_etype; + qhp->attr.ecode = attrs->ecode; + ep = qhp->ep; + if (!internal) { + c4iw_get_ep(&qhp->ep->com); + terminate = 1; + disconnect = 1; + } else { + terminate = qhp->attr.send_term; + ret = rdma_fini(rhp, qhp, ep); + if (ret) + goto err; + } + break; + case C4IW_QP_STATE_ERROR: + t4_set_wq_in_error(&qhp->wq); + set_state(qhp, C4IW_QP_STATE_ERROR); + if (!internal) { + abort = 1; + disconnect = 1; + ep = qhp->ep; + c4iw_get_ep(&qhp->ep->com); + } + goto err; + break; + default: + ret = -EINVAL; + goto out; + } + break; + case C4IW_QP_STATE_CLOSING: + if (!internal) { + ret = -EINVAL; + goto out; + } + switch (attrs->next_state) { + case C4IW_QP_STATE_IDLE: + flush_qp(qhp); + set_state(qhp, C4IW_QP_STATE_IDLE); + qhp->attr.llp_stream_handle = NULL; + c4iw_put_ep(&qhp->ep->com); + qhp->ep = NULL; + wake_up(&qhp->wait); + break; + case C4IW_QP_STATE_ERROR: + goto err; + default: + ret = -EINVAL; + goto err; + } + break; + case C4IW_QP_STATE_ERROR: + if (attrs->next_state != C4IW_QP_STATE_IDLE) { + ret = -EINVAL; + goto out; + } + if (!t4_sq_empty(&qhp->wq) || !t4_rq_empty(&qhp->wq)) { + ret = -EINVAL; + goto out; + } + set_state(qhp, C4IW_QP_STATE_IDLE); + break; + case C4IW_QP_STATE_TERMINATE: + if (!internal) { + ret = -EINVAL; + goto out; + } + goto err; + break; + default: + printk(KERN_ERR "%s in a bad state %d\n", + __func__, qhp->attr.state); + ret = -EINVAL; + goto err; + break; + } + goto out; +err: + PDBG("%s disassociating ep %p qpid 0x%x\n", __func__, qhp->ep, + qhp->wq.sq.qid); + + /* disassociate the LLP connection */ + qhp->attr.llp_stream_handle = NULL; + if (!ep) + ep = qhp->ep; + qhp->ep = NULL; + set_state(qhp, C4IW_QP_STATE_ERROR); + free = 1; + abort = 1; + BUG_ON(!ep); + flush_qp(qhp); + wake_up(&qhp->wait); +out: + mutex_unlock(&qhp->mutex); + + if (terminate) + post_terminate(qhp, NULL, internal ? GFP_ATOMIC : GFP_KERNEL); + + /* + * If disconnect is 1, then we need to initiate a disconnect + * on the EP. This can be a normal close (RTS->CLOSING) or + * an abnormal close (RTS/CLOSING->ERROR). + */ + if (disconnect) { + c4iw_ep_disconnect(ep, abort, internal ? GFP_ATOMIC : + GFP_KERNEL); + c4iw_put_ep(&ep->com); + } + + /* + * If free is 1, then we've disassociated the EP from the QP + * and we need to dereference the EP. + */ + if (free) + c4iw_put_ep(&ep->com); + PDBG("%s exit state %d\n", __func__, qhp->attr.state); + return ret; +} + +int c4iw_destroy_qp(struct ib_qp *ib_qp) +{ + struct c4iw_dev *rhp; + struct c4iw_qp *qhp; + struct c4iw_qp_attributes attrs; + struct c4iw_ucontext *ucontext; + + qhp = to_c4iw_qp(ib_qp); + rhp = qhp->rhp; + + attrs.next_state = C4IW_QP_STATE_ERROR; + if (qhp->attr.state == C4IW_QP_STATE_TERMINATE) + c4iw_modify_qp(rhp, qhp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); + else + c4iw_modify_qp(rhp, qhp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 0); + wait_event(qhp->wait, !qhp->ep); + + remove_handle(rhp, &rhp->qpidr, qhp->wq.sq.qid); + atomic_dec(&qhp->refcnt); + wait_event(qhp->wait, !atomic_read(&qhp->refcnt)); + + spin_lock_irq(&rhp->lock); + if (!list_empty(&qhp->db_fc_entry)) + list_del_init(&qhp->db_fc_entry); + spin_unlock_irq(&rhp->lock); + free_ird(rhp, qhp->attr.max_ird); + + ucontext = ib_qp->uobject ? + to_c4iw_ucontext(ib_qp->uobject->context) : NULL; + destroy_qp(&rhp->rdev, &qhp->wq, + ucontext ? &ucontext->uctx : &rhp->rdev.uctx); + + PDBG("%s ib_qp %p qpid 0x%0x\n", __func__, ib_qp, qhp->wq.sq.qid); + kfree(qhp); + return 0; +} + +struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs, + struct ib_udata *udata) +{ + struct c4iw_dev *rhp; + struct c4iw_qp *qhp; + struct c4iw_pd *php; + struct c4iw_cq *schp; + struct c4iw_cq *rchp; + struct c4iw_create_qp_resp uresp; + unsigned int sqsize, rqsize; + struct c4iw_ucontext *ucontext; + int ret; + struct c4iw_mm_entry *mm1, *mm2, *mm3, *mm4, *mm5 = NULL; + + PDBG("%s ib_pd %p\n", __func__, pd); + + if (attrs->qp_type != IB_QPT_RC) + return ERR_PTR(-EINVAL); + + php = to_c4iw_pd(pd); + rhp = php->rhp; + schp = get_chp(rhp, ((struct c4iw_cq *)attrs->send_cq)->cq.cqid); + rchp = get_chp(rhp, ((struct c4iw_cq *)attrs->recv_cq)->cq.cqid); + if (!schp || !rchp) + return ERR_PTR(-EINVAL); + + if (attrs->cap.max_inline_data > T4_MAX_SEND_INLINE) + return ERR_PTR(-EINVAL); + + if (attrs->cap.max_recv_wr > rhp->rdev.hw_queue.t4_max_rq_size) + return ERR_PTR(-E2BIG); + rqsize = attrs->cap.max_recv_wr + 1; + if (rqsize < 8) + rqsize = 8; + + if (attrs->cap.max_send_wr > rhp->rdev.hw_queue.t4_max_sq_size) + return ERR_PTR(-E2BIG); + sqsize = attrs->cap.max_send_wr + 1; + if (sqsize < 8) + sqsize = 8; + + ucontext = pd->uobject ? to_c4iw_ucontext(pd->uobject->context) : NULL; + + qhp = kzalloc(sizeof(*qhp), GFP_KERNEL); + if (!qhp) + return ERR_PTR(-ENOMEM); + qhp->wq.sq.size = sqsize; + qhp->wq.sq.memsize = + (sqsize + rhp->rdev.hw_queue.t4_eq_status_entries) * + sizeof(*qhp->wq.sq.queue) + 16 * sizeof(__be64); + qhp->wq.sq.flush_cidx = -1; + qhp->wq.rq.size = rqsize; + qhp->wq.rq.memsize = + (rqsize + rhp->rdev.hw_queue.t4_eq_status_entries) * + sizeof(*qhp->wq.rq.queue); + + if (ucontext) { + qhp->wq.sq.memsize = roundup(qhp->wq.sq.memsize, PAGE_SIZE); + qhp->wq.rq.memsize = roundup(qhp->wq.rq.memsize, PAGE_SIZE); + } + + ret = create_qp(&rhp->rdev, &qhp->wq, &schp->cq, &rchp->cq, + ucontext ? &ucontext->uctx : &rhp->rdev.uctx); + if (ret) + goto err1; + + attrs->cap.max_recv_wr = rqsize - 1; + attrs->cap.max_send_wr = sqsize - 1; + attrs->cap.max_inline_data = T4_MAX_SEND_INLINE; + + qhp->rhp = rhp; + qhp->attr.pd = php->pdid; + qhp->attr.scq = ((struct c4iw_cq *) attrs->send_cq)->cq.cqid; + qhp->attr.rcq = ((struct c4iw_cq *) attrs->recv_cq)->cq.cqid; + qhp->attr.sq_num_entries = attrs->cap.max_send_wr; + qhp->attr.rq_num_entries = attrs->cap.max_recv_wr; + qhp->attr.sq_max_sges = attrs->cap.max_send_sge; + qhp->attr.sq_max_sges_rdma_write = attrs->cap.max_send_sge; + qhp->attr.rq_max_sges = attrs->cap.max_recv_sge; + qhp->attr.state = C4IW_QP_STATE_IDLE; + qhp->attr.next_state = C4IW_QP_STATE_IDLE; + qhp->attr.enable_rdma_read = 1; + qhp->attr.enable_rdma_write = 1; + qhp->attr.enable_bind = 1; + qhp->attr.max_ord = 0; + qhp->attr.max_ird = 0; + qhp->sq_sig_all = attrs->sq_sig_type == IB_SIGNAL_ALL_WR; + spin_lock_init(&qhp->lock); + mutex_init(&qhp->mutex); + init_waitqueue_head(&qhp->wait); + atomic_set(&qhp->refcnt, 1); + + ret = insert_handle(rhp, &rhp->qpidr, qhp, qhp->wq.sq.qid); + if (ret) + goto err2; + + if (udata) { + mm1 = kmalloc(sizeof *mm1, GFP_KERNEL); + if (!mm1) { + ret = -ENOMEM; + goto err3; + } + mm2 = kmalloc(sizeof *mm2, GFP_KERNEL); + if (!mm2) { + ret = -ENOMEM; + goto err4; + } + mm3 = kmalloc(sizeof *mm3, GFP_KERNEL); + if (!mm3) { + ret = -ENOMEM; + goto err5; + } + mm4 = kmalloc(sizeof *mm4, GFP_KERNEL); + if (!mm4) { + ret = -ENOMEM; + goto err6; + } + if (t4_sq_onchip(&qhp->wq.sq)) { + mm5 = kmalloc(sizeof *mm5, GFP_KERNEL); + if (!mm5) { + ret = -ENOMEM; + goto err7; + } + uresp.flags = C4IW_QPF_ONCHIP; + } else + uresp.flags = 0; + uresp.qid_mask = rhp->rdev.qpmask; + uresp.sqid = qhp->wq.sq.qid; + uresp.sq_size = qhp->wq.sq.size; + uresp.sq_memsize = qhp->wq.sq.memsize; + uresp.rqid = qhp->wq.rq.qid; + uresp.rq_size = qhp->wq.rq.size; + uresp.rq_memsize = qhp->wq.rq.memsize; + spin_lock(&ucontext->mmap_lock); + if (mm5) { + uresp.ma_sync_key = ucontext->key; + ucontext->key += PAGE_SIZE; + } else { + uresp.ma_sync_key = 0; + } + uresp.sq_key = ucontext->key; + ucontext->key += PAGE_SIZE; + uresp.rq_key = ucontext->key; + ucontext->key += PAGE_SIZE; + uresp.sq_db_gts_key = ucontext->key; + ucontext->key += PAGE_SIZE; + uresp.rq_db_gts_key = ucontext->key; + ucontext->key += PAGE_SIZE; + spin_unlock(&ucontext->mmap_lock); + ret = ib_copy_to_udata(udata, &uresp, sizeof uresp); + if (ret) + goto err8; + mm1->key = uresp.sq_key; + mm1->addr = qhp->wq.sq.phys_addr; + mm1->len = PAGE_ALIGN(qhp->wq.sq.memsize); + insert_mmap(ucontext, mm1); + mm2->key = uresp.rq_key; + mm2->addr = virt_to_phys(qhp->wq.rq.queue); + mm2->len = PAGE_ALIGN(qhp->wq.rq.memsize); + insert_mmap(ucontext, mm2); + mm3->key = uresp.sq_db_gts_key; + mm3->addr = (__force unsigned long)qhp->wq.sq.udb; + mm3->len = PAGE_SIZE; + insert_mmap(ucontext, mm3); + mm4->key = uresp.rq_db_gts_key; + mm4->addr = (__force unsigned long)qhp->wq.rq.udb; + mm4->len = PAGE_SIZE; + insert_mmap(ucontext, mm4); + if (mm5) { + mm5->key = uresp.ma_sync_key; + mm5->addr = (pci_resource_start(rhp->rdev.lldi.pdev, 0) + + PCIE_MA_SYNC_A) & PAGE_MASK; + mm5->len = PAGE_SIZE; + insert_mmap(ucontext, mm5); + } + } + qhp->ibqp.qp_num = qhp->wq.sq.qid; + init_timer(&(qhp->timer)); + INIT_LIST_HEAD(&qhp->db_fc_entry); + PDBG("%s sq id %u size %u memsize %zu num_entries %u " + "rq id %u size %u memsize %zu num_entries %u\n", __func__, + qhp->wq.sq.qid, qhp->wq.sq.size, qhp->wq.sq.memsize, + attrs->cap.max_send_wr, qhp->wq.rq.qid, qhp->wq.rq.size, + qhp->wq.rq.memsize, attrs->cap.max_recv_wr); + return &qhp->ibqp; +err8: + kfree(mm5); +err7: + kfree(mm4); +err6: + kfree(mm3); +err5: + kfree(mm2); +err4: + kfree(mm1); +err3: + remove_handle(rhp, &rhp->qpidr, qhp->wq.sq.qid); +err2: + destroy_qp(&rhp->rdev, &qhp->wq, + ucontext ? &ucontext->uctx : &rhp->rdev.uctx); +err1: + kfree(qhp); + return ERR_PTR(ret); +} + +int c4iw_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct c4iw_dev *rhp; + struct c4iw_qp *qhp; + enum c4iw_qp_attr_mask mask = 0; + struct c4iw_qp_attributes attrs; + + PDBG("%s ib_qp %p\n", __func__, ibqp); + + /* iwarp does not support the RTR state */ + if ((attr_mask & IB_QP_STATE) && (attr->qp_state == IB_QPS_RTR)) + attr_mask &= ~IB_QP_STATE; + + /* Make sure we still have something left to do */ + if (!attr_mask) + return 0; + + memset(&attrs, 0, sizeof attrs); + qhp = to_c4iw_qp(ibqp); + rhp = qhp->rhp; + + attrs.next_state = c4iw_convert_state(attr->qp_state); + attrs.enable_rdma_read = (attr->qp_access_flags & + IB_ACCESS_REMOTE_READ) ? 1 : 0; + attrs.enable_rdma_write = (attr->qp_access_flags & + IB_ACCESS_REMOTE_WRITE) ? 1 : 0; + attrs.enable_bind = (attr->qp_access_flags & IB_ACCESS_MW_BIND) ? 1 : 0; + + + mask |= (attr_mask & IB_QP_STATE) ? C4IW_QP_ATTR_NEXT_STATE : 0; + mask |= (attr_mask & IB_QP_ACCESS_FLAGS) ? + (C4IW_QP_ATTR_ENABLE_RDMA_READ | + C4IW_QP_ATTR_ENABLE_RDMA_WRITE | + C4IW_QP_ATTR_ENABLE_RDMA_BIND) : 0; + + /* + * Use SQ_PSN and RQ_PSN to pass in IDX_INC values for + * ringing the queue db when we're in DB_FULL mode. + * Only allow this on T4 devices. + */ + attrs.sq_db_inc = attr->sq_psn; + attrs.rq_db_inc = attr->rq_psn; + mask |= (attr_mask & IB_QP_SQ_PSN) ? C4IW_QP_ATTR_SQ_DB : 0; + mask |= (attr_mask & IB_QP_RQ_PSN) ? C4IW_QP_ATTR_RQ_DB : 0; + if (is_t5(to_c4iw_qp(ibqp)->rhp->rdev.lldi.adapter_type) && + (mask & (C4IW_QP_ATTR_SQ_DB|C4IW_QP_ATTR_RQ_DB))) + return -EINVAL; + + return c4iw_modify_qp(rhp, qhp, mask, &attrs, 0); +} + +struct ib_qp *c4iw_get_qp(struct ib_device *dev, int qpn) +{ + PDBG("%s ib_dev %p qpn 0x%x\n", __func__, dev, qpn); + return (struct ib_qp *)get_qhp(to_c4iw_dev(dev), qpn); +} + +int c4iw_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_qp_init_attr *init_attr) +{ + struct c4iw_qp *qhp = to_c4iw_qp(ibqp); + + memset(attr, 0, sizeof *attr); + memset(init_attr, 0, sizeof *init_attr); + attr->qp_state = to_ib_qp_state(qhp->attr.state); + init_attr->cap.max_send_wr = qhp->attr.sq_num_entries; + init_attr->cap.max_recv_wr = qhp->attr.rq_num_entries; + init_attr->cap.max_send_sge = qhp->attr.sq_max_sges; + init_attr->cap.max_recv_sge = qhp->attr.sq_max_sges; + init_attr->cap.max_inline_data = T4_MAX_SEND_INLINE; + init_attr->sq_sig_type = qhp->sq_sig_all ? IB_SIGNAL_ALL_WR : 0; + return 0; +} diff --git a/kernel/drivers/infiniband/hw/cxgb4/resource.c b/kernel/drivers/infiniband/hw/cxgb4/resource.c new file mode 100644 index 000000000..67df71a70 --- /dev/null +++ b/kernel/drivers/infiniband/hw/cxgb4/resource.c @@ -0,0 +1,453 @@ +/* + * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +/* Crude resource management */ +#include +#include +#include +#include "iw_cxgb4.h" + +static int c4iw_init_qid_table(struct c4iw_rdev *rdev) +{ + u32 i; + + if (c4iw_id_table_alloc(&rdev->resource.qid_table, + rdev->lldi.vr->qp.start, + rdev->lldi.vr->qp.size, + rdev->lldi.vr->qp.size, 0)) + return -ENOMEM; + + for (i = rdev->lldi.vr->qp.start; + i < rdev->lldi.vr->qp.start + rdev->lldi.vr->qp.size; i++) + if (!(i & rdev->qpmask)) + c4iw_id_free(&rdev->resource.qid_table, i); + return 0; +} + +/* nr_* must be power of 2 */ +int c4iw_init_resource(struct c4iw_rdev *rdev, u32 nr_tpt, u32 nr_pdid) +{ + int err = 0; + err = c4iw_id_table_alloc(&rdev->resource.tpt_table, 0, nr_tpt, 1, + C4IW_ID_TABLE_F_RANDOM); + if (err) + goto tpt_err; + err = c4iw_init_qid_table(rdev); + if (err) + goto qid_err; + err = c4iw_id_table_alloc(&rdev->resource.pdid_table, 0, + nr_pdid, 1, 0); + if (err) + goto pdid_err; + return 0; + pdid_err: + c4iw_id_table_free(&rdev->resource.qid_table); + qid_err: + c4iw_id_table_free(&rdev->resource.tpt_table); + tpt_err: + return -ENOMEM; +} + +/* + * returns 0 if no resource available + */ +u32 c4iw_get_resource(struct c4iw_id_table *id_table) +{ + u32 entry; + entry = c4iw_id_alloc(id_table); + if (entry == (u32)(-1)) + return 0; + return entry; +} + +void c4iw_put_resource(struct c4iw_id_table *id_table, u32 entry) +{ + PDBG("%s entry 0x%x\n", __func__, entry); + c4iw_id_free(id_table, entry); +} + +u32 c4iw_get_cqid(struct c4iw_rdev *rdev, struct c4iw_dev_ucontext *uctx) +{ + struct c4iw_qid_list *entry; + u32 qid; + int i; + + mutex_lock(&uctx->lock); + if (!list_empty(&uctx->cqids)) { + entry = list_entry(uctx->cqids.next, struct c4iw_qid_list, + entry); + list_del(&entry->entry); + qid = entry->qid; + kfree(entry); + } else { + qid = c4iw_get_resource(&rdev->resource.qid_table); + if (!qid) + goto out; + mutex_lock(&rdev->stats.lock); + rdev->stats.qid.cur += rdev->qpmask + 1; + mutex_unlock(&rdev->stats.lock); + for (i = qid+1; i & rdev->qpmask; i++) { + entry = kmalloc(sizeof *entry, GFP_KERNEL); + if (!entry) + goto out; + entry->qid = i; + list_add_tail(&entry->entry, &uctx->cqids); + } + + /* + * now put the same ids on the qp list since they all + * map to the same db/gts page. + */ + entry = kmalloc(sizeof *entry, GFP_KERNEL); + if (!entry) + goto out; + entry->qid = qid; + list_add_tail(&entry->entry, &uctx->qpids); + for (i = qid+1; i & rdev->qpmask; i++) { + entry = kmalloc(sizeof *entry, GFP_KERNEL); + if (!entry) + goto out; + entry->qid = i; + list_add_tail(&entry->entry, &uctx->qpids); + } + } +out: + mutex_unlock(&uctx->lock); + PDBG("%s qid 0x%x\n", __func__, qid); + mutex_lock(&rdev->stats.lock); + if (rdev->stats.qid.cur > rdev->stats.qid.max) + rdev->stats.qid.max = rdev->stats.qid.cur; + mutex_unlock(&rdev->stats.lock); + return qid; +} + +void c4iw_put_cqid(struct c4iw_rdev *rdev, u32 qid, + struct c4iw_dev_ucontext *uctx) +{ + struct c4iw_qid_list *entry; + + entry = kmalloc(sizeof *entry, GFP_KERNEL); + if (!entry) + return; + PDBG("%s qid 0x%x\n", __func__, qid); + entry->qid = qid; + mutex_lock(&uctx->lock); + list_add_tail(&entry->entry, &uctx->cqids); + mutex_unlock(&uctx->lock); +} + +u32 c4iw_get_qpid(struct c4iw_rdev *rdev, struct c4iw_dev_ucontext *uctx) +{ + struct c4iw_qid_list *entry; + u32 qid; + int i; + + mutex_lock(&uctx->lock); + if (!list_empty(&uctx->qpids)) { + entry = list_entry(uctx->qpids.next, struct c4iw_qid_list, + entry); + list_del(&entry->entry); + qid = entry->qid; + kfree(entry); + } else { + qid = c4iw_get_resource(&rdev->resource.qid_table); + if (!qid) { + mutex_lock(&rdev->stats.lock); + rdev->stats.qid.fail++; + mutex_unlock(&rdev->stats.lock); + goto out; + } + mutex_lock(&rdev->stats.lock); + rdev->stats.qid.cur += rdev->qpmask + 1; + mutex_unlock(&rdev->stats.lock); + for (i = qid+1; i & rdev->qpmask; i++) { + entry = kmalloc(sizeof *entry, GFP_KERNEL); + if (!entry) + goto out; + entry->qid = i; + list_add_tail(&entry->entry, &uctx->qpids); + } + + /* + * now put the same ids on the cq list since they all + * map to the same db/gts page. + */ + entry = kmalloc(sizeof *entry, GFP_KERNEL); + if (!entry) + goto out; + entry->qid = qid; + list_add_tail(&entry->entry, &uctx->cqids); + for (i = qid; i & rdev->qpmask; i++) { + entry = kmalloc(sizeof *entry, GFP_KERNEL); + if (!entry) + goto out; + entry->qid = i; + list_add_tail(&entry->entry, &uctx->cqids); + } + } +out: + mutex_unlock(&uctx->lock); + PDBG("%s qid 0x%x\n", __func__, qid); + mutex_lock(&rdev->stats.lock); + if (rdev->stats.qid.cur > rdev->stats.qid.max) + rdev->stats.qid.max = rdev->stats.qid.cur; + mutex_unlock(&rdev->stats.lock); + return qid; +} + +void c4iw_put_qpid(struct c4iw_rdev *rdev, u32 qid, + struct c4iw_dev_ucontext *uctx) +{ + struct c4iw_qid_list *entry; + + entry = kmalloc(sizeof *entry, GFP_KERNEL); + if (!entry) + return; + PDBG("%s qid 0x%x\n", __func__, qid); + entry->qid = qid; + mutex_lock(&uctx->lock); + list_add_tail(&entry->entry, &uctx->qpids); + mutex_unlock(&uctx->lock); +} + +void c4iw_destroy_resource(struct c4iw_resource *rscp) +{ + c4iw_id_table_free(&rscp->tpt_table); + c4iw_id_table_free(&rscp->qid_table); + c4iw_id_table_free(&rscp->pdid_table); +} + +/* + * PBL Memory Manager. Uses Linux generic allocator. + */ + +#define MIN_PBL_SHIFT 8 /* 256B == min PBL size (32 entries) */ + +u32 c4iw_pblpool_alloc(struct c4iw_rdev *rdev, int size) +{ + unsigned long addr = gen_pool_alloc(rdev->pbl_pool, size); + PDBG("%s addr 0x%x size %d\n", __func__, (u32)addr, size); + mutex_lock(&rdev->stats.lock); + if (addr) { + rdev->stats.pbl.cur += roundup(size, 1 << MIN_PBL_SHIFT); + if (rdev->stats.pbl.cur > rdev->stats.pbl.max) + rdev->stats.pbl.max = rdev->stats.pbl.cur; + } else + rdev->stats.pbl.fail++; + mutex_unlock(&rdev->stats.lock); + return (u32)addr; +} + +void c4iw_pblpool_free(struct c4iw_rdev *rdev, u32 addr, int size) +{ + PDBG("%s addr 0x%x size %d\n", __func__, addr, size); + mutex_lock(&rdev->stats.lock); + rdev->stats.pbl.cur -= roundup(size, 1 << MIN_PBL_SHIFT); + mutex_unlock(&rdev->stats.lock); + gen_pool_free(rdev->pbl_pool, (unsigned long)addr, size); +} + +int c4iw_pblpool_create(struct c4iw_rdev *rdev) +{ + unsigned pbl_start, pbl_chunk, pbl_top; + + rdev->pbl_pool = gen_pool_create(MIN_PBL_SHIFT, -1); + if (!rdev->pbl_pool) + return -ENOMEM; + + pbl_start = rdev->lldi.vr->pbl.start; + pbl_chunk = rdev->lldi.vr->pbl.size; + pbl_top = pbl_start + pbl_chunk; + + while (pbl_start < pbl_top) { + pbl_chunk = min(pbl_top - pbl_start + 1, pbl_chunk); + if (gen_pool_add(rdev->pbl_pool, pbl_start, pbl_chunk, -1)) { + PDBG("%s failed to add PBL chunk (%x/%x)\n", + __func__, pbl_start, pbl_chunk); + if (pbl_chunk <= 1024 << MIN_PBL_SHIFT) { + printk(KERN_WARNING MOD + "Failed to add all PBL chunks (%x/%x)\n", + pbl_start, + pbl_top - pbl_start); + return 0; + } + pbl_chunk >>= 1; + } else { + PDBG("%s added PBL chunk (%x/%x)\n", + __func__, pbl_start, pbl_chunk); + pbl_start += pbl_chunk; + } + } + + return 0; +} + +void c4iw_pblpool_destroy(struct c4iw_rdev *rdev) +{ + gen_pool_destroy(rdev->pbl_pool); +} + +/* + * RQT Memory Manager. Uses Linux generic allocator. + */ + +#define MIN_RQT_SHIFT 10 /* 1KB == min RQT size (16 entries) */ + +u32 c4iw_rqtpool_alloc(struct c4iw_rdev *rdev, int size) +{ + unsigned long addr = gen_pool_alloc(rdev->rqt_pool, size << 6); + PDBG("%s addr 0x%x size %d\n", __func__, (u32)addr, size << 6); + if (!addr) + pr_warn_ratelimited(MOD "%s: Out of RQT memory\n", + pci_name(rdev->lldi.pdev)); + mutex_lock(&rdev->stats.lock); + if (addr) { + rdev->stats.rqt.cur += roundup(size << 6, 1 << MIN_RQT_SHIFT); + if (rdev->stats.rqt.cur > rdev->stats.rqt.max) + rdev->stats.rqt.max = rdev->stats.rqt.cur; + } else + rdev->stats.rqt.fail++; + mutex_unlock(&rdev->stats.lock); + return (u32)addr; +} + +void c4iw_rqtpool_free(struct c4iw_rdev *rdev, u32 addr, int size) +{ + PDBG("%s addr 0x%x size %d\n", __func__, addr, size << 6); + mutex_lock(&rdev->stats.lock); + rdev->stats.rqt.cur -= roundup(size << 6, 1 << MIN_RQT_SHIFT); + mutex_unlock(&rdev->stats.lock); + gen_pool_free(rdev->rqt_pool, (unsigned long)addr, size << 6); +} + +int c4iw_rqtpool_create(struct c4iw_rdev *rdev) +{ + unsigned rqt_start, rqt_chunk, rqt_top; + + rdev->rqt_pool = gen_pool_create(MIN_RQT_SHIFT, -1); + if (!rdev->rqt_pool) + return -ENOMEM; + + rqt_start = rdev->lldi.vr->rq.start; + rqt_chunk = rdev->lldi.vr->rq.size; + rqt_top = rqt_start + rqt_chunk; + + while (rqt_start < rqt_top) { + rqt_chunk = min(rqt_top - rqt_start + 1, rqt_chunk); + if (gen_pool_add(rdev->rqt_pool, rqt_start, rqt_chunk, -1)) { + PDBG("%s failed to add RQT chunk (%x/%x)\n", + __func__, rqt_start, rqt_chunk); + if (rqt_chunk <= 1024 << MIN_RQT_SHIFT) { + printk(KERN_WARNING MOD + "Failed to add all RQT chunks (%x/%x)\n", + rqt_start, rqt_top - rqt_start); + return 0; + } + rqt_chunk >>= 1; + } else { + PDBG("%s added RQT chunk (%x/%x)\n", + __func__, rqt_start, rqt_chunk); + rqt_start += rqt_chunk; + } + } + return 0; +} + +void c4iw_rqtpool_destroy(struct c4iw_rdev *rdev) +{ + gen_pool_destroy(rdev->rqt_pool); +} + +/* + * On-Chip QP Memory. + */ +#define MIN_OCQP_SHIFT 12 /* 4KB == min ocqp size */ + +u32 c4iw_ocqp_pool_alloc(struct c4iw_rdev *rdev, int size) +{ + unsigned long addr = gen_pool_alloc(rdev->ocqp_pool, size); + PDBG("%s addr 0x%x size %d\n", __func__, (u32)addr, size); + if (addr) { + mutex_lock(&rdev->stats.lock); + rdev->stats.ocqp.cur += roundup(size, 1 << MIN_OCQP_SHIFT); + if (rdev->stats.ocqp.cur > rdev->stats.ocqp.max) + rdev->stats.ocqp.max = rdev->stats.ocqp.cur; + mutex_unlock(&rdev->stats.lock); + } + return (u32)addr; +} + +void c4iw_ocqp_pool_free(struct c4iw_rdev *rdev, u32 addr, int size) +{ + PDBG("%s addr 0x%x size %d\n", __func__, addr, size); + mutex_lock(&rdev->stats.lock); + rdev->stats.ocqp.cur -= roundup(size, 1 << MIN_OCQP_SHIFT); + mutex_unlock(&rdev->stats.lock); + gen_pool_free(rdev->ocqp_pool, (unsigned long)addr, size); +} + +int c4iw_ocqp_pool_create(struct c4iw_rdev *rdev) +{ + unsigned start, chunk, top; + + rdev->ocqp_pool = gen_pool_create(MIN_OCQP_SHIFT, -1); + if (!rdev->ocqp_pool) + return -ENOMEM; + + start = rdev->lldi.vr->ocq.start; + chunk = rdev->lldi.vr->ocq.size; + top = start + chunk; + + while (start < top) { + chunk = min(top - start + 1, chunk); + if (gen_pool_add(rdev->ocqp_pool, start, chunk, -1)) { + PDBG("%s failed to add OCQP chunk (%x/%x)\n", + __func__, start, chunk); + if (chunk <= 1024 << MIN_OCQP_SHIFT) { + printk(KERN_WARNING MOD + "Failed to add all OCQP chunks (%x/%x)\n", + start, top - start); + return 0; + } + chunk >>= 1; + } else { + PDBG("%s added OCQP chunk (%x/%x)\n", + __func__, start, chunk); + start += chunk; + } + } + return 0; +} + +void c4iw_ocqp_pool_destroy(struct c4iw_rdev *rdev) +{ + gen_pool_destroy(rdev->ocqp_pool); +} diff --git a/kernel/drivers/infiniband/hw/cxgb4/t4.h b/kernel/drivers/infiniband/hw/cxgb4/t4.h new file mode 100644 index 000000000..7f2a6c244 --- /dev/null +++ b/kernel/drivers/infiniband/hw/cxgb4/t4.h @@ -0,0 +1,685 @@ +/* + * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __T4_H__ +#define __T4_H__ + +#include "t4_hw.h" +#include "t4_regs.h" +#include "t4_msg.h" +#include "t4fw_ri_api.h" + +#define T4_MAX_NUM_PD 65536 +#define T4_MAX_MR_SIZE (~0ULL) +#define T4_PAGESIZE_MASK 0xffff000 /* 4KB-128MB */ +#define T4_STAG_UNSET 0xffffffff +#define T4_FW_MAJ 0 +#define PCIE_MA_SYNC_A 0x30b4 + +struct t4_status_page { + __be32 rsvd1; /* flit 0 - hw owns */ + __be16 rsvd2; + __be16 qid; + __be16 cidx; + __be16 pidx; + u8 qp_err; /* flit 1 - sw owns */ + u8 db_off; + u8 pad; + u16 host_wq_pidx; + u16 host_cidx; + u16 host_pidx; +}; + +#define T4_EQ_ENTRY_SIZE 64 + +#define T4_SQ_NUM_SLOTS 5 +#define T4_SQ_NUM_BYTES (T4_EQ_ENTRY_SIZE * T4_SQ_NUM_SLOTS) +#define T4_MAX_SEND_SGE ((T4_SQ_NUM_BYTES - sizeof(struct fw_ri_send_wr) - \ + sizeof(struct fw_ri_isgl)) / sizeof(struct fw_ri_sge)) +#define T4_MAX_SEND_INLINE ((T4_SQ_NUM_BYTES - sizeof(struct fw_ri_send_wr) - \ + sizeof(struct fw_ri_immd))) +#define T4_MAX_WRITE_INLINE ((T4_SQ_NUM_BYTES - \ + sizeof(struct fw_ri_rdma_write_wr) - \ + sizeof(struct fw_ri_immd))) +#define T4_MAX_WRITE_SGE ((T4_SQ_NUM_BYTES - \ + sizeof(struct fw_ri_rdma_write_wr) - \ + sizeof(struct fw_ri_isgl)) / sizeof(struct fw_ri_sge)) +#define T4_MAX_FR_IMMD ((T4_SQ_NUM_BYTES - sizeof(struct fw_ri_fr_nsmr_wr) - \ + sizeof(struct fw_ri_immd)) & ~31UL) +#define T4_MAX_FR_IMMD_DEPTH (T4_MAX_FR_IMMD / sizeof(u64)) +#define T4_MAX_FR_DSGL 1024 +#define T4_MAX_FR_DSGL_DEPTH (T4_MAX_FR_DSGL / sizeof(u64)) + +static inline int t4_max_fr_depth(int use_dsgl) +{ + return use_dsgl ? T4_MAX_FR_DSGL_DEPTH : T4_MAX_FR_IMMD_DEPTH; +} + +#define T4_RQ_NUM_SLOTS 2 +#define T4_RQ_NUM_BYTES (T4_EQ_ENTRY_SIZE * T4_RQ_NUM_SLOTS) +#define T4_MAX_RECV_SGE 4 + +union t4_wr { + struct fw_ri_res_wr res; + struct fw_ri_wr ri; + struct fw_ri_rdma_write_wr write; + struct fw_ri_send_wr send; + struct fw_ri_rdma_read_wr read; + struct fw_ri_bind_mw_wr bind; + struct fw_ri_fr_nsmr_wr fr; + struct fw_ri_inv_lstag_wr inv; + struct t4_status_page status; + __be64 flits[T4_EQ_ENTRY_SIZE / sizeof(__be64) * T4_SQ_NUM_SLOTS]; +}; + +union t4_recv_wr { + struct fw_ri_recv_wr recv; + struct t4_status_page status; + __be64 flits[T4_EQ_ENTRY_SIZE / sizeof(__be64) * T4_RQ_NUM_SLOTS]; +}; + +static inline void init_wr_hdr(union t4_wr *wqe, u16 wrid, + enum fw_wr_opcodes opcode, u8 flags, u8 len16) +{ + wqe->send.opcode = (u8)opcode; + wqe->send.flags = flags; + wqe->send.wrid = wrid; + wqe->send.r1[0] = 0; + wqe->send.r1[1] = 0; + wqe->send.r1[2] = 0; + wqe->send.len16 = len16; +} + +/* CQE/AE status codes */ +#define T4_ERR_SUCCESS 0x0 +#define T4_ERR_STAG 0x1 /* STAG invalid: either the */ + /* STAG is offlimt, being 0, */ + /* or STAG_key mismatch */ +#define T4_ERR_PDID 0x2 /* PDID mismatch */ +#define T4_ERR_QPID 0x3 /* QPID mismatch */ +#define T4_ERR_ACCESS 0x4 /* Invalid access right */ +#define T4_ERR_WRAP 0x5 /* Wrap error */ +#define T4_ERR_BOUND 0x6 /* base and bounds voilation */ +#define T4_ERR_INVALIDATE_SHARED_MR 0x7 /* attempt to invalidate a */ + /* shared memory region */ +#define T4_ERR_INVALIDATE_MR_WITH_MW_BOUND 0x8 /* attempt to invalidate a */ + /* shared memory region */ +#define T4_ERR_ECC 0x9 /* ECC error detected */ +#define T4_ERR_ECC_PSTAG 0xA /* ECC error detected when */ + /* reading PSTAG for a MW */ + /* Invalidate */ +#define T4_ERR_PBL_ADDR_BOUND 0xB /* pbl addr out of bounds: */ + /* software error */ +#define T4_ERR_SWFLUSH 0xC /* SW FLUSHED */ +#define T4_ERR_CRC 0x10 /* CRC error */ +#define T4_ERR_MARKER 0x11 /* Marker error */ +#define T4_ERR_PDU_LEN_ERR 0x12 /* invalid PDU length */ +#define T4_ERR_OUT_OF_RQE 0x13 /* out of RQE */ +#define T4_ERR_DDP_VERSION 0x14 /* wrong DDP version */ +#define T4_ERR_RDMA_VERSION 0x15 /* wrong RDMA version */ +#define T4_ERR_OPCODE 0x16 /* invalid rdma opcode */ +#define T4_ERR_DDP_QUEUE_NUM 0x17 /* invalid ddp queue number */ +#define T4_ERR_MSN 0x18 /* MSN error */ +#define T4_ERR_TBIT 0x19 /* tag bit not set correctly */ +#define T4_ERR_MO 0x1A /* MO not 0 for TERMINATE */ + /* or READ_REQ */ +#define T4_ERR_MSN_GAP 0x1B +#define T4_ERR_MSN_RANGE 0x1C +#define T4_ERR_IRD_OVERFLOW 0x1D +#define T4_ERR_RQE_ADDR_BOUND 0x1E /* RQE addr out of bounds: */ + /* software error */ +#define T4_ERR_INTERNAL_ERR 0x1F /* internal error (opcode */ + /* mismatch) */ +/* + * CQE defs + */ +struct t4_cqe { + __be32 header; + __be32 len; + union { + struct { + __be32 stag; + __be32 msn; + } rcqe; + struct { + u32 nada1; + u16 nada2; + u16 cidx; + } scqe; + struct { + __be32 wrid_hi; + __be32 wrid_low; + } gen; + } u; + __be64 reserved; + __be64 bits_type_ts; +}; + +/* macros for flit 0 of the cqe */ + +#define CQE_QPID_S 12 +#define CQE_QPID_M 0xFFFFF +#define CQE_QPID_G(x) ((((x) >> CQE_QPID_S)) & CQE_QPID_M) +#define CQE_QPID_V(x) ((x)<> CQE_SWCQE_S)) & CQE_SWCQE_M) +#define CQE_SWCQE_V(x) ((x)<> CQE_STATUS_S)) & CQE_STATUS_M) +#define CQE_STATUS_V(x) ((x)<> CQE_TYPE_S)) & CQE_TYPE_M) +#define CQE_TYPE_V(x) ((x)<> CQE_OPCODE_S)) & CQE_OPCODE_M) +#define CQE_OPCODE_V(x) ((x)<header))) +#define CQE_QPID(x) (CQE_QPID_G(be32_to_cpu((x)->header))) +#define CQE_TYPE(x) (CQE_TYPE_G(be32_to_cpu((x)->header))) +#define SQ_TYPE(x) (CQE_TYPE((x))) +#define RQ_TYPE(x) (!CQE_TYPE((x))) +#define CQE_STATUS(x) (CQE_STATUS_G(be32_to_cpu((x)->header))) +#define CQE_OPCODE(x) (CQE_OPCODE_G(be32_to_cpu((x)->header))) + +#define CQE_SEND_OPCODE(x)( \ + (CQE_OPCODE_G(be32_to_cpu((x)->header)) == FW_RI_SEND) || \ + (CQE_OPCODE_G(be32_to_cpu((x)->header)) == FW_RI_SEND_WITH_SE) || \ + (CQE_OPCODE_G(be32_to_cpu((x)->header)) == FW_RI_SEND_WITH_INV) || \ + (CQE_OPCODE_G(be32_to_cpu((x)->header)) == FW_RI_SEND_WITH_SE_INV)) + +#define CQE_LEN(x) (be32_to_cpu((x)->len)) + +/* used for RQ completion processing */ +#define CQE_WRID_STAG(x) (be32_to_cpu((x)->u.rcqe.stag)) +#define CQE_WRID_MSN(x) (be32_to_cpu((x)->u.rcqe.msn)) + +/* used for SQ completion processing */ +#define CQE_WRID_SQ_IDX(x) ((x)->u.scqe.cidx) + +/* generic accessor macros */ +#define CQE_WRID_HI(x) (be32_to_cpu((x)->u.gen.wrid_hi)) +#define CQE_WRID_LOW(x) (be32_to_cpu((x)->u.gen.wrid_low)) + +/* macros for flit 3 of the cqe */ +#define CQE_GENBIT_S 63 +#define CQE_GENBIT_M 0x1 +#define CQE_GENBIT_G(x) (((x) >> CQE_GENBIT_S) & CQE_GENBIT_M) +#define CQE_GENBIT_V(x) ((x)<> CQE_OVFBIT_S)) & CQE_OVFBIT_M) + +#define CQE_IQTYPE_S 60 +#define CQE_IQTYPE_M 0x3 +#define CQE_IQTYPE_G(x) ((((x) >> CQE_IQTYPE_S)) & CQE_IQTYPE_M) + +#define CQE_TS_M 0x0fffffffffffffffULL +#define CQE_TS_G(x) ((x) & CQE_TS_M) + +#define CQE_OVFBIT(x) ((unsigned)CQE_OVFBIT_G(be64_to_cpu((x)->bits_type_ts))) +#define CQE_GENBIT(x) ((unsigned)CQE_GENBIT_G(be64_to_cpu((x)->bits_type_ts))) +#define CQE_TS(x) (CQE_TS_G(be64_to_cpu((x)->bits_type_ts))) + +struct t4_swsqe { + u64 wr_id; + struct t4_cqe cqe; + int read_len; + int opcode; + int complete; + int signaled; + u16 idx; + int flushed; + struct timespec host_ts; + u64 sge_ts; +}; + +static inline pgprot_t t4_pgprot_wc(pgprot_t prot) +{ +#if defined(__i386__) || defined(__x86_64__) || defined(CONFIG_PPC64) + return pgprot_writecombine(prot); +#else + return pgprot_noncached(prot); +#endif +} + +enum { + T4_SQ_ONCHIP = (1<<0), +}; + +struct t4_sq { + union t4_wr *queue; + dma_addr_t dma_addr; + DEFINE_DMA_UNMAP_ADDR(mapping); + unsigned long phys_addr; + struct t4_swsqe *sw_sq; + struct t4_swsqe *oldest_read; + u64 __iomem *udb; + size_t memsize; + u32 qid; + u16 in_use; + u16 size; + u16 cidx; + u16 pidx; + u16 wq_pidx; + u16 wq_pidx_inc; + u16 flags; + short flush_cidx; +}; + +struct t4_swrqe { + u64 wr_id; + struct timespec host_ts; + u64 sge_ts; +}; + +struct t4_rq { + union t4_recv_wr *queue; + dma_addr_t dma_addr; + DEFINE_DMA_UNMAP_ADDR(mapping); + struct t4_swrqe *sw_rq; + u64 __iomem *udb; + size_t memsize; + u32 qid; + u32 msn; + u32 rqt_hwaddr; + u16 rqt_size; + u16 in_use; + u16 size; + u16 cidx; + u16 pidx; + u16 wq_pidx; + u16 wq_pidx_inc; +}; + +struct t4_wq { + struct t4_sq sq; + struct t4_rq rq; + void __iomem *db; + void __iomem *gts; + struct c4iw_rdev *rdev; + int flushed; +}; + +static inline int t4_rqes_posted(struct t4_wq *wq) +{ + return wq->rq.in_use; +} + +static inline int t4_rq_empty(struct t4_wq *wq) +{ + return wq->rq.in_use == 0; +} + +static inline int t4_rq_full(struct t4_wq *wq) +{ + return wq->rq.in_use == (wq->rq.size - 1); +} + +static inline u32 t4_rq_avail(struct t4_wq *wq) +{ + return wq->rq.size - 1 - wq->rq.in_use; +} + +static inline void t4_rq_produce(struct t4_wq *wq, u8 len16) +{ + wq->rq.in_use++; + if (++wq->rq.pidx == wq->rq.size) + wq->rq.pidx = 0; + wq->rq.wq_pidx += DIV_ROUND_UP(len16*16, T4_EQ_ENTRY_SIZE); + if (wq->rq.wq_pidx >= wq->rq.size * T4_RQ_NUM_SLOTS) + wq->rq.wq_pidx %= wq->rq.size * T4_RQ_NUM_SLOTS; +} + +static inline void t4_rq_consume(struct t4_wq *wq) +{ + wq->rq.in_use--; + wq->rq.msn++; + if (++wq->rq.cidx == wq->rq.size) + wq->rq.cidx = 0; +} + +static inline u16 t4_rq_host_wq_pidx(struct t4_wq *wq) +{ + return wq->rq.queue[wq->rq.size].status.host_wq_pidx; +} + +static inline u16 t4_rq_wq_size(struct t4_wq *wq) +{ + return wq->rq.size * T4_RQ_NUM_SLOTS; +} + +static inline int t4_sq_onchip(struct t4_sq *sq) +{ + return sq->flags & T4_SQ_ONCHIP; +} + +static inline int t4_sq_empty(struct t4_wq *wq) +{ + return wq->sq.in_use == 0; +} + +static inline int t4_sq_full(struct t4_wq *wq) +{ + return wq->sq.in_use == (wq->sq.size - 1); +} + +static inline u32 t4_sq_avail(struct t4_wq *wq) +{ + return wq->sq.size - 1 - wq->sq.in_use; +} + +static inline void t4_sq_produce(struct t4_wq *wq, u8 len16) +{ + wq->sq.in_use++; + if (++wq->sq.pidx == wq->sq.size) + wq->sq.pidx = 0; + wq->sq.wq_pidx += DIV_ROUND_UP(len16*16, T4_EQ_ENTRY_SIZE); + if (wq->sq.wq_pidx >= wq->sq.size * T4_SQ_NUM_SLOTS) + wq->sq.wq_pidx %= wq->sq.size * T4_SQ_NUM_SLOTS; +} + +static inline void t4_sq_consume(struct t4_wq *wq) +{ + BUG_ON(wq->sq.in_use < 1); + if (wq->sq.cidx == wq->sq.flush_cidx) + wq->sq.flush_cidx = -1; + wq->sq.in_use--; + if (++wq->sq.cidx == wq->sq.size) + wq->sq.cidx = 0; +} + +static inline u16 t4_sq_host_wq_pidx(struct t4_wq *wq) +{ + return wq->sq.queue[wq->sq.size].status.host_wq_pidx; +} + +static inline u16 t4_sq_wq_size(struct t4_wq *wq) +{ + return wq->sq.size * T4_SQ_NUM_SLOTS; +} + +/* This function copies 64 byte coalesced work request to memory + * mapped BAR2 space. For coalesced WRs, the SGE fetches data + * from the FIFO instead of from Host. + */ +static inline void pio_copy(u64 __iomem *dst, u64 *src) +{ + int count = 8; + + while (count) { + writeq(*src, dst); + src++; + dst++; + count--; + } +} + +static inline void t4_ring_sq_db(struct t4_wq *wq, u16 inc, u8 t5, + union t4_wr *wqe) +{ + + /* Flush host queue memory writes. */ + wmb(); + if (t5) { + if (inc == 1 && wqe) { + PDBG("%s: WC wq->sq.pidx = %d\n", + __func__, wq->sq.pidx); + pio_copy(wq->sq.udb + 7, (void *)wqe); + } else { + PDBG("%s: DB wq->sq.pidx = %d\n", + __func__, wq->sq.pidx); + writel(PIDX_T5_V(inc), wq->sq.udb); + } + + /* Flush user doorbell area writes. */ + wmb(); + return; + } + writel(QID_V(wq->sq.qid) | PIDX_V(inc), wq->db); +} + +static inline void t4_ring_rq_db(struct t4_wq *wq, u16 inc, u8 t5, + union t4_recv_wr *wqe) +{ + + /* Flush host queue memory writes. */ + wmb(); + if (t5) { + if (inc == 1 && wqe) { + PDBG("%s: WC wq->rq.pidx = %d\n", + __func__, wq->rq.pidx); + pio_copy(wq->rq.udb + 7, (void *)wqe); + } else { + PDBG("%s: DB wq->rq.pidx = %d\n", + __func__, wq->rq.pidx); + writel(PIDX_T5_V(inc), wq->rq.udb); + } + + /* Flush user doorbell area writes. */ + wmb(); + return; + } + writel(QID_V(wq->rq.qid) | PIDX_V(inc), wq->db); +} + +static inline int t4_wq_in_error(struct t4_wq *wq) +{ + return wq->rq.queue[wq->rq.size].status.qp_err; +} + +static inline void t4_set_wq_in_error(struct t4_wq *wq) +{ + wq->rq.queue[wq->rq.size].status.qp_err = 1; +} + +static inline void t4_disable_wq_db(struct t4_wq *wq) +{ + wq->rq.queue[wq->rq.size].status.db_off = 1; +} + +static inline void t4_enable_wq_db(struct t4_wq *wq) +{ + wq->rq.queue[wq->rq.size].status.db_off = 0; +} + +static inline int t4_wq_db_enabled(struct t4_wq *wq) +{ + return !wq->rq.queue[wq->rq.size].status.db_off; +} + +enum t4_cq_flags { + CQ_ARMED = 1, +}; + +struct t4_cq { + struct t4_cqe *queue; + dma_addr_t dma_addr; + DEFINE_DMA_UNMAP_ADDR(mapping); + struct t4_cqe *sw_queue; + void __iomem *gts; + struct c4iw_rdev *rdev; + u64 ugts; + size_t memsize; + __be64 bits_type_ts; + u32 cqid; + u32 qid_mask; + int vector; + u16 size; /* including status page */ + u16 cidx; + u16 sw_pidx; + u16 sw_cidx; + u16 sw_in_use; + u16 cidx_inc; + u8 gen; + u8 error; + unsigned long flags; +}; + +static inline int t4_clear_cq_armed(struct t4_cq *cq) +{ + return test_and_clear_bit(CQ_ARMED, &cq->flags); +} + +static inline int t4_arm_cq(struct t4_cq *cq, int se) +{ + u32 val; + + set_bit(CQ_ARMED, &cq->flags); + while (cq->cidx_inc > CIDXINC_M) { + val = SEINTARM_V(0) | CIDXINC_V(CIDXINC_M) | TIMERREG_V(7) | + INGRESSQID_V(cq->cqid & cq->qid_mask); + writel(val, cq->gts); + cq->cidx_inc -= CIDXINC_M; + } + val = SEINTARM_V(se) | CIDXINC_V(cq->cidx_inc) | TIMERREG_V(6) | + INGRESSQID_V(cq->cqid & cq->qid_mask); + writel(val, cq->gts); + cq->cidx_inc = 0; + return 0; +} + +static inline void t4_swcq_produce(struct t4_cq *cq) +{ + cq->sw_in_use++; + if (cq->sw_in_use == cq->size) { + PDBG("%s cxgb4 sw cq overflow cqid %u\n", __func__, cq->cqid); + cq->error = 1; + BUG_ON(1); + } + if (++cq->sw_pidx == cq->size) + cq->sw_pidx = 0; +} + +static inline void t4_swcq_consume(struct t4_cq *cq) +{ + BUG_ON(cq->sw_in_use < 1); + cq->sw_in_use--; + if (++cq->sw_cidx == cq->size) + cq->sw_cidx = 0; +} + +static inline void t4_hwcq_consume(struct t4_cq *cq) +{ + cq->bits_type_ts = cq->queue[cq->cidx].bits_type_ts; + if (++cq->cidx_inc == (cq->size >> 4) || cq->cidx_inc == CIDXINC_M) { + u32 val; + + val = SEINTARM_V(0) | CIDXINC_V(cq->cidx_inc) | TIMERREG_V(7) | + INGRESSQID_V(cq->cqid & cq->qid_mask); + writel(val, cq->gts); + cq->cidx_inc = 0; + } + if (++cq->cidx == cq->size) { + cq->cidx = 0; + cq->gen ^= 1; + } +} + +static inline int t4_valid_cqe(struct t4_cq *cq, struct t4_cqe *cqe) +{ + return (CQE_GENBIT(cqe) == cq->gen); +} + +static inline int t4_next_hw_cqe(struct t4_cq *cq, struct t4_cqe **cqe) +{ + int ret; + u16 prev_cidx; + + if (cq->cidx == 0) + prev_cidx = cq->size - 1; + else + prev_cidx = cq->cidx - 1; + + if (cq->queue[prev_cidx].bits_type_ts != cq->bits_type_ts) { + ret = -EOVERFLOW; + cq->error = 1; + printk(KERN_ERR MOD "cq overflow cqid %u\n", cq->cqid); + BUG_ON(1); + } else if (t4_valid_cqe(cq, &cq->queue[cq->cidx])) { + + /* Ensure CQE is flushed to memory */ + rmb(); + *cqe = &cq->queue[cq->cidx]; + ret = 0; + } else + ret = -ENODATA; + return ret; +} + +static inline struct t4_cqe *t4_next_sw_cqe(struct t4_cq *cq) +{ + if (cq->sw_in_use == cq->size) { + PDBG("%s cxgb4 sw cq overflow cqid %u\n", __func__, cq->cqid); + cq->error = 1; + BUG_ON(1); + return NULL; + } + if (cq->sw_in_use) + return &cq->sw_queue[cq->sw_cidx]; + return NULL; +} + +static inline int t4_next_cqe(struct t4_cq *cq, struct t4_cqe **cqe) +{ + int ret = 0; + + if (cq->error) + ret = -ENODATA; + else if (cq->sw_in_use) + *cqe = &cq->sw_queue[cq->sw_cidx]; + else + ret = t4_next_hw_cqe(cq, cqe); + return ret; +} + +static inline int t4_cq_in_error(struct t4_cq *cq) +{ + return ((struct t4_status_page *)&cq->queue[cq->size])->qp_err; +} + +static inline void t4_set_cq_in_error(struct t4_cq *cq) +{ + ((struct t4_status_page *)&cq->queue[cq->size])->qp_err = 1; +} +#endif + +struct t4_dev_status_page { + u8 db_off; +}; diff --git a/kernel/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h b/kernel/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h new file mode 100644 index 000000000..343e8daf2 --- /dev/null +++ b/kernel/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h @@ -0,0 +1,855 @@ +/* + * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _T4FW_RI_API_H_ +#define _T4FW_RI_API_H_ + +#include "t4fw_api.h" + +enum fw_ri_wr_opcode { + FW_RI_RDMA_WRITE = 0x0, /* IETF RDMAP v1.0 ... */ + FW_RI_READ_REQ = 0x1, + FW_RI_READ_RESP = 0x2, + FW_RI_SEND = 0x3, + FW_RI_SEND_WITH_INV = 0x4, + FW_RI_SEND_WITH_SE = 0x5, + FW_RI_SEND_WITH_SE_INV = 0x6, + FW_RI_TERMINATE = 0x7, + FW_RI_RDMA_INIT = 0x8, /* CHELSIO RI specific ... */ + FW_RI_BIND_MW = 0x9, + FW_RI_FAST_REGISTER = 0xa, + FW_RI_LOCAL_INV = 0xb, + FW_RI_QP_MODIFY = 0xc, + FW_RI_BYPASS = 0xd, + FW_RI_RECEIVE = 0xe, + + FW_RI_SGE_EC_CR_RETURN = 0xf +}; + +enum fw_ri_wr_flags { + FW_RI_COMPLETION_FLAG = 0x01, + FW_RI_NOTIFICATION_FLAG = 0x02, + FW_RI_SOLICITED_EVENT_FLAG = 0x04, + FW_RI_READ_FENCE_FLAG = 0x08, + FW_RI_LOCAL_FENCE_FLAG = 0x10, + FW_RI_RDMA_READ_INVALIDATE = 0x20 +}; + +enum fw_ri_mpa_attrs { + FW_RI_MPA_RX_MARKER_ENABLE = 0x01, + FW_RI_MPA_TX_MARKER_ENABLE = 0x02, + FW_RI_MPA_CRC_ENABLE = 0x04, + FW_RI_MPA_IETF_ENABLE = 0x08 +}; + +enum fw_ri_qp_caps { + FW_RI_QP_RDMA_READ_ENABLE = 0x01, + FW_RI_QP_RDMA_WRITE_ENABLE = 0x02, + FW_RI_QP_BIND_ENABLE = 0x04, + FW_RI_QP_FAST_REGISTER_ENABLE = 0x08, + FW_RI_QP_STAG0_ENABLE = 0x10 +}; + +enum fw_ri_addr_type { + FW_RI_ZERO_BASED_TO = 0x00, + FW_RI_VA_BASED_TO = 0x01 +}; + +enum fw_ri_mem_perms { + FW_RI_MEM_ACCESS_REM_WRITE = 0x01, + FW_RI_MEM_ACCESS_REM_READ = 0x02, + FW_RI_MEM_ACCESS_REM = 0x03, + FW_RI_MEM_ACCESS_LOCAL_WRITE = 0x04, + FW_RI_MEM_ACCESS_LOCAL_READ = 0x08, + FW_RI_MEM_ACCESS_LOCAL = 0x0C +}; + +enum fw_ri_stag_type { + FW_RI_STAG_NSMR = 0x00, + FW_RI_STAG_SMR = 0x01, + FW_RI_STAG_MW = 0x02, + FW_RI_STAG_MW_RELAXED = 0x03 +}; + +enum fw_ri_data_op { + FW_RI_DATA_IMMD = 0x81, + FW_RI_DATA_DSGL = 0x82, + FW_RI_DATA_ISGL = 0x83 +}; + +enum fw_ri_sgl_depth { + FW_RI_SGL_DEPTH_MAX_SQ = 16, + FW_RI_SGL_DEPTH_MAX_RQ = 4 +}; + +struct fw_ri_dsge_pair { + __be32 len[2]; + __be64 addr[2]; +}; + +struct fw_ri_dsgl { + __u8 op; + __u8 r1; + __be16 nsge; + __be32 len0; + __be64 addr0; +#ifndef C99_NOT_SUPPORTED + struct fw_ri_dsge_pair sge[0]; +#endif +}; + +struct fw_ri_sge { + __be32 stag; + __be32 len; + __be64 to; +}; + +struct fw_ri_isgl { + __u8 op; + __u8 r1; + __be16 nsge; + __be32 r2; +#ifndef C99_NOT_SUPPORTED + struct fw_ri_sge sge[0]; +#endif +}; + +struct fw_ri_immd { + __u8 op; + __u8 r1; + __be16 r2; + __be32 immdlen; +#ifndef C99_NOT_SUPPORTED + __u8 data[0]; +#endif +}; + +struct fw_ri_tpte { + __be32 valid_to_pdid; + __be32 locread_to_qpid; + __be32 nosnoop_pbladdr; + __be32 len_lo; + __be32 va_hi; + __be32 va_lo_fbo; + __be32 dca_mwbcnt_pstag; + __be32 len_hi; +}; + +#define FW_RI_TPTE_VALID_S 31 +#define FW_RI_TPTE_VALID_M 0x1 +#define FW_RI_TPTE_VALID_V(x) ((x) << FW_RI_TPTE_VALID_S) +#define FW_RI_TPTE_VALID_G(x) \ + (((x) >> FW_RI_TPTE_VALID_S) & FW_RI_TPTE_VALID_M) +#define FW_RI_TPTE_VALID_F FW_RI_TPTE_VALID_V(1U) + +#define FW_RI_TPTE_STAGKEY_S 23 +#define FW_RI_TPTE_STAGKEY_M 0xff +#define FW_RI_TPTE_STAGKEY_V(x) ((x) << FW_RI_TPTE_STAGKEY_S) +#define FW_RI_TPTE_STAGKEY_G(x) \ + (((x) >> FW_RI_TPTE_STAGKEY_S) & FW_RI_TPTE_STAGKEY_M) + +#define FW_RI_TPTE_STAGSTATE_S 22 +#define FW_RI_TPTE_STAGSTATE_M 0x1 +#define FW_RI_TPTE_STAGSTATE_V(x) ((x) << FW_RI_TPTE_STAGSTATE_S) +#define FW_RI_TPTE_STAGSTATE_G(x) \ + (((x) >> FW_RI_TPTE_STAGSTATE_S) & FW_RI_TPTE_STAGSTATE_M) +#define FW_RI_TPTE_STAGSTATE_F FW_RI_TPTE_STAGSTATE_V(1U) + +#define FW_RI_TPTE_STAGTYPE_S 20 +#define FW_RI_TPTE_STAGTYPE_M 0x3 +#define FW_RI_TPTE_STAGTYPE_V(x) ((x) << FW_RI_TPTE_STAGTYPE_S) +#define FW_RI_TPTE_STAGTYPE_G(x) \ + (((x) >> FW_RI_TPTE_STAGTYPE_S) & FW_RI_TPTE_STAGTYPE_M) + +#define FW_RI_TPTE_PDID_S 0 +#define FW_RI_TPTE_PDID_M 0xfffff +#define FW_RI_TPTE_PDID_V(x) ((x) << FW_RI_TPTE_PDID_S) +#define FW_RI_TPTE_PDID_G(x) \ + (((x) >> FW_RI_TPTE_PDID_S) & FW_RI_TPTE_PDID_M) + +#define FW_RI_TPTE_PERM_S 28 +#define FW_RI_TPTE_PERM_M 0xf +#define FW_RI_TPTE_PERM_V(x) ((x) << FW_RI_TPTE_PERM_S) +#define FW_RI_TPTE_PERM_G(x) \ + (((x) >> FW_RI_TPTE_PERM_S) & FW_RI_TPTE_PERM_M) + +#define FW_RI_TPTE_REMINVDIS_S 27 +#define FW_RI_TPTE_REMINVDIS_M 0x1 +#define FW_RI_TPTE_REMINVDIS_V(x) ((x) << FW_RI_TPTE_REMINVDIS_S) +#define FW_RI_TPTE_REMINVDIS_G(x) \ + (((x) >> FW_RI_TPTE_REMINVDIS_S) & FW_RI_TPTE_REMINVDIS_M) +#define FW_RI_TPTE_REMINVDIS_F FW_RI_TPTE_REMINVDIS_V(1U) + +#define FW_RI_TPTE_ADDRTYPE_S 26 +#define FW_RI_TPTE_ADDRTYPE_M 1 +#define FW_RI_TPTE_ADDRTYPE_V(x) ((x) << FW_RI_TPTE_ADDRTYPE_S) +#define FW_RI_TPTE_ADDRTYPE_G(x) \ + (((x) >> FW_RI_TPTE_ADDRTYPE_S) & FW_RI_TPTE_ADDRTYPE_M) +#define FW_RI_TPTE_ADDRTYPE_F FW_RI_TPTE_ADDRTYPE_V(1U) + +#define FW_RI_TPTE_MWBINDEN_S 25 +#define FW_RI_TPTE_MWBINDEN_M 0x1 +#define FW_RI_TPTE_MWBINDEN_V(x) ((x) << FW_RI_TPTE_MWBINDEN_S) +#define FW_RI_TPTE_MWBINDEN_G(x) \ + (((x) >> FW_RI_TPTE_MWBINDEN_S) & FW_RI_TPTE_MWBINDEN_M) +#define FW_RI_TPTE_MWBINDEN_F FW_RI_TPTE_MWBINDEN_V(1U) + +#define FW_RI_TPTE_PS_S 20 +#define FW_RI_TPTE_PS_M 0x1f +#define FW_RI_TPTE_PS_V(x) ((x) << FW_RI_TPTE_PS_S) +#define FW_RI_TPTE_PS_G(x) \ + (((x) >> FW_RI_TPTE_PS_S) & FW_RI_TPTE_PS_M) + +#define FW_RI_TPTE_QPID_S 0 +#define FW_RI_TPTE_QPID_M 0xfffff +#define FW_RI_TPTE_QPID_V(x) ((x) << FW_RI_TPTE_QPID_S) +#define FW_RI_TPTE_QPID_G(x) \ + (((x) >> FW_RI_TPTE_QPID_S) & FW_RI_TPTE_QPID_M) + +#define FW_RI_TPTE_NOSNOOP_S 30 +#define FW_RI_TPTE_NOSNOOP_M 0x1 +#define FW_RI_TPTE_NOSNOOP_V(x) ((x) << FW_RI_TPTE_NOSNOOP_S) +#define FW_RI_TPTE_NOSNOOP_G(x) \ + (((x) >> FW_RI_TPTE_NOSNOOP_S) & FW_RI_TPTE_NOSNOOP_M) +#define FW_RI_TPTE_NOSNOOP_F FW_RI_TPTE_NOSNOOP_V(1U) + +#define FW_RI_TPTE_PBLADDR_S 0 +#define FW_RI_TPTE_PBLADDR_M 0x1fffffff +#define FW_RI_TPTE_PBLADDR_V(x) ((x) << FW_RI_TPTE_PBLADDR_S) +#define FW_RI_TPTE_PBLADDR_G(x) \ + (((x) >> FW_RI_TPTE_PBLADDR_S) & FW_RI_TPTE_PBLADDR_M) + +#define FW_RI_TPTE_DCA_S 24 +#define FW_RI_TPTE_DCA_M 0x1f +#define FW_RI_TPTE_DCA_V(x) ((x) << FW_RI_TPTE_DCA_S) +#define FW_RI_TPTE_DCA_G(x) \ + (((x) >> FW_RI_TPTE_DCA_S) & FW_RI_TPTE_DCA_M) + +#define FW_RI_TPTE_MWBCNT_PSTAG_S 0 +#define FW_RI_TPTE_MWBCNT_PSTAG_M 0xffffff +#define FW_RI_TPTE_MWBCNT_PSTAT_V(x) \ + ((x) << FW_RI_TPTE_MWBCNT_PSTAG_S) +#define FW_RI_TPTE_MWBCNT_PSTAG_G(x) \ + (((x) >> FW_RI_TPTE_MWBCNT_PSTAG_S) & FW_RI_TPTE_MWBCNT_PSTAG_M) + +enum fw_ri_res_type { + FW_RI_RES_TYPE_SQ, + FW_RI_RES_TYPE_RQ, + FW_RI_RES_TYPE_CQ, +}; + +enum fw_ri_res_op { + FW_RI_RES_OP_WRITE, + FW_RI_RES_OP_RESET, +}; + +struct fw_ri_res { + union fw_ri_restype { + struct fw_ri_res_sqrq { + __u8 restype; + __u8 op; + __be16 r3; + __be32 eqid; + __be32 r4[2]; + __be32 fetchszm_to_iqid; + __be32 dcaen_to_eqsize; + __be64 eqaddr; + } sqrq; + struct fw_ri_res_cq { + __u8 restype; + __u8 op; + __be16 r3; + __be32 iqid; + __be32 r4[2]; + __be32 iqandst_to_iqandstindex; + __be16 iqdroprss_to_iqesize; + __be16 iqsize; + __be64 iqaddr; + __be32 iqns_iqro; + __be32 r6_lo; + __be64 r7; + } cq; + } u; +}; + +struct fw_ri_res_wr { + __be32 op_nres; + __be32 len16_pkd; + __u64 cookie; +#ifndef C99_NOT_SUPPORTED + struct fw_ri_res res[0]; +#endif +}; + +#define FW_RI_RES_WR_NRES_S 0 +#define FW_RI_RES_WR_NRES_M 0xff +#define FW_RI_RES_WR_NRES_V(x) ((x) << FW_RI_RES_WR_NRES_S) +#define FW_RI_RES_WR_NRES_G(x) \ + (((x) >> FW_RI_RES_WR_NRES_S) & FW_RI_RES_WR_NRES_M) + +#define FW_RI_RES_WR_FETCHSZM_S 26 +#define FW_RI_RES_WR_FETCHSZM_M 0x1 +#define FW_RI_RES_WR_FETCHSZM_V(x) ((x) << FW_RI_RES_WR_FETCHSZM_S) +#define FW_RI_RES_WR_FETCHSZM_G(x) \ + (((x) >> FW_RI_RES_WR_FETCHSZM_S) & FW_RI_RES_WR_FETCHSZM_M) +#define FW_RI_RES_WR_FETCHSZM_F FW_RI_RES_WR_FETCHSZM_V(1U) + +#define FW_RI_RES_WR_STATUSPGNS_S 25 +#define FW_RI_RES_WR_STATUSPGNS_M 0x1 +#define FW_RI_RES_WR_STATUSPGNS_V(x) ((x) << FW_RI_RES_WR_STATUSPGNS_S) +#define FW_RI_RES_WR_STATUSPGNS_G(x) \ + (((x) >> FW_RI_RES_WR_STATUSPGNS_S) & FW_RI_RES_WR_STATUSPGNS_M) +#define FW_RI_RES_WR_STATUSPGNS_F FW_RI_RES_WR_STATUSPGNS_V(1U) + +#define FW_RI_RES_WR_STATUSPGRO_S 24 +#define FW_RI_RES_WR_STATUSPGRO_M 0x1 +#define FW_RI_RES_WR_STATUSPGRO_V(x) ((x) << FW_RI_RES_WR_STATUSPGRO_S) +#define FW_RI_RES_WR_STATUSPGRO_G(x) \ + (((x) >> FW_RI_RES_WR_STATUSPGRO_S) & FW_RI_RES_WR_STATUSPGRO_M) +#define FW_RI_RES_WR_STATUSPGRO_F FW_RI_RES_WR_STATUSPGRO_V(1U) + +#define FW_RI_RES_WR_FETCHNS_S 23 +#define FW_RI_RES_WR_FETCHNS_M 0x1 +#define FW_RI_RES_WR_FETCHNS_V(x) ((x) << FW_RI_RES_WR_FETCHNS_S) +#define FW_RI_RES_WR_FETCHNS_G(x) \ + (((x) >> FW_RI_RES_WR_FETCHNS_S) & FW_RI_RES_WR_FETCHNS_M) +#define FW_RI_RES_WR_FETCHNS_F FW_RI_RES_WR_FETCHNS_V(1U) + +#define FW_RI_RES_WR_FETCHRO_S 22 +#define FW_RI_RES_WR_FETCHRO_M 0x1 +#define FW_RI_RES_WR_FETCHRO_V(x) ((x) << FW_RI_RES_WR_FETCHRO_S) +#define FW_RI_RES_WR_FETCHRO_G(x) \ + (((x) >> FW_RI_RES_WR_FETCHRO_S) & FW_RI_RES_WR_FETCHRO_M) +#define FW_RI_RES_WR_FETCHRO_F FW_RI_RES_WR_FETCHRO_V(1U) + +#define FW_RI_RES_WR_HOSTFCMODE_S 20 +#define FW_RI_RES_WR_HOSTFCMODE_M 0x3 +#define FW_RI_RES_WR_HOSTFCMODE_V(x) ((x) << FW_RI_RES_WR_HOSTFCMODE_S) +#define FW_RI_RES_WR_HOSTFCMODE_G(x) \ + (((x) >> FW_RI_RES_WR_HOSTFCMODE_S) & FW_RI_RES_WR_HOSTFCMODE_M) + +#define FW_RI_RES_WR_CPRIO_S 19 +#define FW_RI_RES_WR_CPRIO_M 0x1 +#define FW_RI_RES_WR_CPRIO_V(x) ((x) << FW_RI_RES_WR_CPRIO_S) +#define FW_RI_RES_WR_CPRIO_G(x) \ + (((x) >> FW_RI_RES_WR_CPRIO_S) & FW_RI_RES_WR_CPRIO_M) +#define FW_RI_RES_WR_CPRIO_F FW_RI_RES_WR_CPRIO_V(1U) + +#define FW_RI_RES_WR_ONCHIP_S 18 +#define FW_RI_RES_WR_ONCHIP_M 0x1 +#define FW_RI_RES_WR_ONCHIP_V(x) ((x) << FW_RI_RES_WR_ONCHIP_S) +#define FW_RI_RES_WR_ONCHIP_G(x) \ + (((x) >> FW_RI_RES_WR_ONCHIP_S) & FW_RI_RES_WR_ONCHIP_M) +#define FW_RI_RES_WR_ONCHIP_F FW_RI_RES_WR_ONCHIP_V(1U) + +#define FW_RI_RES_WR_PCIECHN_S 16 +#define FW_RI_RES_WR_PCIECHN_M 0x3 +#define FW_RI_RES_WR_PCIECHN_V(x) ((x) << FW_RI_RES_WR_PCIECHN_S) +#define FW_RI_RES_WR_PCIECHN_G(x) \ + (((x) >> FW_RI_RES_WR_PCIECHN_S) & FW_RI_RES_WR_PCIECHN_M) + +#define FW_RI_RES_WR_IQID_S 0 +#define FW_RI_RES_WR_IQID_M 0xffff +#define FW_RI_RES_WR_IQID_V(x) ((x) << FW_RI_RES_WR_IQID_S) +#define FW_RI_RES_WR_IQID_G(x) \ + (((x) >> FW_RI_RES_WR_IQID_S) & FW_RI_RES_WR_IQID_M) + +#define FW_RI_RES_WR_DCAEN_S 31 +#define FW_RI_RES_WR_DCAEN_M 0x1 +#define FW_RI_RES_WR_DCAEN_V(x) ((x) << FW_RI_RES_WR_DCAEN_S) +#define FW_RI_RES_WR_DCAEN_G(x) \ + (((x) >> FW_RI_RES_WR_DCAEN_S) & FW_RI_RES_WR_DCAEN_M) +#define FW_RI_RES_WR_DCAEN_F FW_RI_RES_WR_DCAEN_V(1U) + +#define FW_RI_RES_WR_DCACPU_S 26 +#define FW_RI_RES_WR_DCACPU_M 0x1f +#define FW_RI_RES_WR_DCACPU_V(x) ((x) << FW_RI_RES_WR_DCACPU_S) +#define FW_RI_RES_WR_DCACPU_G(x) \ + (((x) >> FW_RI_RES_WR_DCACPU_S) & FW_RI_RES_WR_DCACPU_M) + +#define FW_RI_RES_WR_FBMIN_S 23 +#define FW_RI_RES_WR_FBMIN_M 0x7 +#define FW_RI_RES_WR_FBMIN_V(x) ((x) << FW_RI_RES_WR_FBMIN_S) +#define FW_RI_RES_WR_FBMIN_G(x) \ + (((x) >> FW_RI_RES_WR_FBMIN_S) & FW_RI_RES_WR_FBMIN_M) + +#define FW_RI_RES_WR_FBMAX_S 20 +#define FW_RI_RES_WR_FBMAX_M 0x7 +#define FW_RI_RES_WR_FBMAX_V(x) ((x) << FW_RI_RES_WR_FBMAX_S) +#define FW_RI_RES_WR_FBMAX_G(x) \ + (((x) >> FW_RI_RES_WR_FBMAX_S) & FW_RI_RES_WR_FBMAX_M) + +#define FW_RI_RES_WR_CIDXFTHRESHO_S 19 +#define FW_RI_RES_WR_CIDXFTHRESHO_M 0x1 +#define FW_RI_RES_WR_CIDXFTHRESHO_V(x) ((x) << FW_RI_RES_WR_CIDXFTHRESHO_S) +#define FW_RI_RES_WR_CIDXFTHRESHO_G(x) \ + (((x) >> FW_RI_RES_WR_CIDXFTHRESHO_S) & FW_RI_RES_WR_CIDXFTHRESHO_M) +#define FW_RI_RES_WR_CIDXFTHRESHO_F FW_RI_RES_WR_CIDXFTHRESHO_V(1U) + +#define FW_RI_RES_WR_CIDXFTHRESH_S 16 +#define FW_RI_RES_WR_CIDXFTHRESH_M 0x7 +#define FW_RI_RES_WR_CIDXFTHRESH_V(x) ((x) << FW_RI_RES_WR_CIDXFTHRESH_S) +#define FW_RI_RES_WR_CIDXFTHRESH_G(x) \ + (((x) >> FW_RI_RES_WR_CIDXFTHRESH_S) & FW_RI_RES_WR_CIDXFTHRESH_M) + +#define FW_RI_RES_WR_EQSIZE_S 0 +#define FW_RI_RES_WR_EQSIZE_M 0xffff +#define FW_RI_RES_WR_EQSIZE_V(x) ((x) << FW_RI_RES_WR_EQSIZE_S) +#define FW_RI_RES_WR_EQSIZE_G(x) \ + (((x) >> FW_RI_RES_WR_EQSIZE_S) & FW_RI_RES_WR_EQSIZE_M) + +#define FW_RI_RES_WR_IQANDST_S 15 +#define FW_RI_RES_WR_IQANDST_M 0x1 +#define FW_RI_RES_WR_IQANDST_V(x) ((x) << FW_RI_RES_WR_IQANDST_S) +#define FW_RI_RES_WR_IQANDST_G(x) \ + (((x) >> FW_RI_RES_WR_IQANDST_S) & FW_RI_RES_WR_IQANDST_M) +#define FW_RI_RES_WR_IQANDST_F FW_RI_RES_WR_IQANDST_V(1U) + +#define FW_RI_RES_WR_IQANUS_S 14 +#define FW_RI_RES_WR_IQANUS_M 0x1 +#define FW_RI_RES_WR_IQANUS_V(x) ((x) << FW_RI_RES_WR_IQANUS_S) +#define FW_RI_RES_WR_IQANUS_G(x) \ + (((x) >> FW_RI_RES_WR_IQANUS_S) & FW_RI_RES_WR_IQANUS_M) +#define FW_RI_RES_WR_IQANUS_F FW_RI_RES_WR_IQANUS_V(1U) + +#define FW_RI_RES_WR_IQANUD_S 12 +#define FW_RI_RES_WR_IQANUD_M 0x3 +#define FW_RI_RES_WR_IQANUD_V(x) ((x) << FW_RI_RES_WR_IQANUD_S) +#define FW_RI_RES_WR_IQANUD_G(x) \ + (((x) >> FW_RI_RES_WR_IQANUD_S) & FW_RI_RES_WR_IQANUD_M) + +#define FW_RI_RES_WR_IQANDSTINDEX_S 0 +#define FW_RI_RES_WR_IQANDSTINDEX_M 0xfff +#define FW_RI_RES_WR_IQANDSTINDEX_V(x) ((x) << FW_RI_RES_WR_IQANDSTINDEX_S) +#define FW_RI_RES_WR_IQANDSTINDEX_G(x) \ + (((x) >> FW_RI_RES_WR_IQANDSTINDEX_S) & FW_RI_RES_WR_IQANDSTINDEX_M) + +#define FW_RI_RES_WR_IQDROPRSS_S 15 +#define FW_RI_RES_WR_IQDROPRSS_M 0x1 +#define FW_RI_RES_WR_IQDROPRSS_V(x) ((x) << FW_RI_RES_WR_IQDROPRSS_S) +#define FW_RI_RES_WR_IQDROPRSS_G(x) \ + (((x) >> FW_RI_RES_WR_IQDROPRSS_S) & FW_RI_RES_WR_IQDROPRSS_M) +#define FW_RI_RES_WR_IQDROPRSS_F FW_RI_RES_WR_IQDROPRSS_V(1U) + +#define FW_RI_RES_WR_IQGTSMODE_S 14 +#define FW_RI_RES_WR_IQGTSMODE_M 0x1 +#define FW_RI_RES_WR_IQGTSMODE_V(x) ((x) << FW_RI_RES_WR_IQGTSMODE_S) +#define FW_RI_RES_WR_IQGTSMODE_G(x) \ + (((x) >> FW_RI_RES_WR_IQGTSMODE_S) & FW_RI_RES_WR_IQGTSMODE_M) +#define FW_RI_RES_WR_IQGTSMODE_F FW_RI_RES_WR_IQGTSMODE_V(1U) + +#define FW_RI_RES_WR_IQPCIECH_S 12 +#define FW_RI_RES_WR_IQPCIECH_M 0x3 +#define FW_RI_RES_WR_IQPCIECH_V(x) ((x) << FW_RI_RES_WR_IQPCIECH_S) +#define FW_RI_RES_WR_IQPCIECH_G(x) \ + (((x) >> FW_RI_RES_WR_IQPCIECH_S) & FW_RI_RES_WR_IQPCIECH_M) + +#define FW_RI_RES_WR_IQDCAEN_S 11 +#define FW_RI_RES_WR_IQDCAEN_M 0x1 +#define FW_RI_RES_WR_IQDCAEN_V(x) ((x) << FW_RI_RES_WR_IQDCAEN_S) +#define FW_RI_RES_WR_IQDCAEN_G(x) \ + (((x) >> FW_RI_RES_WR_IQDCAEN_S) & FW_RI_RES_WR_IQDCAEN_M) +#define FW_RI_RES_WR_IQDCAEN_F FW_RI_RES_WR_IQDCAEN_V(1U) + +#define FW_RI_RES_WR_IQDCACPU_S 6 +#define FW_RI_RES_WR_IQDCACPU_M 0x1f +#define FW_RI_RES_WR_IQDCACPU_V(x) ((x) << FW_RI_RES_WR_IQDCACPU_S) +#define FW_RI_RES_WR_IQDCACPU_G(x) \ + (((x) >> FW_RI_RES_WR_IQDCACPU_S) & FW_RI_RES_WR_IQDCACPU_M) + +#define FW_RI_RES_WR_IQINTCNTTHRESH_S 4 +#define FW_RI_RES_WR_IQINTCNTTHRESH_M 0x3 +#define FW_RI_RES_WR_IQINTCNTTHRESH_V(x) \ + ((x) << FW_RI_RES_WR_IQINTCNTTHRESH_S) +#define FW_RI_RES_WR_IQINTCNTTHRESH_G(x) \ + (((x) >> FW_RI_RES_WR_IQINTCNTTHRESH_S) & FW_RI_RES_WR_IQINTCNTTHRESH_M) + +#define FW_RI_RES_WR_IQO_S 3 +#define FW_RI_RES_WR_IQO_M 0x1 +#define FW_RI_RES_WR_IQO_V(x) ((x) << FW_RI_RES_WR_IQO_S) +#define FW_RI_RES_WR_IQO_G(x) \ + (((x) >> FW_RI_RES_WR_IQO_S) & FW_RI_RES_WR_IQO_M) +#define FW_RI_RES_WR_IQO_F FW_RI_RES_WR_IQO_V(1U) + +#define FW_RI_RES_WR_IQCPRIO_S 2 +#define FW_RI_RES_WR_IQCPRIO_M 0x1 +#define FW_RI_RES_WR_IQCPRIO_V(x) ((x) << FW_RI_RES_WR_IQCPRIO_S) +#define FW_RI_RES_WR_IQCPRIO_G(x) \ + (((x) >> FW_RI_RES_WR_IQCPRIO_S) & FW_RI_RES_WR_IQCPRIO_M) +#define FW_RI_RES_WR_IQCPRIO_F FW_RI_RES_WR_IQCPRIO_V(1U) + +#define FW_RI_RES_WR_IQESIZE_S 0 +#define FW_RI_RES_WR_IQESIZE_M 0x3 +#define FW_RI_RES_WR_IQESIZE_V(x) ((x) << FW_RI_RES_WR_IQESIZE_S) +#define FW_RI_RES_WR_IQESIZE_G(x) \ + (((x) >> FW_RI_RES_WR_IQESIZE_S) & FW_RI_RES_WR_IQESIZE_M) + +#define FW_RI_RES_WR_IQNS_S 31 +#define FW_RI_RES_WR_IQNS_M 0x1 +#define FW_RI_RES_WR_IQNS_V(x) ((x) << FW_RI_RES_WR_IQNS_S) +#define FW_RI_RES_WR_IQNS_G(x) \ + (((x) >> FW_RI_RES_WR_IQNS_S) & FW_RI_RES_WR_IQNS_M) +#define FW_RI_RES_WR_IQNS_F FW_RI_RES_WR_IQNS_V(1U) + +#define FW_RI_RES_WR_IQRO_S 30 +#define FW_RI_RES_WR_IQRO_M 0x1 +#define FW_RI_RES_WR_IQRO_V(x) ((x) << FW_RI_RES_WR_IQRO_S) +#define FW_RI_RES_WR_IQRO_G(x) \ + (((x) >> FW_RI_RES_WR_IQRO_S) & FW_RI_RES_WR_IQRO_M) +#define FW_RI_RES_WR_IQRO_F FW_RI_RES_WR_IQRO_V(1U) + +struct fw_ri_rdma_write_wr { + __u8 opcode; + __u8 flags; + __u16 wrid; + __u8 r1[3]; + __u8 len16; + __be64 r2; + __be32 plen; + __be32 stag_sink; + __be64 to_sink; +#ifndef C99_NOT_SUPPORTED + union { + struct fw_ri_immd immd_src[0]; + struct fw_ri_isgl isgl_src[0]; + } u; +#endif +}; + +struct fw_ri_send_wr { + __u8 opcode; + __u8 flags; + __u16 wrid; + __u8 r1[3]; + __u8 len16; + __be32 sendop_pkd; + __be32 stag_inv; + __be32 plen; + __be32 r3; + __be64 r4; +#ifndef C99_NOT_SUPPORTED + union { + struct fw_ri_immd immd_src[0]; + struct fw_ri_isgl isgl_src[0]; + } u; +#endif +}; + +#define FW_RI_SEND_WR_SENDOP_S 0 +#define FW_RI_SEND_WR_SENDOP_M 0xf +#define FW_RI_SEND_WR_SENDOP_V(x) ((x) << FW_RI_SEND_WR_SENDOP_S) +#define FW_RI_SEND_WR_SENDOP_G(x) \ + (((x) >> FW_RI_SEND_WR_SENDOP_S) & FW_RI_SEND_WR_SENDOP_M) + +struct fw_ri_rdma_read_wr { + __u8 opcode; + __u8 flags; + __u16 wrid; + __u8 r1[3]; + __u8 len16; + __be64 r2; + __be32 stag_sink; + __be32 to_sink_hi; + __be32 to_sink_lo; + __be32 plen; + __be32 stag_src; + __be32 to_src_hi; + __be32 to_src_lo; + __be32 r5; +}; + +struct fw_ri_recv_wr { + __u8 opcode; + __u8 r1; + __u16 wrid; + __u8 r2[3]; + __u8 len16; + struct fw_ri_isgl isgl; +}; + +struct fw_ri_bind_mw_wr { + __u8 opcode; + __u8 flags; + __u16 wrid; + __u8 r1[3]; + __u8 len16; + __u8 qpbinde_to_dcacpu; + __u8 pgsz_shift; + __u8 addr_type; + __u8 mem_perms; + __be32 stag_mr; + __be32 stag_mw; + __be32 r3; + __be64 len_mw; + __be64 va_fbo; + __be64 r4; +}; + +#define FW_RI_BIND_MW_WR_QPBINDE_S 6 +#define FW_RI_BIND_MW_WR_QPBINDE_M 0x1 +#define FW_RI_BIND_MW_WR_QPBINDE_V(x) ((x) << FW_RI_BIND_MW_WR_QPBINDE_S) +#define FW_RI_BIND_MW_WR_QPBINDE_G(x) \ + (((x) >> FW_RI_BIND_MW_WR_QPBINDE_S) & FW_RI_BIND_MW_WR_QPBINDE_M) +#define FW_RI_BIND_MW_WR_QPBINDE_F FW_RI_BIND_MW_WR_QPBINDE_V(1U) + +#define FW_RI_BIND_MW_WR_NS_S 5 +#define FW_RI_BIND_MW_WR_NS_M 0x1 +#define FW_RI_BIND_MW_WR_NS_V(x) ((x) << FW_RI_BIND_MW_WR_NS_S) +#define FW_RI_BIND_MW_WR_NS_G(x) \ + (((x) >> FW_RI_BIND_MW_WR_NS_S) & FW_RI_BIND_MW_WR_NS_M) +#define FW_RI_BIND_MW_WR_NS_F FW_RI_BIND_MW_WR_NS_V(1U) + +#define FW_RI_BIND_MW_WR_DCACPU_S 0 +#define FW_RI_BIND_MW_WR_DCACPU_M 0x1f +#define FW_RI_BIND_MW_WR_DCACPU_V(x) ((x) << FW_RI_BIND_MW_WR_DCACPU_S) +#define FW_RI_BIND_MW_WR_DCACPU_G(x) \ + (((x) >> FW_RI_BIND_MW_WR_DCACPU_S) & FW_RI_BIND_MW_WR_DCACPU_M) + +struct fw_ri_fr_nsmr_wr { + __u8 opcode; + __u8 flags; + __u16 wrid; + __u8 r1[3]; + __u8 len16; + __u8 qpbinde_to_dcacpu; + __u8 pgsz_shift; + __u8 addr_type; + __u8 mem_perms; + __be32 stag; + __be32 len_hi; + __be32 len_lo; + __be32 va_hi; + __be32 va_lo_fbo; +}; + +#define FW_RI_FR_NSMR_WR_QPBINDE_S 6 +#define FW_RI_FR_NSMR_WR_QPBINDE_M 0x1 +#define FW_RI_FR_NSMR_WR_QPBINDE_V(x) ((x) << FW_RI_FR_NSMR_WR_QPBINDE_S) +#define FW_RI_FR_NSMR_WR_QPBINDE_G(x) \ + (((x) >> FW_RI_FR_NSMR_WR_QPBINDE_S) & FW_RI_FR_NSMR_WR_QPBINDE_M) +#define FW_RI_FR_NSMR_WR_QPBINDE_F FW_RI_FR_NSMR_WR_QPBINDE_V(1U) + +#define FW_RI_FR_NSMR_WR_NS_S 5 +#define FW_RI_FR_NSMR_WR_NS_M 0x1 +#define FW_RI_FR_NSMR_WR_NS_V(x) ((x) << FW_RI_FR_NSMR_WR_NS_S) +#define FW_RI_FR_NSMR_WR_NS_G(x) \ + (((x) >> FW_RI_FR_NSMR_WR_NS_S) & FW_RI_FR_NSMR_WR_NS_M) +#define FW_RI_FR_NSMR_WR_NS_F FW_RI_FR_NSMR_WR_NS_V(1U) + +#define FW_RI_FR_NSMR_WR_DCACPU_S 0 +#define FW_RI_FR_NSMR_WR_DCACPU_M 0x1f +#define FW_RI_FR_NSMR_WR_DCACPU_V(x) ((x) << FW_RI_FR_NSMR_WR_DCACPU_S) +#define FW_RI_FR_NSMR_WR_DCACPU_G(x) \ + (((x) >> FW_RI_FR_NSMR_WR_DCACPU_S) & FW_RI_FR_NSMR_WR_DCACPU_M) + +struct fw_ri_inv_lstag_wr { + __u8 opcode; + __u8 flags; + __u16 wrid; + __u8 r1[3]; + __u8 len16; + __be32 r2; + __be32 stag_inv; +}; + +enum fw_ri_type { + FW_RI_TYPE_INIT, + FW_RI_TYPE_FINI, + FW_RI_TYPE_TERMINATE +}; + +enum fw_ri_init_p2ptype { + FW_RI_INIT_P2PTYPE_RDMA_WRITE = FW_RI_RDMA_WRITE, + FW_RI_INIT_P2PTYPE_READ_REQ = FW_RI_READ_REQ, + FW_RI_INIT_P2PTYPE_SEND = FW_RI_SEND, + FW_RI_INIT_P2PTYPE_SEND_WITH_INV = FW_RI_SEND_WITH_INV, + FW_RI_INIT_P2PTYPE_SEND_WITH_SE = FW_RI_SEND_WITH_SE, + FW_RI_INIT_P2PTYPE_SEND_WITH_SE_INV = FW_RI_SEND_WITH_SE_INV, + FW_RI_INIT_P2PTYPE_DISABLED = 0xf, +}; + +struct fw_ri_wr { + __be32 op_compl; + __be32 flowid_len16; + __u64 cookie; + union fw_ri { + struct fw_ri_init { + __u8 type; + __u8 mpareqbit_p2ptype; + __u8 r4[2]; + __u8 mpa_attrs; + __u8 qp_caps; + __be16 nrqe; + __be32 pdid; + __be32 qpid; + __be32 sq_eqid; + __be32 rq_eqid; + __be32 scqid; + __be32 rcqid; + __be32 ord_max; + __be32 ird_max; + __be32 iss; + __be32 irs; + __be32 hwrqsize; + __be32 hwrqaddr; + __be64 r5; + union fw_ri_init_p2p { + struct fw_ri_rdma_write_wr write; + struct fw_ri_rdma_read_wr read; + struct fw_ri_send_wr send; + } u; + } init; + struct fw_ri_fini { + __u8 type; + __u8 r3[7]; + __be64 r4; + } fini; + struct fw_ri_terminate { + __u8 type; + __u8 r3[3]; + __be32 immdlen; + __u8 termmsg[40]; + } terminate; + } u; +}; + +#define FW_RI_WR_MPAREQBIT_S 7 +#define FW_RI_WR_MPAREQBIT_M 0x1 +#define FW_RI_WR_MPAREQBIT_V(x) ((x) << FW_RI_WR_MPAREQBIT_S) +#define FW_RI_WR_MPAREQBIT_G(x) \ + (((x) >> FW_RI_WR_MPAREQBIT_S) & FW_RI_WR_MPAREQBIT_M) +#define FW_RI_WR_MPAREQBIT_F FW_RI_WR_MPAREQBIT_V(1U) + +#define FW_RI_WR_P2PTYPE_S 0 +#define FW_RI_WR_P2PTYPE_M 0xf +#define FW_RI_WR_P2PTYPE_V(x) ((x) << FW_RI_WR_P2PTYPE_S) +#define FW_RI_WR_P2PTYPE_G(x) \ + (((x) >> FW_RI_WR_P2PTYPE_S) & FW_RI_WR_P2PTYPE_M) + +struct tcp_options { + __be16 mss; + __u8 wsf; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8:4; + __u8 unknown:1; + __u8:1; + __u8 sack:1; + __u8 tstamp:1; +#else + __u8 tstamp:1; + __u8 sack:1; + __u8:1; + __u8 unknown:1; + __u8:4; +#endif +}; + +struct cpl_pass_accept_req { + union opcode_tid ot; + __be16 rsvd; + __be16 len; + __be32 hdr_len; + __be16 vlan; + __be16 l2info; + __be32 tos_stid; + struct tcp_options tcpopt; +}; + +/* cpl_pass_accept_req.hdr_len fields */ +#define SYN_RX_CHAN_S 0 +#define SYN_RX_CHAN_M 0xF +#define SYN_RX_CHAN_V(x) ((x) << SYN_RX_CHAN_S) +#define SYN_RX_CHAN_G(x) (((x) >> SYN_RX_CHAN_S) & SYN_RX_CHAN_M) + +#define TCP_HDR_LEN_S 10 +#define TCP_HDR_LEN_M 0x3F +#define TCP_HDR_LEN_V(x) ((x) << TCP_HDR_LEN_S) +#define TCP_HDR_LEN_G(x) (((x) >> TCP_HDR_LEN_S) & TCP_HDR_LEN_M) + +#define IP_HDR_LEN_S 16 +#define IP_HDR_LEN_M 0x3FF +#define IP_HDR_LEN_V(x) ((x) << IP_HDR_LEN_S) +#define IP_HDR_LEN_G(x) (((x) >> IP_HDR_LEN_S) & IP_HDR_LEN_M) + +#define ETH_HDR_LEN_S 26 +#define ETH_HDR_LEN_M 0x1F +#define ETH_HDR_LEN_V(x) ((x) << ETH_HDR_LEN_S) +#define ETH_HDR_LEN_G(x) (((x) >> ETH_HDR_LEN_S) & ETH_HDR_LEN_M) + +/* cpl_pass_accept_req.l2info fields */ +#define SYN_MAC_IDX_S 0 +#define SYN_MAC_IDX_M 0x1FF +#define SYN_MAC_IDX_V(x) ((x) << SYN_MAC_IDX_S) +#define SYN_MAC_IDX_G(x) (((x) >> SYN_MAC_IDX_S) & SYN_MAC_IDX_M) + +#define SYN_XACT_MATCH_S 9 +#define SYN_XACT_MATCH_V(x) ((x) << SYN_XACT_MATCH_S) +#define SYN_XACT_MATCH_F SYN_XACT_MATCH_V(1U) + +#define SYN_INTF_S 12 +#define SYN_INTF_M 0xF +#define SYN_INTF_V(x) ((x) << SYN_INTF_S) +#define SYN_INTF_G(x) (((x) >> SYN_INTF_S) & SYN_INTF_M) + +struct ulptx_idata { + __be32 cmd_more; + __be32 len; +}; + +#define ULPTX_NSGE_S 0 +#define ULPTX_NSGE_M 0xFFFF +#define ULPTX_NSGE_V(x) ((x) << ULPTX_NSGE_S) + +#define RX_DACK_MODE_S 29 +#define RX_DACK_MODE_M 0x3 +#define RX_DACK_MODE_V(x) ((x) << RX_DACK_MODE_S) +#define RX_DACK_MODE_G(x) (((x) >> RX_DACK_MODE_S) & RX_DACK_MODE_M) + +#define RX_DACK_CHANGE_S 31 +#define RX_DACK_CHANGE_V(x) ((x) << RX_DACK_CHANGE_S) +#define RX_DACK_CHANGE_F RX_DACK_CHANGE_V(1U) + +enum { /* TCP congestion control algorithms */ + CONG_ALG_RENO, + CONG_ALG_TAHOE, + CONG_ALG_NEWRENO, + CONG_ALG_HIGHSPEED +}; + +#define CONG_CNTRL_S 14 +#define CONG_CNTRL_M 0x3 +#define CONG_CNTRL_V(x) ((x) << CONG_CNTRL_S) +#define CONG_CNTRL_G(x) (((x) >> CONG_CNTRL_S) & CONG_CNTRL_M) + +#define T5_ISS_S 18 +#define T5_ISS_V(x) ((x) << T5_ISS_S) +#define T5_ISS_F T5_ISS_V(1U) + +#endif /* _T4FW_RI_API_H_ */ diff --git a/kernel/drivers/infiniband/hw/cxgb4/user.h b/kernel/drivers/infiniband/hw/cxgb4/user.h new file mode 100644 index 000000000..cbd0ce170 --- /dev/null +++ b/kernel/drivers/infiniband/hw/cxgb4/user.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __C4IW_USER_H__ +#define __C4IW_USER_H__ + +#define C4IW_UVERBS_ABI_VERSION 2 + +/* + * Make sure that all structs defined in this file remain laid out so + * that they pack the same way on 32-bit and 64-bit architectures (to + * avoid incompatibility between 32-bit userspace and 64-bit kernels). + * In particular do not use pointer types -- pass pointers in __u64 + * instead. + */ +struct c4iw_create_cq_resp { + __u64 key; + __u64 gts_key; + __u64 memsize; + __u32 cqid; + __u32 size; + __u32 qid_mask; + __u32 reserved; /* explicit padding (optional for i386) */ +}; + + +enum { + C4IW_QPF_ONCHIP = (1<<0) +}; + +struct c4iw_create_qp_resp { + __u64 ma_sync_key; + __u64 sq_key; + __u64 rq_key; + __u64 sq_db_gts_key; + __u64 rq_db_gts_key; + __u64 sq_memsize; + __u64 rq_memsize; + __u32 sqid; + __u32 rqid; + __u32 sq_size; + __u32 rq_size; + __u32 qid_mask; + __u32 flags; +}; + +struct c4iw_alloc_ucontext_resp { + __u64 status_page_key; + __u32 status_page_size; + __u32 reserved; /* explicit padding (optional for i386) */ +}; +#endif diff --git a/kernel/drivers/infiniband/hw/ehca/Kconfig b/kernel/drivers/infiniband/hw/ehca/Kconfig new file mode 100644 index 000000000..59f807d8d --- /dev/null +++ b/kernel/drivers/infiniband/hw/ehca/Kconfig @@ -0,0 +1,9 @@ +config INFINIBAND_EHCA + tristate "eHCA support" + depends on IBMEBUS + ---help--- + This driver supports the IBM pSeries eHCA InfiniBand adapter. + + To compile the driver as a module, choose M here. The module + will be called ib_ehca. + diff --git a/kernel/drivers/infiniband/hw/ehca/Makefile b/kernel/drivers/infiniband/hw/ehca/Makefile new file mode 100644 index 000000000..74d284e46 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ehca/Makefile @@ -0,0 +1,16 @@ +# Authors: Heiko J Schick +# Christoph Raisch +# Joachim Fenkes +# +# Copyright (c) 2005 IBM Corporation +# +# All rights reserved. +# +# This source code is distributed under a dual license of GPL v2.0 and OpenIB BSD. + +obj-$(CONFIG_INFINIBAND_EHCA) += ib_ehca.o + +ib_ehca-objs = ehca_main.o ehca_hca.o ehca_mcast.o ehca_pd.o ehca_av.o ehca_eq.o \ + ehca_cq.o ehca_qp.o ehca_sqp.o ehca_mrmw.o ehca_reqs.o ehca_irq.o \ + ehca_uverbs.o ipz_pt_fn.o hcp_if.o hcp_phyp.o + diff --git a/kernel/drivers/infiniband/hw/ehca/ehca_av.c b/kernel/drivers/infiniband/hw/ehca/ehca_av.c new file mode 100644 index 000000000..465926319 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ehca/ehca_av.c @@ -0,0 +1,277 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * address vector functions + * + * Authors: Hoang-Nam Nguyen + * Khadija Souissi + * Reinhard Ernst + * Christoph Raisch + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#include "ehca_tools.h" +#include "ehca_iverbs.h" +#include "hcp_if.h" + +static struct kmem_cache *av_cache; + +int ehca_calc_ipd(struct ehca_shca *shca, int port, + enum ib_rate path_rate, u32 *ipd) +{ + int path = ib_rate_to_mult(path_rate); + int link, ret; + struct ib_port_attr pa; + + if (path_rate == IB_RATE_PORT_CURRENT) { + *ipd = 0; + return 0; + } + + if (unlikely(path < 0)) { + ehca_err(&shca->ib_device, "Invalid static rate! path_rate=%x", + path_rate); + return -EINVAL; + } + + ret = ehca_query_port(&shca->ib_device, port, &pa); + if (unlikely(ret < 0)) { + ehca_err(&shca->ib_device, "Failed to query port ret=%i", ret); + return ret; + } + + link = ib_width_enum_to_int(pa.active_width) * pa.active_speed; + + if (path >= link) + /* no need to throttle if path faster than link */ + *ipd = 0; + else + /* IPD = round((link / path) - 1) */ + *ipd = ((link + (path >> 1)) / path) - 1; + + return 0; +} + +struct ib_ah *ehca_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr) +{ + int ret; + struct ehca_av *av; + struct ehca_shca *shca = container_of(pd->device, struct ehca_shca, + ib_device); + + av = kmem_cache_alloc(av_cache, GFP_KERNEL); + if (!av) { + ehca_err(pd->device, "Out of memory pd=%p ah_attr=%p", + pd, ah_attr); + return ERR_PTR(-ENOMEM); + } + + av->av.sl = ah_attr->sl; + av->av.dlid = ah_attr->dlid; + av->av.slid_path_bits = ah_attr->src_path_bits; + + if (ehca_static_rate < 0) { + u32 ipd; + if (ehca_calc_ipd(shca, ah_attr->port_num, + ah_attr->static_rate, &ipd)) { + ret = -EINVAL; + goto create_ah_exit1; + } + av->av.ipd = ipd; + } else + av->av.ipd = ehca_static_rate; + + av->av.lnh = ah_attr->ah_flags; + av->av.grh.word_0 = EHCA_BMASK_SET(GRH_IPVERSION_MASK, 6); + av->av.grh.word_0 |= EHCA_BMASK_SET(GRH_TCLASS_MASK, + ah_attr->grh.traffic_class); + av->av.grh.word_0 |= EHCA_BMASK_SET(GRH_FLOWLABEL_MASK, + ah_attr->grh.flow_label); + av->av.grh.word_0 |= EHCA_BMASK_SET(GRH_HOPLIMIT_MASK, + ah_attr->grh.hop_limit); + av->av.grh.word_0 |= EHCA_BMASK_SET(GRH_NEXTHEADER_MASK, 0x1B); + /* set sgid in grh.word_1 */ + if (ah_attr->ah_flags & IB_AH_GRH) { + int rc; + struct ib_port_attr port_attr; + union ib_gid gid; + memset(&port_attr, 0, sizeof(port_attr)); + rc = ehca_query_port(pd->device, ah_attr->port_num, + &port_attr); + if (rc) { /* invalid port number */ + ret = -EINVAL; + ehca_err(pd->device, "Invalid port number " + "ehca_query_port() returned %x " + "pd=%p ah_attr=%p", rc, pd, ah_attr); + goto create_ah_exit1; + } + memset(&gid, 0, sizeof(gid)); + rc = ehca_query_gid(pd->device, + ah_attr->port_num, + ah_attr->grh.sgid_index, &gid); + if (rc) { + ret = -EINVAL; + ehca_err(pd->device, "Failed to retrieve sgid " + "ehca_query_gid() returned %x " + "pd=%p ah_attr=%p", rc, pd, ah_attr); + goto create_ah_exit1; + } + memcpy(&av->av.grh.word_1, &gid, sizeof(gid)); + } + av->av.pmtu = shca->max_mtu; + + /* dgid comes in grh.word_3 */ + memcpy(&av->av.grh.word_3, &ah_attr->grh.dgid, + sizeof(ah_attr->grh.dgid)); + + return &av->ib_ah; + +create_ah_exit1: + kmem_cache_free(av_cache, av); + + return ERR_PTR(ret); +} + +int ehca_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr) +{ + struct ehca_av *av; + struct ehca_ud_av new_ehca_av; + struct ehca_shca *shca = container_of(ah->pd->device, struct ehca_shca, + ib_device); + + memset(&new_ehca_av, 0, sizeof(new_ehca_av)); + new_ehca_av.sl = ah_attr->sl; + new_ehca_av.dlid = ah_attr->dlid; + new_ehca_av.slid_path_bits = ah_attr->src_path_bits; + new_ehca_av.ipd = ah_attr->static_rate; + new_ehca_av.lnh = EHCA_BMASK_SET(GRH_FLAG_MASK, + (ah_attr->ah_flags & IB_AH_GRH) > 0); + new_ehca_av.grh.word_0 = EHCA_BMASK_SET(GRH_TCLASS_MASK, + ah_attr->grh.traffic_class); + new_ehca_av.grh.word_0 |= EHCA_BMASK_SET(GRH_FLOWLABEL_MASK, + ah_attr->grh.flow_label); + new_ehca_av.grh.word_0 |= EHCA_BMASK_SET(GRH_HOPLIMIT_MASK, + ah_attr->grh.hop_limit); + new_ehca_av.grh.word_0 |= EHCA_BMASK_SET(GRH_NEXTHEADER_MASK, 0x1b); + + /* set sgid in grh.word_1 */ + if (ah_attr->ah_flags & IB_AH_GRH) { + int rc; + struct ib_port_attr port_attr; + union ib_gid gid; + memset(&port_attr, 0, sizeof(port_attr)); + rc = ehca_query_port(ah->device, ah_attr->port_num, + &port_attr); + if (rc) { /* invalid port number */ + ehca_err(ah->device, "Invalid port number " + "ehca_query_port() returned %x " + "ah=%p ah_attr=%p port_num=%x", + rc, ah, ah_attr, ah_attr->port_num); + return -EINVAL; + } + memset(&gid, 0, sizeof(gid)); + rc = ehca_query_gid(ah->device, + ah_attr->port_num, + ah_attr->grh.sgid_index, &gid); + if (rc) { + ehca_err(ah->device, "Failed to retrieve sgid " + "ehca_query_gid() returned %x " + "ah=%p ah_attr=%p port_num=%x " + "sgid_index=%x", + rc, ah, ah_attr, ah_attr->port_num, + ah_attr->grh.sgid_index); + return -EINVAL; + } + memcpy(&new_ehca_av.grh.word_1, &gid, sizeof(gid)); + } + + new_ehca_av.pmtu = shca->max_mtu; + + memcpy(&new_ehca_av.grh.word_3, &ah_attr->grh.dgid, + sizeof(ah_attr->grh.dgid)); + + av = container_of(ah, struct ehca_av, ib_ah); + av->av = new_ehca_av; + + return 0; +} + +int ehca_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr) +{ + struct ehca_av *av = container_of(ah, struct ehca_av, ib_ah); + + memcpy(&ah_attr->grh.dgid, &av->av.grh.word_3, + sizeof(ah_attr->grh.dgid)); + ah_attr->sl = av->av.sl; + + ah_attr->dlid = av->av.dlid; + + ah_attr->src_path_bits = av->av.slid_path_bits; + ah_attr->static_rate = av->av.ipd; + ah_attr->ah_flags = EHCA_BMASK_GET(GRH_FLAG_MASK, av->av.lnh); + ah_attr->grh.traffic_class = EHCA_BMASK_GET(GRH_TCLASS_MASK, + av->av.grh.word_0); + ah_attr->grh.hop_limit = EHCA_BMASK_GET(GRH_HOPLIMIT_MASK, + av->av.grh.word_0); + ah_attr->grh.flow_label = EHCA_BMASK_GET(GRH_FLOWLABEL_MASK, + av->av.grh.word_0); + + return 0; +} + +int ehca_destroy_ah(struct ib_ah *ah) +{ + kmem_cache_free(av_cache, container_of(ah, struct ehca_av, ib_ah)); + + return 0; +} + +int ehca_init_av_cache(void) +{ + av_cache = kmem_cache_create("ehca_cache_av", + sizeof(struct ehca_av), 0, + SLAB_HWCACHE_ALIGN, + NULL); + if (!av_cache) + return -ENOMEM; + return 0; +} + +void ehca_cleanup_av_cache(void) +{ + if (av_cache) + kmem_cache_destroy(av_cache); +} diff --git a/kernel/drivers/infiniband/hw/ehca/ehca_classes.h b/kernel/drivers/infiniband/hw/ehca/ehca_classes.h new file mode 100644 index 000000000..bd45e0f39 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ehca/ehca_classes.h @@ -0,0 +1,482 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * Struct definition for eHCA internal structures + * + * Authors: Heiko J Schick + * Christoph Raisch + * Joachim Fenkes + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __EHCA_CLASSES_H__ +#define __EHCA_CLASSES_H__ + +struct ehca_module; +struct ehca_qp; +struct ehca_cq; +struct ehca_eq; +struct ehca_mr; +struct ehca_mw; +struct ehca_pd; +struct ehca_av; + +#include +#include + +#include +#include + +#ifdef CONFIG_PPC64 +#include "ehca_classes_pSeries.h" +#endif +#include "ipz_pt_fn.h" +#include "ehca_qes.h" +#include "ehca_irq.h" + +#define EHCA_EQE_CACHE_SIZE 20 +#define EHCA_MAX_NUM_QUEUES 0xffff + +struct ehca_eqe_cache_entry { + struct ehca_eqe *eqe; + struct ehca_cq *cq; +}; + +struct ehca_eq { + u32 length; + struct ipz_queue ipz_queue; + struct ipz_eq_handle ipz_eq_handle; + struct work_struct work; + struct h_galpas galpas; + int is_initialized; + struct ehca_pfeq pf; + spinlock_t spinlock; + struct tasklet_struct interrupt_task; + u32 ist; + spinlock_t irq_spinlock; + struct ehca_eqe_cache_entry eqe_cache[EHCA_EQE_CACHE_SIZE]; +}; + +struct ehca_sma_attr { + u16 lid, lmc, sm_sl, sm_lid; + u16 pkey_tbl_len, pkeys[16]; +}; + +struct ehca_sport { + struct ib_cq *ibcq_aqp1; + struct ib_qp *ibqp_sqp[2]; + /* lock to serialze modify_qp() calls for sqp in normal + * and irq path (when event PORT_ACTIVE is received first time) + */ + spinlock_t mod_sqp_lock; + enum ib_port_state port_state; + struct ehca_sma_attr saved_attr; + u32 pma_qp_nr; +}; + +#define HCA_CAP_MR_PGSIZE_4K 0x80000000 +#define HCA_CAP_MR_PGSIZE_64K 0x40000000 +#define HCA_CAP_MR_PGSIZE_1M 0x20000000 +#define HCA_CAP_MR_PGSIZE_16M 0x10000000 + +struct ehca_shca { + struct ib_device ib_device; + struct platform_device *ofdev; + u8 num_ports; + int hw_level; + struct list_head shca_list; + struct ipz_adapter_handle ipz_hca_handle; + struct ehca_sport sport[2]; + struct ehca_eq eq; + struct ehca_eq neq; + struct ehca_mr *maxmr; + struct ehca_pd *pd; + struct h_galpas galpas; + struct mutex modify_mutex; + u64 hca_cap; + /* MR pgsize: bit 0-3 means 4K, 64K, 1M, 16M respectively */ + u32 hca_cap_mr_pgsize; + int max_mtu; + int max_num_qps; + int max_num_cqs; + atomic_t num_cqs; + atomic_t num_qps; +}; + +struct ehca_pd { + struct ib_pd ib_pd; + struct ipz_pd fw_pd; + /* small queue mgmt */ + struct mutex lock; + struct list_head free[2]; + struct list_head full[2]; +}; + +enum ehca_ext_qp_type { + EQPT_NORMAL = 0, + EQPT_LLQP = 1, + EQPT_SRQBASE = 2, + EQPT_SRQ = 3, +}; + +/* struct to cache modify_qp()'s parms for GSI/SMI qp */ +struct ehca_mod_qp_parm { + int mask; + struct ib_qp_attr attr; +}; + +#define EHCA_MOD_QP_PARM_MAX 4 + +#define QMAP_IDX_MASK 0xFFFFULL + +/* struct for tracking if cqes have been reported to the application */ +struct ehca_qmap_entry { + u16 app_wr_id; + u8 reported; + u8 cqe_req; +}; + +struct ehca_queue_map { + struct ehca_qmap_entry *map; + unsigned int entries; + unsigned int tail; + unsigned int left_to_poll; + unsigned int next_wqe_idx; /* Idx to first wqe to be flushed */ +}; + +/* function to calculate the next index for the qmap */ +static inline unsigned int next_index(unsigned int cur_index, unsigned int limit) +{ + unsigned int temp = cur_index + 1; + return (temp == limit) ? 0 : temp; +} + +struct ehca_qp { + union { + struct ib_qp ib_qp; + struct ib_srq ib_srq; + }; + u32 qp_type; + enum ehca_ext_qp_type ext_type; + enum ib_qp_state state; + struct ipz_queue ipz_squeue; + struct ehca_queue_map sq_map; + struct ipz_queue ipz_rqueue; + struct ehca_queue_map rq_map; + struct h_galpas galpas; + u32 qkey; + u32 real_qp_num; + u32 token; + spinlock_t spinlock_s; + spinlock_t spinlock_r; + u32 sq_max_inline_data_size; + struct ipz_qp_handle ipz_qp_handle; + struct ehca_pfqp pf; + struct ib_qp_init_attr init_attr; + struct ehca_cq *send_cq; + struct ehca_cq *recv_cq; + unsigned int sqerr_purgeflag; + struct hlist_node list_entries; + /* array to cache modify_qp()'s parms for GSI/SMI qp */ + struct ehca_mod_qp_parm *mod_qp_parm; + int mod_qp_parm_idx; + /* mmap counter for resources mapped into user space */ + u32 mm_count_squeue; + u32 mm_count_rqueue; + u32 mm_count_galpa; + /* unsolicited ack circumvention */ + int unsol_ack_circ; + int mtu_shift; + u32 message_count; + u32 packet_count; + atomic_t nr_events; /* events seen */ + wait_queue_head_t wait_completion; + int mig_armed; + struct list_head sq_err_node; + struct list_head rq_err_node; +}; + +#define IS_SRQ(qp) (qp->ext_type == EQPT_SRQ) +#define HAS_SQ(qp) (qp->ext_type != EQPT_SRQ) +#define HAS_RQ(qp) (qp->ext_type != EQPT_SRQBASE) + +/* must be power of 2 */ +#define QP_HASHTAB_LEN 8 + +struct ehca_cq { + struct ib_cq ib_cq; + struct ipz_queue ipz_queue; + struct h_galpas galpas; + spinlock_t spinlock; + u32 cq_number; + u32 token; + u32 nr_of_entries; + struct ipz_cq_handle ipz_cq_handle; + struct ehca_pfcq pf; + spinlock_t cb_lock; + struct hlist_head qp_hashtab[QP_HASHTAB_LEN]; + struct list_head entry; + u32 nr_callbacks; /* #events assigned to cpu by scaling code */ + atomic_t nr_events; /* #events seen */ + wait_queue_head_t wait_completion; + spinlock_t task_lock; + /* mmap counter for resources mapped into user space */ + u32 mm_count_queue; + u32 mm_count_galpa; + struct list_head sqp_err_list; + struct list_head rqp_err_list; +}; + +enum ehca_mr_flag { + EHCA_MR_FLAG_FMR = 0x80000000, /* FMR, created with ehca_alloc_fmr */ + EHCA_MR_FLAG_MAXMR = 0x40000000, /* max-MR */ +}; + +struct ehca_mr { + union { + struct ib_mr ib_mr; /* must always be first in ehca_mr */ + struct ib_fmr ib_fmr; /* must always be first in ehca_mr */ + } ib; + struct ib_umem *umem; + spinlock_t mrlock; + + enum ehca_mr_flag flags; + u32 num_kpages; /* number of kernel pages */ + u32 num_hwpages; /* number of hw pages to form MR */ + u64 hwpage_size; /* hw page size used for this MR */ + int acl; /* ACL (stored here for usage in reregister) */ + u64 *start; /* virtual start address (stored here for */ + /* usage in reregister) */ + u64 size; /* size (stored here for usage in reregister) */ + u32 fmr_page_size; /* page size for FMR */ + u32 fmr_max_pages; /* max pages for FMR */ + u32 fmr_max_maps; /* max outstanding maps for FMR */ + u32 fmr_map_cnt; /* map counter for FMR */ + /* fw specific data */ + struct ipz_mrmw_handle ipz_mr_handle; /* MR handle for h-calls */ + struct h_galpas galpas; +}; + +struct ehca_mw { + struct ib_mw ib_mw; /* gen2 mw, must always be first in ehca_mw */ + spinlock_t mwlock; + + u8 never_bound; /* indication MW was never bound */ + struct ipz_mrmw_handle ipz_mw_handle; /* MW handle for h-calls */ + struct h_galpas galpas; +}; + +enum ehca_mr_pgi_type { + EHCA_MR_PGI_PHYS = 1, /* type of ehca_reg_phys_mr, + * ehca_rereg_phys_mr, + * ehca_reg_internal_maxmr */ + EHCA_MR_PGI_USER = 2, /* type of ehca_reg_user_mr */ + EHCA_MR_PGI_FMR = 3 /* type of ehca_map_phys_fmr */ +}; + +struct ehca_mr_pginfo { + enum ehca_mr_pgi_type type; + u64 num_kpages; + u64 kpage_cnt; + u64 hwpage_size; /* hw page size used for this MR */ + u64 num_hwpages; /* number of hw pages */ + u64 hwpage_cnt; /* counter for hw pages */ + u64 next_hwpage; /* next hw page in buffer/chunk/listelem */ + + union { + struct { /* type EHCA_MR_PGI_PHYS section */ + int num_phys_buf; + struct ib_phys_buf *phys_buf_array; + u64 next_buf; + } phy; + struct { /* type EHCA_MR_PGI_USER section */ + struct ib_umem *region; + struct scatterlist *next_sg; + u64 next_nmap; + } usr; + struct { /* type EHCA_MR_PGI_FMR section */ + u64 fmr_pgsize; + u64 *page_list; + u64 next_listelem; + } fmr; + } u; +}; + +/* output parameters for MR/FMR hipz calls */ +struct ehca_mr_hipzout_parms { + struct ipz_mrmw_handle handle; + u32 lkey; + u32 rkey; + u64 len; + u64 vaddr; + u32 acl; +}; + +/* output parameters for MW hipz calls */ +struct ehca_mw_hipzout_parms { + struct ipz_mrmw_handle handle; + u32 rkey; +}; + +struct ehca_av { + struct ib_ah ib_ah; + struct ehca_ud_av av; +}; + +struct ehca_ucontext { + struct ib_ucontext ib_ucontext; +}; + +int ehca_init_pd_cache(void); +void ehca_cleanup_pd_cache(void); +int ehca_init_cq_cache(void); +void ehca_cleanup_cq_cache(void); +int ehca_init_qp_cache(void); +void ehca_cleanup_qp_cache(void); +int ehca_init_av_cache(void); +void ehca_cleanup_av_cache(void); +int ehca_init_mrmw_cache(void); +void ehca_cleanup_mrmw_cache(void); +int ehca_init_small_qp_cache(void); +void ehca_cleanup_small_qp_cache(void); + +extern rwlock_t ehca_qp_idr_lock; +extern rwlock_t ehca_cq_idr_lock; +extern struct idr ehca_qp_idr; +extern struct idr ehca_cq_idr; +extern spinlock_t shca_list_lock; + +extern int ehca_static_rate; +extern int ehca_port_act_time; +extern bool ehca_use_hp_mr; +extern bool ehca_scaling_code; +extern int ehca_lock_hcalls; +extern int ehca_nr_ports; +extern int ehca_max_cq; +extern int ehca_max_qp; + +struct ipzu_queue_resp { + u32 qe_size; /* queue entry size */ + u32 act_nr_of_sg; + u32 queue_length; /* queue length allocated in bytes */ + u32 pagesize; + u32 toggle_state; + u32 offset; /* save offset within a page for small_qp */ +}; + +struct ehca_create_cq_resp { + u32 cq_number; + u32 token; + struct ipzu_queue_resp ipz_queue; + u32 fw_handle_ofs; + u32 dummy; +}; + +struct ehca_create_qp_resp { + u32 qp_num; + u32 token; + u32 qp_type; + u32 ext_type; + u32 qkey; + /* qp_num assigned by ehca: sqp0/1 may have got different numbers */ + u32 real_qp_num; + u32 fw_handle_ofs; + u32 dummy; + struct ipzu_queue_resp ipz_squeue; + struct ipzu_queue_resp ipz_rqueue; +}; + +struct ehca_alloc_cq_parms { + u32 nr_cqe; + u32 act_nr_of_entries; + u32 act_pages; + struct ipz_eq_handle eq_handle; +}; + +enum ehca_service_type { + ST_RC = 0, + ST_UC = 1, + ST_RD = 2, + ST_UD = 3, +}; + +enum ehca_ll_comp_flags { + LLQP_SEND_COMP = 0x20, + LLQP_RECV_COMP = 0x40, + LLQP_COMP_MASK = 0x60, +}; + +struct ehca_alloc_queue_parms { + /* input parameters */ + int max_wr; + int max_sge; + int page_size; + int is_small; + + /* output parameters */ + u16 act_nr_wqes; + u8 act_nr_sges; + u32 queue_size; /* bytes for small queues, pages otherwise */ +}; + +struct ehca_alloc_qp_parms { + struct ehca_alloc_queue_parms squeue; + struct ehca_alloc_queue_parms rqueue; + + /* input parameters */ + enum ehca_service_type servicetype; + int qp_storage; + int sigtype; + enum ehca_ext_qp_type ext_type; + enum ehca_ll_comp_flags ll_comp_flags; + int ud_av_l_key_ctl; + + u32 token; + struct ipz_eq_handle eq_handle; + struct ipz_pd pd; + struct ipz_cq_handle send_cq_handle, recv_cq_handle; + + u32 srq_qpn, srq_token, srq_limit; + + /* output parameters */ + u32 real_qp_num; + struct ipz_qp_handle qp_handle; + struct h_galpas galpas; +}; + +int ehca_cq_assign_qp(struct ehca_cq *cq, struct ehca_qp *qp); +int ehca_cq_unassign_qp(struct ehca_cq *cq, unsigned int qp_num); +struct ehca_qp *ehca_cq_get_qp(struct ehca_cq *cq, int qp_num); + +#endif diff --git a/kernel/drivers/infiniband/hw/ehca/ehca_classes_pSeries.h b/kernel/drivers/infiniband/hw/ehca/ehca_classes_pSeries.h new file mode 100644 index 000000000..689c35786 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ehca/ehca_classes_pSeries.h @@ -0,0 +1,208 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * pSeries interface definitions + * + * Authors: Waleri Fomin + * Christoph Raisch + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __EHCA_CLASSES_PSERIES_H__ +#define __EHCA_CLASSES_PSERIES_H__ + +#include "hcp_phyp.h" +#include "ipz_pt_fn.h" + + +struct ehca_pfqp { + struct ipz_qpt sqpt; + struct ipz_qpt rqpt; +}; + +struct ehca_pfcq { + struct ipz_qpt qpt; + u32 cqnr; +}; + +struct ehca_pfeq { + struct ipz_qpt qpt; + struct h_galpa galpa; + u32 eqnr; +}; + +struct ipz_adapter_handle { + u64 handle; +}; + +struct ipz_cq_handle { + u64 handle; +}; + +struct ipz_eq_handle { + u64 handle; +}; + +struct ipz_qp_handle { + u64 handle; +}; +struct ipz_mrmw_handle { + u64 handle; +}; + +struct ipz_pd { + u32 value; +}; + +struct hcp_modify_qp_control_block { + u32 qkey; /* 00 */ + u32 rdd; /* reliable datagram domain */ + u32 send_psn; /* 02 */ + u32 receive_psn; /* 03 */ + u32 prim_phys_port; /* 04 */ + u32 alt_phys_port; /* 05 */ + u32 prim_p_key_idx; /* 06 */ + u32 alt_p_key_idx; /* 07 */ + u32 rdma_atomic_ctrl; /* 08 */ + u32 qp_state; /* 09 */ + u32 reserved_10; /* 10 */ + u32 rdma_nr_atomic_resp_res; /* 11 */ + u32 path_migration_state; /* 12 */ + u32 rdma_atomic_outst_dest_qp; /* 13 */ + u32 dest_qp_nr; /* 14 */ + u32 min_rnr_nak_timer_field; /* 15 */ + u32 service_level; /* 16 */ + u32 send_grh_flag; /* 17 */ + u32 retry_count; /* 18 */ + u32 timeout; /* 19 */ + u32 path_mtu; /* 20 */ + u32 max_static_rate; /* 21 */ + u32 dlid; /* 22 */ + u32 rnr_retry_count; /* 23 */ + u32 source_path_bits; /* 24 */ + u32 traffic_class; /* 25 */ + u32 hop_limit; /* 26 */ + u32 source_gid_idx; /* 27 */ + u32 flow_label; /* 28 */ + u32 reserved_29; /* 29 */ + union { /* 30 */ + u64 dw[2]; + u8 byte[16]; + } dest_gid; + u32 service_level_al; /* 34 */ + u32 send_grh_flag_al; /* 35 */ + u32 retry_count_al; /* 36 */ + u32 timeout_al; /* 37 */ + u32 max_static_rate_al; /* 38 */ + u32 dlid_al; /* 39 */ + u32 rnr_retry_count_al; /* 40 */ + u32 source_path_bits_al; /* 41 */ + u32 traffic_class_al; /* 42 */ + u32 hop_limit_al; /* 43 */ + u32 source_gid_idx_al; /* 44 */ + u32 flow_label_al; /* 45 */ + u32 reserved_46; /* 46 */ + u32 reserved_47; /* 47 */ + union { /* 48 */ + u64 dw[2]; + u8 byte[16]; + } dest_gid_al; + u32 max_nr_outst_send_wr; /* 52 */ + u32 max_nr_outst_recv_wr; /* 53 */ + u32 disable_ete_credit_check; /* 54 */ + u32 qp_number; /* 55 */ + u64 send_queue_handle; /* 56 */ + u64 recv_queue_handle; /* 58 */ + u32 actual_nr_sges_in_sq_wqe; /* 60 */ + u32 actual_nr_sges_in_rq_wqe; /* 61 */ + u32 qp_enable; /* 62 */ + u32 curr_srq_limit; /* 63 */ + u64 qp_aff_asyn_ev_log_reg; /* 64 */ + u64 shared_rq_hndl; /* 66 */ + u64 trigg_doorbell_qp_hndl; /* 68 */ + u32 reserved_70_127[58]; /* 70 */ +}; + +#define MQPCB_MASK_QKEY EHCA_BMASK_IBM( 0, 0) +#define MQPCB_MASK_SEND_PSN EHCA_BMASK_IBM( 2, 2) +#define MQPCB_MASK_RECEIVE_PSN EHCA_BMASK_IBM( 3, 3) +#define MQPCB_MASK_PRIM_PHYS_PORT EHCA_BMASK_IBM( 4, 4) +#define MQPCB_PRIM_PHYS_PORT EHCA_BMASK_IBM(24, 31) +#define MQPCB_MASK_ALT_PHYS_PORT EHCA_BMASK_IBM( 5, 5) +#define MQPCB_MASK_PRIM_P_KEY_IDX EHCA_BMASK_IBM( 6, 6) +#define MQPCB_PRIM_P_KEY_IDX EHCA_BMASK_IBM(24, 31) +#define MQPCB_MASK_ALT_P_KEY_IDX EHCA_BMASK_IBM( 7, 7) +#define MQPCB_MASK_RDMA_ATOMIC_CTRL EHCA_BMASK_IBM( 8, 8) +#define MQPCB_MASK_QP_STATE EHCA_BMASK_IBM( 9, 9) +#define MQPCB_MASK_RDMA_NR_ATOMIC_RESP_RES EHCA_BMASK_IBM(11, 11) +#define MQPCB_MASK_PATH_MIGRATION_STATE EHCA_BMASK_IBM(12, 12) +#define MQPCB_MASK_RDMA_ATOMIC_OUTST_DEST_QP EHCA_BMASK_IBM(13, 13) +#define MQPCB_MASK_DEST_QP_NR EHCA_BMASK_IBM(14, 14) +#define MQPCB_MASK_MIN_RNR_NAK_TIMER_FIELD EHCA_BMASK_IBM(15, 15) +#define MQPCB_MASK_SERVICE_LEVEL EHCA_BMASK_IBM(16, 16) +#define MQPCB_MASK_SEND_GRH_FLAG EHCA_BMASK_IBM(17, 17) +#define MQPCB_MASK_RETRY_COUNT EHCA_BMASK_IBM(18, 18) +#define MQPCB_MASK_TIMEOUT EHCA_BMASK_IBM(19, 19) +#define MQPCB_MASK_PATH_MTU EHCA_BMASK_IBM(20, 20) +#define MQPCB_MASK_MAX_STATIC_RATE EHCA_BMASK_IBM(21, 21) +#define MQPCB_MASK_DLID EHCA_BMASK_IBM(22, 22) +#define MQPCB_MASK_RNR_RETRY_COUNT EHCA_BMASK_IBM(23, 23) +#define MQPCB_MASK_SOURCE_PATH_BITS EHCA_BMASK_IBM(24, 24) +#define MQPCB_MASK_TRAFFIC_CLASS EHCA_BMASK_IBM(25, 25) +#define MQPCB_MASK_HOP_LIMIT EHCA_BMASK_IBM(26, 26) +#define MQPCB_MASK_SOURCE_GID_IDX EHCA_BMASK_IBM(27, 27) +#define MQPCB_MASK_FLOW_LABEL EHCA_BMASK_IBM(28, 28) +#define MQPCB_MASK_DEST_GID EHCA_BMASK_IBM(30, 30) +#define MQPCB_MASK_SERVICE_LEVEL_AL EHCA_BMASK_IBM(31, 31) +#define MQPCB_MASK_SEND_GRH_FLAG_AL EHCA_BMASK_IBM(32, 32) +#define MQPCB_MASK_RETRY_COUNT_AL EHCA_BMASK_IBM(33, 33) +#define MQPCB_MASK_TIMEOUT_AL EHCA_BMASK_IBM(34, 34) +#define MQPCB_MASK_MAX_STATIC_RATE_AL EHCA_BMASK_IBM(35, 35) +#define MQPCB_MASK_DLID_AL EHCA_BMASK_IBM(36, 36) +#define MQPCB_MASK_RNR_RETRY_COUNT_AL EHCA_BMASK_IBM(37, 37) +#define MQPCB_MASK_SOURCE_PATH_BITS_AL EHCA_BMASK_IBM(38, 38) +#define MQPCB_MASK_TRAFFIC_CLASS_AL EHCA_BMASK_IBM(39, 39) +#define MQPCB_MASK_HOP_LIMIT_AL EHCA_BMASK_IBM(40, 40) +#define MQPCB_MASK_SOURCE_GID_IDX_AL EHCA_BMASK_IBM(41, 41) +#define MQPCB_MASK_FLOW_LABEL_AL EHCA_BMASK_IBM(42, 42) +#define MQPCB_MASK_DEST_GID_AL EHCA_BMASK_IBM(44, 44) +#define MQPCB_MASK_MAX_NR_OUTST_SEND_WR EHCA_BMASK_IBM(45, 45) +#define MQPCB_MASK_MAX_NR_OUTST_RECV_WR EHCA_BMASK_IBM(46, 46) +#define MQPCB_MASK_DISABLE_ETE_CREDIT_CHECK EHCA_BMASK_IBM(47, 47) +#define MQPCB_MASK_QP_ENABLE EHCA_BMASK_IBM(48, 48) +#define MQPCB_MASK_CURR_SRQ_LIMIT EHCA_BMASK_IBM(49, 49) +#define MQPCB_MASK_QP_AFF_ASYN_EV_LOG_REG EHCA_BMASK_IBM(50, 50) +#define MQPCB_MASK_SHARED_RQ_HNDL EHCA_BMASK_IBM(51, 51) + +#endif /* __EHCA_CLASSES_PSERIES_H__ */ diff --git a/kernel/drivers/infiniband/hw/ehca/ehca_cq.c b/kernel/drivers/infiniband/hw/ehca/ehca_cq.c new file mode 100644 index 000000000..8cc837537 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ehca/ehca_cq.c @@ -0,0 +1,392 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * Completion queue handling + * + * Authors: Waleri Fomin + * Khadija Souissi + * Reinhard Ernst + * Heiko J Schick + * Hoang-Nam Nguyen + * + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#include "ehca_iverbs.h" +#include "ehca_classes.h" +#include "ehca_irq.h" +#include "hcp_if.h" + +static struct kmem_cache *cq_cache; + +int ehca_cq_assign_qp(struct ehca_cq *cq, struct ehca_qp *qp) +{ + unsigned int qp_num = qp->real_qp_num; + unsigned int key = qp_num & (QP_HASHTAB_LEN-1); + unsigned long flags; + + spin_lock_irqsave(&cq->spinlock, flags); + hlist_add_head(&qp->list_entries, &cq->qp_hashtab[key]); + spin_unlock_irqrestore(&cq->spinlock, flags); + + ehca_dbg(cq->ib_cq.device, "cq_num=%x real_qp_num=%x", + cq->cq_number, qp_num); + + return 0; +} + +int ehca_cq_unassign_qp(struct ehca_cq *cq, unsigned int real_qp_num) +{ + int ret = -EINVAL; + unsigned int key = real_qp_num & (QP_HASHTAB_LEN-1); + struct hlist_node *iter; + struct ehca_qp *qp; + unsigned long flags; + + spin_lock_irqsave(&cq->spinlock, flags); + hlist_for_each(iter, &cq->qp_hashtab[key]) { + qp = hlist_entry(iter, struct ehca_qp, list_entries); + if (qp->real_qp_num == real_qp_num) { + hlist_del(iter); + ehca_dbg(cq->ib_cq.device, + "removed qp from cq .cq_num=%x real_qp_num=%x", + cq->cq_number, real_qp_num); + ret = 0; + break; + } + } + spin_unlock_irqrestore(&cq->spinlock, flags); + if (ret) + ehca_err(cq->ib_cq.device, + "qp not found cq_num=%x real_qp_num=%x", + cq->cq_number, real_qp_num); + + return ret; +} + +struct ehca_qp *ehca_cq_get_qp(struct ehca_cq *cq, int real_qp_num) +{ + struct ehca_qp *ret = NULL; + unsigned int key = real_qp_num & (QP_HASHTAB_LEN-1); + struct hlist_node *iter; + struct ehca_qp *qp; + hlist_for_each(iter, &cq->qp_hashtab[key]) { + qp = hlist_entry(iter, struct ehca_qp, list_entries); + if (qp->real_qp_num == real_qp_num) { + ret = qp; + break; + } + } + return ret; +} + +struct ib_cq *ehca_create_cq(struct ib_device *device, int cqe, int comp_vector, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + static const u32 additional_cqe = 20; + struct ib_cq *cq; + struct ehca_cq *my_cq; + struct ehca_shca *shca = + container_of(device, struct ehca_shca, ib_device); + struct ipz_adapter_handle adapter_handle; + struct ehca_alloc_cq_parms param; /* h_call's out parameters */ + struct h_galpa gal; + void *vpage; + u32 counter; + u64 rpage, cqx_fec, h_ret; + int ipz_rc, i; + unsigned long flags; + + if (cqe >= 0xFFFFFFFF - 64 - additional_cqe) + return ERR_PTR(-EINVAL); + + if (!atomic_add_unless(&shca->num_cqs, 1, shca->max_num_cqs)) { + ehca_err(device, "Unable to create CQ, max number of %i " + "CQs reached.", shca->max_num_cqs); + ehca_err(device, "To increase the maximum number of CQs " + "use the number_of_cqs module parameter.\n"); + return ERR_PTR(-ENOSPC); + } + + my_cq = kmem_cache_zalloc(cq_cache, GFP_KERNEL); + if (!my_cq) { + ehca_err(device, "Out of memory for ehca_cq struct device=%p", + device); + atomic_dec(&shca->num_cqs); + return ERR_PTR(-ENOMEM); + } + + memset(¶m, 0, sizeof(struct ehca_alloc_cq_parms)); + + spin_lock_init(&my_cq->spinlock); + spin_lock_init(&my_cq->cb_lock); + spin_lock_init(&my_cq->task_lock); + atomic_set(&my_cq->nr_events, 0); + init_waitqueue_head(&my_cq->wait_completion); + + cq = &my_cq->ib_cq; + + adapter_handle = shca->ipz_hca_handle; + param.eq_handle = shca->eq.ipz_eq_handle; + + idr_preload(GFP_KERNEL); + write_lock_irqsave(&ehca_cq_idr_lock, flags); + my_cq->token = idr_alloc(&ehca_cq_idr, my_cq, 0, 0x2000000, GFP_NOWAIT); + write_unlock_irqrestore(&ehca_cq_idr_lock, flags); + idr_preload_end(); + + if (my_cq->token < 0) { + cq = ERR_PTR(-ENOMEM); + ehca_err(device, "Can't allocate new idr entry. device=%p", + device); + goto create_cq_exit1; + } + + /* + * CQs maximum depth is 4GB-64, but we need additional 20 as buffer + * for receiving errors CQEs. + */ + param.nr_cqe = cqe + additional_cqe; + h_ret = hipz_h_alloc_resource_cq(adapter_handle, my_cq, ¶m); + + if (h_ret != H_SUCCESS) { + ehca_err(device, "hipz_h_alloc_resource_cq() failed " + "h_ret=%lli device=%p", h_ret, device); + cq = ERR_PTR(ehca2ib_return_code(h_ret)); + goto create_cq_exit2; + } + + ipz_rc = ipz_queue_ctor(NULL, &my_cq->ipz_queue, param.act_pages, + EHCA_PAGESIZE, sizeof(struct ehca_cqe), 0, 0); + if (!ipz_rc) { + ehca_err(device, "ipz_queue_ctor() failed ipz_rc=%i device=%p", + ipz_rc, device); + cq = ERR_PTR(-EINVAL); + goto create_cq_exit3; + } + + for (counter = 0; counter < param.act_pages; counter++) { + vpage = ipz_qpageit_get_inc(&my_cq->ipz_queue); + if (!vpage) { + ehca_err(device, "ipz_qpageit_get_inc() " + "returns NULL device=%p", device); + cq = ERR_PTR(-EAGAIN); + goto create_cq_exit4; + } + rpage = __pa(vpage); + + h_ret = hipz_h_register_rpage_cq(adapter_handle, + my_cq->ipz_cq_handle, + &my_cq->pf, + 0, + 0, + rpage, + 1, + my_cq->galpas. + kernel); + + if (h_ret < H_SUCCESS) { + ehca_err(device, "hipz_h_register_rpage_cq() failed " + "ehca_cq=%p cq_num=%x h_ret=%lli counter=%i " + "act_pages=%i", my_cq, my_cq->cq_number, + h_ret, counter, param.act_pages); + cq = ERR_PTR(-EINVAL); + goto create_cq_exit4; + } + + if (counter == (param.act_pages - 1)) { + vpage = ipz_qpageit_get_inc(&my_cq->ipz_queue); + if ((h_ret != H_SUCCESS) || vpage) { + ehca_err(device, "Registration of pages not " + "complete ehca_cq=%p cq_num=%x " + "h_ret=%lli", my_cq, my_cq->cq_number, + h_ret); + cq = ERR_PTR(-EAGAIN); + goto create_cq_exit4; + } + } else { + if (h_ret != H_PAGE_REGISTERED) { + ehca_err(device, "Registration of page failed " + "ehca_cq=%p cq_num=%x h_ret=%lli " + "counter=%i act_pages=%i", + my_cq, my_cq->cq_number, + h_ret, counter, param.act_pages); + cq = ERR_PTR(-ENOMEM); + goto create_cq_exit4; + } + } + } + + ipz_qeit_reset(&my_cq->ipz_queue); + + gal = my_cq->galpas.kernel; + cqx_fec = hipz_galpa_load(gal, CQTEMM_OFFSET(cqx_fec)); + ehca_dbg(device, "ehca_cq=%p cq_num=%x CQX_FEC=%llx", + my_cq, my_cq->cq_number, cqx_fec); + + my_cq->ib_cq.cqe = my_cq->nr_of_entries = + param.act_nr_of_entries - additional_cqe; + my_cq->cq_number = (my_cq->ipz_cq_handle.handle) & 0xffff; + + for (i = 0; i < QP_HASHTAB_LEN; i++) + INIT_HLIST_HEAD(&my_cq->qp_hashtab[i]); + + INIT_LIST_HEAD(&my_cq->sqp_err_list); + INIT_LIST_HEAD(&my_cq->rqp_err_list); + + if (context) { + struct ipz_queue *ipz_queue = &my_cq->ipz_queue; + struct ehca_create_cq_resp resp; + memset(&resp, 0, sizeof(resp)); + resp.cq_number = my_cq->cq_number; + resp.token = my_cq->token; + resp.ipz_queue.qe_size = ipz_queue->qe_size; + resp.ipz_queue.act_nr_of_sg = ipz_queue->act_nr_of_sg; + resp.ipz_queue.queue_length = ipz_queue->queue_length; + resp.ipz_queue.pagesize = ipz_queue->pagesize; + resp.ipz_queue.toggle_state = ipz_queue->toggle_state; + resp.fw_handle_ofs = (u32) + (my_cq->galpas.user.fw_handle & (PAGE_SIZE - 1)); + if (ib_copy_to_udata(udata, &resp, sizeof(resp))) { + ehca_err(device, "Copy to udata failed."); + cq = ERR_PTR(-EFAULT); + goto create_cq_exit4; + } + } + + return cq; + +create_cq_exit4: + ipz_queue_dtor(NULL, &my_cq->ipz_queue); + +create_cq_exit3: + h_ret = hipz_h_destroy_cq(adapter_handle, my_cq, 1); + if (h_ret != H_SUCCESS) + ehca_err(device, "hipz_h_destroy_cq() failed ehca_cq=%p " + "cq_num=%x h_ret=%lli", my_cq, my_cq->cq_number, h_ret); + +create_cq_exit2: + write_lock_irqsave(&ehca_cq_idr_lock, flags); + idr_remove(&ehca_cq_idr, my_cq->token); + write_unlock_irqrestore(&ehca_cq_idr_lock, flags); + +create_cq_exit1: + kmem_cache_free(cq_cache, my_cq); + + atomic_dec(&shca->num_cqs); + return cq; +} + +int ehca_destroy_cq(struct ib_cq *cq) +{ + u64 h_ret; + struct ehca_cq *my_cq = container_of(cq, struct ehca_cq, ib_cq); + int cq_num = my_cq->cq_number; + struct ib_device *device = cq->device; + struct ehca_shca *shca = container_of(device, struct ehca_shca, + ib_device); + struct ipz_adapter_handle adapter_handle = shca->ipz_hca_handle; + unsigned long flags; + + if (cq->uobject) { + if (my_cq->mm_count_galpa || my_cq->mm_count_queue) { + ehca_err(device, "Resources still referenced in " + "user space cq_num=%x", my_cq->cq_number); + return -EINVAL; + } + } + + /* + * remove the CQ from the idr first to make sure + * no more interrupt tasklets will touch this CQ + */ + write_lock_irqsave(&ehca_cq_idr_lock, flags); + idr_remove(&ehca_cq_idr, my_cq->token); + write_unlock_irqrestore(&ehca_cq_idr_lock, flags); + + /* now wait until all pending events have completed */ + wait_event(my_cq->wait_completion, !atomic_read(&my_cq->nr_events)); + + /* nobody's using our CQ any longer -- we can destroy it */ + h_ret = hipz_h_destroy_cq(adapter_handle, my_cq, 0); + if (h_ret == H_R_STATE) { + /* cq in err: read err data and destroy it forcibly */ + ehca_dbg(device, "ehca_cq=%p cq_num=%x resource=%llx in err " + "state. Try to delete it forcibly.", + my_cq, cq_num, my_cq->ipz_cq_handle.handle); + ehca_error_data(shca, my_cq, my_cq->ipz_cq_handle.handle); + h_ret = hipz_h_destroy_cq(adapter_handle, my_cq, 1); + if (h_ret == H_SUCCESS) + ehca_dbg(device, "cq_num=%x deleted successfully.", + cq_num); + } + if (h_ret != H_SUCCESS) { + ehca_err(device, "hipz_h_destroy_cq() failed h_ret=%lli " + "ehca_cq=%p cq_num=%x", h_ret, my_cq, cq_num); + return ehca2ib_return_code(h_ret); + } + ipz_queue_dtor(NULL, &my_cq->ipz_queue); + kmem_cache_free(cq_cache, my_cq); + + atomic_dec(&shca->num_cqs); + return 0; +} + +int ehca_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata) +{ + /* TODO: proper resize needs to be done */ + ehca_err(cq->device, "not implemented yet"); + + return -EFAULT; +} + +int ehca_init_cq_cache(void) +{ + cq_cache = kmem_cache_create("ehca_cache_cq", + sizeof(struct ehca_cq), 0, + SLAB_HWCACHE_ALIGN, + NULL); + if (!cq_cache) + return -ENOMEM; + return 0; +} + +void ehca_cleanup_cq_cache(void) +{ + if (cq_cache) + kmem_cache_destroy(cq_cache); +} diff --git a/kernel/drivers/infiniband/hw/ehca/ehca_eq.c b/kernel/drivers/infiniband/hw/ehca/ehca_eq.c new file mode 100644 index 000000000..90da6747d --- /dev/null +++ b/kernel/drivers/infiniband/hw/ehca/ehca_eq.c @@ -0,0 +1,189 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * Event queue handling + * + * Authors: Waleri Fomin + * Khadija Souissi + * Reinhard Ernst + * Heiko J Schick + * Hoang-Nam Nguyen + * + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "ehca_classes.h" +#include "ehca_irq.h" +#include "ehca_iverbs.h" +#include "ehca_qes.h" +#include "hcp_if.h" +#include "ipz_pt_fn.h" + +int ehca_create_eq(struct ehca_shca *shca, + struct ehca_eq *eq, + const enum ehca_eq_type type, const u32 length) +{ + int ret; + u64 h_ret; + u32 nr_pages; + u32 i; + void *vpage; + struct ib_device *ib_dev = &shca->ib_device; + + spin_lock_init(&eq->spinlock); + spin_lock_init(&eq->irq_spinlock); + eq->is_initialized = 0; + + if (type != EHCA_EQ && type != EHCA_NEQ) { + ehca_err(ib_dev, "Invalid EQ type %x. eq=%p", type, eq); + return -EINVAL; + } + if (!length) { + ehca_err(ib_dev, "EQ length must not be zero. eq=%p", eq); + return -EINVAL; + } + + h_ret = hipz_h_alloc_resource_eq(shca->ipz_hca_handle, + &eq->pf, + type, + length, + &eq->ipz_eq_handle, + &eq->length, + &nr_pages, &eq->ist); + + if (h_ret != H_SUCCESS) { + ehca_err(ib_dev, "Can't allocate EQ/NEQ. eq=%p", eq); + return -EINVAL; + } + + ret = ipz_queue_ctor(NULL, &eq->ipz_queue, nr_pages, + EHCA_PAGESIZE, sizeof(struct ehca_eqe), 0, 0); + if (!ret) { + ehca_err(ib_dev, "Can't allocate EQ pages eq=%p", eq); + goto create_eq_exit1; + } + + for (i = 0; i < nr_pages; i++) { + u64 rpage; + + vpage = ipz_qpageit_get_inc(&eq->ipz_queue); + if (!vpage) + goto create_eq_exit2; + + rpage = __pa(vpage); + h_ret = hipz_h_register_rpage_eq(shca->ipz_hca_handle, + eq->ipz_eq_handle, + &eq->pf, + 0, 0, rpage, 1); + + if (i == (nr_pages - 1)) { + /* last page */ + vpage = ipz_qpageit_get_inc(&eq->ipz_queue); + if (h_ret != H_SUCCESS || vpage) + goto create_eq_exit2; + } else { + if (h_ret != H_PAGE_REGISTERED) + goto create_eq_exit2; + } + } + + ipz_qeit_reset(&eq->ipz_queue); + + /* register interrupt handlers and initialize work queues */ + if (type == EHCA_EQ) { + tasklet_init(&eq->interrupt_task, ehca_tasklet_eq, (long)shca); + + ret = ibmebus_request_irq(eq->ist, ehca_interrupt_eq, + 0, "ehca_eq", + (void *)shca); + if (ret < 0) + ehca_err(ib_dev, "Can't map interrupt handler."); + } else if (type == EHCA_NEQ) { + tasklet_init(&eq->interrupt_task, ehca_tasklet_neq, (long)shca); + + ret = ibmebus_request_irq(eq->ist, ehca_interrupt_neq, + 0, "ehca_neq", + (void *)shca); + if (ret < 0) + ehca_err(ib_dev, "Can't map interrupt handler."); + } + + eq->is_initialized = 1; + + return 0; + +create_eq_exit2: + ipz_queue_dtor(NULL, &eq->ipz_queue); + +create_eq_exit1: + hipz_h_destroy_eq(shca->ipz_hca_handle, eq); + + return -EINVAL; +} + +void *ehca_poll_eq(struct ehca_shca *shca, struct ehca_eq *eq) +{ + unsigned long flags; + void *eqe; + + spin_lock_irqsave(&eq->spinlock, flags); + eqe = ipz_eqit_eq_get_inc_valid(&eq->ipz_queue); + spin_unlock_irqrestore(&eq->spinlock, flags); + + return eqe; +} + +int ehca_destroy_eq(struct ehca_shca *shca, struct ehca_eq *eq) +{ + unsigned long flags; + u64 h_ret; + + ibmebus_free_irq(eq->ist, (void *)shca); + + spin_lock_irqsave(&shca_list_lock, flags); + eq->is_initialized = 0; + spin_unlock_irqrestore(&shca_list_lock, flags); + + tasklet_kill(&eq->interrupt_task); + + h_ret = hipz_h_destroy_eq(shca->ipz_hca_handle, eq); + + if (h_ret != H_SUCCESS) { + ehca_err(&shca->ib_device, "Can't free EQ resources."); + return -EINVAL; + } + ipz_queue_dtor(NULL, &eq->ipz_queue); + + return 0; +} diff --git a/kernel/drivers/infiniband/hw/ehca/ehca_hca.c b/kernel/drivers/infiniband/hw/ehca/ehca_hca.c new file mode 100644 index 000000000..9ed4d2588 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ehca/ehca_hca.c @@ -0,0 +1,410 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * HCA query functions + * + * Authors: Heiko J Schick + * Christoph Raisch + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#include "ehca_tools.h" +#include "ehca_iverbs.h" +#include "hcp_if.h" + +static unsigned int limit_uint(unsigned int value) +{ + return min_t(unsigned int, value, INT_MAX); +} + +int ehca_query_device(struct ib_device *ibdev, struct ib_device_attr *props) +{ + int i, ret = 0; + struct ehca_shca *shca = container_of(ibdev, struct ehca_shca, + ib_device); + struct hipz_query_hca *rblock; + + static const u32 cap_mapping[] = { + IB_DEVICE_RESIZE_MAX_WR, HCA_CAP_WQE_RESIZE, + IB_DEVICE_BAD_PKEY_CNTR, HCA_CAP_BAD_P_KEY_CTR, + IB_DEVICE_BAD_QKEY_CNTR, HCA_CAP_Q_KEY_VIOL_CTR, + IB_DEVICE_RAW_MULTI, HCA_CAP_RAW_PACKET_MCAST, + IB_DEVICE_AUTO_PATH_MIG, HCA_CAP_AUTO_PATH_MIG, + IB_DEVICE_CHANGE_PHY_PORT, HCA_CAP_SQD_RTS_PORT_CHANGE, + IB_DEVICE_UD_AV_PORT_ENFORCE, HCA_CAP_AH_PORT_NR_CHECK, + IB_DEVICE_CURR_QP_STATE_MOD, HCA_CAP_CUR_QP_STATE_MOD, + IB_DEVICE_SHUTDOWN_PORT, HCA_CAP_SHUTDOWN_PORT, + IB_DEVICE_INIT_TYPE, HCA_CAP_INIT_TYPE, + IB_DEVICE_PORT_ACTIVE_EVENT, HCA_CAP_PORT_ACTIVE_EVENT, + }; + + rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL); + if (!rblock) { + ehca_err(&shca->ib_device, "Can't allocate rblock memory."); + return -ENOMEM; + } + + if (hipz_h_query_hca(shca->ipz_hca_handle, rblock) != H_SUCCESS) { + ehca_err(&shca->ib_device, "Can't query device properties"); + ret = -EINVAL; + goto query_device1; + } + + memset(props, 0, sizeof(struct ib_device_attr)); + props->page_size_cap = shca->hca_cap_mr_pgsize; + props->fw_ver = rblock->hw_ver; + props->max_mr_size = rblock->max_mr_size; + props->vendor_id = rblock->vendor_id >> 8; + props->vendor_part_id = rblock->vendor_part_id >> 16; + props->hw_ver = rblock->hw_ver; + props->max_qp = limit_uint(rblock->max_qp); + props->max_qp_wr = limit_uint(rblock->max_wqes_wq); + props->max_sge = limit_uint(rblock->max_sge); + props->max_sge_rd = limit_uint(rblock->max_sge_rd); + props->max_cq = limit_uint(rblock->max_cq); + props->max_cqe = limit_uint(rblock->max_cqe); + props->max_mr = limit_uint(rblock->max_mr); + props->max_mw = limit_uint(rblock->max_mw); + props->max_pd = limit_uint(rblock->max_pd); + props->max_ah = limit_uint(rblock->max_ah); + props->max_ee = limit_uint(rblock->max_rd_ee_context); + props->max_rdd = limit_uint(rblock->max_rd_domain); + props->max_fmr = limit_uint(rblock->max_mr); + props->max_qp_rd_atom = limit_uint(rblock->max_rr_qp); + props->max_ee_rd_atom = limit_uint(rblock->max_rr_ee_context); + props->max_res_rd_atom = limit_uint(rblock->max_rr_hca); + props->max_qp_init_rd_atom = limit_uint(rblock->max_act_wqs_qp); + props->max_ee_init_rd_atom = limit_uint(rblock->max_act_wqs_ee_context); + + if (EHCA_BMASK_GET(HCA_CAP_SRQ, shca->hca_cap)) { + props->max_srq = limit_uint(props->max_qp); + props->max_srq_wr = limit_uint(props->max_qp_wr); + props->max_srq_sge = 3; + } + + props->max_pkeys = 16; + /* Some FW versions say 0 here; insert sensible value in that case */ + props->local_ca_ack_delay = rblock->local_ca_ack_delay ? + min_t(u8, rblock->local_ca_ack_delay, 255) : 12; + props->max_raw_ipv6_qp = limit_uint(rblock->max_raw_ipv6_qp); + props->max_raw_ethy_qp = limit_uint(rblock->max_raw_ethy_qp); + props->max_mcast_grp = limit_uint(rblock->max_mcast_grp); + props->max_mcast_qp_attach = limit_uint(rblock->max_mcast_qp_attach); + props->max_total_mcast_qp_attach + = limit_uint(rblock->max_total_mcast_qp_attach); + + /* translate device capabilities */ + props->device_cap_flags = IB_DEVICE_SYS_IMAGE_GUID | + IB_DEVICE_RC_RNR_NAK_GEN | IB_DEVICE_N_NOTIFY_CQ; + for (i = 0; i < ARRAY_SIZE(cap_mapping); i += 2) + if (rblock->hca_cap_indicators & cap_mapping[i + 1]) + props->device_cap_flags |= cap_mapping[i]; + +query_device1: + ehca_free_fw_ctrlblock(rblock); + + return ret; +} + +static enum ib_mtu map_mtu(struct ehca_shca *shca, u32 fw_mtu) +{ + switch (fw_mtu) { + case 0x1: + return IB_MTU_256; + case 0x2: + return IB_MTU_512; + case 0x3: + return IB_MTU_1024; + case 0x4: + return IB_MTU_2048; + case 0x5: + return IB_MTU_4096; + default: + ehca_err(&shca->ib_device, "Unknown MTU size: %x.", + fw_mtu); + return 0; + } +} + +static u8 map_number_of_vls(struct ehca_shca *shca, u32 vl_cap) +{ + switch (vl_cap) { + case 0x1: + return 1; + case 0x2: + return 2; + case 0x3: + return 4; + case 0x4: + return 8; + case 0x5: + return 15; + default: + ehca_err(&shca->ib_device, "invalid Vl Capability: %x.", + vl_cap); + return 0; + } +} + +int ehca_query_port(struct ib_device *ibdev, + u8 port, struct ib_port_attr *props) +{ + int ret = 0; + u64 h_ret; + struct ehca_shca *shca = container_of(ibdev, struct ehca_shca, + ib_device); + struct hipz_query_port *rblock; + + rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL); + if (!rblock) { + ehca_err(&shca->ib_device, "Can't allocate rblock memory."); + return -ENOMEM; + } + + h_ret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock); + if (h_ret != H_SUCCESS) { + ehca_err(&shca->ib_device, "Can't query port properties"); + ret = -EINVAL; + goto query_port1; + } + + memset(props, 0, sizeof(struct ib_port_attr)); + + props->active_mtu = props->max_mtu = map_mtu(shca, rblock->max_mtu); + props->port_cap_flags = rblock->capability_mask; + props->gid_tbl_len = rblock->gid_tbl_len; + if (rblock->max_msg_sz) + props->max_msg_sz = rblock->max_msg_sz; + else + props->max_msg_sz = 0x1 << 31; + props->bad_pkey_cntr = rblock->bad_pkey_cntr; + props->qkey_viol_cntr = rblock->qkey_viol_cntr; + props->pkey_tbl_len = rblock->pkey_tbl_len; + props->lid = rblock->lid; + props->sm_lid = rblock->sm_lid; + props->lmc = rblock->lmc; + props->sm_sl = rblock->sm_sl; + props->subnet_timeout = rblock->subnet_timeout; + props->init_type_reply = rblock->init_type_reply; + props->max_vl_num = map_number_of_vls(shca, rblock->vl_cap); + + if (rblock->state && rblock->phys_width) { + props->phys_state = rblock->phys_pstate; + props->state = rblock->phys_state; + props->active_width = rblock->phys_width; + props->active_speed = rblock->phys_speed; + } else { + /* old firmware releases don't report physical + * port info, so use default values + */ + props->phys_state = 5; + props->state = rblock->state; + props->active_width = IB_WIDTH_12X; + props->active_speed = IB_SPEED_SDR; + } + +query_port1: + ehca_free_fw_ctrlblock(rblock); + + return ret; +} + +int ehca_query_sma_attr(struct ehca_shca *shca, + u8 port, struct ehca_sma_attr *attr) +{ + int ret = 0; + u64 h_ret; + struct hipz_query_port *rblock; + + rblock = ehca_alloc_fw_ctrlblock(GFP_ATOMIC); + if (!rblock) { + ehca_err(&shca->ib_device, "Can't allocate rblock memory."); + return -ENOMEM; + } + + h_ret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock); + if (h_ret != H_SUCCESS) { + ehca_err(&shca->ib_device, "Can't query port properties"); + ret = -EINVAL; + goto query_sma_attr1; + } + + memset(attr, 0, sizeof(struct ehca_sma_attr)); + + attr->lid = rblock->lid; + attr->lmc = rblock->lmc; + attr->sm_sl = rblock->sm_sl; + attr->sm_lid = rblock->sm_lid; + + attr->pkey_tbl_len = rblock->pkey_tbl_len; + memcpy(attr->pkeys, rblock->pkey_entries, sizeof(attr->pkeys)); + +query_sma_attr1: + ehca_free_fw_ctrlblock(rblock); + + return ret; +} + +int ehca_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey) +{ + int ret = 0; + u64 h_ret; + struct ehca_shca *shca; + struct hipz_query_port *rblock; + + shca = container_of(ibdev, struct ehca_shca, ib_device); + if (index > 16) { + ehca_err(&shca->ib_device, "Invalid index: %x.", index); + return -EINVAL; + } + + rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL); + if (!rblock) { + ehca_err(&shca->ib_device, "Can't allocate rblock memory."); + return -ENOMEM; + } + + h_ret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock); + if (h_ret != H_SUCCESS) { + ehca_err(&shca->ib_device, "Can't query port properties"); + ret = -EINVAL; + goto query_pkey1; + } + + memcpy(pkey, &rblock->pkey_entries + index, sizeof(u16)); + +query_pkey1: + ehca_free_fw_ctrlblock(rblock); + + return ret; +} + +int ehca_query_gid(struct ib_device *ibdev, u8 port, + int index, union ib_gid *gid) +{ + int ret = 0; + u64 h_ret; + struct ehca_shca *shca = container_of(ibdev, struct ehca_shca, + ib_device); + struct hipz_query_port *rblock; + + if (index < 0 || index > 255) { + ehca_err(&shca->ib_device, "Invalid index: %x.", index); + return -EINVAL; + } + + rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL); + if (!rblock) { + ehca_err(&shca->ib_device, "Can't allocate rblock memory."); + return -ENOMEM; + } + + h_ret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock); + if (h_ret != H_SUCCESS) { + ehca_err(&shca->ib_device, "Can't query port properties"); + ret = -EINVAL; + goto query_gid1; + } + + memcpy(&gid->raw[0], &rblock->gid_prefix, sizeof(u64)); + memcpy(&gid->raw[8], &rblock->guid_entries[index], sizeof(u64)); + +query_gid1: + ehca_free_fw_ctrlblock(rblock); + + return ret; +} + +static const u32 allowed_port_caps = ( + IB_PORT_SM | IB_PORT_LED_INFO_SUP | IB_PORT_CM_SUP | + IB_PORT_SNMP_TUNNEL_SUP | IB_PORT_DEVICE_MGMT_SUP | + IB_PORT_VENDOR_CLASS_SUP); + +int ehca_modify_port(struct ib_device *ibdev, + u8 port, int port_modify_mask, + struct ib_port_modify *props) +{ + int ret = 0; + struct ehca_shca *shca; + struct hipz_query_port *rblock; + u32 cap; + u64 hret; + + shca = container_of(ibdev, struct ehca_shca, ib_device); + if ((props->set_port_cap_mask | props->clr_port_cap_mask) + & ~allowed_port_caps) { + ehca_err(&shca->ib_device, "Non-changeable bits set in masks " + "set=%x clr=%x allowed=%x", props->set_port_cap_mask, + props->clr_port_cap_mask, allowed_port_caps); + return -EINVAL; + } + + if (mutex_lock_interruptible(&shca->modify_mutex)) + return -ERESTARTSYS; + + rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL); + if (!rblock) { + ehca_err(&shca->ib_device, "Can't allocate rblock memory."); + ret = -ENOMEM; + goto modify_port1; + } + + hret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock); + if (hret != H_SUCCESS) { + ehca_err(&shca->ib_device, "Can't query port properties"); + ret = -EINVAL; + goto modify_port2; + } + + cap = (rblock->capability_mask | props->set_port_cap_mask) + & ~props->clr_port_cap_mask; + + hret = hipz_h_modify_port(shca->ipz_hca_handle, port, + cap, props->init_type, port_modify_mask); + if (hret != H_SUCCESS) { + ehca_err(&shca->ib_device, "Modify port failed h_ret=%lli", + hret); + ret = -EINVAL; + } + +modify_port2: + ehca_free_fw_ctrlblock(rblock); + +modify_port1: + mutex_unlock(&shca->modify_mutex); + + return ret; +} diff --git a/kernel/drivers/infiniband/hw/ehca/ehca_irq.c b/kernel/drivers/infiniband/hw/ehca/ehca_irq.c new file mode 100644 index 000000000..8615d7cf7 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ehca/ehca_irq.c @@ -0,0 +1,870 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * Functions for EQs, NEQs and interrupts + * + * Authors: Heiko J Schick + * Khadija Souissi + * Hoang-Nam Nguyen + * Joachim Fenkes + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include + +#include "ehca_classes.h" +#include "ehca_irq.h" +#include "ehca_iverbs.h" +#include "ehca_tools.h" +#include "hcp_if.h" +#include "hipz_fns.h" +#include "ipz_pt_fn.h" + +#define EQE_COMPLETION_EVENT EHCA_BMASK_IBM( 1, 1) +#define EQE_CQ_QP_NUMBER EHCA_BMASK_IBM( 8, 31) +#define EQE_EE_IDENTIFIER EHCA_BMASK_IBM( 2, 7) +#define EQE_CQ_NUMBER EHCA_BMASK_IBM( 8, 31) +#define EQE_QP_NUMBER EHCA_BMASK_IBM( 8, 31) +#define EQE_QP_TOKEN EHCA_BMASK_IBM(32, 63) +#define EQE_CQ_TOKEN EHCA_BMASK_IBM(32, 63) + +#define NEQE_COMPLETION_EVENT EHCA_BMASK_IBM( 1, 1) +#define NEQE_EVENT_CODE EHCA_BMASK_IBM( 2, 7) +#define NEQE_PORT_NUMBER EHCA_BMASK_IBM( 8, 15) +#define NEQE_PORT_AVAILABILITY EHCA_BMASK_IBM(16, 16) +#define NEQE_DISRUPTIVE EHCA_BMASK_IBM(16, 16) +#define NEQE_SPECIFIC_EVENT EHCA_BMASK_IBM(16, 23) + +#define ERROR_DATA_LENGTH EHCA_BMASK_IBM(52, 63) +#define ERROR_DATA_TYPE EHCA_BMASK_IBM( 0, 7) + +static void queue_comp_task(struct ehca_cq *__cq); + +static struct ehca_comp_pool *pool; + +static inline void comp_event_callback(struct ehca_cq *cq) +{ + if (!cq->ib_cq.comp_handler) + return; + + spin_lock(&cq->cb_lock); + cq->ib_cq.comp_handler(&cq->ib_cq, cq->ib_cq.cq_context); + spin_unlock(&cq->cb_lock); + + return; +} + +static void print_error_data(struct ehca_shca *shca, void *data, + u64 *rblock, int length) +{ + u64 type = EHCA_BMASK_GET(ERROR_DATA_TYPE, rblock[2]); + u64 resource = rblock[1]; + + switch (type) { + case 0x1: /* Queue Pair */ + { + struct ehca_qp *qp = (struct ehca_qp *)data; + + /* only print error data if AER is set */ + if (rblock[6] == 0) + return; + + ehca_err(&shca->ib_device, + "QP 0x%x (resource=%llx) has errors.", + qp->ib_qp.qp_num, resource); + break; + } + case 0x4: /* Completion Queue */ + { + struct ehca_cq *cq = (struct ehca_cq *)data; + + ehca_err(&shca->ib_device, + "CQ 0x%x (resource=%llx) has errors.", + cq->cq_number, resource); + break; + } + default: + ehca_err(&shca->ib_device, + "Unknown error type: %llx on %s.", + type, shca->ib_device.name); + break; + } + + ehca_err(&shca->ib_device, "Error data is available: %llx.", resource); + ehca_err(&shca->ib_device, "EHCA ----- error data begin " + "---------------------------------------------------"); + ehca_dmp(rblock, length, "resource=%llx", resource); + ehca_err(&shca->ib_device, "EHCA ----- error data end " + "----------------------------------------------------"); + + return; +} + +int ehca_error_data(struct ehca_shca *shca, void *data, + u64 resource) +{ + + unsigned long ret; + u64 *rblock; + unsigned long block_count; + + rblock = ehca_alloc_fw_ctrlblock(GFP_ATOMIC); + if (!rblock) { + ehca_err(&shca->ib_device, "Cannot allocate rblock memory."); + ret = -ENOMEM; + goto error_data1; + } + + /* rblock must be 4K aligned and should be 4K large */ + ret = hipz_h_error_data(shca->ipz_hca_handle, + resource, + rblock, + &block_count); + + if (ret == H_R_STATE) + ehca_err(&shca->ib_device, + "No error data is available: %llx.", resource); + else if (ret == H_SUCCESS) { + int length; + + length = EHCA_BMASK_GET(ERROR_DATA_LENGTH, rblock[0]); + + if (length > EHCA_PAGESIZE) + length = EHCA_PAGESIZE; + + print_error_data(shca, data, rblock, length); + } else + ehca_err(&shca->ib_device, + "Error data could not be fetched: %llx", resource); + + ehca_free_fw_ctrlblock(rblock); + +error_data1: + return ret; + +} + +static void dispatch_qp_event(struct ehca_shca *shca, struct ehca_qp *qp, + enum ib_event_type event_type) +{ + struct ib_event event; + + /* PATH_MIG without the QP ever having been armed is false alarm */ + if (event_type == IB_EVENT_PATH_MIG && !qp->mig_armed) + return; + + event.device = &shca->ib_device; + event.event = event_type; + + if (qp->ext_type == EQPT_SRQ) { + if (!qp->ib_srq.event_handler) + return; + + event.element.srq = &qp->ib_srq; + qp->ib_srq.event_handler(&event, qp->ib_srq.srq_context); + } else { + if (!qp->ib_qp.event_handler) + return; + + event.element.qp = &qp->ib_qp; + qp->ib_qp.event_handler(&event, qp->ib_qp.qp_context); + } +} + +static void qp_event_callback(struct ehca_shca *shca, u64 eqe, + enum ib_event_type event_type, int fatal) +{ + struct ehca_qp *qp; + u32 token = EHCA_BMASK_GET(EQE_QP_TOKEN, eqe); + + read_lock(&ehca_qp_idr_lock); + qp = idr_find(&ehca_qp_idr, token); + if (qp) + atomic_inc(&qp->nr_events); + read_unlock(&ehca_qp_idr_lock); + + if (!qp) + return; + + if (fatal) + ehca_error_data(shca, qp, qp->ipz_qp_handle.handle); + + dispatch_qp_event(shca, qp, fatal && qp->ext_type == EQPT_SRQ ? + IB_EVENT_SRQ_ERR : event_type); + + /* + * eHCA only processes one WQE at a time for SRQ base QPs, + * so the last WQE has been processed as soon as the QP enters + * error state. + */ + if (fatal && qp->ext_type == EQPT_SRQBASE) + dispatch_qp_event(shca, qp, IB_EVENT_QP_LAST_WQE_REACHED); + + if (atomic_dec_and_test(&qp->nr_events)) + wake_up(&qp->wait_completion); + return; +} + +static void cq_event_callback(struct ehca_shca *shca, + u64 eqe) +{ + struct ehca_cq *cq; + u32 token = EHCA_BMASK_GET(EQE_CQ_TOKEN, eqe); + + read_lock(&ehca_cq_idr_lock); + cq = idr_find(&ehca_cq_idr, token); + if (cq) + atomic_inc(&cq->nr_events); + read_unlock(&ehca_cq_idr_lock); + + if (!cq) + return; + + ehca_error_data(shca, cq, cq->ipz_cq_handle.handle); + + if (atomic_dec_and_test(&cq->nr_events)) + wake_up(&cq->wait_completion); + + return; +} + +static void parse_identifier(struct ehca_shca *shca, u64 eqe) +{ + u8 identifier = EHCA_BMASK_GET(EQE_EE_IDENTIFIER, eqe); + + switch (identifier) { + case 0x02: /* path migrated */ + qp_event_callback(shca, eqe, IB_EVENT_PATH_MIG, 0); + break; + case 0x03: /* communication established */ + qp_event_callback(shca, eqe, IB_EVENT_COMM_EST, 0); + break; + case 0x04: /* send queue drained */ + qp_event_callback(shca, eqe, IB_EVENT_SQ_DRAINED, 0); + break; + case 0x05: /* QP error */ + case 0x06: /* QP error */ + qp_event_callback(shca, eqe, IB_EVENT_QP_FATAL, 1); + break; + case 0x07: /* CQ error */ + case 0x08: /* CQ error */ + cq_event_callback(shca, eqe); + break; + case 0x09: /* MRMWPTE error */ + ehca_err(&shca->ib_device, "MRMWPTE error."); + break; + case 0x0A: /* port event */ + ehca_err(&shca->ib_device, "Port event."); + break; + case 0x0B: /* MR access error */ + ehca_err(&shca->ib_device, "MR access error."); + break; + case 0x0C: /* EQ error */ + ehca_err(&shca->ib_device, "EQ error."); + break; + case 0x0D: /* P/Q_Key mismatch */ + ehca_err(&shca->ib_device, "P/Q_Key mismatch."); + break; + case 0x10: /* sampling complete */ + ehca_err(&shca->ib_device, "Sampling complete."); + break; + case 0x11: /* unaffiliated access error */ + ehca_err(&shca->ib_device, "Unaffiliated access error."); + break; + case 0x12: /* path migrating */ + ehca_err(&shca->ib_device, "Path migrating."); + break; + case 0x13: /* interface trace stopped */ + ehca_err(&shca->ib_device, "Interface trace stopped."); + break; + case 0x14: /* first error capture info available */ + ehca_info(&shca->ib_device, "First error capture available"); + break; + case 0x15: /* SRQ limit reached */ + qp_event_callback(shca, eqe, IB_EVENT_SRQ_LIMIT_REACHED, 0); + break; + default: + ehca_err(&shca->ib_device, "Unknown identifier: %x on %s.", + identifier, shca->ib_device.name); + break; + } + + return; +} + +static void dispatch_port_event(struct ehca_shca *shca, int port_num, + enum ib_event_type type, const char *msg) +{ + struct ib_event event; + + ehca_info(&shca->ib_device, "port %d %s.", port_num, msg); + event.device = &shca->ib_device; + event.event = type; + event.element.port_num = port_num; + ib_dispatch_event(&event); +} + +static void notify_port_conf_change(struct ehca_shca *shca, int port_num) +{ + struct ehca_sma_attr new_attr; + struct ehca_sma_attr *old_attr = &shca->sport[port_num - 1].saved_attr; + + ehca_query_sma_attr(shca, port_num, &new_attr); + + if (new_attr.sm_sl != old_attr->sm_sl || + new_attr.sm_lid != old_attr->sm_lid) + dispatch_port_event(shca, port_num, IB_EVENT_SM_CHANGE, + "SM changed"); + + if (new_attr.lid != old_attr->lid || + new_attr.lmc != old_attr->lmc) + dispatch_port_event(shca, port_num, IB_EVENT_LID_CHANGE, + "LID changed"); + + if (new_attr.pkey_tbl_len != old_attr->pkey_tbl_len || + memcmp(new_attr.pkeys, old_attr->pkeys, + sizeof(u16) * new_attr.pkey_tbl_len)) + dispatch_port_event(shca, port_num, IB_EVENT_PKEY_CHANGE, + "P_Key changed"); + + *old_attr = new_attr; +} + +/* replay modify_qp for sqps -- return 0 if all is well, 1 if AQP1 destroyed */ +static int replay_modify_qp(struct ehca_sport *sport) +{ + int aqp1_destroyed; + unsigned long flags; + + spin_lock_irqsave(&sport->mod_sqp_lock, flags); + + aqp1_destroyed = !sport->ibqp_sqp[IB_QPT_GSI]; + + if (sport->ibqp_sqp[IB_QPT_SMI]) + ehca_recover_sqp(sport->ibqp_sqp[IB_QPT_SMI]); + if (!aqp1_destroyed) + ehca_recover_sqp(sport->ibqp_sqp[IB_QPT_GSI]); + + spin_unlock_irqrestore(&sport->mod_sqp_lock, flags); + + return aqp1_destroyed; +} + +static void parse_ec(struct ehca_shca *shca, u64 eqe) +{ + u8 ec = EHCA_BMASK_GET(NEQE_EVENT_CODE, eqe); + u8 port = EHCA_BMASK_GET(NEQE_PORT_NUMBER, eqe); + u8 spec_event; + struct ehca_sport *sport = &shca->sport[port - 1]; + + switch (ec) { + case 0x30: /* port availability change */ + if (EHCA_BMASK_GET(NEQE_PORT_AVAILABILITY, eqe)) { + /* only replay modify_qp calls in autodetect mode; + * if AQP1 was destroyed, the port is already down + * again and we can drop the event. + */ + if (ehca_nr_ports < 0) + if (replay_modify_qp(sport)) + break; + + sport->port_state = IB_PORT_ACTIVE; + dispatch_port_event(shca, port, IB_EVENT_PORT_ACTIVE, + "is active"); + ehca_query_sma_attr(shca, port, &sport->saved_attr); + } else { + sport->port_state = IB_PORT_DOWN; + dispatch_port_event(shca, port, IB_EVENT_PORT_ERR, + "is inactive"); + } + break; + case 0x31: + /* port configuration change + * disruptive change is caused by + * LID, PKEY or SM change + */ + if (EHCA_BMASK_GET(NEQE_DISRUPTIVE, eqe)) { + ehca_warn(&shca->ib_device, "disruptive port " + "%d configuration change", port); + + sport->port_state = IB_PORT_DOWN; + dispatch_port_event(shca, port, IB_EVENT_PORT_ERR, + "is inactive"); + + sport->port_state = IB_PORT_ACTIVE; + dispatch_port_event(shca, port, IB_EVENT_PORT_ACTIVE, + "is active"); + ehca_query_sma_attr(shca, port, + &sport->saved_attr); + } else + notify_port_conf_change(shca, port); + break; + case 0x32: /* adapter malfunction */ + ehca_err(&shca->ib_device, "Adapter malfunction."); + break; + case 0x33: /* trace stopped */ + ehca_err(&shca->ib_device, "Traced stopped."); + break; + case 0x34: /* util async event */ + spec_event = EHCA_BMASK_GET(NEQE_SPECIFIC_EVENT, eqe); + if (spec_event == 0x80) /* client reregister required */ + dispatch_port_event(shca, port, + IB_EVENT_CLIENT_REREGISTER, + "client reregister req."); + else + ehca_warn(&shca->ib_device, "Unknown util async " + "event %x on port %x", spec_event, port); + break; + default: + ehca_err(&shca->ib_device, "Unknown event code: %x on %s.", + ec, shca->ib_device.name); + break; + } + + return; +} + +static inline void reset_eq_pending(struct ehca_cq *cq) +{ + u64 CQx_EP; + struct h_galpa gal = cq->galpas.kernel; + + hipz_galpa_store_cq(gal, cqx_ep, 0x0); + CQx_EP = hipz_galpa_load(gal, CQTEMM_OFFSET(cqx_ep)); + + return; +} + +irqreturn_t ehca_interrupt_neq(int irq, void *dev_id) +{ + struct ehca_shca *shca = (struct ehca_shca*)dev_id; + + tasklet_hi_schedule(&shca->neq.interrupt_task); + + return IRQ_HANDLED; +} + +void ehca_tasklet_neq(unsigned long data) +{ + struct ehca_shca *shca = (struct ehca_shca*)data; + struct ehca_eqe *eqe; + u64 ret; + + eqe = ehca_poll_eq(shca, &shca->neq); + + while (eqe) { + if (!EHCA_BMASK_GET(NEQE_COMPLETION_EVENT, eqe->entry)) + parse_ec(shca, eqe->entry); + + eqe = ehca_poll_eq(shca, &shca->neq); + } + + ret = hipz_h_reset_event(shca->ipz_hca_handle, + shca->neq.ipz_eq_handle, 0xFFFFFFFFFFFFFFFFL); + + if (ret != H_SUCCESS) + ehca_err(&shca->ib_device, "Can't clear notification events."); + + return; +} + +irqreturn_t ehca_interrupt_eq(int irq, void *dev_id) +{ + struct ehca_shca *shca = (struct ehca_shca*)dev_id; + + tasklet_hi_schedule(&shca->eq.interrupt_task); + + return IRQ_HANDLED; +} + + +static inline void process_eqe(struct ehca_shca *shca, struct ehca_eqe *eqe) +{ + u64 eqe_value; + u32 token; + struct ehca_cq *cq; + + eqe_value = eqe->entry; + ehca_dbg(&shca->ib_device, "eqe_value=%llx", eqe_value); + if (EHCA_BMASK_GET(EQE_COMPLETION_EVENT, eqe_value)) { + ehca_dbg(&shca->ib_device, "Got completion event"); + token = EHCA_BMASK_GET(EQE_CQ_TOKEN, eqe_value); + read_lock(&ehca_cq_idr_lock); + cq = idr_find(&ehca_cq_idr, token); + if (cq) + atomic_inc(&cq->nr_events); + read_unlock(&ehca_cq_idr_lock); + if (cq == NULL) { + ehca_err(&shca->ib_device, + "Invalid eqe for non-existing cq token=%x", + token); + return; + } + reset_eq_pending(cq); + if (ehca_scaling_code) + queue_comp_task(cq); + else { + comp_event_callback(cq); + if (atomic_dec_and_test(&cq->nr_events)) + wake_up(&cq->wait_completion); + } + } else { + ehca_dbg(&shca->ib_device, "Got non completion event"); + parse_identifier(shca, eqe_value); + } +} + +void ehca_process_eq(struct ehca_shca *shca, int is_irq) +{ + struct ehca_eq *eq = &shca->eq; + struct ehca_eqe_cache_entry *eqe_cache = eq->eqe_cache; + u64 eqe_value, ret; + int eqe_cnt, i; + int eq_empty = 0; + + spin_lock(&eq->irq_spinlock); + if (is_irq) { + const int max_query_cnt = 100; + int query_cnt = 0; + int int_state = 1; + do { + int_state = hipz_h_query_int_state( + shca->ipz_hca_handle, eq->ist); + query_cnt++; + iosync(); + } while (int_state && query_cnt < max_query_cnt); + if (unlikely((query_cnt == max_query_cnt))) + ehca_dbg(&shca->ib_device, "int_state=%x query_cnt=%x", + int_state, query_cnt); + } + + /* read out all eqes */ + eqe_cnt = 0; + do { + u32 token; + eqe_cache[eqe_cnt].eqe = ehca_poll_eq(shca, eq); + if (!eqe_cache[eqe_cnt].eqe) + break; + eqe_value = eqe_cache[eqe_cnt].eqe->entry; + if (EHCA_BMASK_GET(EQE_COMPLETION_EVENT, eqe_value)) { + token = EHCA_BMASK_GET(EQE_CQ_TOKEN, eqe_value); + read_lock(&ehca_cq_idr_lock); + eqe_cache[eqe_cnt].cq = idr_find(&ehca_cq_idr, token); + if (eqe_cache[eqe_cnt].cq) + atomic_inc(&eqe_cache[eqe_cnt].cq->nr_events); + read_unlock(&ehca_cq_idr_lock); + if (!eqe_cache[eqe_cnt].cq) { + ehca_err(&shca->ib_device, + "Invalid eqe for non-existing cq " + "token=%x", token); + continue; + } + } else + eqe_cache[eqe_cnt].cq = NULL; + eqe_cnt++; + } while (eqe_cnt < EHCA_EQE_CACHE_SIZE); + if (!eqe_cnt) { + if (is_irq) + ehca_dbg(&shca->ib_device, + "No eqe found for irq event"); + goto unlock_irq_spinlock; + } else if (!is_irq) { + ret = hipz_h_eoi(eq->ist); + if (ret != H_SUCCESS) + ehca_err(&shca->ib_device, + "bad return code EOI -rc = %lld\n", ret); + ehca_dbg(&shca->ib_device, "deadman found %x eqe", eqe_cnt); + } + if (unlikely(eqe_cnt == EHCA_EQE_CACHE_SIZE)) + ehca_dbg(&shca->ib_device, "too many eqes for one irq event"); + /* enable irq for new packets */ + for (i = 0; i < eqe_cnt; i++) { + if (eq->eqe_cache[i].cq) + reset_eq_pending(eq->eqe_cache[i].cq); + } + /* check eq */ + spin_lock(&eq->spinlock); + eq_empty = (!ipz_eqit_eq_peek_valid(&shca->eq.ipz_queue)); + spin_unlock(&eq->spinlock); + /* call completion handler for cached eqes */ + for (i = 0; i < eqe_cnt; i++) + if (eq->eqe_cache[i].cq) { + if (ehca_scaling_code) + queue_comp_task(eq->eqe_cache[i].cq); + else { + struct ehca_cq *cq = eq->eqe_cache[i].cq; + comp_event_callback(cq); + if (atomic_dec_and_test(&cq->nr_events)) + wake_up(&cq->wait_completion); + } + } else { + ehca_dbg(&shca->ib_device, "Got non completion event"); + parse_identifier(shca, eq->eqe_cache[i].eqe->entry); + } + /* poll eq if not empty */ + if (eq_empty) + goto unlock_irq_spinlock; + do { + struct ehca_eqe *eqe; + eqe = ehca_poll_eq(shca, &shca->eq); + if (!eqe) + break; + process_eqe(shca, eqe); + } while (1); + +unlock_irq_spinlock: + spin_unlock(&eq->irq_spinlock); +} + +void ehca_tasklet_eq(unsigned long data) +{ + ehca_process_eq((struct ehca_shca*)data, 1); +} + +static int find_next_online_cpu(struct ehca_comp_pool *pool) +{ + int cpu; + unsigned long flags; + + WARN_ON_ONCE(!in_interrupt()); + if (ehca_debug_level >= 3) + ehca_dmp(cpu_online_mask, cpumask_size(), ""); + + spin_lock_irqsave(&pool->last_cpu_lock, flags); + do { + cpu = cpumask_next(pool->last_cpu, cpu_online_mask); + if (cpu >= nr_cpu_ids) + cpu = cpumask_first(cpu_online_mask); + pool->last_cpu = cpu; + } while (!per_cpu_ptr(pool->cpu_comp_tasks, cpu)->active); + spin_unlock_irqrestore(&pool->last_cpu_lock, flags); + + return cpu; +} + +static void __queue_comp_task(struct ehca_cq *__cq, + struct ehca_cpu_comp_task *cct, + struct task_struct *thread) +{ + unsigned long flags; + + spin_lock_irqsave(&cct->task_lock, flags); + spin_lock(&__cq->task_lock); + + if (__cq->nr_callbacks == 0) { + __cq->nr_callbacks++; + list_add_tail(&__cq->entry, &cct->cq_list); + cct->cq_jobs++; + wake_up_process(thread); + } else + __cq->nr_callbacks++; + + spin_unlock(&__cq->task_lock); + spin_unlock_irqrestore(&cct->task_lock, flags); +} + +static void queue_comp_task(struct ehca_cq *__cq) +{ + int cpu_id; + struct ehca_cpu_comp_task *cct; + struct task_struct *thread; + int cq_jobs; + unsigned long flags; + + cpu_id = find_next_online_cpu(pool); + BUG_ON(!cpu_online(cpu_id)); + + cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu_id); + thread = *per_cpu_ptr(pool->cpu_comp_threads, cpu_id); + BUG_ON(!cct || !thread); + + spin_lock_irqsave(&cct->task_lock, flags); + cq_jobs = cct->cq_jobs; + spin_unlock_irqrestore(&cct->task_lock, flags); + if (cq_jobs > 0) { + cpu_id = find_next_online_cpu(pool); + cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu_id); + thread = *per_cpu_ptr(pool->cpu_comp_threads, cpu_id); + BUG_ON(!cct || !thread); + } + __queue_comp_task(__cq, cct, thread); +} + +static void run_comp_task(struct ehca_cpu_comp_task *cct) +{ + struct ehca_cq *cq; + + while (!list_empty(&cct->cq_list)) { + cq = list_entry(cct->cq_list.next, struct ehca_cq, entry); + spin_unlock_irq(&cct->task_lock); + + comp_event_callback(cq); + if (atomic_dec_and_test(&cq->nr_events)) + wake_up(&cq->wait_completion); + + spin_lock_irq(&cct->task_lock); + spin_lock(&cq->task_lock); + cq->nr_callbacks--; + if (!cq->nr_callbacks) { + list_del_init(cct->cq_list.next); + cct->cq_jobs--; + } + spin_unlock(&cq->task_lock); + } +} + +static void comp_task_park(unsigned int cpu) +{ + struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu); + struct ehca_cpu_comp_task *target; + struct task_struct *thread; + struct ehca_cq *cq, *tmp; + LIST_HEAD(list); + + spin_lock_irq(&cct->task_lock); + cct->cq_jobs = 0; + cct->active = 0; + list_splice_init(&cct->cq_list, &list); + spin_unlock_irq(&cct->task_lock); + + cpu = find_next_online_cpu(pool); + target = per_cpu_ptr(pool->cpu_comp_tasks, cpu); + thread = *per_cpu_ptr(pool->cpu_comp_threads, cpu); + spin_lock_irq(&target->task_lock); + list_for_each_entry_safe(cq, tmp, &list, entry) { + list_del(&cq->entry); + __queue_comp_task(cq, target, thread); + } + spin_unlock_irq(&target->task_lock); +} + +static void comp_task_stop(unsigned int cpu, bool online) +{ + struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu); + + spin_lock_irq(&cct->task_lock); + cct->cq_jobs = 0; + cct->active = 0; + WARN_ON(!list_empty(&cct->cq_list)); + spin_unlock_irq(&cct->task_lock); +} + +static int comp_task_should_run(unsigned int cpu) +{ + struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu); + + return cct->cq_jobs; +} + +static void comp_task(unsigned int cpu) +{ + struct ehca_cpu_comp_task *cct = this_cpu_ptr(pool->cpu_comp_tasks); + int cql_empty; + + spin_lock_irq(&cct->task_lock); + cql_empty = list_empty(&cct->cq_list); + if (!cql_empty) { + __set_current_state(TASK_RUNNING); + run_comp_task(cct); + } + spin_unlock_irq(&cct->task_lock); +} + +static struct smp_hotplug_thread comp_pool_threads = { + .thread_should_run = comp_task_should_run, + .thread_fn = comp_task, + .thread_comm = "ehca_comp/%u", + .cleanup = comp_task_stop, + .park = comp_task_park, +}; + +int ehca_create_comp_pool(void) +{ + int cpu, ret = -ENOMEM; + + if (!ehca_scaling_code) + return 0; + + pool = kzalloc(sizeof(struct ehca_comp_pool), GFP_KERNEL); + if (pool == NULL) + return -ENOMEM; + + spin_lock_init(&pool->last_cpu_lock); + pool->last_cpu = cpumask_any(cpu_online_mask); + + pool->cpu_comp_tasks = alloc_percpu(struct ehca_cpu_comp_task); + if (!pool->cpu_comp_tasks) + goto out_pool; + + pool->cpu_comp_threads = alloc_percpu(struct task_struct *); + if (!pool->cpu_comp_threads) + goto out_tasks; + + for_each_present_cpu(cpu) { + struct ehca_cpu_comp_task *cct; + + cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu); + spin_lock_init(&cct->task_lock); + INIT_LIST_HEAD(&cct->cq_list); + } + + comp_pool_threads.store = pool->cpu_comp_threads; + ret = smpboot_register_percpu_thread(&comp_pool_threads); + if (ret) + goto out_threads; + + pr_info("eHCA scaling code enabled\n"); + return ret; + +out_threads: + free_percpu(pool->cpu_comp_threads); +out_tasks: + free_percpu(pool->cpu_comp_tasks); +out_pool: + kfree(pool); + return ret; +} + +void ehca_destroy_comp_pool(void) +{ + if (!ehca_scaling_code) + return; + + smpboot_unregister_percpu_thread(&comp_pool_threads); + + free_percpu(pool->cpu_comp_threads); + free_percpu(pool->cpu_comp_tasks); + kfree(pool); +} diff --git a/kernel/drivers/infiniband/hw/ehca/ehca_irq.h b/kernel/drivers/infiniband/hw/ehca/ehca_irq.h new file mode 100644 index 000000000..5370199f0 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ehca/ehca_irq.h @@ -0,0 +1,77 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * Function definitions and structs for EQs, NEQs and interrupts + * + * Authors: Heiko J Schick + * Khadija Souissi + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __EHCA_IRQ_H +#define __EHCA_IRQ_H + + +struct ehca_shca; + +#include +#include + +int ehca_error_data(struct ehca_shca *shca, void *data, u64 resource); + +irqreturn_t ehca_interrupt_neq(int irq, void *dev_id); +void ehca_tasklet_neq(unsigned long data); + +irqreturn_t ehca_interrupt_eq(int irq, void *dev_id); +void ehca_tasklet_eq(unsigned long data); +void ehca_process_eq(struct ehca_shca *shca, int is_irq); + +struct ehca_cpu_comp_task { + struct list_head cq_list; + spinlock_t task_lock; + int cq_jobs; + int active; +}; + +struct ehca_comp_pool { + struct ehca_cpu_comp_task __percpu *cpu_comp_tasks; + struct task_struct * __percpu *cpu_comp_threads; + int last_cpu; + spinlock_t last_cpu_lock; +}; + +int ehca_create_comp_pool(void); +void ehca_destroy_comp_pool(void); + +#endif diff --git a/kernel/drivers/infiniband/hw/ehca/ehca_iverbs.h b/kernel/drivers/infiniband/hw/ehca/ehca_iverbs.h new file mode 100644 index 000000000..22f79afa7 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ehca/ehca_iverbs.h @@ -0,0 +1,212 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * Function definitions for internal functions + * + * Authors: Heiko J Schick + * Dietmar Decker + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __EHCA_IVERBS_H__ +#define __EHCA_IVERBS_H__ + +#include "ehca_classes.h" + +int ehca_query_device(struct ib_device *ibdev, struct ib_device_attr *props); + +int ehca_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props); + +int ehca_query_sma_attr(struct ehca_shca *shca, u8 port, + struct ehca_sma_attr *attr); + +int ehca_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 * pkey); + +int ehca_query_gid(struct ib_device *ibdev, u8 port, int index, + union ib_gid *gid); + +int ehca_modify_port(struct ib_device *ibdev, u8 port, int port_modify_mask, + struct ib_port_modify *props); + +struct ib_pd *ehca_alloc_pd(struct ib_device *device, + struct ib_ucontext *context, + struct ib_udata *udata); + +int ehca_dealloc_pd(struct ib_pd *pd); + +struct ib_ah *ehca_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr); + +int ehca_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr); + +int ehca_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr); + +int ehca_destroy_ah(struct ib_ah *ah); + +struct ib_mr *ehca_get_dma_mr(struct ib_pd *pd, int mr_access_flags); + +struct ib_mr *ehca_reg_phys_mr(struct ib_pd *pd, + struct ib_phys_buf *phys_buf_array, + int num_phys_buf, + int mr_access_flags, u64 *iova_start); + +struct ib_mr *ehca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt, int mr_access_flags, + struct ib_udata *udata); + +int ehca_rereg_phys_mr(struct ib_mr *mr, + int mr_rereg_mask, + struct ib_pd *pd, + struct ib_phys_buf *phys_buf_array, + int num_phys_buf, int mr_access_flags, u64 *iova_start); + +int ehca_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr); + +int ehca_dereg_mr(struct ib_mr *mr); + +struct ib_mw *ehca_alloc_mw(struct ib_pd *pd, enum ib_mw_type type); + +int ehca_bind_mw(struct ib_qp *qp, struct ib_mw *mw, + struct ib_mw_bind *mw_bind); + +int ehca_dealloc_mw(struct ib_mw *mw); + +struct ib_fmr *ehca_alloc_fmr(struct ib_pd *pd, + int mr_access_flags, + struct ib_fmr_attr *fmr_attr); + +int ehca_map_phys_fmr(struct ib_fmr *fmr, + u64 *page_list, int list_len, u64 iova); + +int ehca_unmap_fmr(struct list_head *fmr_list); + +int ehca_dealloc_fmr(struct ib_fmr *fmr); + +enum ehca_eq_type { + EHCA_EQ = 0, /* Event Queue */ + EHCA_NEQ /* Notification Event Queue */ +}; + +int ehca_create_eq(struct ehca_shca *shca, struct ehca_eq *eq, + enum ehca_eq_type type, const u32 length); + +int ehca_destroy_eq(struct ehca_shca *shca, struct ehca_eq *eq); + +void *ehca_poll_eq(struct ehca_shca *shca, struct ehca_eq *eq); + + +struct ib_cq *ehca_create_cq(struct ib_device *device, int cqe, int comp_vector, + struct ib_ucontext *context, + struct ib_udata *udata); + +int ehca_destroy_cq(struct ib_cq *cq); + +int ehca_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata); + +int ehca_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc); + +int ehca_peek_cq(struct ib_cq *cq, int wc_cnt); + +int ehca_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags notify_flags); + +struct ib_qp *ehca_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata); + +int ehca_destroy_qp(struct ib_qp *qp); + +int ehca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, + struct ib_udata *udata); + +int ehca_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr); + +int ehca_post_send(struct ib_qp *qp, struct ib_send_wr *send_wr, + struct ib_send_wr **bad_send_wr); + +int ehca_post_recv(struct ib_qp *qp, struct ib_recv_wr *recv_wr, + struct ib_recv_wr **bad_recv_wr); + +int ehca_post_srq_recv(struct ib_srq *srq, + struct ib_recv_wr *recv_wr, + struct ib_recv_wr **bad_recv_wr); + +struct ib_srq *ehca_create_srq(struct ib_pd *pd, + struct ib_srq_init_attr *init_attr, + struct ib_udata *udata); + +int ehca_modify_srq(struct ib_srq *srq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, struct ib_udata *udata); + +int ehca_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr); + +int ehca_destroy_srq(struct ib_srq *srq); + +u64 ehca_define_sqp(struct ehca_shca *shca, struct ehca_qp *ibqp, + struct ib_qp_init_attr *qp_init_attr); + +int ehca_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid); + +int ehca_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid); + +struct ib_ucontext *ehca_alloc_ucontext(struct ib_device *device, + struct ib_udata *udata); + +int ehca_dealloc_ucontext(struct ib_ucontext *context); + +int ehca_mmap(struct ib_ucontext *context, struct vm_area_struct *vma); + +int ehca_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + struct ib_wc *in_wc, struct ib_grh *in_grh, + struct ib_mad *in_mad, + struct ib_mad *out_mad); + +void ehca_poll_eqs(unsigned long data); + +int ehca_calc_ipd(struct ehca_shca *shca, int port, + enum ib_rate path_rate, u32 *ipd); + +void ehca_add_to_err_list(struct ehca_qp *qp, int on_sq); + +#ifdef CONFIG_PPC_64K_PAGES +void *ehca_alloc_fw_ctrlblock(gfp_t flags); +void ehca_free_fw_ctrlblock(void *ptr); +#else +#define ehca_alloc_fw_ctrlblock(flags) ((void *)get_zeroed_page(flags)) +#define ehca_free_fw_ctrlblock(ptr) free_page((unsigned long)(ptr)) +#endif + +void ehca_recover_sqp(struct ib_qp *sqp); + +#endif diff --git a/kernel/drivers/infiniband/hw/ehca/ehca_main.c b/kernel/drivers/infiniband/hw/ehca/ehca_main.c new file mode 100644 index 000000000..cd8d290a0 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ehca/ehca_main.c @@ -0,0 +1,1100 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * module start stop, hca detection + * + * Authors: Heiko J Schick + * Hoang-Nam Nguyen + * Joachim Fenkes + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef CONFIG_PPC_64K_PAGES +#include +#endif + +#include +#include +#include "ehca_classes.h" +#include "ehca_iverbs.h" +#include "ehca_mrmw.h" +#include "ehca_tools.h" +#include "hcp_if.h" + +#define HCAD_VERSION "0029" + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_AUTHOR("Christoph Raisch "); +MODULE_DESCRIPTION("IBM eServer HCA InfiniBand Device Driver"); +MODULE_VERSION(HCAD_VERSION); + +static bool ehca_open_aqp1 = 0; +static int ehca_hw_level = 0; +static bool ehca_poll_all_eqs = 1; + +int ehca_debug_level = 0; +int ehca_nr_ports = -1; +bool ehca_use_hp_mr = 0; +int ehca_port_act_time = 30; +int ehca_static_rate = -1; +bool ehca_scaling_code = 0; +int ehca_lock_hcalls = -1; +int ehca_max_cq = -1; +int ehca_max_qp = -1; + +module_param_named(open_aqp1, ehca_open_aqp1, bool, S_IRUGO); +module_param_named(debug_level, ehca_debug_level, int, S_IRUGO); +module_param_named(hw_level, ehca_hw_level, int, S_IRUGO); +module_param_named(nr_ports, ehca_nr_ports, int, S_IRUGO); +module_param_named(use_hp_mr, ehca_use_hp_mr, bool, S_IRUGO); +module_param_named(port_act_time, ehca_port_act_time, int, S_IRUGO); +module_param_named(poll_all_eqs, ehca_poll_all_eqs, bool, S_IRUGO); +module_param_named(static_rate, ehca_static_rate, int, S_IRUGO); +module_param_named(scaling_code, ehca_scaling_code, bool, S_IRUGO); +module_param_named(lock_hcalls, ehca_lock_hcalls, bint, S_IRUGO); +module_param_named(number_of_cqs, ehca_max_cq, int, S_IRUGO); +module_param_named(number_of_qps, ehca_max_qp, int, S_IRUGO); + +MODULE_PARM_DESC(open_aqp1, + "Open AQP1 on startup (default: no)"); +MODULE_PARM_DESC(debug_level, + "Amount of debug output (0: none (default), 1: traces, " + "2: some dumps, 3: lots)"); +MODULE_PARM_DESC(hw_level, + "Hardware level (0: autosensing (default), " + "0x10..0x14: eHCA, 0x20..0x23: eHCA2)"); +MODULE_PARM_DESC(nr_ports, + "number of connected ports (-1: autodetect (default), " + "1: port one only, 2: two ports)"); +MODULE_PARM_DESC(use_hp_mr, + "Use high performance MRs (default: no)"); +MODULE_PARM_DESC(port_act_time, + "Time to wait for port activation (default: 30 sec)"); +MODULE_PARM_DESC(poll_all_eqs, + "Poll all event queues periodically (default: yes)"); +MODULE_PARM_DESC(static_rate, + "Set permanent static rate (default: no static rate)"); +MODULE_PARM_DESC(scaling_code, + "Enable scaling code (default: no)"); +MODULE_PARM_DESC(lock_hcalls, + "Serialize all hCalls made by the driver " + "(default: autodetect)"); +MODULE_PARM_DESC(number_of_cqs, + "Max number of CQs which can be allocated " + "(default: autodetect)"); +MODULE_PARM_DESC(number_of_qps, + "Max number of QPs which can be allocated " + "(default: autodetect)"); + +DEFINE_RWLOCK(ehca_qp_idr_lock); +DEFINE_RWLOCK(ehca_cq_idr_lock); +DEFINE_IDR(ehca_qp_idr); +DEFINE_IDR(ehca_cq_idr); + +static LIST_HEAD(shca_list); /* list of all registered ehcas */ +DEFINE_SPINLOCK(shca_list_lock); + +static struct timer_list poll_eqs_timer; + +#ifdef CONFIG_PPC_64K_PAGES +static struct kmem_cache *ctblk_cache; + +void *ehca_alloc_fw_ctrlblock(gfp_t flags) +{ + void *ret = kmem_cache_zalloc(ctblk_cache, flags); + if (!ret) + ehca_gen_err("Out of memory for ctblk"); + return ret; +} + +void ehca_free_fw_ctrlblock(void *ptr) +{ + if (ptr) + kmem_cache_free(ctblk_cache, ptr); + +} +#endif + +int ehca2ib_return_code(u64 ehca_rc) +{ + switch (ehca_rc) { + case H_SUCCESS: + return 0; + case H_RESOURCE: /* Resource in use */ + case H_BUSY: + return -EBUSY; + case H_NOT_ENOUGH_RESOURCES: /* insufficient resources */ + case H_CONSTRAINED: /* resource constraint */ + case H_NO_MEM: + return -ENOMEM; + default: + return -EINVAL; + } +} + +static int ehca_create_slab_caches(void) +{ + int ret; + + ret = ehca_init_pd_cache(); + if (ret) { + ehca_gen_err("Cannot create PD SLAB cache."); + return ret; + } + + ret = ehca_init_cq_cache(); + if (ret) { + ehca_gen_err("Cannot create CQ SLAB cache."); + goto create_slab_caches2; + } + + ret = ehca_init_qp_cache(); + if (ret) { + ehca_gen_err("Cannot create QP SLAB cache."); + goto create_slab_caches3; + } + + ret = ehca_init_av_cache(); + if (ret) { + ehca_gen_err("Cannot create AV SLAB cache."); + goto create_slab_caches4; + } + + ret = ehca_init_mrmw_cache(); + if (ret) { + ehca_gen_err("Cannot create MR&MW SLAB cache."); + goto create_slab_caches5; + } + + ret = ehca_init_small_qp_cache(); + if (ret) { + ehca_gen_err("Cannot create small queue SLAB cache."); + goto create_slab_caches6; + } + +#ifdef CONFIG_PPC_64K_PAGES + ctblk_cache = kmem_cache_create("ehca_cache_ctblk", + EHCA_PAGESIZE, H_CB_ALIGNMENT, + SLAB_HWCACHE_ALIGN, + NULL); + if (!ctblk_cache) { + ehca_gen_err("Cannot create ctblk SLAB cache."); + ehca_cleanup_small_qp_cache(); + ret = -ENOMEM; + goto create_slab_caches6; + } +#endif + return 0; + +create_slab_caches6: + ehca_cleanup_mrmw_cache(); + +create_slab_caches5: + ehca_cleanup_av_cache(); + +create_slab_caches4: + ehca_cleanup_qp_cache(); + +create_slab_caches3: + ehca_cleanup_cq_cache(); + +create_slab_caches2: + ehca_cleanup_pd_cache(); + + return ret; +} + +static void ehca_destroy_slab_caches(void) +{ + ehca_cleanup_small_qp_cache(); + ehca_cleanup_mrmw_cache(); + ehca_cleanup_av_cache(); + ehca_cleanup_qp_cache(); + ehca_cleanup_cq_cache(); + ehca_cleanup_pd_cache(); +#ifdef CONFIG_PPC_64K_PAGES + if (ctblk_cache) + kmem_cache_destroy(ctblk_cache); +#endif +} + +#define EHCA_HCAAVER EHCA_BMASK_IBM(32, 39) +#define EHCA_REVID EHCA_BMASK_IBM(40, 63) + +static struct cap_descr { + u64 mask; + char *descr; +} hca_cap_descr[] = { + { HCA_CAP_AH_PORT_NR_CHECK, "HCA_CAP_AH_PORT_NR_CHECK" }, + { HCA_CAP_ATOMIC, "HCA_CAP_ATOMIC" }, + { HCA_CAP_AUTO_PATH_MIG, "HCA_CAP_AUTO_PATH_MIG" }, + { HCA_CAP_BAD_P_KEY_CTR, "HCA_CAP_BAD_P_KEY_CTR" }, + { HCA_CAP_SQD_RTS_PORT_CHANGE, "HCA_CAP_SQD_RTS_PORT_CHANGE" }, + { HCA_CAP_CUR_QP_STATE_MOD, "HCA_CAP_CUR_QP_STATE_MOD" }, + { HCA_CAP_INIT_TYPE, "HCA_CAP_INIT_TYPE" }, + { HCA_CAP_PORT_ACTIVE_EVENT, "HCA_CAP_PORT_ACTIVE_EVENT" }, + { HCA_CAP_Q_KEY_VIOL_CTR, "HCA_CAP_Q_KEY_VIOL_CTR" }, + { HCA_CAP_WQE_RESIZE, "HCA_CAP_WQE_RESIZE" }, + { HCA_CAP_RAW_PACKET_MCAST, "HCA_CAP_RAW_PACKET_MCAST" }, + { HCA_CAP_SHUTDOWN_PORT, "HCA_CAP_SHUTDOWN_PORT" }, + { HCA_CAP_RC_LL_QP, "HCA_CAP_RC_LL_QP" }, + { HCA_CAP_SRQ, "HCA_CAP_SRQ" }, + { HCA_CAP_UD_LL_QP, "HCA_CAP_UD_LL_QP" }, + { HCA_CAP_RESIZE_MR, "HCA_CAP_RESIZE_MR" }, + { HCA_CAP_MINI_QP, "HCA_CAP_MINI_QP" }, + { HCA_CAP_H_ALLOC_RES_SYNC, "HCA_CAP_H_ALLOC_RES_SYNC" }, +}; + +static int ehca_sense_attributes(struct ehca_shca *shca) +{ + int i, ret = 0; + u64 h_ret; + struct hipz_query_hca *rblock; + struct hipz_query_port *port; + const char *loc_code; + + static const u32 pgsize_map[] = { + HCA_CAP_MR_PGSIZE_4K, 0x1000, + HCA_CAP_MR_PGSIZE_64K, 0x10000, + HCA_CAP_MR_PGSIZE_1M, 0x100000, + HCA_CAP_MR_PGSIZE_16M, 0x1000000, + }; + + ehca_gen_dbg("Probing adapter %s...", + shca->ofdev->dev.of_node->full_name); + loc_code = of_get_property(shca->ofdev->dev.of_node, "ibm,loc-code", + NULL); + if (loc_code) + ehca_gen_dbg(" ... location lode=%s", loc_code); + + rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL); + if (!rblock) { + ehca_gen_err("Cannot allocate rblock memory."); + return -ENOMEM; + } + + h_ret = hipz_h_query_hca(shca->ipz_hca_handle, rblock); + if (h_ret != H_SUCCESS) { + ehca_gen_err("Cannot query device properties. h_ret=%lli", + h_ret); + ret = -EPERM; + goto sense_attributes1; + } + + if (ehca_nr_ports == 1) + shca->num_ports = 1; + else + shca->num_ports = (u8)rblock->num_ports; + + ehca_gen_dbg(" ... found %x ports", rblock->num_ports); + + if (ehca_hw_level == 0) { + u32 hcaaver; + u32 revid; + + hcaaver = EHCA_BMASK_GET(EHCA_HCAAVER, rblock->hw_ver); + revid = EHCA_BMASK_GET(EHCA_REVID, rblock->hw_ver); + + ehca_gen_dbg(" ... hardware version=%x:%x", hcaaver, revid); + + if (hcaaver == 1) { + if (revid <= 3) + shca->hw_level = 0x10 | (revid + 1); + else + shca->hw_level = 0x14; + } else if (hcaaver == 2) { + if (revid == 0) + shca->hw_level = 0x21; + else if (revid == 0x10) + shca->hw_level = 0x22; + else if (revid == 0x20 || revid == 0x21) + shca->hw_level = 0x23; + } + + if (!shca->hw_level) { + ehca_gen_warn("unknown hardware version" + " - assuming default level"); + shca->hw_level = 0x22; + } + } else + shca->hw_level = ehca_hw_level; + ehca_gen_dbg(" ... hardware level=%x", shca->hw_level); + + shca->hca_cap = rblock->hca_cap_indicators; + ehca_gen_dbg(" ... HCA capabilities:"); + for (i = 0; i < ARRAY_SIZE(hca_cap_descr); i++) + if (EHCA_BMASK_GET(hca_cap_descr[i].mask, shca->hca_cap)) + ehca_gen_dbg(" %s", hca_cap_descr[i].descr); + + /* Autodetect hCall locking -- the "H_ALLOC_RESOURCE synced" flag is + * a firmware property, so it's valid across all adapters + */ + if (ehca_lock_hcalls == -1) + ehca_lock_hcalls = !EHCA_BMASK_GET(HCA_CAP_H_ALLOC_RES_SYNC, + shca->hca_cap); + + /* translate supported MR page sizes; always support 4K */ + shca->hca_cap_mr_pgsize = EHCA_PAGESIZE; + for (i = 0; i < ARRAY_SIZE(pgsize_map); i += 2) + if (rblock->memory_page_size_supported & pgsize_map[i]) + shca->hca_cap_mr_pgsize |= pgsize_map[i + 1]; + + /* Set maximum number of CQs and QPs to calculate EQ size */ + if (shca->max_num_qps == -1) + shca->max_num_qps = min_t(int, rblock->max_qp, + EHCA_MAX_NUM_QUEUES); + else if (shca->max_num_qps < 1 || shca->max_num_qps > rblock->max_qp) { + ehca_gen_warn("The requested number of QPs is out of range " + "(1 - %i) specified by HW. Value is set to %i", + rblock->max_qp, rblock->max_qp); + shca->max_num_qps = rblock->max_qp; + } + + if (shca->max_num_cqs == -1) + shca->max_num_cqs = min_t(int, rblock->max_cq, + EHCA_MAX_NUM_QUEUES); + else if (shca->max_num_cqs < 1 || shca->max_num_cqs > rblock->max_cq) { + ehca_gen_warn("The requested number of CQs is out of range " + "(1 - %i) specified by HW. Value is set to %i", + rblock->max_cq, rblock->max_cq); + } + + /* query max MTU from first port -- it's the same for all ports */ + port = (struct hipz_query_port *)rblock; + h_ret = hipz_h_query_port(shca->ipz_hca_handle, 1, port); + if (h_ret != H_SUCCESS) { + ehca_gen_err("Cannot query port properties. h_ret=%lli", + h_ret); + ret = -EPERM; + goto sense_attributes1; + } + + shca->max_mtu = port->max_mtu; + +sense_attributes1: + ehca_free_fw_ctrlblock(rblock); + return ret; +} + +static int init_node_guid(struct ehca_shca *shca) +{ + int ret = 0; + struct hipz_query_hca *rblock; + + rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL); + if (!rblock) { + ehca_err(&shca->ib_device, "Can't allocate rblock memory."); + return -ENOMEM; + } + + if (hipz_h_query_hca(shca->ipz_hca_handle, rblock) != H_SUCCESS) { + ehca_err(&shca->ib_device, "Can't query device properties"); + ret = -EINVAL; + goto init_node_guid1; + } + + memcpy(&shca->ib_device.node_guid, &rblock->node_guid, sizeof(u64)); + +init_node_guid1: + ehca_free_fw_ctrlblock(rblock); + return ret; +} + +static int ehca_init_device(struct ehca_shca *shca) +{ + int ret; + + ret = init_node_guid(shca); + if (ret) + return ret; + + strlcpy(shca->ib_device.name, "ehca%d", IB_DEVICE_NAME_MAX); + shca->ib_device.owner = THIS_MODULE; + + shca->ib_device.uverbs_abi_ver = 8; + shca->ib_device.uverbs_cmd_mask = + (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | + (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | + (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_DEREG_MR) | + (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | + (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_QP) | + (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | + (1ull << IB_USER_VERBS_CMD_QUERY_QP) | + (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | + (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_DETACH_MCAST); + + shca->ib_device.node_type = RDMA_NODE_IB_CA; + shca->ib_device.phys_port_cnt = shca->num_ports; + shca->ib_device.num_comp_vectors = 1; + shca->ib_device.dma_device = &shca->ofdev->dev; + shca->ib_device.query_device = ehca_query_device; + shca->ib_device.query_port = ehca_query_port; + shca->ib_device.query_gid = ehca_query_gid; + shca->ib_device.query_pkey = ehca_query_pkey; + /* shca->in_device.modify_device = ehca_modify_device */ + shca->ib_device.modify_port = ehca_modify_port; + shca->ib_device.alloc_ucontext = ehca_alloc_ucontext; + shca->ib_device.dealloc_ucontext = ehca_dealloc_ucontext; + shca->ib_device.alloc_pd = ehca_alloc_pd; + shca->ib_device.dealloc_pd = ehca_dealloc_pd; + shca->ib_device.create_ah = ehca_create_ah; + /* shca->ib_device.modify_ah = ehca_modify_ah; */ + shca->ib_device.query_ah = ehca_query_ah; + shca->ib_device.destroy_ah = ehca_destroy_ah; + shca->ib_device.create_qp = ehca_create_qp; + shca->ib_device.modify_qp = ehca_modify_qp; + shca->ib_device.query_qp = ehca_query_qp; + shca->ib_device.destroy_qp = ehca_destroy_qp; + shca->ib_device.post_send = ehca_post_send; + shca->ib_device.post_recv = ehca_post_recv; + shca->ib_device.create_cq = ehca_create_cq; + shca->ib_device.destroy_cq = ehca_destroy_cq; + shca->ib_device.resize_cq = ehca_resize_cq; + shca->ib_device.poll_cq = ehca_poll_cq; + /* shca->ib_device.peek_cq = ehca_peek_cq; */ + shca->ib_device.req_notify_cq = ehca_req_notify_cq; + /* shca->ib_device.req_ncomp_notif = ehca_req_ncomp_notif; */ + shca->ib_device.get_dma_mr = ehca_get_dma_mr; + shca->ib_device.reg_phys_mr = ehca_reg_phys_mr; + shca->ib_device.reg_user_mr = ehca_reg_user_mr; + shca->ib_device.query_mr = ehca_query_mr; + shca->ib_device.dereg_mr = ehca_dereg_mr; + shca->ib_device.rereg_phys_mr = ehca_rereg_phys_mr; + shca->ib_device.alloc_mw = ehca_alloc_mw; + shca->ib_device.bind_mw = ehca_bind_mw; + shca->ib_device.dealloc_mw = ehca_dealloc_mw; + shca->ib_device.alloc_fmr = ehca_alloc_fmr; + shca->ib_device.map_phys_fmr = ehca_map_phys_fmr; + shca->ib_device.unmap_fmr = ehca_unmap_fmr; + shca->ib_device.dealloc_fmr = ehca_dealloc_fmr; + shca->ib_device.attach_mcast = ehca_attach_mcast; + shca->ib_device.detach_mcast = ehca_detach_mcast; + shca->ib_device.process_mad = ehca_process_mad; + shca->ib_device.mmap = ehca_mmap; + shca->ib_device.dma_ops = &ehca_dma_mapping_ops; + + if (EHCA_BMASK_GET(HCA_CAP_SRQ, shca->hca_cap)) { + shca->ib_device.uverbs_cmd_mask |= + (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) | + (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) | + (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ); + + shca->ib_device.create_srq = ehca_create_srq; + shca->ib_device.modify_srq = ehca_modify_srq; + shca->ib_device.query_srq = ehca_query_srq; + shca->ib_device.destroy_srq = ehca_destroy_srq; + shca->ib_device.post_srq_recv = ehca_post_srq_recv; + } + + return ret; +} + +static int ehca_create_aqp1(struct ehca_shca *shca, u32 port) +{ + struct ehca_sport *sport = &shca->sport[port - 1]; + struct ib_cq *ibcq; + struct ib_qp *ibqp; + struct ib_qp_init_attr qp_init_attr; + int ret; + + if (sport->ibcq_aqp1) { + ehca_err(&shca->ib_device, "AQP1 CQ is already created."); + return -EPERM; + } + + ibcq = ib_create_cq(&shca->ib_device, NULL, NULL, (void *)(-1), 10, 0); + if (IS_ERR(ibcq)) { + ehca_err(&shca->ib_device, "Cannot create AQP1 CQ."); + return PTR_ERR(ibcq); + } + sport->ibcq_aqp1 = ibcq; + + if (sport->ibqp_sqp[IB_QPT_GSI]) { + ehca_err(&shca->ib_device, "AQP1 QP is already created."); + ret = -EPERM; + goto create_aqp1; + } + + memset(&qp_init_attr, 0, sizeof(struct ib_qp_init_attr)); + qp_init_attr.send_cq = ibcq; + qp_init_attr.recv_cq = ibcq; + qp_init_attr.sq_sig_type = IB_SIGNAL_ALL_WR; + qp_init_attr.cap.max_send_wr = 100; + qp_init_attr.cap.max_recv_wr = 100; + qp_init_attr.cap.max_send_sge = 2; + qp_init_attr.cap.max_recv_sge = 1; + qp_init_attr.qp_type = IB_QPT_GSI; + qp_init_attr.port_num = port; + qp_init_attr.qp_context = NULL; + qp_init_attr.event_handler = NULL; + qp_init_attr.srq = NULL; + + ibqp = ib_create_qp(&shca->pd->ib_pd, &qp_init_attr); + if (IS_ERR(ibqp)) { + ehca_err(&shca->ib_device, "Cannot create AQP1 QP."); + ret = PTR_ERR(ibqp); + goto create_aqp1; + } + sport->ibqp_sqp[IB_QPT_GSI] = ibqp; + + return 0; + +create_aqp1: + ib_destroy_cq(sport->ibcq_aqp1); + return ret; +} + +static int ehca_destroy_aqp1(struct ehca_sport *sport) +{ + int ret; + + ret = ib_destroy_qp(sport->ibqp_sqp[IB_QPT_GSI]); + if (ret) { + ehca_gen_err("Cannot destroy AQP1 QP. ret=%i", ret); + return ret; + } + + ret = ib_destroy_cq(sport->ibcq_aqp1); + if (ret) + ehca_gen_err("Cannot destroy AQP1 CQ. ret=%i", ret); + + return ret; +} + +static ssize_t ehca_show_debug_level(struct device_driver *ddp, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", ehca_debug_level); +} + +static ssize_t ehca_store_debug_level(struct device_driver *ddp, + const char *buf, size_t count) +{ + int value = (*buf) - '0'; + if (value >= 0 && value <= 9) + ehca_debug_level = value; + return 1; +} + +static DRIVER_ATTR(debug_level, S_IRUSR | S_IWUSR, + ehca_show_debug_level, ehca_store_debug_level); + +static struct attribute *ehca_drv_attrs[] = { + &driver_attr_debug_level.attr, + NULL +}; + +static struct attribute_group ehca_drv_attr_grp = { + .attrs = ehca_drv_attrs +}; + +static const struct attribute_group *ehca_drv_attr_groups[] = { + &ehca_drv_attr_grp, + NULL, +}; + +#define EHCA_RESOURCE_ATTR(name) \ +static ssize_t ehca_show_##name(struct device *dev, \ + struct device_attribute *attr, \ + char *buf) \ +{ \ + struct ehca_shca *shca; \ + struct hipz_query_hca *rblock; \ + int data; \ + \ + shca = dev_get_drvdata(dev); \ + \ + rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL); \ + if (!rblock) { \ + dev_err(dev, "Can't allocate rblock memory.\n"); \ + return 0; \ + } \ + \ + if (hipz_h_query_hca(shca->ipz_hca_handle, rblock) != H_SUCCESS) { \ + dev_err(dev, "Can't query device properties\n"); \ + ehca_free_fw_ctrlblock(rblock); \ + return 0; \ + } \ + \ + data = rblock->name; \ + ehca_free_fw_ctrlblock(rblock); \ + \ + if ((strcmp(#name, "num_ports") == 0) && (ehca_nr_ports == 1)) \ + return snprintf(buf, 256, "1\n"); \ + else \ + return snprintf(buf, 256, "%d\n", data); \ + \ +} \ +static DEVICE_ATTR(name, S_IRUGO, ehca_show_##name, NULL); + +EHCA_RESOURCE_ATTR(num_ports); +EHCA_RESOURCE_ATTR(hw_ver); +EHCA_RESOURCE_ATTR(max_eq); +EHCA_RESOURCE_ATTR(cur_eq); +EHCA_RESOURCE_ATTR(max_cq); +EHCA_RESOURCE_ATTR(cur_cq); +EHCA_RESOURCE_ATTR(max_qp); +EHCA_RESOURCE_ATTR(cur_qp); +EHCA_RESOURCE_ATTR(max_mr); +EHCA_RESOURCE_ATTR(cur_mr); +EHCA_RESOURCE_ATTR(max_mw); +EHCA_RESOURCE_ATTR(cur_mw); +EHCA_RESOURCE_ATTR(max_pd); +EHCA_RESOURCE_ATTR(max_ah); + +static ssize_t ehca_show_adapter_handle(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ehca_shca *shca = dev_get_drvdata(dev); + + return sprintf(buf, "%llx\n", shca->ipz_hca_handle.handle); + +} +static DEVICE_ATTR(adapter_handle, S_IRUGO, ehca_show_adapter_handle, NULL); + +static struct attribute *ehca_dev_attrs[] = { + &dev_attr_adapter_handle.attr, + &dev_attr_num_ports.attr, + &dev_attr_hw_ver.attr, + &dev_attr_max_eq.attr, + &dev_attr_cur_eq.attr, + &dev_attr_max_cq.attr, + &dev_attr_cur_cq.attr, + &dev_attr_max_qp.attr, + &dev_attr_cur_qp.attr, + &dev_attr_max_mr.attr, + &dev_attr_cur_mr.attr, + &dev_attr_max_mw.attr, + &dev_attr_cur_mw.attr, + &dev_attr_max_pd.attr, + &dev_attr_max_ah.attr, + NULL +}; + +static struct attribute_group ehca_dev_attr_grp = { + .attrs = ehca_dev_attrs +}; + +static int ehca_probe(struct platform_device *dev) +{ + struct ehca_shca *shca; + const u64 *handle; + struct ib_pd *ibpd; + int ret, i, eq_size; + unsigned long flags; + + handle = of_get_property(dev->dev.of_node, "ibm,hca-handle", NULL); + if (!handle) { + ehca_gen_err("Cannot get eHCA handle for adapter: %s.", + dev->dev.of_node->full_name); + return -ENODEV; + } + + if (!(*handle)) { + ehca_gen_err("Wrong eHCA handle for adapter: %s.", + dev->dev.of_node->full_name); + return -ENODEV; + } + + shca = (struct ehca_shca *)ib_alloc_device(sizeof(*shca)); + if (!shca) { + ehca_gen_err("Cannot allocate shca memory."); + return -ENOMEM; + } + + mutex_init(&shca->modify_mutex); + atomic_set(&shca->num_cqs, 0); + atomic_set(&shca->num_qps, 0); + shca->max_num_qps = ehca_max_qp; + shca->max_num_cqs = ehca_max_cq; + + for (i = 0; i < ARRAY_SIZE(shca->sport); i++) + spin_lock_init(&shca->sport[i].mod_sqp_lock); + + shca->ofdev = dev; + shca->ipz_hca_handle.handle = *handle; + dev_set_drvdata(&dev->dev, shca); + + ret = ehca_sense_attributes(shca); + if (ret < 0) { + ehca_gen_err("Cannot sense eHCA attributes."); + goto probe1; + } + + ret = ehca_init_device(shca); + if (ret) { + ehca_gen_err("Cannot init ehca device struct"); + goto probe1; + } + + eq_size = 2 * shca->max_num_cqs + 4 * shca->max_num_qps; + /* create event queues */ + ret = ehca_create_eq(shca, &shca->eq, EHCA_EQ, eq_size); + if (ret) { + ehca_err(&shca->ib_device, "Cannot create EQ."); + goto probe1; + } + + ret = ehca_create_eq(shca, &shca->neq, EHCA_NEQ, 513); + if (ret) { + ehca_err(&shca->ib_device, "Cannot create NEQ."); + goto probe3; + } + + /* create internal protection domain */ + ibpd = ehca_alloc_pd(&shca->ib_device, (void *)(-1), NULL); + if (IS_ERR(ibpd)) { + ehca_err(&shca->ib_device, "Cannot create internal PD."); + ret = PTR_ERR(ibpd); + goto probe4; + } + + shca->pd = container_of(ibpd, struct ehca_pd, ib_pd); + shca->pd->ib_pd.device = &shca->ib_device; + + /* create internal max MR */ + ret = ehca_reg_internal_maxmr(shca, shca->pd, &shca->maxmr); + + if (ret) { + ehca_err(&shca->ib_device, "Cannot create internal MR ret=%i", + ret); + goto probe5; + } + + ret = ib_register_device(&shca->ib_device, NULL); + if (ret) { + ehca_err(&shca->ib_device, + "ib_register_device() failed ret=%i", ret); + goto probe6; + } + + /* create AQP1 for port 1 */ + if (ehca_open_aqp1 == 1) { + shca->sport[0].port_state = IB_PORT_DOWN; + ret = ehca_create_aqp1(shca, 1); + if (ret) { + ehca_err(&shca->ib_device, + "Cannot create AQP1 for port 1."); + goto probe7; + } + } + + /* create AQP1 for port 2 */ + if ((ehca_open_aqp1 == 1) && (shca->num_ports == 2)) { + shca->sport[1].port_state = IB_PORT_DOWN; + ret = ehca_create_aqp1(shca, 2); + if (ret) { + ehca_err(&shca->ib_device, + "Cannot create AQP1 for port 2."); + goto probe8; + } + } + + ret = sysfs_create_group(&dev->dev.kobj, &ehca_dev_attr_grp); + if (ret) /* only complain; we can live without attributes */ + ehca_err(&shca->ib_device, + "Cannot create device attributes ret=%d", ret); + + spin_lock_irqsave(&shca_list_lock, flags); + list_add(&shca->shca_list, &shca_list); + spin_unlock_irqrestore(&shca_list_lock, flags); + + return 0; + +probe8: + ret = ehca_destroy_aqp1(&shca->sport[0]); + if (ret) + ehca_err(&shca->ib_device, + "Cannot destroy AQP1 for port 1. ret=%i", ret); + +probe7: + ib_unregister_device(&shca->ib_device); + +probe6: + ret = ehca_dereg_internal_maxmr(shca); + if (ret) + ehca_err(&shca->ib_device, + "Cannot destroy internal MR. ret=%x", ret); + +probe5: + ret = ehca_dealloc_pd(&shca->pd->ib_pd); + if (ret) + ehca_err(&shca->ib_device, + "Cannot destroy internal PD. ret=%x", ret); + +probe4: + ret = ehca_destroy_eq(shca, &shca->neq); + if (ret) + ehca_err(&shca->ib_device, + "Cannot destroy NEQ. ret=%x", ret); + +probe3: + ret = ehca_destroy_eq(shca, &shca->eq); + if (ret) + ehca_err(&shca->ib_device, + "Cannot destroy EQ. ret=%x", ret); + +probe1: + ib_dealloc_device(&shca->ib_device); + + return -EINVAL; +} + +static int ehca_remove(struct platform_device *dev) +{ + struct ehca_shca *shca = dev_get_drvdata(&dev->dev); + unsigned long flags; + int ret; + + sysfs_remove_group(&dev->dev.kobj, &ehca_dev_attr_grp); + + if (ehca_open_aqp1 == 1) { + int i; + for (i = 0; i < shca->num_ports; i++) { + ret = ehca_destroy_aqp1(&shca->sport[i]); + if (ret) + ehca_err(&shca->ib_device, + "Cannot destroy AQP1 for port %x " + "ret=%i", ret, i); + } + } + + ib_unregister_device(&shca->ib_device); + + ret = ehca_dereg_internal_maxmr(shca); + if (ret) + ehca_err(&shca->ib_device, + "Cannot destroy internal MR. ret=%i", ret); + + ret = ehca_dealloc_pd(&shca->pd->ib_pd); + if (ret) + ehca_err(&shca->ib_device, + "Cannot destroy internal PD. ret=%i", ret); + + ret = ehca_destroy_eq(shca, &shca->eq); + if (ret) + ehca_err(&shca->ib_device, "Cannot destroy EQ. ret=%i", ret); + + ret = ehca_destroy_eq(shca, &shca->neq); + if (ret) + ehca_err(&shca->ib_device, "Canot destroy NEQ. ret=%i", ret); + + ib_dealloc_device(&shca->ib_device); + + spin_lock_irqsave(&shca_list_lock, flags); + list_del(&shca->shca_list); + spin_unlock_irqrestore(&shca_list_lock, flags); + + return ret; +} + +static struct of_device_id ehca_device_table[] = +{ + { + .name = "lhca", + .compatible = "IBM,lhca", + }, + {}, +}; +MODULE_DEVICE_TABLE(of, ehca_device_table); + +static struct platform_driver ehca_driver = { + .probe = ehca_probe, + .remove = ehca_remove, + .driver = { + .name = "ehca", + .owner = THIS_MODULE, + .groups = ehca_drv_attr_groups, + .of_match_table = ehca_device_table, + }, +}; + +void ehca_poll_eqs(unsigned long data) +{ + struct ehca_shca *shca; + + spin_lock(&shca_list_lock); + list_for_each_entry(shca, &shca_list, shca_list) { + if (shca->eq.is_initialized) { + /* call deadman proc only if eq ptr does not change */ + struct ehca_eq *eq = &shca->eq; + int max = 3; + volatile u64 q_ofs, q_ofs2; + unsigned long flags; + spin_lock_irqsave(&eq->spinlock, flags); + q_ofs = eq->ipz_queue.current_q_offset; + spin_unlock_irqrestore(&eq->spinlock, flags); + do { + spin_lock_irqsave(&eq->spinlock, flags); + q_ofs2 = eq->ipz_queue.current_q_offset; + spin_unlock_irqrestore(&eq->spinlock, flags); + max--; + } while (q_ofs == q_ofs2 && max > 0); + if (q_ofs == q_ofs2) + ehca_process_eq(shca, 0); + } + } + mod_timer(&poll_eqs_timer, round_jiffies(jiffies + HZ)); + spin_unlock(&shca_list_lock); +} + +static int ehca_mem_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + static unsigned long ehca_dmem_warn_time; + unsigned long flags; + + switch (action) { + case MEM_CANCEL_OFFLINE: + case MEM_CANCEL_ONLINE: + case MEM_ONLINE: + case MEM_OFFLINE: + return NOTIFY_OK; + case MEM_GOING_ONLINE: + case MEM_GOING_OFFLINE: + /* only ok if no hca is attached to the lpar */ + spin_lock_irqsave(&shca_list_lock, flags); + if (list_empty(&shca_list)) { + spin_unlock_irqrestore(&shca_list_lock, flags); + return NOTIFY_OK; + } else { + spin_unlock_irqrestore(&shca_list_lock, flags); + if (printk_timed_ratelimit(&ehca_dmem_warn_time, + 30 * 1000)) + ehca_gen_err("DMEM operations are not allowed" + "in conjunction with eHCA"); + return NOTIFY_BAD; + } + } + return NOTIFY_OK; +} + +static struct notifier_block ehca_mem_nb = { + .notifier_call = ehca_mem_notifier, +}; + +static int __init ehca_module_init(void) +{ + int ret; + + printk(KERN_INFO "eHCA Infiniband Device Driver " + "(Version " HCAD_VERSION ")\n"); + + ret = ehca_create_comp_pool(); + if (ret) { + ehca_gen_err("Cannot create comp pool."); + return ret; + } + + ret = ehca_create_slab_caches(); + if (ret) { + ehca_gen_err("Cannot create SLAB caches"); + ret = -ENOMEM; + goto module_init1; + } + + ret = ehca_create_busmap(); + if (ret) { + ehca_gen_err("Cannot create busmap."); + goto module_init2; + } + + ret = ibmebus_register_driver(&ehca_driver); + if (ret) { + ehca_gen_err("Cannot register eHCA device driver"); + ret = -EINVAL; + goto module_init3; + } + + ret = register_memory_notifier(&ehca_mem_nb); + if (ret) { + ehca_gen_err("Failed registering memory add/remove notifier"); + goto module_init4; + } + + if (ehca_poll_all_eqs != 1) { + ehca_gen_err("WARNING!!!"); + ehca_gen_err("It is possible to lose interrupts."); + } else { + init_timer(&poll_eqs_timer); + poll_eqs_timer.function = ehca_poll_eqs; + poll_eqs_timer.expires = jiffies + HZ; + add_timer(&poll_eqs_timer); + } + + return 0; + +module_init4: + ibmebus_unregister_driver(&ehca_driver); + +module_init3: + ehca_destroy_busmap(); + +module_init2: + ehca_destroy_slab_caches(); + +module_init1: + ehca_destroy_comp_pool(); + return ret; +}; + +static void __exit ehca_module_exit(void) +{ + if (ehca_poll_all_eqs == 1) + del_timer_sync(&poll_eqs_timer); + + ibmebus_unregister_driver(&ehca_driver); + + unregister_memory_notifier(&ehca_mem_nb); + + ehca_destroy_busmap(); + + ehca_destroy_slab_caches(); + + ehca_destroy_comp_pool(); + + idr_destroy(&ehca_cq_idr); + idr_destroy(&ehca_qp_idr); +}; + +module_init(ehca_module_init); +module_exit(ehca_module_exit); diff --git a/kernel/drivers/infiniband/hw/ehca/ehca_mcast.c b/kernel/drivers/infiniband/hw/ehca/ehca_mcast.c new file mode 100644 index 000000000..cec181532 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ehca/ehca_mcast.c @@ -0,0 +1,131 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * mcast functions + * + * Authors: Khadija Souissi + * Waleri Fomin + * Reinhard Ernst + * Hoang-Nam Nguyen + * Heiko J Schick + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include "ehca_classes.h" +#include "ehca_tools.h" +#include "ehca_qes.h" +#include "ehca_iverbs.h" +#include "hcp_if.h" + +#define MAX_MC_LID 0xFFFE +#define MIN_MC_LID 0xC000 /* Multicast limits */ +#define EHCA_VALID_MULTICAST_GID(gid) ((gid)[0] == 0xFF) +#define EHCA_VALID_MULTICAST_LID(lid) \ + (((lid) >= MIN_MC_LID) && ((lid) <= MAX_MC_LID)) + +int ehca_attach_mcast(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + struct ehca_qp *my_qp = container_of(ibqp, struct ehca_qp, ib_qp); + struct ehca_shca *shca = container_of(ibqp->device, struct ehca_shca, + ib_device); + union ib_gid my_gid; + u64 subnet_prefix, interface_id, h_ret; + + if (ibqp->qp_type != IB_QPT_UD) { + ehca_err(ibqp->device, "invalid qp_type=%x", ibqp->qp_type); + return -EINVAL; + } + + if (!(EHCA_VALID_MULTICAST_GID(gid->raw))) { + ehca_err(ibqp->device, "invalid mulitcast gid"); + return -EINVAL; + } else if ((lid < MIN_MC_LID) || (lid > MAX_MC_LID)) { + ehca_err(ibqp->device, "invalid mulitcast lid=%x", lid); + return -EINVAL; + } + + memcpy(&my_gid, gid->raw, sizeof(union ib_gid)); + + subnet_prefix = be64_to_cpu(my_gid.global.subnet_prefix); + interface_id = be64_to_cpu(my_gid.global.interface_id); + h_ret = hipz_h_attach_mcqp(shca->ipz_hca_handle, + my_qp->ipz_qp_handle, + my_qp->galpas.kernel, + lid, subnet_prefix, interface_id); + if (h_ret != H_SUCCESS) + ehca_err(ibqp->device, + "ehca_qp=%p qp_num=%x hipz_h_attach_mcqp() failed " + "h_ret=%lli", my_qp, ibqp->qp_num, h_ret); + + return ehca2ib_return_code(h_ret); +} + +int ehca_detach_mcast(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + struct ehca_qp *my_qp = container_of(ibqp, struct ehca_qp, ib_qp); + struct ehca_shca *shca = container_of(ibqp->pd->device, + struct ehca_shca, ib_device); + union ib_gid my_gid; + u64 subnet_prefix, interface_id, h_ret; + + if (ibqp->qp_type != IB_QPT_UD) { + ehca_err(ibqp->device, "invalid qp_type %x", ibqp->qp_type); + return -EINVAL; + } + + if (!(EHCA_VALID_MULTICAST_GID(gid->raw))) { + ehca_err(ibqp->device, "invalid mulitcast gid"); + return -EINVAL; + } else if ((lid < MIN_MC_LID) || (lid > MAX_MC_LID)) { + ehca_err(ibqp->device, "invalid mulitcast lid=%x", lid); + return -EINVAL; + } + + memcpy(&my_gid, gid->raw, sizeof(union ib_gid)); + + subnet_prefix = be64_to_cpu(my_gid.global.subnet_prefix); + interface_id = be64_to_cpu(my_gid.global.interface_id); + h_ret = hipz_h_detach_mcqp(shca->ipz_hca_handle, + my_qp->ipz_qp_handle, + my_qp->galpas.kernel, + lid, subnet_prefix, interface_id); + if (h_ret != H_SUCCESS) + ehca_err(ibqp->device, + "ehca_qp=%p qp_num=%x hipz_h_detach_mcqp() failed " + "h_ret=%lli", my_qp, ibqp->qp_num, h_ret); + + return ehca2ib_return_code(h_ret); +} diff --git a/kernel/drivers/infiniband/hw/ehca/ehca_mrmw.c b/kernel/drivers/infiniband/hw/ehca/ehca_mrmw.c new file mode 100644 index 000000000..f914b3099 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ehca/ehca_mrmw.c @@ -0,0 +1,2593 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * MR/MW functions + * + * Authors: Dietmar Decker + * Christoph Raisch + * Hoang-Nam Nguyen + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include + +#include "ehca_iverbs.h" +#include "ehca_mrmw.h" +#include "hcp_if.h" +#include "hipz_hw.h" + +#define NUM_CHUNKS(length, chunk_size) \ + (((length) + (chunk_size - 1)) / (chunk_size)) + +/* max number of rpages (per hcall register_rpages) */ +#define MAX_RPAGES 512 + +/* DMEM toleration management */ +#define EHCA_SECTSHIFT SECTION_SIZE_BITS +#define EHCA_SECTSIZE (1UL << EHCA_SECTSHIFT) +#define EHCA_HUGEPAGESHIFT 34 +#define EHCA_HUGEPAGE_SIZE (1UL << EHCA_HUGEPAGESHIFT) +#define EHCA_HUGEPAGE_PFN_MASK ((EHCA_HUGEPAGE_SIZE - 1) >> PAGE_SHIFT) +#define EHCA_INVAL_ADDR 0xFFFFFFFFFFFFFFFFULL +#define EHCA_DIR_INDEX_SHIFT 13 /* 8k Entries in 64k block */ +#define EHCA_TOP_INDEX_SHIFT (EHCA_DIR_INDEX_SHIFT * 2) +#define EHCA_MAP_ENTRIES (1 << EHCA_DIR_INDEX_SHIFT) +#define EHCA_TOP_MAP_SIZE (0x10000) /* currently fixed map size */ +#define EHCA_DIR_MAP_SIZE (0x10000) +#define EHCA_ENT_MAP_SIZE (0x10000) +#define EHCA_INDEX_MASK (EHCA_MAP_ENTRIES - 1) + +static unsigned long ehca_mr_len; + +/* + * Memory map data structures + */ +struct ehca_dir_bmap { + u64 ent[EHCA_MAP_ENTRIES]; +}; +struct ehca_top_bmap { + struct ehca_dir_bmap *dir[EHCA_MAP_ENTRIES]; +}; +struct ehca_bmap { + struct ehca_top_bmap *top[EHCA_MAP_ENTRIES]; +}; + +static struct ehca_bmap *ehca_bmap; + +static struct kmem_cache *mr_cache; +static struct kmem_cache *mw_cache; + +enum ehca_mr_pgsize { + EHCA_MR_PGSIZE4K = 0x1000L, + EHCA_MR_PGSIZE64K = 0x10000L, + EHCA_MR_PGSIZE1M = 0x100000L, + EHCA_MR_PGSIZE16M = 0x1000000L +}; + +#define EHCA_MR_PGSHIFT4K 12 +#define EHCA_MR_PGSHIFT64K 16 +#define EHCA_MR_PGSHIFT1M 20 +#define EHCA_MR_PGSHIFT16M 24 + +static u64 ehca_map_vaddr(void *caddr); + +static u32 ehca_encode_hwpage_size(u32 pgsize) +{ + int log = ilog2(pgsize); + WARN_ON(log < 12 || log > 24 || log & 3); + return (log - 12) / 4; +} + +static u64 ehca_get_max_hwpage_size(struct ehca_shca *shca) +{ + return rounddown_pow_of_two(shca->hca_cap_mr_pgsize); +} + +static struct ehca_mr *ehca_mr_new(void) +{ + struct ehca_mr *me; + + me = kmem_cache_zalloc(mr_cache, GFP_KERNEL); + if (me) + spin_lock_init(&me->mrlock); + else + ehca_gen_err("alloc failed"); + + return me; +} + +static void ehca_mr_delete(struct ehca_mr *me) +{ + kmem_cache_free(mr_cache, me); +} + +static struct ehca_mw *ehca_mw_new(void) +{ + struct ehca_mw *me; + + me = kmem_cache_zalloc(mw_cache, GFP_KERNEL); + if (me) + spin_lock_init(&me->mwlock); + else + ehca_gen_err("alloc failed"); + + return me; +} + +static void ehca_mw_delete(struct ehca_mw *me) +{ + kmem_cache_free(mw_cache, me); +} + +/*----------------------------------------------------------------------*/ + +struct ib_mr *ehca_get_dma_mr(struct ib_pd *pd, int mr_access_flags) +{ + struct ib_mr *ib_mr; + int ret; + struct ehca_mr *e_maxmr; + struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd); + struct ehca_shca *shca = + container_of(pd->device, struct ehca_shca, ib_device); + + if (shca->maxmr) { + e_maxmr = ehca_mr_new(); + if (!e_maxmr) { + ehca_err(&shca->ib_device, "out of memory"); + ib_mr = ERR_PTR(-ENOMEM); + goto get_dma_mr_exit0; + } + + ret = ehca_reg_maxmr(shca, e_maxmr, + (void *)ehca_map_vaddr((void *)(KERNELBASE + PHYSICAL_START)), + mr_access_flags, e_pd, + &e_maxmr->ib.ib_mr.lkey, + &e_maxmr->ib.ib_mr.rkey); + if (ret) { + ehca_mr_delete(e_maxmr); + ib_mr = ERR_PTR(ret); + goto get_dma_mr_exit0; + } + ib_mr = &e_maxmr->ib.ib_mr; + } else { + ehca_err(&shca->ib_device, "no internal max-MR exist!"); + ib_mr = ERR_PTR(-EINVAL); + goto get_dma_mr_exit0; + } + +get_dma_mr_exit0: + if (IS_ERR(ib_mr)) + ehca_err(&shca->ib_device, "h_ret=%li pd=%p mr_access_flags=%x", + PTR_ERR(ib_mr), pd, mr_access_flags); + return ib_mr; +} /* end ehca_get_dma_mr() */ + +/*----------------------------------------------------------------------*/ + +struct ib_mr *ehca_reg_phys_mr(struct ib_pd *pd, + struct ib_phys_buf *phys_buf_array, + int num_phys_buf, + int mr_access_flags, + u64 *iova_start) +{ + struct ib_mr *ib_mr; + int ret; + struct ehca_mr *e_mr; + struct ehca_shca *shca = + container_of(pd->device, struct ehca_shca, ib_device); + struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd); + + u64 size; + + if ((num_phys_buf <= 0) || !phys_buf_array) { + ehca_err(pd->device, "bad input values: num_phys_buf=%x " + "phys_buf_array=%p", num_phys_buf, phys_buf_array); + ib_mr = ERR_PTR(-EINVAL); + goto reg_phys_mr_exit0; + } + if (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) && + !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) || + ((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) && + !(mr_access_flags & IB_ACCESS_LOCAL_WRITE))) { + /* + * Remote Write Access requires Local Write Access + * Remote Atomic Access requires Local Write Access + */ + ehca_err(pd->device, "bad input values: mr_access_flags=%x", + mr_access_flags); + ib_mr = ERR_PTR(-EINVAL); + goto reg_phys_mr_exit0; + } + + /* check physical buffer list and calculate size */ + ret = ehca_mr_chk_buf_and_calc_size(phys_buf_array, num_phys_buf, + iova_start, &size); + if (ret) { + ib_mr = ERR_PTR(ret); + goto reg_phys_mr_exit0; + } + if ((size == 0) || + (((u64)iova_start + size) < (u64)iova_start)) { + ehca_err(pd->device, "bad input values: size=%llx iova_start=%p", + size, iova_start); + ib_mr = ERR_PTR(-EINVAL); + goto reg_phys_mr_exit0; + } + + e_mr = ehca_mr_new(); + if (!e_mr) { + ehca_err(pd->device, "out of memory"); + ib_mr = ERR_PTR(-ENOMEM); + goto reg_phys_mr_exit0; + } + + /* register MR on HCA */ + if (ehca_mr_is_maxmr(size, iova_start)) { + e_mr->flags |= EHCA_MR_FLAG_MAXMR; + ret = ehca_reg_maxmr(shca, e_mr, iova_start, mr_access_flags, + e_pd, &e_mr->ib.ib_mr.lkey, + &e_mr->ib.ib_mr.rkey); + if (ret) { + ib_mr = ERR_PTR(ret); + goto reg_phys_mr_exit1; + } + } else { + struct ehca_mr_pginfo pginfo; + u32 num_kpages; + u32 num_hwpages; + u64 hw_pgsize; + + num_kpages = NUM_CHUNKS(((u64)iova_start % PAGE_SIZE) + size, + PAGE_SIZE); + /* for kernel space we try most possible pgsize */ + hw_pgsize = ehca_get_max_hwpage_size(shca); + num_hwpages = NUM_CHUNKS(((u64)iova_start % hw_pgsize) + size, + hw_pgsize); + memset(&pginfo, 0, sizeof(pginfo)); + pginfo.type = EHCA_MR_PGI_PHYS; + pginfo.num_kpages = num_kpages; + pginfo.hwpage_size = hw_pgsize; + pginfo.num_hwpages = num_hwpages; + pginfo.u.phy.num_phys_buf = num_phys_buf; + pginfo.u.phy.phys_buf_array = phys_buf_array; + pginfo.next_hwpage = + ((u64)iova_start & ~PAGE_MASK) / hw_pgsize; + + ret = ehca_reg_mr(shca, e_mr, iova_start, size, mr_access_flags, + e_pd, &pginfo, &e_mr->ib.ib_mr.lkey, + &e_mr->ib.ib_mr.rkey, EHCA_REG_MR); + if (ret) { + ib_mr = ERR_PTR(ret); + goto reg_phys_mr_exit1; + } + } + + /* successful registration of all pages */ + return &e_mr->ib.ib_mr; + +reg_phys_mr_exit1: + ehca_mr_delete(e_mr); +reg_phys_mr_exit0: + if (IS_ERR(ib_mr)) + ehca_err(pd->device, "h_ret=%li pd=%p phys_buf_array=%p " + "num_phys_buf=%x mr_access_flags=%x iova_start=%p", + PTR_ERR(ib_mr), pd, phys_buf_array, + num_phys_buf, mr_access_flags, iova_start); + return ib_mr; +} /* end ehca_reg_phys_mr() */ + +/*----------------------------------------------------------------------*/ + +struct ib_mr *ehca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt, int mr_access_flags, + struct ib_udata *udata) +{ + struct ib_mr *ib_mr; + struct ehca_mr *e_mr; + struct ehca_shca *shca = + container_of(pd->device, struct ehca_shca, ib_device); + struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd); + struct ehca_mr_pginfo pginfo; + int ret, page_shift; + u32 num_kpages; + u32 num_hwpages; + u64 hwpage_size; + + if (!pd) { + ehca_gen_err("bad pd=%p", pd); + return ERR_PTR(-EFAULT); + } + + if (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) && + !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) || + ((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) && + !(mr_access_flags & IB_ACCESS_LOCAL_WRITE))) { + /* + * Remote Write Access requires Local Write Access + * Remote Atomic Access requires Local Write Access + */ + ehca_err(pd->device, "bad input values: mr_access_flags=%x", + mr_access_flags); + ib_mr = ERR_PTR(-EINVAL); + goto reg_user_mr_exit0; + } + + if (length == 0 || virt + length < virt) { + ehca_err(pd->device, "bad input values: length=%llx " + "virt_base=%llx", length, virt); + ib_mr = ERR_PTR(-EINVAL); + goto reg_user_mr_exit0; + } + + e_mr = ehca_mr_new(); + if (!e_mr) { + ehca_err(pd->device, "out of memory"); + ib_mr = ERR_PTR(-ENOMEM); + goto reg_user_mr_exit0; + } + + e_mr->umem = ib_umem_get(pd->uobject->context, start, length, + mr_access_flags, 0); + if (IS_ERR(e_mr->umem)) { + ib_mr = (void *)e_mr->umem; + goto reg_user_mr_exit1; + } + + if (e_mr->umem->page_size != PAGE_SIZE) { + ehca_err(pd->device, "page size not supported, " + "e_mr->umem->page_size=%x", e_mr->umem->page_size); + ib_mr = ERR_PTR(-EINVAL); + goto reg_user_mr_exit2; + } + + /* determine number of MR pages */ + num_kpages = NUM_CHUNKS((virt % PAGE_SIZE) + length, PAGE_SIZE); + /* select proper hw_pgsize */ + page_shift = PAGE_SHIFT; + if (e_mr->umem->hugetlb) { + /* determine page_shift, clamp between 4K and 16M */ + page_shift = (fls64(length - 1) + 3) & ~3; + page_shift = min(max(page_shift, EHCA_MR_PGSHIFT4K), + EHCA_MR_PGSHIFT16M); + } + hwpage_size = 1UL << page_shift; + + /* now that we have the desired page size, shift until it's + * supported, too. 4K is always supported, so this terminates. + */ + while (!(hwpage_size & shca->hca_cap_mr_pgsize)) + hwpage_size >>= 4; + +reg_user_mr_fallback: + num_hwpages = NUM_CHUNKS((virt % hwpage_size) + length, hwpage_size); + /* register MR on HCA */ + memset(&pginfo, 0, sizeof(pginfo)); + pginfo.type = EHCA_MR_PGI_USER; + pginfo.hwpage_size = hwpage_size; + pginfo.num_kpages = num_kpages; + pginfo.num_hwpages = num_hwpages; + pginfo.u.usr.region = e_mr->umem; + pginfo.next_hwpage = ib_umem_offset(e_mr->umem) / hwpage_size; + pginfo.u.usr.next_sg = pginfo.u.usr.region->sg_head.sgl; + ret = ehca_reg_mr(shca, e_mr, (u64 *)virt, length, mr_access_flags, + e_pd, &pginfo, &e_mr->ib.ib_mr.lkey, + &e_mr->ib.ib_mr.rkey, EHCA_REG_MR); + if (ret == -EINVAL && pginfo.hwpage_size > PAGE_SIZE) { + ehca_warn(pd->device, "failed to register mr " + "with hwpage_size=%llx", hwpage_size); + ehca_info(pd->device, "try to register mr with " + "kpage_size=%lx", PAGE_SIZE); + /* + * this means kpages are not contiguous for a hw page + * try kernel page size as fallback solution + */ + hwpage_size = PAGE_SIZE; + goto reg_user_mr_fallback; + } + if (ret) { + ib_mr = ERR_PTR(ret); + goto reg_user_mr_exit2; + } + + /* successful registration of all pages */ + return &e_mr->ib.ib_mr; + +reg_user_mr_exit2: + ib_umem_release(e_mr->umem); +reg_user_mr_exit1: + ehca_mr_delete(e_mr); +reg_user_mr_exit0: + if (IS_ERR(ib_mr)) + ehca_err(pd->device, "rc=%li pd=%p mr_access_flags=%x udata=%p", + PTR_ERR(ib_mr), pd, mr_access_flags, udata); + return ib_mr; +} /* end ehca_reg_user_mr() */ + +/*----------------------------------------------------------------------*/ + +int ehca_rereg_phys_mr(struct ib_mr *mr, + int mr_rereg_mask, + struct ib_pd *pd, + struct ib_phys_buf *phys_buf_array, + int num_phys_buf, + int mr_access_flags, + u64 *iova_start) +{ + int ret; + + struct ehca_shca *shca = + container_of(mr->device, struct ehca_shca, ib_device); + struct ehca_mr *e_mr = container_of(mr, struct ehca_mr, ib.ib_mr); + u64 new_size; + u64 *new_start; + u32 new_acl; + struct ehca_pd *new_pd; + u32 tmp_lkey, tmp_rkey; + unsigned long sl_flags; + u32 num_kpages = 0; + u32 num_hwpages = 0; + struct ehca_mr_pginfo pginfo; + + if (!(mr_rereg_mask & IB_MR_REREG_TRANS)) { + /* TODO not supported, because PHYP rereg hCall needs pages */ + ehca_err(mr->device, "rereg without IB_MR_REREG_TRANS not " + "supported yet, mr_rereg_mask=%x", mr_rereg_mask); + ret = -EINVAL; + goto rereg_phys_mr_exit0; + } + + if (mr_rereg_mask & IB_MR_REREG_PD) { + if (!pd) { + ehca_err(mr->device, "rereg with bad pd, pd=%p " + "mr_rereg_mask=%x", pd, mr_rereg_mask); + ret = -EINVAL; + goto rereg_phys_mr_exit0; + } + } + + if ((mr_rereg_mask & + ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS)) || + (mr_rereg_mask == 0)) { + ret = -EINVAL; + goto rereg_phys_mr_exit0; + } + + /* check other parameters */ + if (e_mr == shca->maxmr) { + /* should be impossible, however reject to be sure */ + ehca_err(mr->device, "rereg internal max-MR impossible, mr=%p " + "shca->maxmr=%p mr->lkey=%x", + mr, shca->maxmr, mr->lkey); + ret = -EINVAL; + goto rereg_phys_mr_exit0; + } + if (mr_rereg_mask & IB_MR_REREG_TRANS) { /* transl., i.e. addr/size */ + if (e_mr->flags & EHCA_MR_FLAG_FMR) { + ehca_err(mr->device, "not supported for FMR, mr=%p " + "flags=%x", mr, e_mr->flags); + ret = -EINVAL; + goto rereg_phys_mr_exit0; + } + if (!phys_buf_array || num_phys_buf <= 0) { + ehca_err(mr->device, "bad input values mr_rereg_mask=%x" + " phys_buf_array=%p num_phys_buf=%x", + mr_rereg_mask, phys_buf_array, num_phys_buf); + ret = -EINVAL; + goto rereg_phys_mr_exit0; + } + } + if ((mr_rereg_mask & IB_MR_REREG_ACCESS) && /* change ACL */ + (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) && + !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) || + ((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) && + !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)))) { + /* + * Remote Write Access requires Local Write Access + * Remote Atomic Access requires Local Write Access + */ + ehca_err(mr->device, "bad input values: mr_rereg_mask=%x " + "mr_access_flags=%x", mr_rereg_mask, mr_access_flags); + ret = -EINVAL; + goto rereg_phys_mr_exit0; + } + + /* set requested values dependent on rereg request */ + spin_lock_irqsave(&e_mr->mrlock, sl_flags); + new_start = e_mr->start; + new_size = e_mr->size; + new_acl = e_mr->acl; + new_pd = container_of(mr->pd, struct ehca_pd, ib_pd); + + if (mr_rereg_mask & IB_MR_REREG_TRANS) { + u64 hw_pgsize = ehca_get_max_hwpage_size(shca); + + new_start = iova_start; /* change address */ + /* check physical buffer list and calculate size */ + ret = ehca_mr_chk_buf_and_calc_size(phys_buf_array, + num_phys_buf, iova_start, + &new_size); + if (ret) + goto rereg_phys_mr_exit1; + if ((new_size == 0) || + (((u64)iova_start + new_size) < (u64)iova_start)) { + ehca_err(mr->device, "bad input values: new_size=%llx " + "iova_start=%p", new_size, iova_start); + ret = -EINVAL; + goto rereg_phys_mr_exit1; + } + num_kpages = NUM_CHUNKS(((u64)new_start % PAGE_SIZE) + + new_size, PAGE_SIZE); + num_hwpages = NUM_CHUNKS(((u64)new_start % hw_pgsize) + + new_size, hw_pgsize); + memset(&pginfo, 0, sizeof(pginfo)); + pginfo.type = EHCA_MR_PGI_PHYS; + pginfo.num_kpages = num_kpages; + pginfo.hwpage_size = hw_pgsize; + pginfo.num_hwpages = num_hwpages; + pginfo.u.phy.num_phys_buf = num_phys_buf; + pginfo.u.phy.phys_buf_array = phys_buf_array; + pginfo.next_hwpage = + ((u64)iova_start & ~PAGE_MASK) / hw_pgsize; + } + if (mr_rereg_mask & IB_MR_REREG_ACCESS) + new_acl = mr_access_flags; + if (mr_rereg_mask & IB_MR_REREG_PD) + new_pd = container_of(pd, struct ehca_pd, ib_pd); + + ret = ehca_rereg_mr(shca, e_mr, new_start, new_size, new_acl, + new_pd, &pginfo, &tmp_lkey, &tmp_rkey); + if (ret) + goto rereg_phys_mr_exit1; + + /* successful reregistration */ + if (mr_rereg_mask & IB_MR_REREG_PD) + mr->pd = pd; + mr->lkey = tmp_lkey; + mr->rkey = tmp_rkey; + +rereg_phys_mr_exit1: + spin_unlock_irqrestore(&e_mr->mrlock, sl_flags); +rereg_phys_mr_exit0: + if (ret) + ehca_err(mr->device, "ret=%i mr=%p mr_rereg_mask=%x pd=%p " + "phys_buf_array=%p num_phys_buf=%x mr_access_flags=%x " + "iova_start=%p", + ret, mr, mr_rereg_mask, pd, phys_buf_array, + num_phys_buf, mr_access_flags, iova_start); + return ret; +} /* end ehca_rereg_phys_mr() */ + +/*----------------------------------------------------------------------*/ + +int ehca_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr) +{ + int ret = 0; + u64 h_ret; + struct ehca_shca *shca = + container_of(mr->device, struct ehca_shca, ib_device); + struct ehca_mr *e_mr = container_of(mr, struct ehca_mr, ib.ib_mr); + unsigned long sl_flags; + struct ehca_mr_hipzout_parms hipzout; + + if ((e_mr->flags & EHCA_MR_FLAG_FMR)) { + ehca_err(mr->device, "not supported for FMR, mr=%p e_mr=%p " + "e_mr->flags=%x", mr, e_mr, e_mr->flags); + ret = -EINVAL; + goto query_mr_exit0; + } + + memset(mr_attr, 0, sizeof(struct ib_mr_attr)); + spin_lock_irqsave(&e_mr->mrlock, sl_flags); + + h_ret = hipz_h_query_mr(shca->ipz_hca_handle, e_mr, &hipzout); + if (h_ret != H_SUCCESS) { + ehca_err(mr->device, "hipz_mr_query failed, h_ret=%lli mr=%p " + "hca_hndl=%llx mr_hndl=%llx lkey=%x", + h_ret, mr, shca->ipz_hca_handle.handle, + e_mr->ipz_mr_handle.handle, mr->lkey); + ret = ehca2ib_return_code(h_ret); + goto query_mr_exit1; + } + mr_attr->pd = mr->pd; + mr_attr->device_virt_addr = hipzout.vaddr; + mr_attr->size = hipzout.len; + mr_attr->lkey = hipzout.lkey; + mr_attr->rkey = hipzout.rkey; + ehca_mrmw_reverse_map_acl(&hipzout.acl, &mr_attr->mr_access_flags); + +query_mr_exit1: + spin_unlock_irqrestore(&e_mr->mrlock, sl_flags); +query_mr_exit0: + if (ret) + ehca_err(mr->device, "ret=%i mr=%p mr_attr=%p", + ret, mr, mr_attr); + return ret; +} /* end ehca_query_mr() */ + +/*----------------------------------------------------------------------*/ + +int ehca_dereg_mr(struct ib_mr *mr) +{ + int ret = 0; + u64 h_ret; + struct ehca_shca *shca = + container_of(mr->device, struct ehca_shca, ib_device); + struct ehca_mr *e_mr = container_of(mr, struct ehca_mr, ib.ib_mr); + + if ((e_mr->flags & EHCA_MR_FLAG_FMR)) { + ehca_err(mr->device, "not supported for FMR, mr=%p e_mr=%p " + "e_mr->flags=%x", mr, e_mr, e_mr->flags); + ret = -EINVAL; + goto dereg_mr_exit0; + } else if (e_mr == shca->maxmr) { + /* should be impossible, however reject to be sure */ + ehca_err(mr->device, "dereg internal max-MR impossible, mr=%p " + "shca->maxmr=%p mr->lkey=%x", + mr, shca->maxmr, mr->lkey); + ret = -EINVAL; + goto dereg_mr_exit0; + } + + /* TODO: BUSY: MR still has bound window(s) */ + h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_mr); + if (h_ret != H_SUCCESS) { + ehca_err(mr->device, "hipz_free_mr failed, h_ret=%lli shca=%p " + "e_mr=%p hca_hndl=%llx mr_hndl=%llx mr->lkey=%x", + h_ret, shca, e_mr, shca->ipz_hca_handle.handle, + e_mr->ipz_mr_handle.handle, mr->lkey); + ret = ehca2ib_return_code(h_ret); + goto dereg_mr_exit0; + } + + if (e_mr->umem) + ib_umem_release(e_mr->umem); + + /* successful deregistration */ + ehca_mr_delete(e_mr); + +dereg_mr_exit0: + if (ret) + ehca_err(mr->device, "ret=%i mr=%p", ret, mr); + return ret; +} /* end ehca_dereg_mr() */ + +/*----------------------------------------------------------------------*/ + +struct ib_mw *ehca_alloc_mw(struct ib_pd *pd, enum ib_mw_type type) +{ + struct ib_mw *ib_mw; + u64 h_ret; + struct ehca_mw *e_mw; + struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd); + struct ehca_shca *shca = + container_of(pd->device, struct ehca_shca, ib_device); + struct ehca_mw_hipzout_parms hipzout; + + if (type != IB_MW_TYPE_1) + return ERR_PTR(-EINVAL); + + e_mw = ehca_mw_new(); + if (!e_mw) { + ib_mw = ERR_PTR(-ENOMEM); + goto alloc_mw_exit0; + } + + h_ret = hipz_h_alloc_resource_mw(shca->ipz_hca_handle, e_mw, + e_pd->fw_pd, &hipzout); + if (h_ret != H_SUCCESS) { + ehca_err(pd->device, "hipz_mw_allocate failed, h_ret=%lli " + "shca=%p hca_hndl=%llx mw=%p", + h_ret, shca, shca->ipz_hca_handle.handle, e_mw); + ib_mw = ERR_PTR(ehca2ib_return_code(h_ret)); + goto alloc_mw_exit1; + } + /* successful MW allocation */ + e_mw->ipz_mw_handle = hipzout.handle; + e_mw->ib_mw.rkey = hipzout.rkey; + return &e_mw->ib_mw; + +alloc_mw_exit1: + ehca_mw_delete(e_mw); +alloc_mw_exit0: + if (IS_ERR(ib_mw)) + ehca_err(pd->device, "h_ret=%li pd=%p", PTR_ERR(ib_mw), pd); + return ib_mw; +} /* end ehca_alloc_mw() */ + +/*----------------------------------------------------------------------*/ + +int ehca_bind_mw(struct ib_qp *qp, + struct ib_mw *mw, + struct ib_mw_bind *mw_bind) +{ + /* TODO: not supported up to now */ + ehca_gen_err("bind MW currently not supported by HCAD"); + + return -EPERM; +} /* end ehca_bind_mw() */ + +/*----------------------------------------------------------------------*/ + +int ehca_dealloc_mw(struct ib_mw *mw) +{ + u64 h_ret; + struct ehca_shca *shca = + container_of(mw->device, struct ehca_shca, ib_device); + struct ehca_mw *e_mw = container_of(mw, struct ehca_mw, ib_mw); + + h_ret = hipz_h_free_resource_mw(shca->ipz_hca_handle, e_mw); + if (h_ret != H_SUCCESS) { + ehca_err(mw->device, "hipz_free_mw failed, h_ret=%lli shca=%p " + "mw=%p rkey=%x hca_hndl=%llx mw_hndl=%llx", + h_ret, shca, mw, mw->rkey, shca->ipz_hca_handle.handle, + e_mw->ipz_mw_handle.handle); + return ehca2ib_return_code(h_ret); + } + /* successful deallocation */ + ehca_mw_delete(e_mw); + return 0; +} /* end ehca_dealloc_mw() */ + +/*----------------------------------------------------------------------*/ + +struct ib_fmr *ehca_alloc_fmr(struct ib_pd *pd, + int mr_access_flags, + struct ib_fmr_attr *fmr_attr) +{ + struct ib_fmr *ib_fmr; + struct ehca_shca *shca = + container_of(pd->device, struct ehca_shca, ib_device); + struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd); + struct ehca_mr *e_fmr; + int ret; + u32 tmp_lkey, tmp_rkey; + struct ehca_mr_pginfo pginfo; + u64 hw_pgsize; + + /* check other parameters */ + if (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) && + !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) || + ((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) && + !(mr_access_flags & IB_ACCESS_LOCAL_WRITE))) { + /* + * Remote Write Access requires Local Write Access + * Remote Atomic Access requires Local Write Access + */ + ehca_err(pd->device, "bad input values: mr_access_flags=%x", + mr_access_flags); + ib_fmr = ERR_PTR(-EINVAL); + goto alloc_fmr_exit0; + } + if (mr_access_flags & IB_ACCESS_MW_BIND) { + ehca_err(pd->device, "bad input values: mr_access_flags=%x", + mr_access_flags); + ib_fmr = ERR_PTR(-EINVAL); + goto alloc_fmr_exit0; + } + if ((fmr_attr->max_pages == 0) || (fmr_attr->max_maps == 0)) { + ehca_err(pd->device, "bad input values: fmr_attr->max_pages=%x " + "fmr_attr->max_maps=%x fmr_attr->page_shift=%x", + fmr_attr->max_pages, fmr_attr->max_maps, + fmr_attr->page_shift); + ib_fmr = ERR_PTR(-EINVAL); + goto alloc_fmr_exit0; + } + + hw_pgsize = 1 << fmr_attr->page_shift; + if (!(hw_pgsize & shca->hca_cap_mr_pgsize)) { + ehca_err(pd->device, "unsupported fmr_attr->page_shift=%x", + fmr_attr->page_shift); + ib_fmr = ERR_PTR(-EINVAL); + goto alloc_fmr_exit0; + } + + e_fmr = ehca_mr_new(); + if (!e_fmr) { + ib_fmr = ERR_PTR(-ENOMEM); + goto alloc_fmr_exit0; + } + e_fmr->flags |= EHCA_MR_FLAG_FMR; + + /* register MR on HCA */ + memset(&pginfo, 0, sizeof(pginfo)); + pginfo.hwpage_size = hw_pgsize; + /* + * pginfo.num_hwpages==0, ie register_rpages() will not be called + * but deferred to map_phys_fmr() + */ + ret = ehca_reg_mr(shca, e_fmr, NULL, + fmr_attr->max_pages * (1 << fmr_attr->page_shift), + mr_access_flags, e_pd, &pginfo, + &tmp_lkey, &tmp_rkey, EHCA_REG_MR); + if (ret) { + ib_fmr = ERR_PTR(ret); + goto alloc_fmr_exit1; + } + + /* successful */ + e_fmr->hwpage_size = hw_pgsize; + e_fmr->fmr_page_size = 1 << fmr_attr->page_shift; + e_fmr->fmr_max_pages = fmr_attr->max_pages; + e_fmr->fmr_max_maps = fmr_attr->max_maps; + e_fmr->fmr_map_cnt = 0; + return &e_fmr->ib.ib_fmr; + +alloc_fmr_exit1: + ehca_mr_delete(e_fmr); +alloc_fmr_exit0: + return ib_fmr; +} /* end ehca_alloc_fmr() */ + +/*----------------------------------------------------------------------*/ + +int ehca_map_phys_fmr(struct ib_fmr *fmr, + u64 *page_list, + int list_len, + u64 iova) +{ + int ret; + struct ehca_shca *shca = + container_of(fmr->device, struct ehca_shca, ib_device); + struct ehca_mr *e_fmr = container_of(fmr, struct ehca_mr, ib.ib_fmr); + struct ehca_pd *e_pd = container_of(fmr->pd, struct ehca_pd, ib_pd); + struct ehca_mr_pginfo pginfo; + u32 tmp_lkey, tmp_rkey; + + if (!(e_fmr->flags & EHCA_MR_FLAG_FMR)) { + ehca_err(fmr->device, "not a FMR, e_fmr=%p e_fmr->flags=%x", + e_fmr, e_fmr->flags); + ret = -EINVAL; + goto map_phys_fmr_exit0; + } + ret = ehca_fmr_check_page_list(e_fmr, page_list, list_len); + if (ret) + goto map_phys_fmr_exit0; + if (iova % e_fmr->fmr_page_size) { + /* only whole-numbered pages */ + ehca_err(fmr->device, "bad iova, iova=%llx fmr_page_size=%x", + iova, e_fmr->fmr_page_size); + ret = -EINVAL; + goto map_phys_fmr_exit0; + } + if (e_fmr->fmr_map_cnt >= e_fmr->fmr_max_maps) { + /* HCAD does not limit the maps, however trace this anyway */ + ehca_info(fmr->device, "map limit exceeded, fmr=%p " + "e_fmr->fmr_map_cnt=%x e_fmr->fmr_max_maps=%x", + fmr, e_fmr->fmr_map_cnt, e_fmr->fmr_max_maps); + } + + memset(&pginfo, 0, sizeof(pginfo)); + pginfo.type = EHCA_MR_PGI_FMR; + pginfo.num_kpages = list_len; + pginfo.hwpage_size = e_fmr->hwpage_size; + pginfo.num_hwpages = + list_len * e_fmr->fmr_page_size / pginfo.hwpage_size; + pginfo.u.fmr.page_list = page_list; + pginfo.next_hwpage = + (iova & (e_fmr->fmr_page_size-1)) / pginfo.hwpage_size; + pginfo.u.fmr.fmr_pgsize = e_fmr->fmr_page_size; + + ret = ehca_rereg_mr(shca, e_fmr, (u64 *)iova, + list_len * e_fmr->fmr_page_size, + e_fmr->acl, e_pd, &pginfo, &tmp_lkey, &tmp_rkey); + if (ret) + goto map_phys_fmr_exit0; + + /* successful reregistration */ + e_fmr->fmr_map_cnt++; + e_fmr->ib.ib_fmr.lkey = tmp_lkey; + e_fmr->ib.ib_fmr.rkey = tmp_rkey; + return 0; + +map_phys_fmr_exit0: + if (ret) + ehca_err(fmr->device, "ret=%i fmr=%p page_list=%p list_len=%x " + "iova=%llx", ret, fmr, page_list, list_len, iova); + return ret; +} /* end ehca_map_phys_fmr() */ + +/*----------------------------------------------------------------------*/ + +int ehca_unmap_fmr(struct list_head *fmr_list) +{ + int ret = 0; + struct ib_fmr *ib_fmr; + struct ehca_shca *shca = NULL; + struct ehca_shca *prev_shca; + struct ehca_mr *e_fmr; + u32 num_fmr = 0; + u32 unmap_fmr_cnt = 0; + + /* check all FMR belong to same SHCA, and check internal flag */ + list_for_each_entry(ib_fmr, fmr_list, list) { + prev_shca = shca; + shca = container_of(ib_fmr->device, struct ehca_shca, + ib_device); + e_fmr = container_of(ib_fmr, struct ehca_mr, ib.ib_fmr); + if ((shca != prev_shca) && prev_shca) { + ehca_err(&shca->ib_device, "SHCA mismatch, shca=%p " + "prev_shca=%p e_fmr=%p", + shca, prev_shca, e_fmr); + ret = -EINVAL; + goto unmap_fmr_exit0; + } + if (!(e_fmr->flags & EHCA_MR_FLAG_FMR)) { + ehca_err(&shca->ib_device, "not a FMR, e_fmr=%p " + "e_fmr->flags=%x", e_fmr, e_fmr->flags); + ret = -EINVAL; + goto unmap_fmr_exit0; + } + num_fmr++; + } + + /* loop over all FMRs to unmap */ + list_for_each_entry(ib_fmr, fmr_list, list) { + unmap_fmr_cnt++; + e_fmr = container_of(ib_fmr, struct ehca_mr, ib.ib_fmr); + shca = container_of(ib_fmr->device, struct ehca_shca, + ib_device); + ret = ehca_unmap_one_fmr(shca, e_fmr); + if (ret) { + /* unmap failed, stop unmapping of rest of FMRs */ + ehca_err(&shca->ib_device, "unmap of one FMR failed, " + "stop rest, e_fmr=%p num_fmr=%x " + "unmap_fmr_cnt=%x lkey=%x", e_fmr, num_fmr, + unmap_fmr_cnt, e_fmr->ib.ib_fmr.lkey); + goto unmap_fmr_exit0; + } + } + +unmap_fmr_exit0: + if (ret) + ehca_gen_err("ret=%i fmr_list=%p num_fmr=%x unmap_fmr_cnt=%x", + ret, fmr_list, num_fmr, unmap_fmr_cnt); + return ret; +} /* end ehca_unmap_fmr() */ + +/*----------------------------------------------------------------------*/ + +int ehca_dealloc_fmr(struct ib_fmr *fmr) +{ + int ret; + u64 h_ret; + struct ehca_shca *shca = + container_of(fmr->device, struct ehca_shca, ib_device); + struct ehca_mr *e_fmr = container_of(fmr, struct ehca_mr, ib.ib_fmr); + + if (!(e_fmr->flags & EHCA_MR_FLAG_FMR)) { + ehca_err(fmr->device, "not a FMR, e_fmr=%p e_fmr->flags=%x", + e_fmr, e_fmr->flags); + ret = -EINVAL; + goto free_fmr_exit0; + } + + h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_fmr); + if (h_ret != H_SUCCESS) { + ehca_err(fmr->device, "hipz_free_mr failed, h_ret=%lli e_fmr=%p " + "hca_hndl=%llx fmr_hndl=%llx fmr->lkey=%x", + h_ret, e_fmr, shca->ipz_hca_handle.handle, + e_fmr->ipz_mr_handle.handle, fmr->lkey); + ret = ehca2ib_return_code(h_ret); + goto free_fmr_exit0; + } + /* successful deregistration */ + ehca_mr_delete(e_fmr); + return 0; + +free_fmr_exit0: + if (ret) + ehca_err(&shca->ib_device, "ret=%i fmr=%p", ret, fmr); + return ret; +} /* end ehca_dealloc_fmr() */ + +/*----------------------------------------------------------------------*/ + +static int ehca_reg_bmap_mr_rpages(struct ehca_shca *shca, + struct ehca_mr *e_mr, + struct ehca_mr_pginfo *pginfo); + +int ehca_reg_mr(struct ehca_shca *shca, + struct ehca_mr *e_mr, + u64 *iova_start, + u64 size, + int acl, + struct ehca_pd *e_pd, + struct ehca_mr_pginfo *pginfo, + u32 *lkey, /*OUT*/ + u32 *rkey, /*OUT*/ + enum ehca_reg_type reg_type) +{ + int ret; + u64 h_ret; + u32 hipz_acl; + struct ehca_mr_hipzout_parms hipzout; + + ehca_mrmw_map_acl(acl, &hipz_acl); + ehca_mrmw_set_pgsize_hipz_acl(pginfo->hwpage_size, &hipz_acl); + if (ehca_use_hp_mr == 1) + hipz_acl |= 0x00000001; + + h_ret = hipz_h_alloc_resource_mr(shca->ipz_hca_handle, e_mr, + (u64)iova_start, size, hipz_acl, + e_pd->fw_pd, &hipzout); + if (h_ret != H_SUCCESS) { + ehca_err(&shca->ib_device, "hipz_alloc_mr failed, h_ret=%lli " + "hca_hndl=%llx", h_ret, shca->ipz_hca_handle.handle); + ret = ehca2ib_return_code(h_ret); + goto ehca_reg_mr_exit0; + } + + e_mr->ipz_mr_handle = hipzout.handle; + + if (reg_type == EHCA_REG_BUSMAP_MR) + ret = ehca_reg_bmap_mr_rpages(shca, e_mr, pginfo); + else if (reg_type == EHCA_REG_MR) + ret = ehca_reg_mr_rpages(shca, e_mr, pginfo); + else + ret = -EINVAL; + + if (ret) + goto ehca_reg_mr_exit1; + + /* successful registration */ + e_mr->num_kpages = pginfo->num_kpages; + e_mr->num_hwpages = pginfo->num_hwpages; + e_mr->hwpage_size = pginfo->hwpage_size; + e_mr->start = iova_start; + e_mr->size = size; + e_mr->acl = acl; + *lkey = hipzout.lkey; + *rkey = hipzout.rkey; + return 0; + +ehca_reg_mr_exit1: + h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_mr); + if (h_ret != H_SUCCESS) { + ehca_err(&shca->ib_device, "h_ret=%lli shca=%p e_mr=%p " + "iova_start=%p size=%llx acl=%x e_pd=%p lkey=%x " + "pginfo=%p num_kpages=%llx num_hwpages=%llx ret=%i", + h_ret, shca, e_mr, iova_start, size, acl, e_pd, + hipzout.lkey, pginfo, pginfo->num_kpages, + pginfo->num_hwpages, ret); + ehca_err(&shca->ib_device, "internal error in ehca_reg_mr, " + "not recoverable"); + } +ehca_reg_mr_exit0: + if (ret) + ehca_err(&shca->ib_device, "ret=%i shca=%p e_mr=%p " + "iova_start=%p size=%llx acl=%x e_pd=%p pginfo=%p " + "num_kpages=%llx num_hwpages=%llx", + ret, shca, e_mr, iova_start, size, acl, e_pd, pginfo, + pginfo->num_kpages, pginfo->num_hwpages); + return ret; +} /* end ehca_reg_mr() */ + +/*----------------------------------------------------------------------*/ + +int ehca_reg_mr_rpages(struct ehca_shca *shca, + struct ehca_mr *e_mr, + struct ehca_mr_pginfo *pginfo) +{ + int ret = 0; + u64 h_ret; + u32 rnum; + u64 rpage; + u32 i; + u64 *kpage; + + if (!pginfo->num_hwpages) /* in case of fmr */ + return 0; + + kpage = ehca_alloc_fw_ctrlblock(GFP_KERNEL); + if (!kpage) { + ehca_err(&shca->ib_device, "kpage alloc failed"); + ret = -ENOMEM; + goto ehca_reg_mr_rpages_exit0; + } + + /* max MAX_RPAGES ehca mr pages per register call */ + for (i = 0; i < NUM_CHUNKS(pginfo->num_hwpages, MAX_RPAGES); i++) { + + if (i == NUM_CHUNKS(pginfo->num_hwpages, MAX_RPAGES) - 1) { + rnum = pginfo->num_hwpages % MAX_RPAGES; /* last shot */ + if (rnum == 0) + rnum = MAX_RPAGES; /* last shot is full */ + } else + rnum = MAX_RPAGES; + + ret = ehca_set_pagebuf(pginfo, rnum, kpage); + if (ret) { + ehca_err(&shca->ib_device, "ehca_set_pagebuf " + "bad rc, ret=%i rnum=%x kpage=%p", + ret, rnum, kpage); + goto ehca_reg_mr_rpages_exit1; + } + + if (rnum > 1) { + rpage = __pa(kpage); + if (!rpage) { + ehca_err(&shca->ib_device, "kpage=%p i=%x", + kpage, i); + ret = -EFAULT; + goto ehca_reg_mr_rpages_exit1; + } + } else + rpage = *kpage; + + h_ret = hipz_h_register_rpage_mr( + shca->ipz_hca_handle, e_mr, + ehca_encode_hwpage_size(pginfo->hwpage_size), + 0, rpage, rnum); + + if (i == NUM_CHUNKS(pginfo->num_hwpages, MAX_RPAGES) - 1) { + /* + * check for 'registration complete'==H_SUCCESS + * and for 'page registered'==H_PAGE_REGISTERED + */ + if (h_ret != H_SUCCESS) { + ehca_err(&shca->ib_device, "last " + "hipz_reg_rpage_mr failed, h_ret=%lli " + "e_mr=%p i=%x hca_hndl=%llx mr_hndl=%llx" + " lkey=%x", h_ret, e_mr, i, + shca->ipz_hca_handle.handle, + e_mr->ipz_mr_handle.handle, + e_mr->ib.ib_mr.lkey); + ret = ehca2ib_return_code(h_ret); + break; + } else + ret = 0; + } else if (h_ret != H_PAGE_REGISTERED) { + ehca_err(&shca->ib_device, "hipz_reg_rpage_mr failed, " + "h_ret=%lli e_mr=%p i=%x lkey=%x hca_hndl=%llx " + "mr_hndl=%llx", h_ret, e_mr, i, + e_mr->ib.ib_mr.lkey, + shca->ipz_hca_handle.handle, + e_mr->ipz_mr_handle.handle); + ret = ehca2ib_return_code(h_ret); + break; + } else + ret = 0; + } /* end for(i) */ + + +ehca_reg_mr_rpages_exit1: + ehca_free_fw_ctrlblock(kpage); +ehca_reg_mr_rpages_exit0: + if (ret) + ehca_err(&shca->ib_device, "ret=%i shca=%p e_mr=%p pginfo=%p " + "num_kpages=%llx num_hwpages=%llx", ret, shca, e_mr, + pginfo, pginfo->num_kpages, pginfo->num_hwpages); + return ret; +} /* end ehca_reg_mr_rpages() */ + +/*----------------------------------------------------------------------*/ + +inline int ehca_rereg_mr_rereg1(struct ehca_shca *shca, + struct ehca_mr *e_mr, + u64 *iova_start, + u64 size, + u32 acl, + struct ehca_pd *e_pd, + struct ehca_mr_pginfo *pginfo, + u32 *lkey, /*OUT*/ + u32 *rkey) /*OUT*/ +{ + int ret; + u64 h_ret; + u32 hipz_acl; + u64 *kpage; + u64 rpage; + struct ehca_mr_pginfo pginfo_save; + struct ehca_mr_hipzout_parms hipzout; + + ehca_mrmw_map_acl(acl, &hipz_acl); + ehca_mrmw_set_pgsize_hipz_acl(pginfo->hwpage_size, &hipz_acl); + + kpage = ehca_alloc_fw_ctrlblock(GFP_KERNEL); + if (!kpage) { + ehca_err(&shca->ib_device, "kpage alloc failed"); + ret = -ENOMEM; + goto ehca_rereg_mr_rereg1_exit0; + } + + pginfo_save = *pginfo; + ret = ehca_set_pagebuf(pginfo, pginfo->num_hwpages, kpage); + if (ret) { + ehca_err(&shca->ib_device, "set pagebuf failed, e_mr=%p " + "pginfo=%p type=%x num_kpages=%llx num_hwpages=%llx " + "kpage=%p", e_mr, pginfo, pginfo->type, + pginfo->num_kpages, pginfo->num_hwpages, kpage); + goto ehca_rereg_mr_rereg1_exit1; + } + rpage = __pa(kpage); + if (!rpage) { + ehca_err(&shca->ib_device, "kpage=%p", kpage); + ret = -EFAULT; + goto ehca_rereg_mr_rereg1_exit1; + } + h_ret = hipz_h_reregister_pmr(shca->ipz_hca_handle, e_mr, + (u64)iova_start, size, hipz_acl, + e_pd->fw_pd, rpage, &hipzout); + if (h_ret != H_SUCCESS) { + /* + * reregistration unsuccessful, try it again with the 3 hCalls, + * e.g. this is required in case H_MR_CONDITION + * (MW bound or MR is shared) + */ + ehca_warn(&shca->ib_device, "hipz_h_reregister_pmr failed " + "(Rereg1), h_ret=%lli e_mr=%p", h_ret, e_mr); + *pginfo = pginfo_save; + ret = -EAGAIN; + } else if ((u64 *)hipzout.vaddr != iova_start) { + ehca_err(&shca->ib_device, "PHYP changed iova_start in " + "rereg_pmr, iova_start=%p iova_start_out=%llx e_mr=%p " + "mr_handle=%llx lkey=%x lkey_out=%x", iova_start, + hipzout.vaddr, e_mr, e_mr->ipz_mr_handle.handle, + e_mr->ib.ib_mr.lkey, hipzout.lkey); + ret = -EFAULT; + } else { + /* + * successful reregistration + * note: start and start_out are identical for eServer HCAs + */ + e_mr->num_kpages = pginfo->num_kpages; + e_mr->num_hwpages = pginfo->num_hwpages; + e_mr->hwpage_size = pginfo->hwpage_size; + e_mr->start = iova_start; + e_mr->size = size; + e_mr->acl = acl; + *lkey = hipzout.lkey; + *rkey = hipzout.rkey; + } + +ehca_rereg_mr_rereg1_exit1: + ehca_free_fw_ctrlblock(kpage); +ehca_rereg_mr_rereg1_exit0: + if ( ret && (ret != -EAGAIN) ) + ehca_err(&shca->ib_device, "ret=%i lkey=%x rkey=%x " + "pginfo=%p num_kpages=%llx num_hwpages=%llx", + ret, *lkey, *rkey, pginfo, pginfo->num_kpages, + pginfo->num_hwpages); + return ret; +} /* end ehca_rereg_mr_rereg1() */ + +/*----------------------------------------------------------------------*/ + +int ehca_rereg_mr(struct ehca_shca *shca, + struct ehca_mr *e_mr, + u64 *iova_start, + u64 size, + int acl, + struct ehca_pd *e_pd, + struct ehca_mr_pginfo *pginfo, + u32 *lkey, + u32 *rkey) +{ + int ret = 0; + u64 h_ret; + int rereg_1_hcall = 1; /* 1: use hipz_h_reregister_pmr directly */ + int rereg_3_hcall = 0; /* 1: use 3 hipz calls for reregistration */ + + /* first determine reregistration hCall(s) */ + if ((pginfo->num_hwpages > MAX_RPAGES) || + (e_mr->num_hwpages > MAX_RPAGES) || + (pginfo->num_hwpages > e_mr->num_hwpages)) { + ehca_dbg(&shca->ib_device, "Rereg3 case, " + "pginfo->num_hwpages=%llx e_mr->num_hwpages=%x", + pginfo->num_hwpages, e_mr->num_hwpages); + rereg_1_hcall = 0; + rereg_3_hcall = 1; + } + + if (e_mr->flags & EHCA_MR_FLAG_MAXMR) { /* check for max-MR */ + rereg_1_hcall = 0; + rereg_3_hcall = 1; + e_mr->flags &= ~EHCA_MR_FLAG_MAXMR; + ehca_err(&shca->ib_device, "Rereg MR for max-MR! e_mr=%p", + e_mr); + } + + if (rereg_1_hcall) { + ret = ehca_rereg_mr_rereg1(shca, e_mr, iova_start, size, + acl, e_pd, pginfo, lkey, rkey); + if (ret) { + if (ret == -EAGAIN) + rereg_3_hcall = 1; + else + goto ehca_rereg_mr_exit0; + } + } + + if (rereg_3_hcall) { + struct ehca_mr save_mr; + + /* first deregister old MR */ + h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_mr); + if (h_ret != H_SUCCESS) { + ehca_err(&shca->ib_device, "hipz_free_mr failed, " + "h_ret=%lli e_mr=%p hca_hndl=%llx mr_hndl=%llx " + "mr->lkey=%x", + h_ret, e_mr, shca->ipz_hca_handle.handle, + e_mr->ipz_mr_handle.handle, + e_mr->ib.ib_mr.lkey); + ret = ehca2ib_return_code(h_ret); + goto ehca_rereg_mr_exit0; + } + /* clean ehca_mr_t, without changing struct ib_mr and lock */ + save_mr = *e_mr; + ehca_mr_deletenew(e_mr); + + /* set some MR values */ + e_mr->flags = save_mr.flags; + e_mr->hwpage_size = save_mr.hwpage_size; + e_mr->fmr_page_size = save_mr.fmr_page_size; + e_mr->fmr_max_pages = save_mr.fmr_max_pages; + e_mr->fmr_max_maps = save_mr.fmr_max_maps; + e_mr->fmr_map_cnt = save_mr.fmr_map_cnt; + + ret = ehca_reg_mr(shca, e_mr, iova_start, size, acl, + e_pd, pginfo, lkey, rkey, EHCA_REG_MR); + if (ret) { + u32 offset = (u64)(&e_mr->flags) - (u64)e_mr; + memcpy(&e_mr->flags, &(save_mr.flags), + sizeof(struct ehca_mr) - offset); + goto ehca_rereg_mr_exit0; + } + } + +ehca_rereg_mr_exit0: + if (ret) + ehca_err(&shca->ib_device, "ret=%i shca=%p e_mr=%p " + "iova_start=%p size=%llx acl=%x e_pd=%p pginfo=%p " + "num_kpages=%llx lkey=%x rkey=%x rereg_1_hcall=%x " + "rereg_3_hcall=%x", ret, shca, e_mr, iova_start, size, + acl, e_pd, pginfo, pginfo->num_kpages, *lkey, *rkey, + rereg_1_hcall, rereg_3_hcall); + return ret; +} /* end ehca_rereg_mr() */ + +/*----------------------------------------------------------------------*/ + +int ehca_unmap_one_fmr(struct ehca_shca *shca, + struct ehca_mr *e_fmr) +{ + int ret = 0; + u64 h_ret; + struct ehca_pd *e_pd = + container_of(e_fmr->ib.ib_fmr.pd, struct ehca_pd, ib_pd); + struct ehca_mr save_fmr; + u32 tmp_lkey, tmp_rkey; + struct ehca_mr_pginfo pginfo; + struct ehca_mr_hipzout_parms hipzout; + struct ehca_mr save_mr; + + if (e_fmr->fmr_max_pages <= MAX_RPAGES) { + /* + * note: after using rereg hcall with len=0, + * rereg hcall must be used again for registering pages + */ + h_ret = hipz_h_reregister_pmr(shca->ipz_hca_handle, e_fmr, 0, + 0, 0, e_pd->fw_pd, 0, &hipzout); + if (h_ret == H_SUCCESS) { + /* successful reregistration */ + e_fmr->start = NULL; + e_fmr->size = 0; + tmp_lkey = hipzout.lkey; + tmp_rkey = hipzout.rkey; + return 0; + } + /* + * should not happen, because length checked above, + * FMRs are not shared and no MW bound to FMRs + */ + ehca_err(&shca->ib_device, "hipz_reregister_pmr failed " + "(Rereg1), h_ret=%lli e_fmr=%p hca_hndl=%llx " + "mr_hndl=%llx lkey=%x lkey_out=%x", + h_ret, e_fmr, shca->ipz_hca_handle.handle, + e_fmr->ipz_mr_handle.handle, + e_fmr->ib.ib_fmr.lkey, hipzout.lkey); + /* try free and rereg */ + } + + /* first free old FMR */ + h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_fmr); + if (h_ret != H_SUCCESS) { + ehca_err(&shca->ib_device, "hipz_free_mr failed, " + "h_ret=%lli e_fmr=%p hca_hndl=%llx mr_hndl=%llx " + "lkey=%x", + h_ret, e_fmr, shca->ipz_hca_handle.handle, + e_fmr->ipz_mr_handle.handle, + e_fmr->ib.ib_fmr.lkey); + ret = ehca2ib_return_code(h_ret); + goto ehca_unmap_one_fmr_exit0; + } + /* clean ehca_mr_t, without changing lock */ + save_fmr = *e_fmr; + ehca_mr_deletenew(e_fmr); + + /* set some MR values */ + e_fmr->flags = save_fmr.flags; + e_fmr->hwpage_size = save_fmr.hwpage_size; + e_fmr->fmr_page_size = save_fmr.fmr_page_size; + e_fmr->fmr_max_pages = save_fmr.fmr_max_pages; + e_fmr->fmr_max_maps = save_fmr.fmr_max_maps; + e_fmr->fmr_map_cnt = save_fmr.fmr_map_cnt; + e_fmr->acl = save_fmr.acl; + + memset(&pginfo, 0, sizeof(pginfo)); + pginfo.type = EHCA_MR_PGI_FMR; + ret = ehca_reg_mr(shca, e_fmr, NULL, + (e_fmr->fmr_max_pages * e_fmr->fmr_page_size), + e_fmr->acl, e_pd, &pginfo, &tmp_lkey, + &tmp_rkey, EHCA_REG_MR); + if (ret) { + u32 offset = (u64)(&e_fmr->flags) - (u64)e_fmr; + memcpy(&e_fmr->flags, &(save_mr.flags), + sizeof(struct ehca_mr) - offset); + } + +ehca_unmap_one_fmr_exit0: + if (ret) + ehca_err(&shca->ib_device, "ret=%i tmp_lkey=%x tmp_rkey=%x " + "fmr_max_pages=%x", + ret, tmp_lkey, tmp_rkey, e_fmr->fmr_max_pages); + return ret; +} /* end ehca_unmap_one_fmr() */ + +/*----------------------------------------------------------------------*/ + +int ehca_reg_smr(struct ehca_shca *shca, + struct ehca_mr *e_origmr, + struct ehca_mr *e_newmr, + u64 *iova_start, + int acl, + struct ehca_pd *e_pd, + u32 *lkey, /*OUT*/ + u32 *rkey) /*OUT*/ +{ + int ret = 0; + u64 h_ret; + u32 hipz_acl; + struct ehca_mr_hipzout_parms hipzout; + + ehca_mrmw_map_acl(acl, &hipz_acl); + ehca_mrmw_set_pgsize_hipz_acl(e_origmr->hwpage_size, &hipz_acl); + + h_ret = hipz_h_register_smr(shca->ipz_hca_handle, e_newmr, e_origmr, + (u64)iova_start, hipz_acl, e_pd->fw_pd, + &hipzout); + if (h_ret != H_SUCCESS) { + ehca_err(&shca->ib_device, "hipz_reg_smr failed, h_ret=%lli " + "shca=%p e_origmr=%p e_newmr=%p iova_start=%p acl=%x " + "e_pd=%p hca_hndl=%llx mr_hndl=%llx lkey=%x", + h_ret, shca, e_origmr, e_newmr, iova_start, acl, e_pd, + shca->ipz_hca_handle.handle, + e_origmr->ipz_mr_handle.handle, + e_origmr->ib.ib_mr.lkey); + ret = ehca2ib_return_code(h_ret); + goto ehca_reg_smr_exit0; + } + /* successful registration */ + e_newmr->num_kpages = e_origmr->num_kpages; + e_newmr->num_hwpages = e_origmr->num_hwpages; + e_newmr->hwpage_size = e_origmr->hwpage_size; + e_newmr->start = iova_start; + e_newmr->size = e_origmr->size; + e_newmr->acl = acl; + e_newmr->ipz_mr_handle = hipzout.handle; + *lkey = hipzout.lkey; + *rkey = hipzout.rkey; + return 0; + +ehca_reg_smr_exit0: + if (ret) + ehca_err(&shca->ib_device, "ret=%i shca=%p e_origmr=%p " + "e_newmr=%p iova_start=%p acl=%x e_pd=%p", + ret, shca, e_origmr, e_newmr, iova_start, acl, e_pd); + return ret; +} /* end ehca_reg_smr() */ + +/*----------------------------------------------------------------------*/ +static inline void *ehca_calc_sectbase(int top, int dir, int idx) +{ + unsigned long ret = idx; + ret |= dir << EHCA_DIR_INDEX_SHIFT; + ret |= top << EHCA_TOP_INDEX_SHIFT; + return __va(ret << SECTION_SIZE_BITS); +} + +#define ehca_bmap_valid(entry) \ + ((u64)entry != (u64)EHCA_INVAL_ADDR) + +static u64 ehca_reg_mr_section(int top, int dir, int idx, u64 *kpage, + struct ehca_shca *shca, struct ehca_mr *mr, + struct ehca_mr_pginfo *pginfo) +{ + u64 h_ret = 0; + unsigned long page = 0; + u64 rpage = __pa(kpage); + int page_count; + + void *sectbase = ehca_calc_sectbase(top, dir, idx); + if ((unsigned long)sectbase & (pginfo->hwpage_size - 1)) { + ehca_err(&shca->ib_device, "reg_mr_section will probably fail:" + "hwpage_size does not fit to " + "section start address"); + } + page_count = EHCA_SECTSIZE / pginfo->hwpage_size; + + while (page < page_count) { + u64 rnum; + for (rnum = 0; (rnum < MAX_RPAGES) && (page < page_count); + rnum++) { + void *pg = sectbase + ((page++) * pginfo->hwpage_size); + kpage[rnum] = __pa(pg); + } + + h_ret = hipz_h_register_rpage_mr(shca->ipz_hca_handle, mr, + ehca_encode_hwpage_size(pginfo->hwpage_size), + 0, rpage, rnum); + + if ((h_ret != H_SUCCESS) && (h_ret != H_PAGE_REGISTERED)) { + ehca_err(&shca->ib_device, "register_rpage_mr failed"); + return h_ret; + } + } + return h_ret; +} + +static u64 ehca_reg_mr_sections(int top, int dir, u64 *kpage, + struct ehca_shca *shca, struct ehca_mr *mr, + struct ehca_mr_pginfo *pginfo) +{ + u64 hret = H_SUCCESS; + int idx; + + for (idx = 0; idx < EHCA_MAP_ENTRIES; idx++) { + if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]->ent[idx])) + continue; + + hret = ehca_reg_mr_section(top, dir, idx, kpage, shca, mr, + pginfo); + if ((hret != H_SUCCESS) && (hret != H_PAGE_REGISTERED)) + return hret; + } + return hret; +} + +static u64 ehca_reg_mr_dir_sections(int top, u64 *kpage, struct ehca_shca *shca, + struct ehca_mr *mr, + struct ehca_mr_pginfo *pginfo) +{ + u64 hret = H_SUCCESS; + int dir; + + for (dir = 0; dir < EHCA_MAP_ENTRIES; dir++) { + if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir])) + continue; + + hret = ehca_reg_mr_sections(top, dir, kpage, shca, mr, pginfo); + if ((hret != H_SUCCESS) && (hret != H_PAGE_REGISTERED)) + return hret; + } + return hret; +} + +/* register internal max-MR to internal SHCA */ +int ehca_reg_internal_maxmr( + struct ehca_shca *shca, + struct ehca_pd *e_pd, + struct ehca_mr **e_maxmr) /*OUT*/ +{ + int ret; + struct ehca_mr *e_mr; + u64 *iova_start; + u64 size_maxmr; + struct ehca_mr_pginfo pginfo; + struct ib_phys_buf ib_pbuf; + u32 num_kpages; + u32 num_hwpages; + u64 hw_pgsize; + + if (!ehca_bmap) { + ret = -EFAULT; + goto ehca_reg_internal_maxmr_exit0; + } + + e_mr = ehca_mr_new(); + if (!e_mr) { + ehca_err(&shca->ib_device, "out of memory"); + ret = -ENOMEM; + goto ehca_reg_internal_maxmr_exit0; + } + e_mr->flags |= EHCA_MR_FLAG_MAXMR; + + /* register internal max-MR on HCA */ + size_maxmr = ehca_mr_len; + iova_start = (u64 *)ehca_map_vaddr((void *)(KERNELBASE + PHYSICAL_START)); + ib_pbuf.addr = 0; + ib_pbuf.size = size_maxmr; + num_kpages = NUM_CHUNKS(((u64)iova_start % PAGE_SIZE) + size_maxmr, + PAGE_SIZE); + hw_pgsize = ehca_get_max_hwpage_size(shca); + num_hwpages = NUM_CHUNKS(((u64)iova_start % hw_pgsize) + size_maxmr, + hw_pgsize); + + memset(&pginfo, 0, sizeof(pginfo)); + pginfo.type = EHCA_MR_PGI_PHYS; + pginfo.num_kpages = num_kpages; + pginfo.num_hwpages = num_hwpages; + pginfo.hwpage_size = hw_pgsize; + pginfo.u.phy.num_phys_buf = 1; + pginfo.u.phy.phys_buf_array = &ib_pbuf; + + ret = ehca_reg_mr(shca, e_mr, iova_start, size_maxmr, 0, e_pd, + &pginfo, &e_mr->ib.ib_mr.lkey, + &e_mr->ib.ib_mr.rkey, EHCA_REG_BUSMAP_MR); + if (ret) { + ehca_err(&shca->ib_device, "reg of internal max MR failed, " + "e_mr=%p iova_start=%p size_maxmr=%llx num_kpages=%x " + "num_hwpages=%x", e_mr, iova_start, size_maxmr, + num_kpages, num_hwpages); + goto ehca_reg_internal_maxmr_exit1; + } + + /* successful registration of all pages */ + e_mr->ib.ib_mr.device = e_pd->ib_pd.device; + e_mr->ib.ib_mr.pd = &e_pd->ib_pd; + e_mr->ib.ib_mr.uobject = NULL; + atomic_inc(&(e_pd->ib_pd.usecnt)); + atomic_set(&(e_mr->ib.ib_mr.usecnt), 0); + *e_maxmr = e_mr; + return 0; + +ehca_reg_internal_maxmr_exit1: + ehca_mr_delete(e_mr); +ehca_reg_internal_maxmr_exit0: + if (ret) + ehca_err(&shca->ib_device, "ret=%i shca=%p e_pd=%p e_maxmr=%p", + ret, shca, e_pd, e_maxmr); + return ret; +} /* end ehca_reg_internal_maxmr() */ + +/*----------------------------------------------------------------------*/ + +int ehca_reg_maxmr(struct ehca_shca *shca, + struct ehca_mr *e_newmr, + u64 *iova_start, + int acl, + struct ehca_pd *e_pd, + u32 *lkey, + u32 *rkey) +{ + u64 h_ret; + struct ehca_mr *e_origmr = shca->maxmr; + u32 hipz_acl; + struct ehca_mr_hipzout_parms hipzout; + + ehca_mrmw_map_acl(acl, &hipz_acl); + ehca_mrmw_set_pgsize_hipz_acl(e_origmr->hwpage_size, &hipz_acl); + + h_ret = hipz_h_register_smr(shca->ipz_hca_handle, e_newmr, e_origmr, + (u64)iova_start, hipz_acl, e_pd->fw_pd, + &hipzout); + if (h_ret != H_SUCCESS) { + ehca_err(&shca->ib_device, "hipz_reg_smr failed, h_ret=%lli " + "e_origmr=%p hca_hndl=%llx mr_hndl=%llx lkey=%x", + h_ret, e_origmr, shca->ipz_hca_handle.handle, + e_origmr->ipz_mr_handle.handle, + e_origmr->ib.ib_mr.lkey); + return ehca2ib_return_code(h_ret); + } + /* successful registration */ + e_newmr->num_kpages = e_origmr->num_kpages; + e_newmr->num_hwpages = e_origmr->num_hwpages; + e_newmr->hwpage_size = e_origmr->hwpage_size; + e_newmr->start = iova_start; + e_newmr->size = e_origmr->size; + e_newmr->acl = acl; + e_newmr->ipz_mr_handle = hipzout.handle; + *lkey = hipzout.lkey; + *rkey = hipzout.rkey; + return 0; +} /* end ehca_reg_maxmr() */ + +/*----------------------------------------------------------------------*/ + +int ehca_dereg_internal_maxmr(struct ehca_shca *shca) +{ + int ret; + struct ehca_mr *e_maxmr; + struct ib_pd *ib_pd; + + if (!shca->maxmr) { + ehca_err(&shca->ib_device, "bad call, shca=%p", shca); + ret = -EINVAL; + goto ehca_dereg_internal_maxmr_exit0; + } + + e_maxmr = shca->maxmr; + ib_pd = e_maxmr->ib.ib_mr.pd; + shca->maxmr = NULL; /* remove internal max-MR indication from SHCA */ + + ret = ehca_dereg_mr(&e_maxmr->ib.ib_mr); + if (ret) { + ehca_err(&shca->ib_device, "dereg internal max-MR failed, " + "ret=%i e_maxmr=%p shca=%p lkey=%x", + ret, e_maxmr, shca, e_maxmr->ib.ib_mr.lkey); + shca->maxmr = e_maxmr; + goto ehca_dereg_internal_maxmr_exit0; + } + + atomic_dec(&ib_pd->usecnt); + +ehca_dereg_internal_maxmr_exit0: + if (ret) + ehca_err(&shca->ib_device, "ret=%i shca=%p shca->maxmr=%p", + ret, shca, shca->maxmr); + return ret; +} /* end ehca_dereg_internal_maxmr() */ + +/*----------------------------------------------------------------------*/ + +/* + * check physical buffer array of MR verbs for validness and + * calculates MR size + */ +int ehca_mr_chk_buf_and_calc_size(struct ib_phys_buf *phys_buf_array, + int num_phys_buf, + u64 *iova_start, + u64 *size) +{ + struct ib_phys_buf *pbuf = phys_buf_array; + u64 size_count = 0; + u32 i; + + if (num_phys_buf == 0) { + ehca_gen_err("bad phys buf array len, num_phys_buf=0"); + return -EINVAL; + } + /* check first buffer */ + if (((u64)iova_start & ~PAGE_MASK) != (pbuf->addr & ~PAGE_MASK)) { + ehca_gen_err("iova_start/addr mismatch, iova_start=%p " + "pbuf->addr=%llx pbuf->size=%llx", + iova_start, pbuf->addr, pbuf->size); + return -EINVAL; + } + if (((pbuf->addr + pbuf->size) % PAGE_SIZE) && + (num_phys_buf > 1)) { + ehca_gen_err("addr/size mismatch in 1st buf, pbuf->addr=%llx " + "pbuf->size=%llx", pbuf->addr, pbuf->size); + return -EINVAL; + } + + for (i = 0; i < num_phys_buf; i++) { + if ((i > 0) && (pbuf->addr % PAGE_SIZE)) { + ehca_gen_err("bad address, i=%x pbuf->addr=%llx " + "pbuf->size=%llx", + i, pbuf->addr, pbuf->size); + return -EINVAL; + } + if (((i > 0) && /* not 1st */ + (i < (num_phys_buf - 1)) && /* not last */ + (pbuf->size % PAGE_SIZE)) || (pbuf->size == 0)) { + ehca_gen_err("bad size, i=%x pbuf->size=%llx", + i, pbuf->size); + return -EINVAL; + } + size_count += pbuf->size; + pbuf++; + } + + *size = size_count; + return 0; +} /* end ehca_mr_chk_buf_and_calc_size() */ + +/*----------------------------------------------------------------------*/ + +/* check page list of map FMR verb for validness */ +int ehca_fmr_check_page_list(struct ehca_mr *e_fmr, + u64 *page_list, + int list_len) +{ + u32 i; + u64 *page; + + if ((list_len == 0) || (list_len > e_fmr->fmr_max_pages)) { + ehca_gen_err("bad list_len, list_len=%x " + "e_fmr->fmr_max_pages=%x fmr=%p", + list_len, e_fmr->fmr_max_pages, e_fmr); + return -EINVAL; + } + + /* each page must be aligned */ + page = page_list; + for (i = 0; i < list_len; i++) { + if (*page % e_fmr->fmr_page_size) { + ehca_gen_err("bad page, i=%x *page=%llx page=%p fmr=%p " + "fmr_page_size=%x", i, *page, page, e_fmr, + e_fmr->fmr_page_size); + return -EINVAL; + } + page++; + } + + return 0; +} /* end ehca_fmr_check_page_list() */ + +/*----------------------------------------------------------------------*/ + +/* PAGE_SIZE >= pginfo->hwpage_size */ +static int ehca_set_pagebuf_user1(struct ehca_mr_pginfo *pginfo, + u32 number, + u64 *kpage) +{ + int ret = 0; + u64 pgaddr; + u32 j = 0; + int hwpages_per_kpage = PAGE_SIZE / pginfo->hwpage_size; + struct scatterlist **sg = &pginfo->u.usr.next_sg; + + while (*sg != NULL) { + pgaddr = page_to_pfn(sg_page(*sg)) + << PAGE_SHIFT; + *kpage = pgaddr + (pginfo->next_hwpage * + pginfo->hwpage_size); + if (!(*kpage)) { + ehca_gen_err("pgaddr=%llx " + "sg_dma_address=%llx " + "entry=%llx next_hwpage=%llx", + pgaddr, (u64)sg_dma_address(*sg), + pginfo->u.usr.next_nmap, + pginfo->next_hwpage); + return -EFAULT; + } + (pginfo->hwpage_cnt)++; + (pginfo->next_hwpage)++; + kpage++; + if (pginfo->next_hwpage % hwpages_per_kpage == 0) { + (pginfo->kpage_cnt)++; + (pginfo->u.usr.next_nmap)++; + pginfo->next_hwpage = 0; + *sg = sg_next(*sg); + } + j++; + if (j >= number) + break; + } + + return ret; +} + +/* + * check given pages for contiguous layout + * last page addr is returned in prev_pgaddr for further check + */ +static int ehca_check_kpages_per_ate(struct scatterlist **sg, + int num_pages, + u64 *prev_pgaddr) +{ + for (; *sg && num_pages > 0; *sg = sg_next(*sg), num_pages--) { + u64 pgaddr = page_to_pfn(sg_page(*sg)) << PAGE_SHIFT; + if (ehca_debug_level >= 3) + ehca_gen_dbg("chunk_page=%llx value=%016llx", pgaddr, + *(u64 *)__va(pgaddr)); + if (pgaddr - PAGE_SIZE != *prev_pgaddr) { + ehca_gen_err("uncontiguous page found pgaddr=%llx " + "prev_pgaddr=%llx entries_left_in_hwpage=%x", + pgaddr, *prev_pgaddr, num_pages); + return -EINVAL; + } + *prev_pgaddr = pgaddr; + } + return 0; +} + +/* PAGE_SIZE < pginfo->hwpage_size */ +static int ehca_set_pagebuf_user2(struct ehca_mr_pginfo *pginfo, + u32 number, + u64 *kpage) +{ + int ret = 0; + u64 pgaddr, prev_pgaddr; + u32 j = 0; + int kpages_per_hwpage = pginfo->hwpage_size / PAGE_SIZE; + int nr_kpages = kpages_per_hwpage; + struct scatterlist **sg = &pginfo->u.usr.next_sg; + + while (*sg != NULL) { + + if (nr_kpages == kpages_per_hwpage) { + pgaddr = (page_to_pfn(sg_page(*sg)) + << PAGE_SHIFT); + *kpage = pgaddr; + if (!(*kpage)) { + ehca_gen_err("pgaddr=%llx entry=%llx", + pgaddr, pginfo->u.usr.next_nmap); + ret = -EFAULT; + return ret; + } + /* + * The first page in a hwpage must be aligned; + * the first MR page is exempt from this rule. + */ + if (pgaddr & (pginfo->hwpage_size - 1)) { + if (pginfo->hwpage_cnt) { + ehca_gen_err( + "invalid alignment " + "pgaddr=%llx entry=%llx " + "mr_pgsize=%llx", + pgaddr, pginfo->u.usr.next_nmap, + pginfo->hwpage_size); + ret = -EFAULT; + return ret; + } + /* first MR page */ + pginfo->kpage_cnt = + (pgaddr & + (pginfo->hwpage_size - 1)) >> + PAGE_SHIFT; + nr_kpages -= pginfo->kpage_cnt; + *kpage = pgaddr & + ~(pginfo->hwpage_size - 1); + } + if (ehca_debug_level >= 3) { + u64 val = *(u64 *)__va(pgaddr); + ehca_gen_dbg("kpage=%llx page=%llx " + "value=%016llx", + *kpage, pgaddr, val); + } + prev_pgaddr = pgaddr; + *sg = sg_next(*sg); + pginfo->kpage_cnt++; + pginfo->u.usr.next_nmap++; + nr_kpages--; + if (!nr_kpages) + goto next_kpage; + continue; + } + + ret = ehca_check_kpages_per_ate(sg, nr_kpages, + &prev_pgaddr); + if (ret) + return ret; + pginfo->kpage_cnt += nr_kpages; + pginfo->u.usr.next_nmap += nr_kpages; + +next_kpage: + nr_kpages = kpages_per_hwpage; + (pginfo->hwpage_cnt)++; + kpage++; + j++; + if (j >= number) + break; + } + + return ret; +} + +static int ehca_set_pagebuf_phys(struct ehca_mr_pginfo *pginfo, + u32 number, u64 *kpage) +{ + int ret = 0; + struct ib_phys_buf *pbuf; + u64 num_hw, offs_hw; + u32 i = 0; + + /* loop over desired phys_buf_array entries */ + while (i < number) { + pbuf = pginfo->u.phy.phys_buf_array + pginfo->u.phy.next_buf; + num_hw = NUM_CHUNKS((pbuf->addr % pginfo->hwpage_size) + + pbuf->size, pginfo->hwpage_size); + offs_hw = (pbuf->addr & ~(pginfo->hwpage_size - 1)) / + pginfo->hwpage_size; + while (pginfo->next_hwpage < offs_hw + num_hw) { + /* sanity check */ + if ((pginfo->kpage_cnt >= pginfo->num_kpages) || + (pginfo->hwpage_cnt >= pginfo->num_hwpages)) { + ehca_gen_err("kpage_cnt >= num_kpages, " + "kpage_cnt=%llx num_kpages=%llx " + "hwpage_cnt=%llx " + "num_hwpages=%llx i=%x", + pginfo->kpage_cnt, + pginfo->num_kpages, + pginfo->hwpage_cnt, + pginfo->num_hwpages, i); + return -EFAULT; + } + *kpage = (pbuf->addr & ~(pginfo->hwpage_size - 1)) + + (pginfo->next_hwpage * pginfo->hwpage_size); + if ( !(*kpage) && pbuf->addr ) { + ehca_gen_err("pbuf->addr=%llx pbuf->size=%llx " + "next_hwpage=%llx", pbuf->addr, + pbuf->size, pginfo->next_hwpage); + return -EFAULT; + } + (pginfo->hwpage_cnt)++; + (pginfo->next_hwpage)++; + if (PAGE_SIZE >= pginfo->hwpage_size) { + if (pginfo->next_hwpage % + (PAGE_SIZE / pginfo->hwpage_size) == 0) + (pginfo->kpage_cnt)++; + } else + pginfo->kpage_cnt += pginfo->hwpage_size / + PAGE_SIZE; + kpage++; + i++; + if (i >= number) break; + } + if (pginfo->next_hwpage >= offs_hw + num_hw) { + (pginfo->u.phy.next_buf)++; + pginfo->next_hwpage = 0; + } + } + return ret; +} + +static int ehca_set_pagebuf_fmr(struct ehca_mr_pginfo *pginfo, + u32 number, u64 *kpage) +{ + int ret = 0; + u64 *fmrlist; + u32 i; + + /* loop over desired page_list entries */ + fmrlist = pginfo->u.fmr.page_list + pginfo->u.fmr.next_listelem; + for (i = 0; i < number; i++) { + *kpage = (*fmrlist & ~(pginfo->hwpage_size - 1)) + + pginfo->next_hwpage * pginfo->hwpage_size; + if ( !(*kpage) ) { + ehca_gen_err("*fmrlist=%llx fmrlist=%p " + "next_listelem=%llx next_hwpage=%llx", + *fmrlist, fmrlist, + pginfo->u.fmr.next_listelem, + pginfo->next_hwpage); + return -EFAULT; + } + (pginfo->hwpage_cnt)++; + if (pginfo->u.fmr.fmr_pgsize >= pginfo->hwpage_size) { + if (pginfo->next_hwpage % + (pginfo->u.fmr.fmr_pgsize / + pginfo->hwpage_size) == 0) { + (pginfo->kpage_cnt)++; + (pginfo->u.fmr.next_listelem)++; + fmrlist++; + pginfo->next_hwpage = 0; + } else + (pginfo->next_hwpage)++; + } else { + unsigned int cnt_per_hwpage = pginfo->hwpage_size / + pginfo->u.fmr.fmr_pgsize; + unsigned int j; + u64 prev = *kpage; + /* check if adrs are contiguous */ + for (j = 1; j < cnt_per_hwpage; j++) { + u64 p = fmrlist[j] & ~(pginfo->hwpage_size - 1); + if (prev + pginfo->u.fmr.fmr_pgsize != p) { + ehca_gen_err("uncontiguous fmr pages " + "found prev=%llx p=%llx " + "idx=%x", prev, p, i + j); + return -EINVAL; + } + prev = p; + } + pginfo->kpage_cnt += cnt_per_hwpage; + pginfo->u.fmr.next_listelem += cnt_per_hwpage; + fmrlist += cnt_per_hwpage; + } + kpage++; + } + return ret; +} + +/* setup page buffer from page info */ +int ehca_set_pagebuf(struct ehca_mr_pginfo *pginfo, + u32 number, + u64 *kpage) +{ + int ret; + + switch (pginfo->type) { + case EHCA_MR_PGI_PHYS: + ret = ehca_set_pagebuf_phys(pginfo, number, kpage); + break; + case EHCA_MR_PGI_USER: + ret = PAGE_SIZE >= pginfo->hwpage_size ? + ehca_set_pagebuf_user1(pginfo, number, kpage) : + ehca_set_pagebuf_user2(pginfo, number, kpage); + break; + case EHCA_MR_PGI_FMR: + ret = ehca_set_pagebuf_fmr(pginfo, number, kpage); + break; + default: + ehca_gen_err("bad pginfo->type=%x", pginfo->type); + ret = -EFAULT; + break; + } + return ret; +} /* end ehca_set_pagebuf() */ + +/*----------------------------------------------------------------------*/ + +/* + * check MR if it is a max-MR, i.e. uses whole memory + * in case it's a max-MR 1 is returned, else 0 + */ +int ehca_mr_is_maxmr(u64 size, + u64 *iova_start) +{ + /* a MR is treated as max-MR only if it fits following: */ + if ((size == ehca_mr_len) && + (iova_start == (void *)ehca_map_vaddr((void *)(KERNELBASE + PHYSICAL_START)))) { + ehca_gen_dbg("this is a max-MR"); + return 1; + } else + return 0; +} /* end ehca_mr_is_maxmr() */ + +/*----------------------------------------------------------------------*/ + +/* map access control for MR/MW. This routine is used for MR and MW. */ +void ehca_mrmw_map_acl(int ib_acl, + u32 *hipz_acl) +{ + *hipz_acl = 0; + if (ib_acl & IB_ACCESS_REMOTE_READ) + *hipz_acl |= HIPZ_ACCESSCTRL_R_READ; + if (ib_acl & IB_ACCESS_REMOTE_WRITE) + *hipz_acl |= HIPZ_ACCESSCTRL_R_WRITE; + if (ib_acl & IB_ACCESS_REMOTE_ATOMIC) + *hipz_acl |= HIPZ_ACCESSCTRL_R_ATOMIC; + if (ib_acl & IB_ACCESS_LOCAL_WRITE) + *hipz_acl |= HIPZ_ACCESSCTRL_L_WRITE; + if (ib_acl & IB_ACCESS_MW_BIND) + *hipz_acl |= HIPZ_ACCESSCTRL_MW_BIND; +} /* end ehca_mrmw_map_acl() */ + +/*----------------------------------------------------------------------*/ + +/* sets page size in hipz access control for MR/MW. */ +void ehca_mrmw_set_pgsize_hipz_acl(u32 pgsize, u32 *hipz_acl) /*INOUT*/ +{ + *hipz_acl |= (ehca_encode_hwpage_size(pgsize) << 24); +} /* end ehca_mrmw_set_pgsize_hipz_acl() */ + +/*----------------------------------------------------------------------*/ + +/* + * reverse map access control for MR/MW. + * This routine is used for MR and MW. + */ +void ehca_mrmw_reverse_map_acl(const u32 *hipz_acl, + int *ib_acl) /*OUT*/ +{ + *ib_acl = 0; + if (*hipz_acl & HIPZ_ACCESSCTRL_R_READ) + *ib_acl |= IB_ACCESS_REMOTE_READ; + if (*hipz_acl & HIPZ_ACCESSCTRL_R_WRITE) + *ib_acl |= IB_ACCESS_REMOTE_WRITE; + if (*hipz_acl & HIPZ_ACCESSCTRL_R_ATOMIC) + *ib_acl |= IB_ACCESS_REMOTE_ATOMIC; + if (*hipz_acl & HIPZ_ACCESSCTRL_L_WRITE) + *ib_acl |= IB_ACCESS_LOCAL_WRITE; + if (*hipz_acl & HIPZ_ACCESSCTRL_MW_BIND) + *ib_acl |= IB_ACCESS_MW_BIND; +} /* end ehca_mrmw_reverse_map_acl() */ + + +/*----------------------------------------------------------------------*/ + +/* + * MR destructor and constructor + * used in Reregister MR verb, sets all fields in ehca_mr_t to 0, + * except struct ib_mr and spinlock + */ +void ehca_mr_deletenew(struct ehca_mr *mr) +{ + mr->flags = 0; + mr->num_kpages = 0; + mr->num_hwpages = 0; + mr->acl = 0; + mr->start = NULL; + mr->fmr_page_size = 0; + mr->fmr_max_pages = 0; + mr->fmr_max_maps = 0; + mr->fmr_map_cnt = 0; + memset(&mr->ipz_mr_handle, 0, sizeof(mr->ipz_mr_handle)); + memset(&mr->galpas, 0, sizeof(mr->galpas)); +} /* end ehca_mr_deletenew() */ + +int ehca_init_mrmw_cache(void) +{ + mr_cache = kmem_cache_create("ehca_cache_mr", + sizeof(struct ehca_mr), 0, + SLAB_HWCACHE_ALIGN, + NULL); + if (!mr_cache) + return -ENOMEM; + mw_cache = kmem_cache_create("ehca_cache_mw", + sizeof(struct ehca_mw), 0, + SLAB_HWCACHE_ALIGN, + NULL); + if (!mw_cache) { + kmem_cache_destroy(mr_cache); + mr_cache = NULL; + return -ENOMEM; + } + return 0; +} + +void ehca_cleanup_mrmw_cache(void) +{ + if (mr_cache) + kmem_cache_destroy(mr_cache); + if (mw_cache) + kmem_cache_destroy(mw_cache); +} + +static inline int ehca_init_top_bmap(struct ehca_top_bmap *ehca_top_bmap, + int dir) +{ + if (!ehca_bmap_valid(ehca_top_bmap->dir[dir])) { + ehca_top_bmap->dir[dir] = + kmalloc(sizeof(struct ehca_dir_bmap), GFP_KERNEL); + if (!ehca_top_bmap->dir[dir]) + return -ENOMEM; + /* Set map block to 0xFF according to EHCA_INVAL_ADDR */ + memset(ehca_top_bmap->dir[dir], 0xFF, EHCA_ENT_MAP_SIZE); + } + return 0; +} + +static inline int ehca_init_bmap(struct ehca_bmap *ehca_bmap, int top, int dir) +{ + if (!ehca_bmap_valid(ehca_bmap->top[top])) { + ehca_bmap->top[top] = + kmalloc(sizeof(struct ehca_top_bmap), GFP_KERNEL); + if (!ehca_bmap->top[top]) + return -ENOMEM; + /* Set map block to 0xFF according to EHCA_INVAL_ADDR */ + memset(ehca_bmap->top[top], 0xFF, EHCA_DIR_MAP_SIZE); + } + return ehca_init_top_bmap(ehca_bmap->top[top], dir); +} + +static inline int ehca_calc_index(unsigned long i, unsigned long s) +{ + return (i >> s) & EHCA_INDEX_MASK; +} + +void ehca_destroy_busmap(void) +{ + int top, dir; + + if (!ehca_bmap) + return; + + for (top = 0; top < EHCA_MAP_ENTRIES; top++) { + if (!ehca_bmap_valid(ehca_bmap->top[top])) + continue; + for (dir = 0; dir < EHCA_MAP_ENTRIES; dir++) { + if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir])) + continue; + + kfree(ehca_bmap->top[top]->dir[dir]); + } + + kfree(ehca_bmap->top[top]); + } + + kfree(ehca_bmap); + ehca_bmap = NULL; +} + +static int ehca_update_busmap(unsigned long pfn, unsigned long nr_pages) +{ + unsigned long i, start_section, end_section; + int top, dir, idx; + + if (!nr_pages) + return 0; + + if (!ehca_bmap) { + ehca_bmap = kmalloc(sizeof(struct ehca_bmap), GFP_KERNEL); + if (!ehca_bmap) + return -ENOMEM; + /* Set map block to 0xFF according to EHCA_INVAL_ADDR */ + memset(ehca_bmap, 0xFF, EHCA_TOP_MAP_SIZE); + } + + start_section = (pfn * PAGE_SIZE) / EHCA_SECTSIZE; + end_section = ((pfn + nr_pages) * PAGE_SIZE) / EHCA_SECTSIZE; + for (i = start_section; i < end_section; i++) { + int ret; + top = ehca_calc_index(i, EHCA_TOP_INDEX_SHIFT); + dir = ehca_calc_index(i, EHCA_DIR_INDEX_SHIFT); + idx = i & EHCA_INDEX_MASK; + + ret = ehca_init_bmap(ehca_bmap, top, dir); + if (ret) { + ehca_destroy_busmap(); + return ret; + } + ehca_bmap->top[top]->dir[dir]->ent[idx] = ehca_mr_len; + ehca_mr_len += EHCA_SECTSIZE; + } + return 0; +} + +static int ehca_is_hugepage(unsigned long pfn) +{ + int page_order; + + if (pfn & EHCA_HUGEPAGE_PFN_MASK) + return 0; + + page_order = compound_order(pfn_to_page(pfn)); + if (page_order + PAGE_SHIFT != EHCA_HUGEPAGESHIFT) + return 0; + + return 1; +} + +static int ehca_create_busmap_callback(unsigned long initial_pfn, + unsigned long total_nr_pages, void *arg) +{ + int ret; + unsigned long pfn, start_pfn, end_pfn, nr_pages; + + if ((total_nr_pages * PAGE_SIZE) < EHCA_HUGEPAGE_SIZE) + return ehca_update_busmap(initial_pfn, total_nr_pages); + + /* Given chunk is >= 16GB -> check for hugepages */ + start_pfn = initial_pfn; + end_pfn = initial_pfn + total_nr_pages; + pfn = start_pfn; + + while (pfn < end_pfn) { + if (ehca_is_hugepage(pfn)) { + /* Add mem found in front of the hugepage */ + nr_pages = pfn - start_pfn; + ret = ehca_update_busmap(start_pfn, nr_pages); + if (ret) + return ret; + /* Skip the hugepage */ + pfn += (EHCA_HUGEPAGE_SIZE / PAGE_SIZE); + start_pfn = pfn; + } else + pfn += (EHCA_SECTSIZE / PAGE_SIZE); + } + + /* Add mem found behind the hugepage(s) */ + nr_pages = pfn - start_pfn; + return ehca_update_busmap(start_pfn, nr_pages); +} + +int ehca_create_busmap(void) +{ + int ret; + + ehca_mr_len = 0; + ret = walk_system_ram_range(0, 1ULL << MAX_PHYSMEM_BITS, NULL, + ehca_create_busmap_callback); + return ret; +} + +static int ehca_reg_bmap_mr_rpages(struct ehca_shca *shca, + struct ehca_mr *e_mr, + struct ehca_mr_pginfo *pginfo) +{ + int top; + u64 hret, *kpage; + + kpage = ehca_alloc_fw_ctrlblock(GFP_KERNEL); + if (!kpage) { + ehca_err(&shca->ib_device, "kpage alloc failed"); + return -ENOMEM; + } + for (top = 0; top < EHCA_MAP_ENTRIES; top++) { + if (!ehca_bmap_valid(ehca_bmap->top[top])) + continue; + hret = ehca_reg_mr_dir_sections(top, kpage, shca, e_mr, pginfo); + if ((hret != H_PAGE_REGISTERED) && (hret != H_SUCCESS)) + break; + } + + ehca_free_fw_ctrlblock(kpage); + + if (hret == H_SUCCESS) + return 0; /* Everything is fine */ + else { + ehca_err(&shca->ib_device, "ehca_reg_bmap_mr_rpages failed, " + "h_ret=%lli e_mr=%p top=%x lkey=%x " + "hca_hndl=%llx mr_hndl=%llx", hret, e_mr, top, + e_mr->ib.ib_mr.lkey, + shca->ipz_hca_handle.handle, + e_mr->ipz_mr_handle.handle); + return ehca2ib_return_code(hret); + } +} + +static u64 ehca_map_vaddr(void *caddr) +{ + int top, dir, idx; + unsigned long abs_addr, offset; + u64 entry; + + if (!ehca_bmap) + return EHCA_INVAL_ADDR; + + abs_addr = __pa(caddr); + top = ehca_calc_index(abs_addr, EHCA_TOP_INDEX_SHIFT + EHCA_SECTSHIFT); + if (!ehca_bmap_valid(ehca_bmap->top[top])) + return EHCA_INVAL_ADDR; + + dir = ehca_calc_index(abs_addr, EHCA_DIR_INDEX_SHIFT + EHCA_SECTSHIFT); + if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir])) + return EHCA_INVAL_ADDR; + + idx = ehca_calc_index(abs_addr, EHCA_SECTSHIFT); + + entry = ehca_bmap->top[top]->dir[dir]->ent[idx]; + if (ehca_bmap_valid(entry)) { + offset = (unsigned long)caddr & (EHCA_SECTSIZE - 1); + return entry | offset; + } else + return EHCA_INVAL_ADDR; +} + +static int ehca_dma_mapping_error(struct ib_device *dev, u64 dma_addr) +{ + return dma_addr == EHCA_INVAL_ADDR; +} + +static u64 ehca_dma_map_single(struct ib_device *dev, void *cpu_addr, + size_t size, enum dma_data_direction direction) +{ + if (cpu_addr) + return ehca_map_vaddr(cpu_addr); + else + return EHCA_INVAL_ADDR; +} + +static void ehca_dma_unmap_single(struct ib_device *dev, u64 addr, size_t size, + enum dma_data_direction direction) +{ + /* This is only a stub; nothing to be done here */ +} + +static u64 ehca_dma_map_page(struct ib_device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction direction) +{ + u64 addr; + + if (offset + size > PAGE_SIZE) + return EHCA_INVAL_ADDR; + + addr = ehca_map_vaddr(page_address(page)); + if (!ehca_dma_mapping_error(dev, addr)) + addr += offset; + + return addr; +} + +static void ehca_dma_unmap_page(struct ib_device *dev, u64 addr, size_t size, + enum dma_data_direction direction) +{ + /* This is only a stub; nothing to be done here */ +} + +static int ehca_dma_map_sg(struct ib_device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction direction) +{ + struct scatterlist *sg; + int i; + + for_each_sg(sgl, sg, nents, i) { + u64 addr; + addr = ehca_map_vaddr(sg_virt(sg)); + if (ehca_dma_mapping_error(dev, addr)) + return 0; + + sg->dma_address = addr; + sg->dma_length = sg->length; + } + return nents; +} + +static void ehca_dma_unmap_sg(struct ib_device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction direction) +{ + /* This is only a stub; nothing to be done here */ +} + +static void ehca_dma_sync_single_for_cpu(struct ib_device *dev, u64 addr, + size_t size, + enum dma_data_direction dir) +{ + dma_sync_single_for_cpu(dev->dma_device, addr, size, dir); +} + +static void ehca_dma_sync_single_for_device(struct ib_device *dev, u64 addr, + size_t size, + enum dma_data_direction dir) +{ + dma_sync_single_for_device(dev->dma_device, addr, size, dir); +} + +static void *ehca_dma_alloc_coherent(struct ib_device *dev, size_t size, + u64 *dma_handle, gfp_t flag) +{ + struct page *p; + void *addr = NULL; + u64 dma_addr; + + p = alloc_pages(flag, get_order(size)); + if (p) { + addr = page_address(p); + dma_addr = ehca_map_vaddr(addr); + if (ehca_dma_mapping_error(dev, dma_addr)) { + free_pages((unsigned long)addr, get_order(size)); + return NULL; + } + if (dma_handle) + *dma_handle = dma_addr; + return addr; + } + return NULL; +} + +static void ehca_dma_free_coherent(struct ib_device *dev, size_t size, + void *cpu_addr, u64 dma_handle) +{ + if (cpu_addr && size) + free_pages((unsigned long)cpu_addr, get_order(size)); +} + + +struct ib_dma_mapping_ops ehca_dma_mapping_ops = { + .mapping_error = ehca_dma_mapping_error, + .map_single = ehca_dma_map_single, + .unmap_single = ehca_dma_unmap_single, + .map_page = ehca_dma_map_page, + .unmap_page = ehca_dma_unmap_page, + .map_sg = ehca_dma_map_sg, + .unmap_sg = ehca_dma_unmap_sg, + .sync_single_for_cpu = ehca_dma_sync_single_for_cpu, + .sync_single_for_device = ehca_dma_sync_single_for_device, + .alloc_coherent = ehca_dma_alloc_coherent, + .free_coherent = ehca_dma_free_coherent, +}; diff --git a/kernel/drivers/infiniband/hw/ehca/ehca_mrmw.h b/kernel/drivers/infiniband/hw/ehca/ehca_mrmw.h new file mode 100644 index 000000000..50d8b5130 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ehca/ehca_mrmw.h @@ -0,0 +1,132 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * MR/MW declarations and inline functions + * + * Authors: Dietmar Decker + * Christoph Raisch + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _EHCA_MRMW_H_ +#define _EHCA_MRMW_H_ + +enum ehca_reg_type { + EHCA_REG_MR, + EHCA_REG_BUSMAP_MR +}; + +int ehca_reg_mr(struct ehca_shca *shca, + struct ehca_mr *e_mr, + u64 *iova_start, + u64 size, + int acl, + struct ehca_pd *e_pd, + struct ehca_mr_pginfo *pginfo, + u32 *lkey, + u32 *rkey, + enum ehca_reg_type reg_type); + +int ehca_reg_mr_rpages(struct ehca_shca *shca, + struct ehca_mr *e_mr, + struct ehca_mr_pginfo *pginfo); + +int ehca_rereg_mr(struct ehca_shca *shca, + struct ehca_mr *e_mr, + u64 *iova_start, + u64 size, + int mr_access_flags, + struct ehca_pd *e_pd, + struct ehca_mr_pginfo *pginfo, + u32 *lkey, + u32 *rkey); + +int ehca_unmap_one_fmr(struct ehca_shca *shca, + struct ehca_mr *e_fmr); + +int ehca_reg_smr(struct ehca_shca *shca, + struct ehca_mr *e_origmr, + struct ehca_mr *e_newmr, + u64 *iova_start, + int acl, + struct ehca_pd *e_pd, + u32 *lkey, + u32 *rkey); + +int ehca_reg_internal_maxmr(struct ehca_shca *shca, + struct ehca_pd *e_pd, + struct ehca_mr **maxmr); + +int ehca_reg_maxmr(struct ehca_shca *shca, + struct ehca_mr *e_newmr, + u64 *iova_start, + int acl, + struct ehca_pd *e_pd, + u32 *lkey, + u32 *rkey); + +int ehca_dereg_internal_maxmr(struct ehca_shca *shca); + +int ehca_mr_chk_buf_and_calc_size(struct ib_phys_buf *phys_buf_array, + int num_phys_buf, + u64 *iova_start, + u64 *size); + +int ehca_fmr_check_page_list(struct ehca_mr *e_fmr, + u64 *page_list, + int list_len); + +int ehca_set_pagebuf(struct ehca_mr_pginfo *pginfo, + u32 number, + u64 *kpage); + +int ehca_mr_is_maxmr(u64 size, + u64 *iova_start); + +void ehca_mrmw_map_acl(int ib_acl, + u32 *hipz_acl); + +void ehca_mrmw_set_pgsize_hipz_acl(u32 pgsize, u32 *hipz_acl); + +void ehca_mrmw_reverse_map_acl(const u32 *hipz_acl, + int *ib_acl); + +void ehca_mr_deletenew(struct ehca_mr *mr); + +int ehca_create_busmap(void); + +void ehca_destroy_busmap(void); + +extern struct ib_dma_mapping_ops ehca_dma_mapping_ops; +#endif /*_EHCA_MRMW_H_*/ diff --git a/kernel/drivers/infiniband/hw/ehca/ehca_pd.c b/kernel/drivers/infiniband/hw/ehca/ehca_pd.c new file mode 100644 index 000000000..351577a66 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ehca/ehca_pd.c @@ -0,0 +1,124 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * PD functions + * + * Authors: Christoph Raisch + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#include "ehca_tools.h" +#include "ehca_iverbs.h" + +static struct kmem_cache *pd_cache; + +struct ib_pd *ehca_alloc_pd(struct ib_device *device, + struct ib_ucontext *context, struct ib_udata *udata) +{ + struct ehca_pd *pd; + int i; + + pd = kmem_cache_zalloc(pd_cache, GFP_KERNEL); + if (!pd) { + ehca_err(device, "device=%p context=%p out of memory", + device, context); + return ERR_PTR(-ENOMEM); + } + + for (i = 0; i < 2; i++) { + INIT_LIST_HEAD(&pd->free[i]); + INIT_LIST_HEAD(&pd->full[i]); + } + mutex_init(&pd->lock); + + /* + * Kernel PD: when device = -1, 0 + * User PD: when context != -1 + */ + if (!context) { + /* + * Kernel PDs after init reuses always + * the one created in ehca_shca_reopen() + */ + struct ehca_shca *shca = container_of(device, struct ehca_shca, + ib_device); + pd->fw_pd.value = shca->pd->fw_pd.value; + } else + pd->fw_pd.value = (u64)pd; + + return &pd->ib_pd; +} + +int ehca_dealloc_pd(struct ib_pd *pd) +{ + struct ehca_pd *my_pd = container_of(pd, struct ehca_pd, ib_pd); + int i, leftovers = 0; + struct ipz_small_queue_page *page, *tmp; + + for (i = 0; i < 2; i++) { + list_splice(&my_pd->full[i], &my_pd->free[i]); + list_for_each_entry_safe(page, tmp, &my_pd->free[i], list) { + leftovers = 1; + free_page(page->page); + kmem_cache_free(small_qp_cache, page); + } + } + + if (leftovers) + ehca_warn(pd->device, + "Some small queue pages were not freed"); + + kmem_cache_free(pd_cache, my_pd); + + return 0; +} + +int ehca_init_pd_cache(void) +{ + pd_cache = kmem_cache_create("ehca_cache_pd", + sizeof(struct ehca_pd), 0, + SLAB_HWCACHE_ALIGN, + NULL); + if (!pd_cache) + return -ENOMEM; + return 0; +} + +void ehca_cleanup_pd_cache(void) +{ + if (pd_cache) + kmem_cache_destroy(pd_cache); +} diff --git a/kernel/drivers/infiniband/hw/ehca/ehca_qes.h b/kernel/drivers/infiniband/hw/ehca/ehca_qes.h new file mode 100644 index 000000000..90c4efa67 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ehca/ehca_qes.h @@ -0,0 +1,260 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * Hardware request structures + * + * Authors: Waleri Fomin + * Reinhard Ernst + * Christoph Raisch + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + + +#ifndef _EHCA_QES_H_ +#define _EHCA_QES_H_ + +#include "ehca_tools.h" + +/* virtual scatter gather entry to specify remote addresses with length */ +struct ehca_vsgentry { + u64 vaddr; + u32 lkey; + u32 length; +}; + +#define GRH_FLAG_MASK EHCA_BMASK_IBM( 7, 7) +#define GRH_IPVERSION_MASK EHCA_BMASK_IBM( 0, 3) +#define GRH_TCLASS_MASK EHCA_BMASK_IBM( 4, 12) +#define GRH_FLOWLABEL_MASK EHCA_BMASK_IBM(13, 31) +#define GRH_PAYLEN_MASK EHCA_BMASK_IBM(32, 47) +#define GRH_NEXTHEADER_MASK EHCA_BMASK_IBM(48, 55) +#define GRH_HOPLIMIT_MASK EHCA_BMASK_IBM(56, 63) + +/* + * Unreliable Datagram Address Vector Format + * see IBTA Vol1 chapter 8.3 Global Routing Header + */ +struct ehca_ud_av { + u8 sl; + u8 lnh; + u16 dlid; + u8 reserved1; + u8 reserved2; + u8 reserved3; + u8 slid_path_bits; + u8 reserved4; + u8 ipd; + u8 reserved5; + u8 pmtu; + u32 reserved6; + u64 reserved7; + union { + struct { + u64 word_0; /* always set to 6 */ + /*should be 0x1B for IB transport */ + u64 word_1; + u64 word_2; + u64 word_3; + u64 word_4; + } grh; + struct { + u32 wd_0; + u32 wd_1; + /* DWord_1 --> SGID */ + + u32 sgid_wd3; + u32 sgid_wd2; + + u32 sgid_wd1; + u32 sgid_wd0; + /* DWord_3 --> DGID */ + + u32 dgid_wd3; + u32 dgid_wd2; + + u32 dgid_wd1; + u32 dgid_wd0; + } grh_l; + }; +}; + +/* maximum number of sg entries allowed in a WQE */ +#define MAX_WQE_SG_ENTRIES 252 + +#define WQE_OPTYPE_SEND 0x80 +#define WQE_OPTYPE_RDMAREAD 0x40 +#define WQE_OPTYPE_RDMAWRITE 0x20 +#define WQE_OPTYPE_CMPSWAP 0x10 +#define WQE_OPTYPE_FETCHADD 0x08 +#define WQE_OPTYPE_BIND 0x04 + +#define WQE_WRFLAG_REQ_SIGNAL_COM 0x80 +#define WQE_WRFLAG_FENCE 0x40 +#define WQE_WRFLAG_IMM_DATA_PRESENT 0x20 +#define WQE_WRFLAG_SOLIC_EVENT 0x10 + +#define WQEF_CACHE_HINT 0x80 +#define WQEF_CACHE_HINT_RD_WR 0x40 +#define WQEF_TIMED_WQE 0x20 +#define WQEF_PURGE 0x08 +#define WQEF_HIGH_NIBBLE 0xF0 + +#define MW_BIND_ACCESSCTRL_R_WRITE 0x40 +#define MW_BIND_ACCESSCTRL_R_READ 0x20 +#define MW_BIND_ACCESSCTRL_R_ATOMIC 0x10 + +struct ehca_wqe { + u64 work_request_id; + u8 optype; + u8 wr_flag; + u16 pkeyi; + u8 wqef; + u8 nr_of_data_seg; + u16 wqe_provided_slid; + u32 destination_qp_number; + u32 resync_psn_sqp; + u32 local_ee_context_qkey; + u32 immediate_data; + union { + struct { + u64 remote_virtual_address; + u32 rkey; + u32 reserved; + u64 atomic_1st_op_dma_len; + u64 atomic_2nd_op; + struct ehca_vsgentry sg_list[MAX_WQE_SG_ENTRIES]; + + } nud; + struct { + u64 ehca_ud_av_ptr; + u64 reserved1; + u64 reserved2; + u64 reserved3; + struct ehca_vsgentry sg_list[MAX_WQE_SG_ENTRIES]; + } ud_avp; + struct { + struct ehca_ud_av ud_av; + struct ehca_vsgentry sg_list[MAX_WQE_SG_ENTRIES - + 2]; + } ud_av; + struct { + u64 reserved0; + u64 reserved1; + u64 reserved2; + u64 reserved3; + struct ehca_vsgentry sg_list[MAX_WQE_SG_ENTRIES]; + } all_rcv; + + struct { + u64 reserved; + u32 rkey; + u32 old_rkey; + u64 reserved1; + u64 reserved2; + u64 virtual_address; + u32 reserved3; + u32 length; + u32 reserved4; + u16 reserved5; + u8 reserved6; + u8 lr_ctl; + u32 lkey; + u32 reserved7; + u64 reserved8; + u64 reserved9; + u64 reserved10; + u64 reserved11; + } bind; + struct { + u64 reserved12; + u64 reserved13; + u32 size; + u32 start; + } inline_data; + } u; + +}; + +#define WC_SEND_RECEIVE EHCA_BMASK_IBM(0, 0) +#define WC_IMM_DATA EHCA_BMASK_IBM(1, 1) +#define WC_GRH_PRESENT EHCA_BMASK_IBM(2, 2) +#define WC_SE_BIT EHCA_BMASK_IBM(3, 3) +#define WC_STATUS_ERROR_BIT 0x80000000 +#define WC_STATUS_REMOTE_ERROR_FLAGS 0x0000F800 +#define WC_STATUS_PURGE_BIT 0x10 +#define WC_SEND_RECEIVE_BIT 0x80 + +struct ehca_cqe { + u64 work_request_id; + u8 optype; + u8 w_completion_flags; + u16 reserved1; + u32 nr_bytes_transferred; + u32 immediate_data; + u32 local_qp_number; + u8 freed_resource_count; + u8 service_level; + u16 wqe_count; + u32 qp_token; + u32 qkey_ee_token; + u32 remote_qp_number; + u16 dlid; + u16 rlid; + u16 reserved2; + u16 pkey_index; + u32 cqe_timestamp; + u32 wqe_timestamp; + u8 wqe_timestamp_valid; + u8 reserved3; + u8 reserved4; + u8 cqe_flags; + u32 status; +}; + +struct ehca_eqe { + u64 entry; +}; + +struct ehca_mrte { + u64 starting_va; + u64 length; /* length of memory region in bytes*/ + u32 pd; + u8 key_instance; + u8 pagesize; + u8 mr_control; + u8 local_remote_access_ctrl; + u8 reserved[0x20 - 0x18]; + u64 at_pointer[4]; +}; +#endif /*_EHCA_QES_H_*/ diff --git a/kernel/drivers/infiniband/hw/ehca/ehca_qp.c b/kernel/drivers/infiniband/hw/ehca/ehca_qp.c new file mode 100644 index 000000000..2e89356c4 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ehca/ehca_qp.c @@ -0,0 +1,2257 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * QP functions + * + * Authors: Joachim Fenkes + * Stefan Roscher + * Waleri Fomin + * Hoang-Nam Nguyen + * Reinhard Ernst + * Heiko J Schick + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#include "ehca_classes.h" +#include "ehca_tools.h" +#include "ehca_qes.h" +#include "ehca_iverbs.h" +#include "hcp_if.h" +#include "hipz_fns.h" + +static struct kmem_cache *qp_cache; + +/* + * attributes not supported by query qp + */ +#define QP_ATTR_QUERY_NOT_SUPPORTED (IB_QP_ACCESS_FLAGS | \ + IB_QP_EN_SQD_ASYNC_NOTIFY) + +/* + * ehca (internal) qp state values + */ +enum ehca_qp_state { + EHCA_QPS_RESET = 1, + EHCA_QPS_INIT = 2, + EHCA_QPS_RTR = 3, + EHCA_QPS_RTS = 5, + EHCA_QPS_SQD = 6, + EHCA_QPS_SQE = 8, + EHCA_QPS_ERR = 128 +}; + +/* + * qp state transitions as defined by IB Arch Rel 1.1 page 431 + */ +enum ib_qp_statetrans { + IB_QPST_ANY2RESET, + IB_QPST_ANY2ERR, + IB_QPST_RESET2INIT, + IB_QPST_INIT2RTR, + IB_QPST_INIT2INIT, + IB_QPST_RTR2RTS, + IB_QPST_RTS2SQD, + IB_QPST_RTS2RTS, + IB_QPST_SQD2RTS, + IB_QPST_SQE2RTS, + IB_QPST_SQD2SQD, + IB_QPST_MAX /* nr of transitions, this must be last!!! */ +}; + +/* + * ib2ehca_qp_state maps IB to ehca qp_state + * returns ehca qp state corresponding to given ib qp state + */ +static inline enum ehca_qp_state ib2ehca_qp_state(enum ib_qp_state ib_qp_state) +{ + switch (ib_qp_state) { + case IB_QPS_RESET: + return EHCA_QPS_RESET; + case IB_QPS_INIT: + return EHCA_QPS_INIT; + case IB_QPS_RTR: + return EHCA_QPS_RTR; + case IB_QPS_RTS: + return EHCA_QPS_RTS; + case IB_QPS_SQD: + return EHCA_QPS_SQD; + case IB_QPS_SQE: + return EHCA_QPS_SQE; + case IB_QPS_ERR: + return EHCA_QPS_ERR; + default: + ehca_gen_err("invalid ib_qp_state=%x", ib_qp_state); + return -EINVAL; + } +} + +/* + * ehca2ib_qp_state maps ehca to IB qp_state + * returns ib qp state corresponding to given ehca qp state + */ +static inline enum ib_qp_state ehca2ib_qp_state(enum ehca_qp_state + ehca_qp_state) +{ + switch (ehca_qp_state) { + case EHCA_QPS_RESET: + return IB_QPS_RESET; + case EHCA_QPS_INIT: + return IB_QPS_INIT; + case EHCA_QPS_RTR: + return IB_QPS_RTR; + case EHCA_QPS_RTS: + return IB_QPS_RTS; + case EHCA_QPS_SQD: + return IB_QPS_SQD; + case EHCA_QPS_SQE: + return IB_QPS_SQE; + case EHCA_QPS_ERR: + return IB_QPS_ERR; + default: + ehca_gen_err("invalid ehca_qp_state=%x", ehca_qp_state); + return -EINVAL; + } +} + +/* + * ehca_qp_type used as index for req_attr and opt_attr of + * struct ehca_modqp_statetrans + */ +enum ehca_qp_type { + QPT_RC = 0, + QPT_UC = 1, + QPT_UD = 2, + QPT_SQP = 3, + QPT_MAX +}; + +/* + * ib2ehcaqptype maps Ib to ehca qp_type + * returns ehca qp type corresponding to ib qp type + */ +static inline enum ehca_qp_type ib2ehcaqptype(enum ib_qp_type ibqptype) +{ + switch (ibqptype) { + case IB_QPT_SMI: + case IB_QPT_GSI: + return QPT_SQP; + case IB_QPT_RC: + return QPT_RC; + case IB_QPT_UC: + return QPT_UC; + case IB_QPT_UD: + return QPT_UD; + default: + ehca_gen_err("Invalid ibqptype=%x", ibqptype); + return -EINVAL; + } +} + +static inline enum ib_qp_statetrans get_modqp_statetrans(int ib_fromstate, + int ib_tostate) +{ + int index = -EINVAL; + switch (ib_tostate) { + case IB_QPS_RESET: + index = IB_QPST_ANY2RESET; + break; + case IB_QPS_INIT: + switch (ib_fromstate) { + case IB_QPS_RESET: + index = IB_QPST_RESET2INIT; + break; + case IB_QPS_INIT: + index = IB_QPST_INIT2INIT; + break; + } + break; + case IB_QPS_RTR: + if (ib_fromstate == IB_QPS_INIT) + index = IB_QPST_INIT2RTR; + break; + case IB_QPS_RTS: + switch (ib_fromstate) { + case IB_QPS_RTR: + index = IB_QPST_RTR2RTS; + break; + case IB_QPS_RTS: + index = IB_QPST_RTS2RTS; + break; + case IB_QPS_SQD: + index = IB_QPST_SQD2RTS; + break; + case IB_QPS_SQE: + index = IB_QPST_SQE2RTS; + break; + } + break; + case IB_QPS_SQD: + if (ib_fromstate == IB_QPS_RTS) + index = IB_QPST_RTS2SQD; + break; + case IB_QPS_SQE: + break; + case IB_QPS_ERR: + index = IB_QPST_ANY2ERR; + break; + default: + break; + } + return index; +} + +/* + * ibqptype2servicetype returns hcp service type corresponding to given + * ib qp type used by create_qp() + */ +static inline int ibqptype2servicetype(enum ib_qp_type ibqptype) +{ + switch (ibqptype) { + case IB_QPT_SMI: + case IB_QPT_GSI: + return ST_UD; + case IB_QPT_RC: + return ST_RC; + case IB_QPT_UC: + return ST_UC; + case IB_QPT_UD: + return ST_UD; + case IB_QPT_RAW_IPV6: + return -EINVAL; + case IB_QPT_RAW_ETHERTYPE: + return -EINVAL; + default: + ehca_gen_err("Invalid ibqptype=%x", ibqptype); + return -EINVAL; + } +} + +/* + * init userspace queue info from ipz_queue data + */ +static inline void queue2resp(struct ipzu_queue_resp *resp, + struct ipz_queue *queue) +{ + resp->qe_size = queue->qe_size; + resp->act_nr_of_sg = queue->act_nr_of_sg; + resp->queue_length = queue->queue_length; + resp->pagesize = queue->pagesize; + resp->toggle_state = queue->toggle_state; + resp->offset = queue->offset; +} + +/* + * init_qp_queue initializes/constructs r/squeue and registers queue pages. + */ +static inline int init_qp_queue(struct ehca_shca *shca, + struct ehca_pd *pd, + struct ehca_qp *my_qp, + struct ipz_queue *queue, + int q_type, + u64 expected_hret, + struct ehca_alloc_queue_parms *parms, + int wqe_size) +{ + int ret, cnt, ipz_rc, nr_q_pages; + void *vpage; + u64 rpage, h_ret; + struct ib_device *ib_dev = &shca->ib_device; + struct ipz_adapter_handle ipz_hca_handle = shca->ipz_hca_handle; + + if (!parms->queue_size) + return 0; + + if (parms->is_small) { + nr_q_pages = 1; + ipz_rc = ipz_queue_ctor(pd, queue, nr_q_pages, + 128 << parms->page_size, + wqe_size, parms->act_nr_sges, 1); + } else { + nr_q_pages = parms->queue_size; + ipz_rc = ipz_queue_ctor(pd, queue, nr_q_pages, + EHCA_PAGESIZE, wqe_size, + parms->act_nr_sges, 0); + } + + if (!ipz_rc) { + ehca_err(ib_dev, "Cannot allocate page for queue. ipz_rc=%i", + ipz_rc); + return -EBUSY; + } + + /* register queue pages */ + for (cnt = 0; cnt < nr_q_pages; cnt++) { + vpage = ipz_qpageit_get_inc(queue); + if (!vpage) { + ehca_err(ib_dev, "ipz_qpageit_get_inc() " + "failed p_vpage= %p", vpage); + ret = -EINVAL; + goto init_qp_queue1; + } + rpage = __pa(vpage); + + h_ret = hipz_h_register_rpage_qp(ipz_hca_handle, + my_qp->ipz_qp_handle, + NULL, 0, q_type, + rpage, parms->is_small ? 0 : 1, + my_qp->galpas.kernel); + if (cnt == (nr_q_pages - 1)) { /* last page! */ + if (h_ret != expected_hret) { + ehca_err(ib_dev, "hipz_qp_register_rpage() " + "h_ret=%lli", h_ret); + ret = ehca2ib_return_code(h_ret); + goto init_qp_queue1; + } + vpage = ipz_qpageit_get_inc(&my_qp->ipz_rqueue); + if (vpage) { + ehca_err(ib_dev, "ipz_qpageit_get_inc() " + "should not succeed vpage=%p", vpage); + ret = -EINVAL; + goto init_qp_queue1; + } + } else { + if (h_ret != H_PAGE_REGISTERED) { + ehca_err(ib_dev, "hipz_qp_register_rpage() " + "h_ret=%lli", h_ret); + ret = ehca2ib_return_code(h_ret); + goto init_qp_queue1; + } + } + } + + ipz_qeit_reset(queue); + + return 0; + +init_qp_queue1: + ipz_queue_dtor(pd, queue); + return ret; +} + +static inline int ehca_calc_wqe_size(int act_nr_sge, int is_llqp) +{ + if (is_llqp) + return 128 << act_nr_sge; + else + return offsetof(struct ehca_wqe, + u.nud.sg_list[act_nr_sge]); +} + +static void ehca_determine_small_queue(struct ehca_alloc_queue_parms *queue, + int req_nr_sge, int is_llqp) +{ + u32 wqe_size, q_size; + int act_nr_sge = req_nr_sge; + + if (!is_llqp) + /* round up #SGEs so WQE size is a power of 2 */ + for (act_nr_sge = 4; act_nr_sge <= 252; + act_nr_sge = 4 + 2 * act_nr_sge) + if (act_nr_sge >= req_nr_sge) + break; + + wqe_size = ehca_calc_wqe_size(act_nr_sge, is_llqp); + q_size = wqe_size * (queue->max_wr + 1); + + if (q_size <= 512) + queue->page_size = 2; + else if (q_size <= 1024) + queue->page_size = 3; + else + queue->page_size = 0; + + queue->is_small = (queue->page_size != 0); +} + +/* needs to be called with cq->spinlock held */ +void ehca_add_to_err_list(struct ehca_qp *qp, int on_sq) +{ + struct list_head *list, *node; + + /* TODO: support low latency QPs */ + if (qp->ext_type == EQPT_LLQP) + return; + + if (on_sq) { + list = &qp->send_cq->sqp_err_list; + node = &qp->sq_err_node; + } else { + list = &qp->recv_cq->rqp_err_list; + node = &qp->rq_err_node; + } + + if (list_empty(node)) + list_add_tail(node, list); + + return; +} + +static void del_from_err_list(struct ehca_cq *cq, struct list_head *node) +{ + unsigned long flags; + + spin_lock_irqsave(&cq->spinlock, flags); + + if (!list_empty(node)) + list_del_init(node); + + spin_unlock_irqrestore(&cq->spinlock, flags); +} + +static void reset_queue_map(struct ehca_queue_map *qmap) +{ + int i; + + qmap->tail = qmap->entries - 1; + qmap->left_to_poll = 0; + qmap->next_wqe_idx = 0; + for (i = 0; i < qmap->entries; i++) { + qmap->map[i].reported = 1; + qmap->map[i].cqe_req = 0; + } +} + +/* + * Create an ib_qp struct that is either a QP or an SRQ, depending on + * the value of the is_srq parameter. If init_attr and srq_init_attr share + * fields, the field out of init_attr is used. + */ +static struct ehca_qp *internal_create_qp( + struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_srq_init_attr *srq_init_attr, + struct ib_udata *udata, int is_srq) +{ + struct ehca_qp *my_qp, *my_srq = NULL; + struct ehca_pd *my_pd = container_of(pd, struct ehca_pd, ib_pd); + struct ehca_shca *shca = container_of(pd->device, struct ehca_shca, + ib_device); + struct ib_ucontext *context = NULL; + u64 h_ret; + int is_llqp = 0, has_srq = 0, is_user = 0; + int qp_type, max_send_sge, max_recv_sge, ret; + + /* h_call's out parameters */ + struct ehca_alloc_qp_parms parms; + u32 swqe_size = 0, rwqe_size = 0, ib_qp_num; + unsigned long flags; + + if (!atomic_add_unless(&shca->num_qps, 1, shca->max_num_qps)) { + ehca_err(pd->device, "Unable to create QP, max number of %i " + "QPs reached.", shca->max_num_qps); + ehca_err(pd->device, "To increase the maximum number of QPs " + "use the number_of_qps module parameter.\n"); + return ERR_PTR(-ENOSPC); + } + + if (init_attr->create_flags) { + atomic_dec(&shca->num_qps); + return ERR_PTR(-EINVAL); + } + + memset(&parms, 0, sizeof(parms)); + qp_type = init_attr->qp_type; + + if (init_attr->sq_sig_type != IB_SIGNAL_REQ_WR && + init_attr->sq_sig_type != IB_SIGNAL_ALL_WR) { + ehca_err(pd->device, "init_attr->sg_sig_type=%x not allowed", + init_attr->sq_sig_type); + atomic_dec(&shca->num_qps); + return ERR_PTR(-EINVAL); + } + + /* save LLQP info */ + if (qp_type & 0x80) { + is_llqp = 1; + parms.ext_type = EQPT_LLQP; + parms.ll_comp_flags = qp_type & LLQP_COMP_MASK; + } + qp_type &= 0x1F; + init_attr->qp_type &= 0x1F; + + /* handle SRQ base QPs */ + if (init_attr->srq) { + my_srq = container_of(init_attr->srq, struct ehca_qp, ib_srq); + + if (qp_type == IB_QPT_UC) { + ehca_err(pd->device, "UC with SRQ not supported"); + atomic_dec(&shca->num_qps); + return ERR_PTR(-EINVAL); + } + + has_srq = 1; + parms.ext_type = EQPT_SRQBASE; + parms.srq_qpn = my_srq->real_qp_num; + } + + if (is_llqp && has_srq) { + ehca_err(pd->device, "LLQPs can't have an SRQ"); + atomic_dec(&shca->num_qps); + return ERR_PTR(-EINVAL); + } + + /* handle SRQs */ + if (is_srq) { + parms.ext_type = EQPT_SRQ; + parms.srq_limit = srq_init_attr->attr.srq_limit; + if (init_attr->cap.max_recv_sge > 3) { + ehca_err(pd->device, "no more than three SGEs " + "supported for SRQ pd=%p max_sge=%x", + pd, init_attr->cap.max_recv_sge); + atomic_dec(&shca->num_qps); + return ERR_PTR(-EINVAL); + } + } + + /* check QP type */ + if (qp_type != IB_QPT_UD && + qp_type != IB_QPT_UC && + qp_type != IB_QPT_RC && + qp_type != IB_QPT_SMI && + qp_type != IB_QPT_GSI) { + ehca_err(pd->device, "wrong QP Type=%x", qp_type); + atomic_dec(&shca->num_qps); + return ERR_PTR(-EINVAL); + } + + if (is_llqp) { + switch (qp_type) { + case IB_QPT_RC: + if ((init_attr->cap.max_send_wr > 255) || + (init_attr->cap.max_recv_wr > 255)) { + ehca_err(pd->device, + "Invalid Number of max_sq_wr=%x " + "or max_rq_wr=%x for RC LLQP", + init_attr->cap.max_send_wr, + init_attr->cap.max_recv_wr); + atomic_dec(&shca->num_qps); + return ERR_PTR(-EINVAL); + } + break; + case IB_QPT_UD: + if (!EHCA_BMASK_GET(HCA_CAP_UD_LL_QP, shca->hca_cap)) { + ehca_err(pd->device, "UD LLQP not supported " + "by this adapter"); + atomic_dec(&shca->num_qps); + return ERR_PTR(-ENOSYS); + } + if (!(init_attr->cap.max_send_sge <= 5 + && init_attr->cap.max_send_sge >= 1 + && init_attr->cap.max_recv_sge <= 5 + && init_attr->cap.max_recv_sge >= 1)) { + ehca_err(pd->device, + "Invalid Number of max_send_sge=%x " + "or max_recv_sge=%x for UD LLQP", + init_attr->cap.max_send_sge, + init_attr->cap.max_recv_sge); + atomic_dec(&shca->num_qps); + return ERR_PTR(-EINVAL); + } else if (init_attr->cap.max_send_wr > 255) { + ehca_err(pd->device, + "Invalid Number of " + "max_send_wr=%x for UD QP_TYPE=%x", + init_attr->cap.max_send_wr, qp_type); + atomic_dec(&shca->num_qps); + return ERR_PTR(-EINVAL); + } + break; + default: + ehca_err(pd->device, "unsupported LL QP Type=%x", + qp_type); + atomic_dec(&shca->num_qps); + return ERR_PTR(-EINVAL); + } + } else { + int max_sge = (qp_type == IB_QPT_UD || qp_type == IB_QPT_SMI + || qp_type == IB_QPT_GSI) ? 250 : 252; + + if (init_attr->cap.max_send_sge > max_sge + || init_attr->cap.max_recv_sge > max_sge) { + ehca_err(pd->device, "Invalid number of SGEs requested " + "send_sge=%x recv_sge=%x max_sge=%x", + init_attr->cap.max_send_sge, + init_attr->cap.max_recv_sge, max_sge); + atomic_dec(&shca->num_qps); + return ERR_PTR(-EINVAL); + } + } + + my_qp = kmem_cache_zalloc(qp_cache, GFP_KERNEL); + if (!my_qp) { + ehca_err(pd->device, "pd=%p not enough memory to alloc qp", pd); + atomic_dec(&shca->num_qps); + return ERR_PTR(-ENOMEM); + } + + if (pd->uobject && udata) { + is_user = 1; + context = pd->uobject->context; + } + + atomic_set(&my_qp->nr_events, 0); + init_waitqueue_head(&my_qp->wait_completion); + spin_lock_init(&my_qp->spinlock_s); + spin_lock_init(&my_qp->spinlock_r); + my_qp->qp_type = qp_type; + my_qp->ext_type = parms.ext_type; + my_qp->state = IB_QPS_RESET; + + if (init_attr->recv_cq) + my_qp->recv_cq = + container_of(init_attr->recv_cq, struct ehca_cq, ib_cq); + if (init_attr->send_cq) + my_qp->send_cq = + container_of(init_attr->send_cq, struct ehca_cq, ib_cq); + + idr_preload(GFP_KERNEL); + write_lock_irqsave(&ehca_qp_idr_lock, flags); + + ret = idr_alloc(&ehca_qp_idr, my_qp, 0, 0x2000000, GFP_NOWAIT); + if (ret >= 0) + my_qp->token = ret; + + write_unlock_irqrestore(&ehca_qp_idr_lock, flags); + idr_preload_end(); + if (ret < 0) { + if (ret == -ENOSPC) { + ret = -EINVAL; + ehca_err(pd->device, "Invalid number of qp"); + } else { + ret = -ENOMEM; + ehca_err(pd->device, "Can't allocate new idr entry."); + } + goto create_qp_exit0; + } + + if (has_srq) + parms.srq_token = my_qp->token; + + parms.servicetype = ibqptype2servicetype(qp_type); + if (parms.servicetype < 0) { + ret = -EINVAL; + ehca_err(pd->device, "Invalid qp_type=%x", qp_type); + goto create_qp_exit1; + } + + /* Always signal by WQE so we can hide circ. WQEs */ + parms.sigtype = HCALL_SIGT_BY_WQE; + + /* UD_AV CIRCUMVENTION */ + max_send_sge = init_attr->cap.max_send_sge; + max_recv_sge = init_attr->cap.max_recv_sge; + if (parms.servicetype == ST_UD && !is_llqp) { + max_send_sge += 2; + max_recv_sge += 2; + } + + parms.token = my_qp->token; + parms.eq_handle = shca->eq.ipz_eq_handle; + parms.pd = my_pd->fw_pd; + if (my_qp->send_cq) + parms.send_cq_handle = my_qp->send_cq->ipz_cq_handle; + if (my_qp->recv_cq) + parms.recv_cq_handle = my_qp->recv_cq->ipz_cq_handle; + + parms.squeue.max_wr = init_attr->cap.max_send_wr; + parms.rqueue.max_wr = init_attr->cap.max_recv_wr; + parms.squeue.max_sge = max_send_sge; + parms.rqueue.max_sge = max_recv_sge; + + /* RC QPs need one more SWQE for unsolicited ack circumvention */ + if (qp_type == IB_QPT_RC) + parms.squeue.max_wr++; + + if (EHCA_BMASK_GET(HCA_CAP_MINI_QP, shca->hca_cap)) { + if (HAS_SQ(my_qp)) + ehca_determine_small_queue( + &parms.squeue, max_send_sge, is_llqp); + if (HAS_RQ(my_qp)) + ehca_determine_small_queue( + &parms.rqueue, max_recv_sge, is_llqp); + parms.qp_storage = + (parms.squeue.is_small || parms.rqueue.is_small); + } + + h_ret = hipz_h_alloc_resource_qp(shca->ipz_hca_handle, &parms, is_user); + if (h_ret != H_SUCCESS) { + ehca_err(pd->device, "h_alloc_resource_qp() failed h_ret=%lli", + h_ret); + ret = ehca2ib_return_code(h_ret); + goto create_qp_exit1; + } + + ib_qp_num = my_qp->real_qp_num = parms.real_qp_num; + my_qp->ipz_qp_handle = parms.qp_handle; + my_qp->galpas = parms.galpas; + + swqe_size = ehca_calc_wqe_size(parms.squeue.act_nr_sges, is_llqp); + rwqe_size = ehca_calc_wqe_size(parms.rqueue.act_nr_sges, is_llqp); + + switch (qp_type) { + case IB_QPT_RC: + if (is_llqp) { + parms.squeue.act_nr_sges = 1; + parms.rqueue.act_nr_sges = 1; + } + /* hide the extra WQE */ + parms.squeue.act_nr_wqes--; + break; + case IB_QPT_UD: + case IB_QPT_GSI: + case IB_QPT_SMI: + /* UD circumvention */ + if (is_llqp) { + parms.squeue.act_nr_sges = 1; + parms.rqueue.act_nr_sges = 1; + } else { + parms.squeue.act_nr_sges -= 2; + parms.rqueue.act_nr_sges -= 2; + } + + if (IB_QPT_GSI == qp_type || IB_QPT_SMI == qp_type) { + parms.squeue.act_nr_wqes = init_attr->cap.max_send_wr; + parms.rqueue.act_nr_wqes = init_attr->cap.max_recv_wr; + parms.squeue.act_nr_sges = init_attr->cap.max_send_sge; + parms.rqueue.act_nr_sges = init_attr->cap.max_recv_sge; + ib_qp_num = (qp_type == IB_QPT_SMI) ? 0 : 1; + } + + break; + + default: + break; + } + + /* initialize r/squeue and register queue pages */ + if (HAS_SQ(my_qp)) { + ret = init_qp_queue( + shca, my_pd, my_qp, &my_qp->ipz_squeue, 0, + HAS_RQ(my_qp) ? H_PAGE_REGISTERED : H_SUCCESS, + &parms.squeue, swqe_size); + if (ret) { + ehca_err(pd->device, "Couldn't initialize squeue " + "and pages ret=%i", ret); + goto create_qp_exit2; + } + + if (!is_user) { + my_qp->sq_map.entries = my_qp->ipz_squeue.queue_length / + my_qp->ipz_squeue.qe_size; + my_qp->sq_map.map = vmalloc(my_qp->sq_map.entries * + sizeof(struct ehca_qmap_entry)); + if (!my_qp->sq_map.map) { + ehca_err(pd->device, "Couldn't allocate squeue " + "map ret=%i", ret); + goto create_qp_exit3; + } + INIT_LIST_HEAD(&my_qp->sq_err_node); + /* to avoid the generation of bogus flush CQEs */ + reset_queue_map(&my_qp->sq_map); + } + } + + if (HAS_RQ(my_qp)) { + ret = init_qp_queue( + shca, my_pd, my_qp, &my_qp->ipz_rqueue, 1, + H_SUCCESS, &parms.rqueue, rwqe_size); + if (ret) { + ehca_err(pd->device, "Couldn't initialize rqueue " + "and pages ret=%i", ret); + goto create_qp_exit4; + } + if (!is_user) { + my_qp->rq_map.entries = my_qp->ipz_rqueue.queue_length / + my_qp->ipz_rqueue.qe_size; + my_qp->rq_map.map = vmalloc(my_qp->rq_map.entries * + sizeof(struct ehca_qmap_entry)); + if (!my_qp->rq_map.map) { + ehca_err(pd->device, "Couldn't allocate squeue " + "map ret=%i", ret); + goto create_qp_exit5; + } + INIT_LIST_HEAD(&my_qp->rq_err_node); + /* to avoid the generation of bogus flush CQEs */ + reset_queue_map(&my_qp->rq_map); + } + } else if (init_attr->srq && !is_user) { + /* this is a base QP, use the queue map of the SRQ */ + my_qp->rq_map = my_srq->rq_map; + INIT_LIST_HEAD(&my_qp->rq_err_node); + + my_qp->ipz_rqueue = my_srq->ipz_rqueue; + } + + if (is_srq) { + my_qp->ib_srq.pd = &my_pd->ib_pd; + my_qp->ib_srq.device = my_pd->ib_pd.device; + + my_qp->ib_srq.srq_context = init_attr->qp_context; + my_qp->ib_srq.event_handler = init_attr->event_handler; + } else { + my_qp->ib_qp.qp_num = ib_qp_num; + my_qp->ib_qp.pd = &my_pd->ib_pd; + my_qp->ib_qp.device = my_pd->ib_pd.device; + + my_qp->ib_qp.recv_cq = init_attr->recv_cq; + my_qp->ib_qp.send_cq = init_attr->send_cq; + + my_qp->ib_qp.qp_type = qp_type; + my_qp->ib_qp.srq = init_attr->srq; + + my_qp->ib_qp.qp_context = init_attr->qp_context; + my_qp->ib_qp.event_handler = init_attr->event_handler; + } + + init_attr->cap.max_inline_data = 0; /* not supported yet */ + init_attr->cap.max_recv_sge = parms.rqueue.act_nr_sges; + init_attr->cap.max_recv_wr = parms.rqueue.act_nr_wqes; + init_attr->cap.max_send_sge = parms.squeue.act_nr_sges; + init_attr->cap.max_send_wr = parms.squeue.act_nr_wqes; + my_qp->init_attr = *init_attr; + + if (qp_type == IB_QPT_SMI || qp_type == IB_QPT_GSI) { + shca->sport[init_attr->port_num - 1].ibqp_sqp[qp_type] = + &my_qp->ib_qp; + if (ehca_nr_ports < 0) { + /* alloc array to cache subsequent modify qp parms + * for autodetect mode + */ + my_qp->mod_qp_parm = + kzalloc(EHCA_MOD_QP_PARM_MAX * + sizeof(*my_qp->mod_qp_parm), + GFP_KERNEL); + if (!my_qp->mod_qp_parm) { + ehca_err(pd->device, + "Could not alloc mod_qp_parm"); + goto create_qp_exit5; + } + } + } + + /* NOTE: define_apq0() not supported yet */ + if (qp_type == IB_QPT_GSI) { + h_ret = ehca_define_sqp(shca, my_qp, init_attr); + if (h_ret != H_SUCCESS) { + kfree(my_qp->mod_qp_parm); + my_qp->mod_qp_parm = NULL; + /* the QP pointer is no longer valid */ + shca->sport[init_attr->port_num - 1].ibqp_sqp[qp_type] = + NULL; + ret = ehca2ib_return_code(h_ret); + goto create_qp_exit6; + } + } + + if (my_qp->send_cq) { + ret = ehca_cq_assign_qp(my_qp->send_cq, my_qp); + if (ret) { + ehca_err(pd->device, + "Couldn't assign qp to send_cq ret=%i", ret); + goto create_qp_exit7; + } + } + + /* copy queues, galpa data to user space */ + if (context && udata) { + struct ehca_create_qp_resp resp; + memset(&resp, 0, sizeof(resp)); + + resp.qp_num = my_qp->real_qp_num; + resp.token = my_qp->token; + resp.qp_type = my_qp->qp_type; + resp.ext_type = my_qp->ext_type; + resp.qkey = my_qp->qkey; + resp.real_qp_num = my_qp->real_qp_num; + + if (HAS_SQ(my_qp)) + queue2resp(&resp.ipz_squeue, &my_qp->ipz_squeue); + if (HAS_RQ(my_qp)) + queue2resp(&resp.ipz_rqueue, &my_qp->ipz_rqueue); + resp.fw_handle_ofs = (u32) + (my_qp->galpas.user.fw_handle & (PAGE_SIZE - 1)); + + if (ib_copy_to_udata(udata, &resp, sizeof resp)) { + ehca_err(pd->device, "Copy to udata failed"); + ret = -EINVAL; + goto create_qp_exit8; + } + } + + return my_qp; + +create_qp_exit8: + ehca_cq_unassign_qp(my_qp->send_cq, my_qp->real_qp_num); + +create_qp_exit7: + kfree(my_qp->mod_qp_parm); + +create_qp_exit6: + if (HAS_RQ(my_qp) && !is_user) + vfree(my_qp->rq_map.map); + +create_qp_exit5: + if (HAS_RQ(my_qp)) + ipz_queue_dtor(my_pd, &my_qp->ipz_rqueue); + +create_qp_exit4: + if (HAS_SQ(my_qp) && !is_user) + vfree(my_qp->sq_map.map); + +create_qp_exit3: + if (HAS_SQ(my_qp)) + ipz_queue_dtor(my_pd, &my_qp->ipz_squeue); + +create_qp_exit2: + hipz_h_destroy_qp(shca->ipz_hca_handle, my_qp); + +create_qp_exit1: + write_lock_irqsave(&ehca_qp_idr_lock, flags); + idr_remove(&ehca_qp_idr, my_qp->token); + write_unlock_irqrestore(&ehca_qp_idr_lock, flags); + +create_qp_exit0: + kmem_cache_free(qp_cache, my_qp); + atomic_dec(&shca->num_qps); + return ERR_PTR(ret); +} + +struct ib_qp *ehca_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *qp_init_attr, + struct ib_udata *udata) +{ + struct ehca_qp *ret; + + ret = internal_create_qp(pd, qp_init_attr, NULL, udata, 0); + return IS_ERR(ret) ? (struct ib_qp *)ret : &ret->ib_qp; +} + +static int internal_destroy_qp(struct ib_device *dev, struct ehca_qp *my_qp, + struct ib_uobject *uobject); + +struct ib_srq *ehca_create_srq(struct ib_pd *pd, + struct ib_srq_init_attr *srq_init_attr, + struct ib_udata *udata) +{ + struct ib_qp_init_attr qp_init_attr; + struct ehca_qp *my_qp; + struct ib_srq *ret; + struct ehca_shca *shca = container_of(pd->device, struct ehca_shca, + ib_device); + struct hcp_modify_qp_control_block *mqpcb; + u64 hret, update_mask; + + if (srq_init_attr->srq_type != IB_SRQT_BASIC) + return ERR_PTR(-ENOSYS); + + /* For common attributes, internal_create_qp() takes its info + * out of qp_init_attr, so copy all common attrs there. + */ + memset(&qp_init_attr, 0, sizeof(qp_init_attr)); + qp_init_attr.event_handler = srq_init_attr->event_handler; + qp_init_attr.qp_context = srq_init_attr->srq_context; + qp_init_attr.sq_sig_type = IB_SIGNAL_ALL_WR; + qp_init_attr.qp_type = IB_QPT_RC; + qp_init_attr.cap.max_recv_wr = srq_init_attr->attr.max_wr; + qp_init_attr.cap.max_recv_sge = srq_init_attr->attr.max_sge; + + my_qp = internal_create_qp(pd, &qp_init_attr, srq_init_attr, udata, 1); + if (IS_ERR(my_qp)) + return (struct ib_srq *)my_qp; + + /* copy back return values */ + srq_init_attr->attr.max_wr = qp_init_attr.cap.max_recv_wr; + srq_init_attr->attr.max_sge = 3; + + /* drive SRQ into RTR state */ + mqpcb = ehca_alloc_fw_ctrlblock(GFP_KERNEL); + if (!mqpcb) { + ehca_err(pd->device, "Could not get zeroed page for mqpcb " + "ehca_qp=%p qp_num=%x ", my_qp, my_qp->real_qp_num); + ret = ERR_PTR(-ENOMEM); + goto create_srq1; + } + + mqpcb->qp_state = EHCA_QPS_INIT; + mqpcb->prim_phys_port = 1; + update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_STATE, 1); + hret = hipz_h_modify_qp(shca->ipz_hca_handle, + my_qp->ipz_qp_handle, + &my_qp->pf, + update_mask, + mqpcb, my_qp->galpas.kernel); + if (hret != H_SUCCESS) { + ehca_err(pd->device, "Could not modify SRQ to INIT " + "ehca_qp=%p qp_num=%x h_ret=%lli", + my_qp, my_qp->real_qp_num, hret); + goto create_srq2; + } + + mqpcb->qp_enable = 1; + update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_ENABLE, 1); + hret = hipz_h_modify_qp(shca->ipz_hca_handle, + my_qp->ipz_qp_handle, + &my_qp->pf, + update_mask, + mqpcb, my_qp->galpas.kernel); + if (hret != H_SUCCESS) { + ehca_err(pd->device, "Could not enable SRQ " + "ehca_qp=%p qp_num=%x h_ret=%lli", + my_qp, my_qp->real_qp_num, hret); + goto create_srq2; + } + + mqpcb->qp_state = EHCA_QPS_RTR; + update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_STATE, 1); + hret = hipz_h_modify_qp(shca->ipz_hca_handle, + my_qp->ipz_qp_handle, + &my_qp->pf, + update_mask, + mqpcb, my_qp->galpas.kernel); + if (hret != H_SUCCESS) { + ehca_err(pd->device, "Could not modify SRQ to RTR " + "ehca_qp=%p qp_num=%x h_ret=%lli", + my_qp, my_qp->real_qp_num, hret); + goto create_srq2; + } + + ehca_free_fw_ctrlblock(mqpcb); + + return &my_qp->ib_srq; + +create_srq2: + ret = ERR_PTR(ehca2ib_return_code(hret)); + ehca_free_fw_ctrlblock(mqpcb); + +create_srq1: + internal_destroy_qp(pd->device, my_qp, my_qp->ib_srq.uobject); + + return ret; +} + +/* + * prepare_sqe_rts called by internal_modify_qp() at trans sqe -> rts + * set purge bit of bad wqe and subsequent wqes to avoid reentering sqe + * returns total number of bad wqes in bad_wqe_cnt + */ +static int prepare_sqe_rts(struct ehca_qp *my_qp, struct ehca_shca *shca, + int *bad_wqe_cnt) +{ + u64 h_ret; + struct ipz_queue *squeue; + void *bad_send_wqe_p, *bad_send_wqe_v; + u64 q_ofs; + struct ehca_wqe *wqe; + int qp_num = my_qp->ib_qp.qp_num; + + /* get send wqe pointer */ + h_ret = hipz_h_disable_and_get_wqe(shca->ipz_hca_handle, + my_qp->ipz_qp_handle, &my_qp->pf, + &bad_send_wqe_p, NULL, 2); + if (h_ret != H_SUCCESS) { + ehca_err(&shca->ib_device, "hipz_h_disable_and_get_wqe() failed" + " ehca_qp=%p qp_num=%x h_ret=%lli", + my_qp, qp_num, h_ret); + return ehca2ib_return_code(h_ret); + } + bad_send_wqe_p = (void *)((u64)bad_send_wqe_p & (~(1L << 63))); + ehca_dbg(&shca->ib_device, "qp_num=%x bad_send_wqe_p=%p", + qp_num, bad_send_wqe_p); + /* convert wqe pointer to vadr */ + bad_send_wqe_v = __va((u64)bad_send_wqe_p); + if (ehca_debug_level >= 2) + ehca_dmp(bad_send_wqe_v, 32, "qp_num=%x bad_wqe", qp_num); + squeue = &my_qp->ipz_squeue; + if (ipz_queue_abs_to_offset(squeue, (u64)bad_send_wqe_p, &q_ofs)) { + ehca_err(&shca->ib_device, "failed to get wqe offset qp_num=%x" + " bad_send_wqe_p=%p", qp_num, bad_send_wqe_p); + return -EFAULT; + } + + /* loop sets wqe's purge bit */ + wqe = (struct ehca_wqe *)ipz_qeit_calc(squeue, q_ofs); + *bad_wqe_cnt = 0; + while (wqe->optype != 0xff && wqe->wqef != 0xff) { + if (ehca_debug_level >= 2) + ehca_dmp(wqe, 32, "qp_num=%x wqe", qp_num); + wqe->nr_of_data_seg = 0; /* suppress data access */ + wqe->wqef = WQEF_PURGE; /* WQE to be purged */ + q_ofs = ipz_queue_advance_offset(squeue, q_ofs); + wqe = (struct ehca_wqe *)ipz_qeit_calc(squeue, q_ofs); + *bad_wqe_cnt = (*bad_wqe_cnt)+1; + } + /* + * bad wqe will be reprocessed and ignored when pol_cq() is called, + * i.e. nr of wqes with flush error status is one less + */ + ehca_dbg(&shca->ib_device, "qp_num=%x flusherr_wqe_cnt=%x", + qp_num, (*bad_wqe_cnt)-1); + wqe->wqef = 0; + + return 0; +} + +static int calc_left_cqes(u64 wqe_p, struct ipz_queue *ipz_queue, + struct ehca_queue_map *qmap) +{ + void *wqe_v; + u64 q_ofs; + u32 wqe_idx; + unsigned int tail_idx; + + /* convert real to abs address */ + wqe_p = wqe_p & (~(1UL << 63)); + + wqe_v = __va(wqe_p); + + if (ipz_queue_abs_to_offset(ipz_queue, wqe_p, &q_ofs)) { + ehca_gen_err("Invalid offset for calculating left cqes " + "wqe_p=%#llx wqe_v=%p\n", wqe_p, wqe_v); + return -EFAULT; + } + + tail_idx = next_index(qmap->tail, qmap->entries); + wqe_idx = q_ofs / ipz_queue->qe_size; + + /* check all processed wqes, whether a cqe is requested or not */ + while (tail_idx != wqe_idx) { + if (qmap->map[tail_idx].cqe_req) + qmap->left_to_poll++; + tail_idx = next_index(tail_idx, qmap->entries); + } + /* save index in queue, where we have to start flushing */ + qmap->next_wqe_idx = wqe_idx; + return 0; +} + +static int check_for_left_cqes(struct ehca_qp *my_qp, struct ehca_shca *shca) +{ + u64 h_ret; + void *send_wqe_p, *recv_wqe_p; + int ret; + unsigned long flags; + int qp_num = my_qp->ib_qp.qp_num; + + /* this hcall is not supported on base QPs */ + if (my_qp->ext_type != EQPT_SRQBASE) { + /* get send and receive wqe pointer */ + h_ret = hipz_h_disable_and_get_wqe(shca->ipz_hca_handle, + my_qp->ipz_qp_handle, &my_qp->pf, + &send_wqe_p, &recv_wqe_p, 4); + if (h_ret != H_SUCCESS) { + ehca_err(&shca->ib_device, "disable_and_get_wqe() " + "failed ehca_qp=%p qp_num=%x h_ret=%lli", + my_qp, qp_num, h_ret); + return ehca2ib_return_code(h_ret); + } + + /* + * acquire lock to ensure that nobody is polling the cq which + * could mean that the qmap->tail pointer is in an + * inconsistent state. + */ + spin_lock_irqsave(&my_qp->send_cq->spinlock, flags); + ret = calc_left_cqes((u64)send_wqe_p, &my_qp->ipz_squeue, + &my_qp->sq_map); + spin_unlock_irqrestore(&my_qp->send_cq->spinlock, flags); + if (ret) + return ret; + + + spin_lock_irqsave(&my_qp->recv_cq->spinlock, flags); + ret = calc_left_cqes((u64)recv_wqe_p, &my_qp->ipz_rqueue, + &my_qp->rq_map); + spin_unlock_irqrestore(&my_qp->recv_cq->spinlock, flags); + if (ret) + return ret; + } else { + spin_lock_irqsave(&my_qp->send_cq->spinlock, flags); + my_qp->sq_map.left_to_poll = 0; + my_qp->sq_map.next_wqe_idx = next_index(my_qp->sq_map.tail, + my_qp->sq_map.entries); + spin_unlock_irqrestore(&my_qp->send_cq->spinlock, flags); + + spin_lock_irqsave(&my_qp->recv_cq->spinlock, flags); + my_qp->rq_map.left_to_poll = 0; + my_qp->rq_map.next_wqe_idx = next_index(my_qp->rq_map.tail, + my_qp->rq_map.entries); + spin_unlock_irqrestore(&my_qp->recv_cq->spinlock, flags); + } + + /* this assures flush cqes being generated only for pending wqes */ + if ((my_qp->sq_map.left_to_poll == 0) && + (my_qp->rq_map.left_to_poll == 0)) { + spin_lock_irqsave(&my_qp->send_cq->spinlock, flags); + ehca_add_to_err_list(my_qp, 1); + spin_unlock_irqrestore(&my_qp->send_cq->spinlock, flags); + + if (HAS_RQ(my_qp)) { + spin_lock_irqsave(&my_qp->recv_cq->spinlock, flags); + ehca_add_to_err_list(my_qp, 0); + spin_unlock_irqrestore(&my_qp->recv_cq->spinlock, + flags); + } + } + + return 0; +} + +/* + * internal_modify_qp with circumvention to handle aqp0 properly + * smi_reset2init indicates if this is an internal reset-to-init-call for + * smi. This flag must always be zero if called from ehca_modify_qp()! + * This internal func was intorduced to avoid recursion of ehca_modify_qp()! + */ +static int internal_modify_qp(struct ib_qp *ibqp, + struct ib_qp_attr *attr, + int attr_mask, int smi_reset2init) +{ + enum ib_qp_state qp_cur_state, qp_new_state; + int cnt, qp_attr_idx, ret = 0; + enum ib_qp_statetrans statetrans; + struct hcp_modify_qp_control_block *mqpcb; + struct ehca_qp *my_qp = container_of(ibqp, struct ehca_qp, ib_qp); + struct ehca_shca *shca = + container_of(ibqp->pd->device, struct ehca_shca, ib_device); + u64 update_mask; + u64 h_ret; + int bad_wqe_cnt = 0; + int is_user = 0; + int squeue_locked = 0; + unsigned long flags = 0; + + /* do query_qp to obtain current attr values */ + mqpcb = ehca_alloc_fw_ctrlblock(GFP_ATOMIC); + if (!mqpcb) { + ehca_err(ibqp->device, "Could not get zeroed page for mqpcb " + "ehca_qp=%p qp_num=%x ", my_qp, ibqp->qp_num); + return -ENOMEM; + } + + h_ret = hipz_h_query_qp(shca->ipz_hca_handle, + my_qp->ipz_qp_handle, + &my_qp->pf, + mqpcb, my_qp->galpas.kernel); + if (h_ret != H_SUCCESS) { + ehca_err(ibqp->device, "hipz_h_query_qp() failed " + "ehca_qp=%p qp_num=%x h_ret=%lli", + my_qp, ibqp->qp_num, h_ret); + ret = ehca2ib_return_code(h_ret); + goto modify_qp_exit1; + } + if (ibqp->uobject) + is_user = 1; + + qp_cur_state = ehca2ib_qp_state(mqpcb->qp_state); + + if (qp_cur_state == -EINVAL) { /* invalid qp state */ + ret = -EINVAL; + ehca_err(ibqp->device, "Invalid current ehca_qp_state=%x " + "ehca_qp=%p qp_num=%x", + mqpcb->qp_state, my_qp, ibqp->qp_num); + goto modify_qp_exit1; + } + /* + * circumvention to set aqp0 initial state to init + * as expected by IB spec + */ + if (smi_reset2init == 0 && + ibqp->qp_type == IB_QPT_SMI && + qp_cur_state == IB_QPS_RESET && + (attr_mask & IB_QP_STATE) && + attr->qp_state == IB_QPS_INIT) { /* RESET -> INIT */ + struct ib_qp_attr smiqp_attr = { + .qp_state = IB_QPS_INIT, + .port_num = my_qp->init_attr.port_num, + .pkey_index = 0, + .qkey = 0 + }; + int smiqp_attr_mask = IB_QP_STATE | IB_QP_PORT | + IB_QP_PKEY_INDEX | IB_QP_QKEY; + int smirc = internal_modify_qp( + ibqp, &smiqp_attr, smiqp_attr_mask, 1); + if (smirc) { + ehca_err(ibqp->device, "SMI RESET -> INIT failed. " + "ehca_modify_qp() rc=%i", smirc); + ret = H_PARAMETER; + goto modify_qp_exit1; + } + qp_cur_state = IB_QPS_INIT; + ehca_dbg(ibqp->device, "SMI RESET -> INIT succeeded"); + } + /* is transmitted current state equal to "real" current state */ + if ((attr_mask & IB_QP_CUR_STATE) && + qp_cur_state != attr->cur_qp_state) { + ret = -EINVAL; + ehca_err(ibqp->device, + "Invalid IB_QP_CUR_STATE attr->curr_qp_state=%x <>" + " actual cur_qp_state=%x. ehca_qp=%p qp_num=%x", + attr->cur_qp_state, qp_cur_state, my_qp, ibqp->qp_num); + goto modify_qp_exit1; + } + + ehca_dbg(ibqp->device, "ehca_qp=%p qp_num=%x current qp_state=%x " + "new qp_state=%x attribute_mask=%x", + my_qp, ibqp->qp_num, qp_cur_state, attr->qp_state, attr_mask); + + qp_new_state = attr_mask & IB_QP_STATE ? attr->qp_state : qp_cur_state; + if (!smi_reset2init && + !ib_modify_qp_is_ok(qp_cur_state, qp_new_state, ibqp->qp_type, + attr_mask, IB_LINK_LAYER_UNSPECIFIED)) { + ret = -EINVAL; + ehca_err(ibqp->device, + "Invalid qp transition new_state=%x cur_state=%x " + "ehca_qp=%p qp_num=%x attr_mask=%x", qp_new_state, + qp_cur_state, my_qp, ibqp->qp_num, attr_mask); + goto modify_qp_exit1; + } + + mqpcb->qp_state = ib2ehca_qp_state(qp_new_state); + if (mqpcb->qp_state) + update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_STATE, 1); + else { + ret = -EINVAL; + ehca_err(ibqp->device, "Invalid new qp state=%x " + "ehca_qp=%p qp_num=%x", + qp_new_state, my_qp, ibqp->qp_num); + goto modify_qp_exit1; + } + + /* retrieve state transition struct to get req and opt attrs */ + statetrans = get_modqp_statetrans(qp_cur_state, qp_new_state); + if (statetrans < 0) { + ret = -EINVAL; + ehca_err(ibqp->device, " qp_cur_state=%x " + "new_qp_state=%x State_xsition=%x ehca_qp=%p " + "qp_num=%x", qp_cur_state, qp_new_state, + statetrans, my_qp, ibqp->qp_num); + goto modify_qp_exit1; + } + + qp_attr_idx = ib2ehcaqptype(ibqp->qp_type); + + if (qp_attr_idx < 0) { + ret = qp_attr_idx; + ehca_err(ibqp->device, + "Invalid QP type=%x ehca_qp=%p qp_num=%x", + ibqp->qp_type, my_qp, ibqp->qp_num); + goto modify_qp_exit1; + } + + ehca_dbg(ibqp->device, + "ehca_qp=%p qp_num=%x qp_state_xsit=%x", + my_qp, ibqp->qp_num, statetrans); + + /* eHCA2 rev2 and higher require the SEND_GRH_FLAG to be set + * in non-LL UD QPs. + */ + if ((my_qp->qp_type == IB_QPT_UD) && + (my_qp->ext_type != EQPT_LLQP) && + (statetrans == IB_QPST_INIT2RTR) && + (shca->hw_level >= 0x22)) { + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SEND_GRH_FLAG, 1); + mqpcb->send_grh_flag = 1; + } + + /* sqe -> rts: set purge bit of bad wqe before actual trans */ + if ((my_qp->qp_type == IB_QPT_UD || + my_qp->qp_type == IB_QPT_GSI || + my_qp->qp_type == IB_QPT_SMI) && + statetrans == IB_QPST_SQE2RTS) { + /* mark next free wqe if kernel */ + if (!ibqp->uobject) { + struct ehca_wqe *wqe; + /* lock send queue */ + spin_lock_irqsave(&my_qp->spinlock_s, flags); + squeue_locked = 1; + /* mark next free wqe */ + wqe = (struct ehca_wqe *) + ipz_qeit_get(&my_qp->ipz_squeue); + wqe->optype = wqe->wqef = 0xff; + ehca_dbg(ibqp->device, "qp_num=%x next_free_wqe=%p", + ibqp->qp_num, wqe); + } + ret = prepare_sqe_rts(my_qp, shca, &bad_wqe_cnt); + if (ret) { + ehca_err(ibqp->device, "prepare_sqe_rts() failed " + "ehca_qp=%p qp_num=%x ret=%i", + my_qp, ibqp->qp_num, ret); + goto modify_qp_exit2; + } + } + + /* + * enable RDMA_Atomic_Control if reset->init und reliable con + * this is necessary since gen2 does not provide that flag, + * but pHyp requires it + */ + if (statetrans == IB_QPST_RESET2INIT && + (ibqp->qp_type == IB_QPT_RC || ibqp->qp_type == IB_QPT_UC)) { + mqpcb->rdma_atomic_ctrl = 3; + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_RDMA_ATOMIC_CTRL, 1); + } + /* circ. pHyp requires #RDMA/Atomic Resp Res for UC INIT -> RTR */ + if (statetrans == IB_QPST_INIT2RTR && + (ibqp->qp_type == IB_QPT_UC) && + !(attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)) { + mqpcb->rdma_nr_atomic_resp_res = 1; /* default to 1 */ + update_mask |= + EHCA_BMASK_SET(MQPCB_MASK_RDMA_NR_ATOMIC_RESP_RES, 1); + } + + if (attr_mask & IB_QP_PKEY_INDEX) { + if (attr->pkey_index >= 16) { + ret = -EINVAL; + ehca_err(ibqp->device, "Invalid pkey_index=%x. " + "ehca_qp=%p qp_num=%x max_pkey_index=f", + attr->pkey_index, my_qp, ibqp->qp_num); + goto modify_qp_exit2; + } + mqpcb->prim_p_key_idx = attr->pkey_index; + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_PRIM_P_KEY_IDX, 1); + } + if (attr_mask & IB_QP_PORT) { + struct ehca_sport *sport; + struct ehca_qp *aqp1; + if (attr->port_num < 1 || attr->port_num > shca->num_ports) { + ret = -EINVAL; + ehca_err(ibqp->device, "Invalid port=%x. " + "ehca_qp=%p qp_num=%x num_ports=%x", + attr->port_num, my_qp, ibqp->qp_num, + shca->num_ports); + goto modify_qp_exit2; + } + sport = &shca->sport[attr->port_num - 1]; + if (!sport->ibqp_sqp[IB_QPT_GSI]) { + /* should not occur */ + ret = -EFAULT; + ehca_err(ibqp->device, "AQP1 was not created for " + "port=%x", attr->port_num); + goto modify_qp_exit2; + } + aqp1 = container_of(sport->ibqp_sqp[IB_QPT_GSI], + struct ehca_qp, ib_qp); + if (ibqp->qp_type != IB_QPT_GSI && + ibqp->qp_type != IB_QPT_SMI && + aqp1->mod_qp_parm) { + /* + * firmware will reject this modify_qp() because + * port is not activated/initialized fully + */ + ret = -EFAULT; + ehca_warn(ibqp->device, "Couldn't modify qp port=%x: " + "either port is being activated (try again) " + "or cabling issue", attr->port_num); + goto modify_qp_exit2; + } + mqpcb->prim_phys_port = attr->port_num; + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_PRIM_PHYS_PORT, 1); + } + if (attr_mask & IB_QP_QKEY) { + mqpcb->qkey = attr->qkey; + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_QKEY, 1); + } + if (attr_mask & IB_QP_AV) { + mqpcb->dlid = attr->ah_attr.dlid; + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_DLID, 1); + mqpcb->source_path_bits = attr->ah_attr.src_path_bits; + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SOURCE_PATH_BITS, 1); + mqpcb->service_level = attr->ah_attr.sl; + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SERVICE_LEVEL, 1); + + if (ehca_calc_ipd(shca, mqpcb->prim_phys_port, + attr->ah_attr.static_rate, + &mqpcb->max_static_rate)) { + ret = -EINVAL; + goto modify_qp_exit2; + } + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_MAX_STATIC_RATE, 1); + + /* + * Always supply the GRH flag, even if it's zero, to give the + * hypervisor a clear "yes" or "no" instead of a "perhaps" + */ + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SEND_GRH_FLAG, 1); + + /* + * only if GRH is TRUE we might consider SOURCE_GID_IDX + * and DEST_GID otherwise phype will return H_ATTR_PARM!!! + */ + if (attr->ah_attr.ah_flags == IB_AH_GRH) { + mqpcb->send_grh_flag = 1; + + mqpcb->source_gid_idx = attr->ah_attr.grh.sgid_index; + update_mask |= + EHCA_BMASK_SET(MQPCB_MASK_SOURCE_GID_IDX, 1); + + for (cnt = 0; cnt < 16; cnt++) + mqpcb->dest_gid.byte[cnt] = + attr->ah_attr.grh.dgid.raw[cnt]; + + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_DEST_GID, 1); + mqpcb->flow_label = attr->ah_attr.grh.flow_label; + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_FLOW_LABEL, 1); + mqpcb->hop_limit = attr->ah_attr.grh.hop_limit; + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_HOP_LIMIT, 1); + mqpcb->traffic_class = attr->ah_attr.grh.traffic_class; + update_mask |= + EHCA_BMASK_SET(MQPCB_MASK_TRAFFIC_CLASS, 1); + } + } + + if (attr_mask & IB_QP_PATH_MTU) { + /* store ld(MTU) */ + my_qp->mtu_shift = attr->path_mtu + 7; + mqpcb->path_mtu = attr->path_mtu; + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_PATH_MTU, 1); + } + if (attr_mask & IB_QP_TIMEOUT) { + mqpcb->timeout = attr->timeout; + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_TIMEOUT, 1); + } + if (attr_mask & IB_QP_RETRY_CNT) { + mqpcb->retry_count = attr->retry_cnt; + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_RETRY_COUNT, 1); + } + if (attr_mask & IB_QP_RNR_RETRY) { + mqpcb->rnr_retry_count = attr->rnr_retry; + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_RNR_RETRY_COUNT, 1); + } + if (attr_mask & IB_QP_RQ_PSN) { + mqpcb->receive_psn = attr->rq_psn; + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_RECEIVE_PSN, 1); + } + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) { + mqpcb->rdma_nr_atomic_resp_res = attr->max_dest_rd_atomic < 3 ? + attr->max_dest_rd_atomic : 2; + update_mask |= + EHCA_BMASK_SET(MQPCB_MASK_RDMA_NR_ATOMIC_RESP_RES, 1); + } + if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) { + mqpcb->rdma_atomic_outst_dest_qp = attr->max_rd_atomic < 3 ? + attr->max_rd_atomic : 2; + update_mask |= + EHCA_BMASK_SET + (MQPCB_MASK_RDMA_ATOMIC_OUTST_DEST_QP, 1); + } + if (attr_mask & IB_QP_ALT_PATH) { + if (attr->alt_port_num < 1 + || attr->alt_port_num > shca->num_ports) { + ret = -EINVAL; + ehca_err(ibqp->device, "Invalid alt_port=%x. " + "ehca_qp=%p qp_num=%x num_ports=%x", + attr->alt_port_num, my_qp, ibqp->qp_num, + shca->num_ports); + goto modify_qp_exit2; + } + mqpcb->alt_phys_port = attr->alt_port_num; + + if (attr->alt_pkey_index >= 16) { + ret = -EINVAL; + ehca_err(ibqp->device, "Invalid alt_pkey_index=%x. " + "ehca_qp=%p qp_num=%x max_pkey_index=f", + attr->pkey_index, my_qp, ibqp->qp_num); + goto modify_qp_exit2; + } + mqpcb->alt_p_key_idx = attr->alt_pkey_index; + + mqpcb->timeout_al = attr->alt_timeout; + mqpcb->dlid_al = attr->alt_ah_attr.dlid; + mqpcb->source_path_bits_al = attr->alt_ah_attr.src_path_bits; + mqpcb->service_level_al = attr->alt_ah_attr.sl; + + if (ehca_calc_ipd(shca, mqpcb->alt_phys_port, + attr->alt_ah_attr.static_rate, + &mqpcb->max_static_rate_al)) { + ret = -EINVAL; + goto modify_qp_exit2; + } + + /* OpenIB doesn't support alternate retry counts - copy them */ + mqpcb->retry_count_al = mqpcb->retry_count; + mqpcb->rnr_retry_count_al = mqpcb->rnr_retry_count; + + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_ALT_PHYS_PORT, 1) + | EHCA_BMASK_SET(MQPCB_MASK_ALT_P_KEY_IDX, 1) + | EHCA_BMASK_SET(MQPCB_MASK_TIMEOUT_AL, 1) + | EHCA_BMASK_SET(MQPCB_MASK_DLID_AL, 1) + | EHCA_BMASK_SET(MQPCB_MASK_SOURCE_PATH_BITS_AL, 1) + | EHCA_BMASK_SET(MQPCB_MASK_SERVICE_LEVEL_AL, 1) + | EHCA_BMASK_SET(MQPCB_MASK_MAX_STATIC_RATE_AL, 1) + | EHCA_BMASK_SET(MQPCB_MASK_RETRY_COUNT_AL, 1) + | EHCA_BMASK_SET(MQPCB_MASK_RNR_RETRY_COUNT_AL, 1); + + /* + * Always supply the GRH flag, even if it's zero, to give the + * hypervisor a clear "yes" or "no" instead of a "perhaps" + */ + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SEND_GRH_FLAG_AL, 1); + + /* + * only if GRH is TRUE we might consider SOURCE_GID_IDX + * and DEST_GID otherwise phype will return H_ATTR_PARM!!! + */ + if (attr->alt_ah_attr.ah_flags == IB_AH_GRH) { + mqpcb->send_grh_flag_al = 1; + + for (cnt = 0; cnt < 16; cnt++) + mqpcb->dest_gid_al.byte[cnt] = + attr->alt_ah_attr.grh.dgid.raw[cnt]; + mqpcb->source_gid_idx_al = + attr->alt_ah_attr.grh.sgid_index; + mqpcb->flow_label_al = attr->alt_ah_attr.grh.flow_label; + mqpcb->hop_limit_al = attr->alt_ah_attr.grh.hop_limit; + mqpcb->traffic_class_al = + attr->alt_ah_attr.grh.traffic_class; + + update_mask |= + EHCA_BMASK_SET(MQPCB_MASK_SOURCE_GID_IDX_AL, 1) + | EHCA_BMASK_SET(MQPCB_MASK_DEST_GID_AL, 1) + | EHCA_BMASK_SET(MQPCB_MASK_FLOW_LABEL_AL, 1) + | EHCA_BMASK_SET(MQPCB_MASK_HOP_LIMIT_AL, 1) | + EHCA_BMASK_SET(MQPCB_MASK_TRAFFIC_CLASS_AL, 1); + } + } + + if (attr_mask & IB_QP_MIN_RNR_TIMER) { + mqpcb->min_rnr_nak_timer_field = attr->min_rnr_timer; + update_mask |= + EHCA_BMASK_SET(MQPCB_MASK_MIN_RNR_NAK_TIMER_FIELD, 1); + } + + if (attr_mask & IB_QP_SQ_PSN) { + mqpcb->send_psn = attr->sq_psn; + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SEND_PSN, 1); + } + + if (attr_mask & IB_QP_DEST_QPN) { + mqpcb->dest_qp_nr = attr->dest_qp_num; + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_DEST_QP_NR, 1); + } + + if (attr_mask & IB_QP_PATH_MIG_STATE) { + if (attr->path_mig_state != IB_MIG_REARM + && attr->path_mig_state != IB_MIG_MIGRATED) { + ret = -EINVAL; + ehca_err(ibqp->device, "Invalid mig_state=%x", + attr->path_mig_state); + goto modify_qp_exit2; + } + mqpcb->path_migration_state = attr->path_mig_state + 1; + if (attr->path_mig_state == IB_MIG_REARM) + my_qp->mig_armed = 1; + update_mask |= + EHCA_BMASK_SET(MQPCB_MASK_PATH_MIGRATION_STATE, 1); + } + + if (attr_mask & IB_QP_CAP) { + mqpcb->max_nr_outst_send_wr = attr->cap.max_send_wr+1; + update_mask |= + EHCA_BMASK_SET(MQPCB_MASK_MAX_NR_OUTST_SEND_WR, 1); + mqpcb->max_nr_outst_recv_wr = attr->cap.max_recv_wr+1; + update_mask |= + EHCA_BMASK_SET(MQPCB_MASK_MAX_NR_OUTST_RECV_WR, 1); + /* no support for max_send/recv_sge yet */ + } + + if (ehca_debug_level >= 2) + ehca_dmp(mqpcb, 4*70, "qp_num=%x", ibqp->qp_num); + + h_ret = hipz_h_modify_qp(shca->ipz_hca_handle, + my_qp->ipz_qp_handle, + &my_qp->pf, + update_mask, + mqpcb, my_qp->galpas.kernel); + + if (h_ret != H_SUCCESS) { + ret = ehca2ib_return_code(h_ret); + ehca_err(ibqp->device, "hipz_h_modify_qp() failed h_ret=%lli " + "ehca_qp=%p qp_num=%x", h_ret, my_qp, ibqp->qp_num); + goto modify_qp_exit2; + } + + if ((my_qp->qp_type == IB_QPT_UD || + my_qp->qp_type == IB_QPT_GSI || + my_qp->qp_type == IB_QPT_SMI) && + statetrans == IB_QPST_SQE2RTS) { + /* doorbell to reprocessing wqes */ + iosync(); /* serialize GAL register access */ + hipz_update_sqa(my_qp, bad_wqe_cnt-1); + ehca_gen_dbg("doorbell for %x wqes", bad_wqe_cnt); + } + + if (statetrans == IB_QPST_RESET2INIT || + statetrans == IB_QPST_INIT2INIT) { + mqpcb->qp_enable = 1; + mqpcb->qp_state = EHCA_QPS_INIT; + update_mask = 0; + update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_ENABLE, 1); + + h_ret = hipz_h_modify_qp(shca->ipz_hca_handle, + my_qp->ipz_qp_handle, + &my_qp->pf, + update_mask, + mqpcb, + my_qp->galpas.kernel); + + if (h_ret != H_SUCCESS) { + ret = ehca2ib_return_code(h_ret); + ehca_err(ibqp->device, "ENABLE in context of " + "RESET_2_INIT failed! Maybe you didn't get " + "a LID h_ret=%lli ehca_qp=%p qp_num=%x", + h_ret, my_qp, ibqp->qp_num); + goto modify_qp_exit2; + } + } + if ((qp_new_state == IB_QPS_ERR) && (qp_cur_state != IB_QPS_ERR) + && !is_user) { + ret = check_for_left_cqes(my_qp, shca); + if (ret) + goto modify_qp_exit2; + } + + if (statetrans == IB_QPST_ANY2RESET) { + ipz_qeit_reset(&my_qp->ipz_rqueue); + ipz_qeit_reset(&my_qp->ipz_squeue); + + if (qp_cur_state == IB_QPS_ERR && !is_user) { + del_from_err_list(my_qp->send_cq, &my_qp->sq_err_node); + + if (HAS_RQ(my_qp)) + del_from_err_list(my_qp->recv_cq, + &my_qp->rq_err_node); + } + if (!is_user) + reset_queue_map(&my_qp->sq_map); + + if (HAS_RQ(my_qp) && !is_user) + reset_queue_map(&my_qp->rq_map); + } + + if (attr_mask & IB_QP_QKEY) + my_qp->qkey = attr->qkey; + +modify_qp_exit2: + if (squeue_locked) { /* this means: sqe -> rts */ + spin_unlock_irqrestore(&my_qp->spinlock_s, flags); + my_qp->sqerr_purgeflag = 1; + } + +modify_qp_exit1: + ehca_free_fw_ctrlblock(mqpcb); + + return ret; +} + +int ehca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, + struct ib_udata *udata) +{ + int ret = 0; + + struct ehca_shca *shca = container_of(ibqp->device, struct ehca_shca, + ib_device); + struct ehca_qp *my_qp = container_of(ibqp, struct ehca_qp, ib_qp); + + /* The if-block below caches qp_attr to be modified for GSI and SMI + * qps during the initialization by ib_mad. When the respective port + * is activated, ie we got an event PORT_ACTIVE, we'll replay the + * cached modify calls sequence, see ehca_recover_sqs() below. + * Why that is required: + * 1) If one port is connected, older code requires that port one + * to be connected and module option nr_ports=1 to be given by + * user, which is very inconvenient for end user. + * 2) Firmware accepts modify_qp() only if respective port has become + * active. Older code had a wait loop of 30sec create_qp()/ + * define_aqp1(), which is not appropriate in practice. This + * code now removes that wait loop, see define_aqp1(), and always + * reports all ports to ib_mad resp. users. Only activated ports + * will then usable for the users. + */ + if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI) { + int port = my_qp->init_attr.port_num; + struct ehca_sport *sport = &shca->sport[port - 1]; + unsigned long flags; + spin_lock_irqsave(&sport->mod_sqp_lock, flags); + /* cache qp_attr only during init */ + if (my_qp->mod_qp_parm) { + struct ehca_mod_qp_parm *p; + if (my_qp->mod_qp_parm_idx >= EHCA_MOD_QP_PARM_MAX) { + ehca_err(&shca->ib_device, + "mod_qp_parm overflow state=%x port=%x" + " type=%x", attr->qp_state, + my_qp->init_attr.port_num, + ibqp->qp_type); + spin_unlock_irqrestore(&sport->mod_sqp_lock, + flags); + return -EINVAL; + } + p = &my_qp->mod_qp_parm[my_qp->mod_qp_parm_idx]; + p->mask = attr_mask; + p->attr = *attr; + my_qp->mod_qp_parm_idx++; + ehca_dbg(&shca->ib_device, + "Saved qp_attr for state=%x port=%x type=%x", + attr->qp_state, my_qp->init_attr.port_num, + ibqp->qp_type); + spin_unlock_irqrestore(&sport->mod_sqp_lock, flags); + goto out; + } + spin_unlock_irqrestore(&sport->mod_sqp_lock, flags); + } + + ret = internal_modify_qp(ibqp, attr, attr_mask, 0); + +out: + if ((ret == 0) && (attr_mask & IB_QP_STATE)) + my_qp->state = attr->qp_state; + + return ret; +} + +void ehca_recover_sqp(struct ib_qp *sqp) +{ + struct ehca_qp *my_sqp = container_of(sqp, struct ehca_qp, ib_qp); + int port = my_sqp->init_attr.port_num; + struct ib_qp_attr attr; + struct ehca_mod_qp_parm *qp_parm; + int i, qp_parm_idx, ret; + unsigned long flags, wr_cnt; + + if (!my_sqp->mod_qp_parm) + return; + ehca_dbg(sqp->device, "SQP port=%x qp_num=%x", port, sqp->qp_num); + + qp_parm = my_sqp->mod_qp_parm; + qp_parm_idx = my_sqp->mod_qp_parm_idx; + for (i = 0; i < qp_parm_idx; i++) { + attr = qp_parm[i].attr; + ret = internal_modify_qp(sqp, &attr, qp_parm[i].mask, 0); + if (ret) { + ehca_err(sqp->device, "Could not modify SQP port=%x " + "qp_num=%x ret=%x", port, sqp->qp_num, ret); + goto free_qp_parm; + } + ehca_dbg(sqp->device, "SQP port=%x qp_num=%x in state=%x", + port, sqp->qp_num, attr.qp_state); + } + + /* re-trigger posted recv wrs */ + wr_cnt = my_sqp->ipz_rqueue.current_q_offset / + my_sqp->ipz_rqueue.qe_size; + if (wr_cnt) { + spin_lock_irqsave(&my_sqp->spinlock_r, flags); + hipz_update_rqa(my_sqp, wr_cnt); + spin_unlock_irqrestore(&my_sqp->spinlock_r, flags); + ehca_dbg(sqp->device, "doorbell port=%x qp_num=%x wr_cnt=%lx", + port, sqp->qp_num, wr_cnt); + } + +free_qp_parm: + kfree(qp_parm); + /* this prevents subsequent calls to modify_qp() to cache qp_attr */ + my_sqp->mod_qp_parm = NULL; +} + +int ehca_query_qp(struct ib_qp *qp, + struct ib_qp_attr *qp_attr, + int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) +{ + struct ehca_qp *my_qp = container_of(qp, struct ehca_qp, ib_qp); + struct ehca_shca *shca = container_of(qp->device, struct ehca_shca, + ib_device); + struct ipz_adapter_handle adapter_handle = shca->ipz_hca_handle; + struct hcp_modify_qp_control_block *qpcb; + int cnt, ret = 0; + u64 h_ret; + + if (qp_attr_mask & QP_ATTR_QUERY_NOT_SUPPORTED) { + ehca_err(qp->device, "Invalid attribute mask " + "ehca_qp=%p qp_num=%x qp_attr_mask=%x ", + my_qp, qp->qp_num, qp_attr_mask); + return -EINVAL; + } + + qpcb = ehca_alloc_fw_ctrlblock(GFP_KERNEL); + if (!qpcb) { + ehca_err(qp->device, "Out of memory for qpcb " + "ehca_qp=%p qp_num=%x", my_qp, qp->qp_num); + return -ENOMEM; + } + + h_ret = hipz_h_query_qp(adapter_handle, + my_qp->ipz_qp_handle, + &my_qp->pf, + qpcb, my_qp->galpas.kernel); + + if (h_ret != H_SUCCESS) { + ret = ehca2ib_return_code(h_ret); + ehca_err(qp->device, "hipz_h_query_qp() failed " + "ehca_qp=%p qp_num=%x h_ret=%lli", + my_qp, qp->qp_num, h_ret); + goto query_qp_exit1; + } + + qp_attr->cur_qp_state = ehca2ib_qp_state(qpcb->qp_state); + qp_attr->qp_state = qp_attr->cur_qp_state; + + if (qp_attr->cur_qp_state == -EINVAL) { + ret = -EINVAL; + ehca_err(qp->device, "Got invalid ehca_qp_state=%x " + "ehca_qp=%p qp_num=%x", + qpcb->qp_state, my_qp, qp->qp_num); + goto query_qp_exit1; + } + + if (qp_attr->qp_state == IB_QPS_SQD) + qp_attr->sq_draining = 1; + + qp_attr->qkey = qpcb->qkey; + qp_attr->path_mtu = qpcb->path_mtu; + qp_attr->path_mig_state = qpcb->path_migration_state - 1; + qp_attr->rq_psn = qpcb->receive_psn; + qp_attr->sq_psn = qpcb->send_psn; + qp_attr->min_rnr_timer = qpcb->min_rnr_nak_timer_field; + qp_attr->cap.max_send_wr = qpcb->max_nr_outst_send_wr-1; + qp_attr->cap.max_recv_wr = qpcb->max_nr_outst_recv_wr-1; + /* UD_AV CIRCUMVENTION */ + if (my_qp->qp_type == IB_QPT_UD) { + qp_attr->cap.max_send_sge = + qpcb->actual_nr_sges_in_sq_wqe - 2; + qp_attr->cap.max_recv_sge = + qpcb->actual_nr_sges_in_rq_wqe - 2; + } else { + qp_attr->cap.max_send_sge = + qpcb->actual_nr_sges_in_sq_wqe; + qp_attr->cap.max_recv_sge = + qpcb->actual_nr_sges_in_rq_wqe; + } + + qp_attr->cap.max_inline_data = my_qp->sq_max_inline_data_size; + qp_attr->dest_qp_num = qpcb->dest_qp_nr; + + qp_attr->pkey_index = qpcb->prim_p_key_idx; + qp_attr->port_num = qpcb->prim_phys_port; + qp_attr->timeout = qpcb->timeout; + qp_attr->retry_cnt = qpcb->retry_count; + qp_attr->rnr_retry = qpcb->rnr_retry_count; + + qp_attr->alt_pkey_index = qpcb->alt_p_key_idx; + qp_attr->alt_port_num = qpcb->alt_phys_port; + qp_attr->alt_timeout = qpcb->timeout_al; + + qp_attr->max_dest_rd_atomic = qpcb->rdma_nr_atomic_resp_res; + qp_attr->max_rd_atomic = qpcb->rdma_atomic_outst_dest_qp; + + /* primary av */ + qp_attr->ah_attr.sl = qpcb->service_level; + + if (qpcb->send_grh_flag) { + qp_attr->ah_attr.ah_flags = IB_AH_GRH; + } + + qp_attr->ah_attr.static_rate = qpcb->max_static_rate; + qp_attr->ah_attr.dlid = qpcb->dlid; + qp_attr->ah_attr.src_path_bits = qpcb->source_path_bits; + qp_attr->ah_attr.port_num = qp_attr->port_num; + + /* primary GRH */ + qp_attr->ah_attr.grh.traffic_class = qpcb->traffic_class; + qp_attr->ah_attr.grh.hop_limit = qpcb->hop_limit; + qp_attr->ah_attr.grh.sgid_index = qpcb->source_gid_idx; + qp_attr->ah_attr.grh.flow_label = qpcb->flow_label; + + for (cnt = 0; cnt < 16; cnt++) + qp_attr->ah_attr.grh.dgid.raw[cnt] = + qpcb->dest_gid.byte[cnt]; + + /* alternate AV */ + qp_attr->alt_ah_attr.sl = qpcb->service_level_al; + if (qpcb->send_grh_flag_al) { + qp_attr->alt_ah_attr.ah_flags = IB_AH_GRH; + } + + qp_attr->alt_ah_attr.static_rate = qpcb->max_static_rate_al; + qp_attr->alt_ah_attr.dlid = qpcb->dlid_al; + qp_attr->alt_ah_attr.src_path_bits = qpcb->source_path_bits_al; + + /* alternate GRH */ + qp_attr->alt_ah_attr.grh.traffic_class = qpcb->traffic_class_al; + qp_attr->alt_ah_attr.grh.hop_limit = qpcb->hop_limit_al; + qp_attr->alt_ah_attr.grh.sgid_index = qpcb->source_gid_idx_al; + qp_attr->alt_ah_attr.grh.flow_label = qpcb->flow_label_al; + + for (cnt = 0; cnt < 16; cnt++) + qp_attr->alt_ah_attr.grh.dgid.raw[cnt] = + qpcb->dest_gid_al.byte[cnt]; + + /* return init attributes given in ehca_create_qp */ + if (qp_init_attr) + *qp_init_attr = my_qp->init_attr; + + if (ehca_debug_level >= 2) + ehca_dmp(qpcb, 4*70, "qp_num=%x", qp->qp_num); + +query_qp_exit1: + ehca_free_fw_ctrlblock(qpcb); + + return ret; +} + +int ehca_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, struct ib_udata *udata) +{ + struct ehca_qp *my_qp = + container_of(ibsrq, struct ehca_qp, ib_srq); + struct ehca_shca *shca = + container_of(ibsrq->pd->device, struct ehca_shca, ib_device); + struct hcp_modify_qp_control_block *mqpcb; + u64 update_mask; + u64 h_ret; + int ret = 0; + + mqpcb = ehca_alloc_fw_ctrlblock(GFP_KERNEL); + if (!mqpcb) { + ehca_err(ibsrq->device, "Could not get zeroed page for mqpcb " + "ehca_qp=%p qp_num=%x ", my_qp, my_qp->real_qp_num); + return -ENOMEM; + } + + update_mask = 0; + if (attr_mask & IB_SRQ_LIMIT) { + attr_mask &= ~IB_SRQ_LIMIT; + update_mask |= + EHCA_BMASK_SET(MQPCB_MASK_CURR_SRQ_LIMIT, 1) + | EHCA_BMASK_SET(MQPCB_MASK_QP_AFF_ASYN_EV_LOG_REG, 1); + mqpcb->curr_srq_limit = attr->srq_limit; + mqpcb->qp_aff_asyn_ev_log_reg = + EHCA_BMASK_SET(QPX_AAELOG_RESET_SRQ_LIMIT, 1); + } + + /* by now, all bits in attr_mask should have been cleared */ + if (attr_mask) { + ehca_err(ibsrq->device, "invalid attribute mask bits set " + "attr_mask=%x", attr_mask); + ret = -EINVAL; + goto modify_srq_exit0; + } + + if (ehca_debug_level >= 2) + ehca_dmp(mqpcb, 4*70, "qp_num=%x", my_qp->real_qp_num); + + h_ret = hipz_h_modify_qp(shca->ipz_hca_handle, my_qp->ipz_qp_handle, + NULL, update_mask, mqpcb, + my_qp->galpas.kernel); + + if (h_ret != H_SUCCESS) { + ret = ehca2ib_return_code(h_ret); + ehca_err(ibsrq->device, "hipz_h_modify_qp() failed h_ret=%lli " + "ehca_qp=%p qp_num=%x", + h_ret, my_qp, my_qp->real_qp_num); + } + +modify_srq_exit0: + ehca_free_fw_ctrlblock(mqpcb); + + return ret; +} + +int ehca_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr) +{ + struct ehca_qp *my_qp = container_of(srq, struct ehca_qp, ib_srq); + struct ehca_shca *shca = container_of(srq->device, struct ehca_shca, + ib_device); + struct ipz_adapter_handle adapter_handle = shca->ipz_hca_handle; + struct hcp_modify_qp_control_block *qpcb; + int ret = 0; + u64 h_ret; + + qpcb = ehca_alloc_fw_ctrlblock(GFP_KERNEL); + if (!qpcb) { + ehca_err(srq->device, "Out of memory for qpcb " + "ehca_qp=%p qp_num=%x", my_qp, my_qp->real_qp_num); + return -ENOMEM; + } + + h_ret = hipz_h_query_qp(adapter_handle, my_qp->ipz_qp_handle, + NULL, qpcb, my_qp->galpas.kernel); + + if (h_ret != H_SUCCESS) { + ret = ehca2ib_return_code(h_ret); + ehca_err(srq->device, "hipz_h_query_qp() failed " + "ehca_qp=%p qp_num=%x h_ret=%lli", + my_qp, my_qp->real_qp_num, h_ret); + goto query_srq_exit1; + } + + srq_attr->max_wr = qpcb->max_nr_outst_recv_wr - 1; + srq_attr->max_sge = 3; + srq_attr->srq_limit = qpcb->curr_srq_limit; + + if (ehca_debug_level >= 2) + ehca_dmp(qpcb, 4*70, "qp_num=%x", my_qp->real_qp_num); + +query_srq_exit1: + ehca_free_fw_ctrlblock(qpcb); + + return ret; +} + +static int internal_destroy_qp(struct ib_device *dev, struct ehca_qp *my_qp, + struct ib_uobject *uobject) +{ + struct ehca_shca *shca = container_of(dev, struct ehca_shca, ib_device); + struct ehca_pd *my_pd = container_of(my_qp->ib_qp.pd, struct ehca_pd, + ib_pd); + struct ehca_sport *sport = &shca->sport[my_qp->init_attr.port_num - 1]; + u32 qp_num = my_qp->real_qp_num; + int ret; + u64 h_ret; + u8 port_num; + int is_user = 0; + enum ib_qp_type qp_type; + unsigned long flags; + + if (uobject) { + is_user = 1; + if (my_qp->mm_count_galpa || + my_qp->mm_count_rqueue || my_qp->mm_count_squeue) { + ehca_err(dev, "Resources still referenced in " + "user space qp_num=%x", qp_num); + return -EINVAL; + } + } + + if (my_qp->send_cq) { + ret = ehca_cq_unassign_qp(my_qp->send_cq, qp_num); + if (ret) { + ehca_err(dev, "Couldn't unassign qp from " + "send_cq ret=%i qp_num=%x cq_num=%x", ret, + qp_num, my_qp->send_cq->cq_number); + return ret; + } + } + + write_lock_irqsave(&ehca_qp_idr_lock, flags); + idr_remove(&ehca_qp_idr, my_qp->token); + write_unlock_irqrestore(&ehca_qp_idr_lock, flags); + + /* + * SRQs will never get into an error list and do not have a recv_cq, + * so we need to skip them here. + */ + if (HAS_RQ(my_qp) && !IS_SRQ(my_qp) && !is_user) + del_from_err_list(my_qp->recv_cq, &my_qp->rq_err_node); + + if (HAS_SQ(my_qp) && !is_user) + del_from_err_list(my_qp->send_cq, &my_qp->sq_err_node); + + /* now wait until all pending events have completed */ + wait_event(my_qp->wait_completion, !atomic_read(&my_qp->nr_events)); + + h_ret = hipz_h_destroy_qp(shca->ipz_hca_handle, my_qp); + if (h_ret != H_SUCCESS) { + ehca_err(dev, "hipz_h_destroy_qp() failed h_ret=%lli " + "ehca_qp=%p qp_num=%x", h_ret, my_qp, qp_num); + return ehca2ib_return_code(h_ret); + } + + port_num = my_qp->init_attr.port_num; + qp_type = my_qp->init_attr.qp_type; + + if (qp_type == IB_QPT_SMI || qp_type == IB_QPT_GSI) { + spin_lock_irqsave(&sport->mod_sqp_lock, flags); + kfree(my_qp->mod_qp_parm); + my_qp->mod_qp_parm = NULL; + shca->sport[port_num - 1].ibqp_sqp[qp_type] = NULL; + spin_unlock_irqrestore(&sport->mod_sqp_lock, flags); + } + + /* no support for IB_QPT_SMI yet */ + if (qp_type == IB_QPT_GSI) { + struct ib_event event; + ehca_info(dev, "device %s: port %x is inactive.", + shca->ib_device.name, port_num); + event.device = &shca->ib_device; + event.event = IB_EVENT_PORT_ERR; + event.element.port_num = port_num; + shca->sport[port_num - 1].port_state = IB_PORT_DOWN; + ib_dispatch_event(&event); + } + + if (HAS_RQ(my_qp)) { + ipz_queue_dtor(my_pd, &my_qp->ipz_rqueue); + if (!is_user) + vfree(my_qp->rq_map.map); + } + if (HAS_SQ(my_qp)) { + ipz_queue_dtor(my_pd, &my_qp->ipz_squeue); + if (!is_user) + vfree(my_qp->sq_map.map); + } + kmem_cache_free(qp_cache, my_qp); + atomic_dec(&shca->num_qps); + return 0; +} + +int ehca_destroy_qp(struct ib_qp *qp) +{ + return internal_destroy_qp(qp->device, + container_of(qp, struct ehca_qp, ib_qp), + qp->uobject); +} + +int ehca_destroy_srq(struct ib_srq *srq) +{ + return internal_destroy_qp(srq->device, + container_of(srq, struct ehca_qp, ib_srq), + srq->uobject); +} + +int ehca_init_qp_cache(void) +{ + qp_cache = kmem_cache_create("ehca_cache_qp", + sizeof(struct ehca_qp), 0, + SLAB_HWCACHE_ALIGN, + NULL); + if (!qp_cache) + return -ENOMEM; + return 0; +} + +void ehca_cleanup_qp_cache(void) +{ + if (qp_cache) + kmem_cache_destroy(qp_cache); +} diff --git a/kernel/drivers/infiniband/hw/ehca/ehca_reqs.c b/kernel/drivers/infiniband/hw/ehca/ehca_reqs.c new file mode 100644 index 000000000..47f949843 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ehca/ehca_reqs.c @@ -0,0 +1,953 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * post_send/recv, poll_cq, req_notify + * + * Authors: Hoang-Nam Nguyen + * Waleri Fomin + * Joachim Fenkes + * Reinhard Ernst + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + + +#include "ehca_classes.h" +#include "ehca_tools.h" +#include "ehca_qes.h" +#include "ehca_iverbs.h" +#include "hcp_if.h" +#include "hipz_fns.h" + +/* in RC traffic, insert an empty RDMA READ every this many packets */ +#define ACK_CIRC_THRESHOLD 2000000 + +static u64 replace_wr_id(u64 wr_id, u16 idx) +{ + u64 ret; + + ret = wr_id & ~QMAP_IDX_MASK; + ret |= idx & QMAP_IDX_MASK; + + return ret; +} + +static u16 get_app_wr_id(u64 wr_id) +{ + return wr_id & QMAP_IDX_MASK; +} + +static inline int ehca_write_rwqe(struct ipz_queue *ipz_rqueue, + struct ehca_wqe *wqe_p, + struct ib_recv_wr *recv_wr, + u32 rq_map_idx) +{ + u8 cnt_ds; + if (unlikely((recv_wr->num_sge < 0) || + (recv_wr->num_sge > ipz_rqueue->act_nr_of_sg))) { + ehca_gen_err("Invalid number of WQE SGE. " + "num_sqe=%x max_nr_of_sg=%x", + recv_wr->num_sge, ipz_rqueue->act_nr_of_sg); + return -EINVAL; /* invalid SG list length */ + } + + /* clear wqe header until sglist */ + memset(wqe_p, 0, offsetof(struct ehca_wqe, u.ud_av.sg_list)); + + wqe_p->work_request_id = replace_wr_id(recv_wr->wr_id, rq_map_idx); + wqe_p->nr_of_data_seg = recv_wr->num_sge; + + for (cnt_ds = 0; cnt_ds < recv_wr->num_sge; cnt_ds++) { + wqe_p->u.all_rcv.sg_list[cnt_ds].vaddr = + recv_wr->sg_list[cnt_ds].addr; + wqe_p->u.all_rcv.sg_list[cnt_ds].lkey = + recv_wr->sg_list[cnt_ds].lkey; + wqe_p->u.all_rcv.sg_list[cnt_ds].length = + recv_wr->sg_list[cnt_ds].length; + } + + if (ehca_debug_level >= 3) { + ehca_gen_dbg("RECEIVE WQE written into ipz_rqueue=%p", + ipz_rqueue); + ehca_dmp(wqe_p, 16*(6 + wqe_p->nr_of_data_seg), "recv wqe"); + } + + return 0; +} + +#if defined(DEBUG_GSI_SEND_WR) + +/* need ib_mad struct */ +#include + +static void trace_send_wr_ud(const struct ib_send_wr *send_wr) +{ + int idx; + int j; + while (send_wr) { + struct ib_mad_hdr *mad_hdr = send_wr->wr.ud.mad_hdr; + struct ib_sge *sge = send_wr->sg_list; + ehca_gen_dbg("send_wr#%x wr_id=%lx num_sge=%x " + "send_flags=%x opcode=%x", idx, send_wr->wr_id, + send_wr->num_sge, send_wr->send_flags, + send_wr->opcode); + if (mad_hdr) { + ehca_gen_dbg("send_wr#%x mad_hdr base_version=%x " + "mgmt_class=%x class_version=%x method=%x " + "status=%x class_specific=%x tid=%lx " + "attr_id=%x resv=%x attr_mod=%x", + idx, mad_hdr->base_version, + mad_hdr->mgmt_class, + mad_hdr->class_version, mad_hdr->method, + mad_hdr->status, mad_hdr->class_specific, + mad_hdr->tid, mad_hdr->attr_id, + mad_hdr->resv, + mad_hdr->attr_mod); + } + for (j = 0; j < send_wr->num_sge; j++) { + u8 *data = __va(sge->addr); + ehca_gen_dbg("send_wr#%x sge#%x addr=%p length=%x " + "lkey=%x", + idx, j, data, sge->length, sge->lkey); + /* assume length is n*16 */ + ehca_dmp(data, sge->length, "send_wr#%x sge#%x", + idx, j); + sge++; + } /* eof for j */ + idx++; + send_wr = send_wr->next; + } /* eof while send_wr */ +} + +#endif /* DEBUG_GSI_SEND_WR */ + +static inline int ehca_write_swqe(struct ehca_qp *qp, + struct ehca_wqe *wqe_p, + const struct ib_send_wr *send_wr, + u32 sq_map_idx, + int hidden) +{ + u32 idx; + u64 dma_length; + struct ehca_av *my_av; + u32 remote_qkey = send_wr->wr.ud.remote_qkey; + struct ehca_qmap_entry *qmap_entry = &qp->sq_map.map[sq_map_idx]; + + if (unlikely((send_wr->num_sge < 0) || + (send_wr->num_sge > qp->ipz_squeue.act_nr_of_sg))) { + ehca_gen_err("Invalid number of WQE SGE. " + "num_sqe=%x max_nr_of_sg=%x", + send_wr->num_sge, qp->ipz_squeue.act_nr_of_sg); + return -EINVAL; /* invalid SG list length */ + } + + /* clear wqe header until sglist */ + memset(wqe_p, 0, offsetof(struct ehca_wqe, u.ud_av.sg_list)); + + wqe_p->work_request_id = replace_wr_id(send_wr->wr_id, sq_map_idx); + + qmap_entry->app_wr_id = get_app_wr_id(send_wr->wr_id); + qmap_entry->reported = 0; + qmap_entry->cqe_req = 0; + + switch (send_wr->opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + wqe_p->optype = WQE_OPTYPE_SEND; + break; + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + wqe_p->optype = WQE_OPTYPE_RDMAWRITE; + break; + case IB_WR_RDMA_READ: + wqe_p->optype = WQE_OPTYPE_RDMAREAD; + break; + default: + ehca_gen_err("Invalid opcode=%x", send_wr->opcode); + return -EINVAL; /* invalid opcode */ + } + + wqe_p->wqef = (send_wr->opcode) & WQEF_HIGH_NIBBLE; + + wqe_p->wr_flag = 0; + + if ((send_wr->send_flags & IB_SEND_SIGNALED || + qp->init_attr.sq_sig_type == IB_SIGNAL_ALL_WR) + && !hidden) { + wqe_p->wr_flag |= WQE_WRFLAG_REQ_SIGNAL_COM; + qmap_entry->cqe_req = 1; + } + + if (send_wr->opcode == IB_WR_SEND_WITH_IMM || + send_wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) { + /* this might not work as long as HW does not support it */ + wqe_p->immediate_data = be32_to_cpu(send_wr->ex.imm_data); + wqe_p->wr_flag |= WQE_WRFLAG_IMM_DATA_PRESENT; + } + + wqe_p->nr_of_data_seg = send_wr->num_sge; + + switch (qp->qp_type) { + case IB_QPT_SMI: + case IB_QPT_GSI: + /* no break is intential here */ + case IB_QPT_UD: + /* IB 1.2 spec C10-15 compliance */ + if (send_wr->wr.ud.remote_qkey & 0x80000000) + remote_qkey = qp->qkey; + + wqe_p->destination_qp_number = send_wr->wr.ud.remote_qpn << 8; + wqe_p->local_ee_context_qkey = remote_qkey; + if (unlikely(!send_wr->wr.ud.ah)) { + ehca_gen_err("wr.ud.ah is NULL. qp=%p", qp); + return -EINVAL; + } + if (unlikely(send_wr->wr.ud.remote_qpn == 0)) { + ehca_gen_err("dest QP# is 0. qp=%x", qp->real_qp_num); + return -EINVAL; + } + my_av = container_of(send_wr->wr.ud.ah, struct ehca_av, ib_ah); + wqe_p->u.ud_av.ud_av = my_av->av; + + /* + * omitted check of IB_SEND_INLINE + * since HW does not support it + */ + for (idx = 0; idx < send_wr->num_sge; idx++) { + wqe_p->u.ud_av.sg_list[idx].vaddr = + send_wr->sg_list[idx].addr; + wqe_p->u.ud_av.sg_list[idx].lkey = + send_wr->sg_list[idx].lkey; + wqe_p->u.ud_av.sg_list[idx].length = + send_wr->sg_list[idx].length; + } /* eof for idx */ + if (qp->qp_type == IB_QPT_SMI || + qp->qp_type == IB_QPT_GSI) + wqe_p->u.ud_av.ud_av.pmtu = 1; + if (qp->qp_type == IB_QPT_GSI) { + wqe_p->pkeyi = send_wr->wr.ud.pkey_index; +#ifdef DEBUG_GSI_SEND_WR + trace_send_wr_ud(send_wr); +#endif /* DEBUG_GSI_SEND_WR */ + } + break; + + case IB_QPT_UC: + if (send_wr->send_flags & IB_SEND_FENCE) + wqe_p->wr_flag |= WQE_WRFLAG_FENCE; + /* no break is intentional here */ + case IB_QPT_RC: + /* TODO: atomic not implemented */ + wqe_p->u.nud.remote_virtual_address = + send_wr->wr.rdma.remote_addr; + wqe_p->u.nud.rkey = send_wr->wr.rdma.rkey; + + /* + * omitted checking of IB_SEND_INLINE + * since HW does not support it + */ + dma_length = 0; + for (idx = 0; idx < send_wr->num_sge; idx++) { + wqe_p->u.nud.sg_list[idx].vaddr = + send_wr->sg_list[idx].addr; + wqe_p->u.nud.sg_list[idx].lkey = + send_wr->sg_list[idx].lkey; + wqe_p->u.nud.sg_list[idx].length = + send_wr->sg_list[idx].length; + dma_length += send_wr->sg_list[idx].length; + } /* eof idx */ + wqe_p->u.nud.atomic_1st_op_dma_len = dma_length; + + /* unsolicited ack circumvention */ + if (send_wr->opcode == IB_WR_RDMA_READ) { + /* on RDMA read, switch on and reset counters */ + qp->message_count = qp->packet_count = 0; + qp->unsol_ack_circ = 1; + } else + /* else estimate #packets */ + qp->packet_count += (dma_length >> qp->mtu_shift) + 1; + + break; + + default: + ehca_gen_err("Invalid qptype=%x", qp->qp_type); + return -EINVAL; + } + + if (ehca_debug_level >= 3) { + ehca_gen_dbg("SEND WQE written into queue qp=%p ", qp); + ehca_dmp( wqe_p, 16*(6 + wqe_p->nr_of_data_seg), "send wqe"); + } + return 0; +} + +/* map_ib_wc_status converts raw cqe_status to ib_wc_status */ +static inline void map_ib_wc_status(u32 cqe_status, + enum ib_wc_status *wc_status) +{ + if (unlikely(cqe_status & WC_STATUS_ERROR_BIT)) { + switch (cqe_status & 0x3F) { + case 0x01: + case 0x21: + *wc_status = IB_WC_LOC_LEN_ERR; + break; + case 0x02: + case 0x22: + *wc_status = IB_WC_LOC_QP_OP_ERR; + break; + case 0x03: + case 0x23: + *wc_status = IB_WC_LOC_EEC_OP_ERR; + break; + case 0x04: + case 0x24: + *wc_status = IB_WC_LOC_PROT_ERR; + break; + case 0x05: + case 0x25: + *wc_status = IB_WC_WR_FLUSH_ERR; + break; + case 0x06: + *wc_status = IB_WC_MW_BIND_ERR; + break; + case 0x07: /* remote error - look into bits 20:24 */ + switch ((cqe_status + & WC_STATUS_REMOTE_ERROR_FLAGS) >> 11) { + case 0x0: + /* + * PSN Sequence Error! + * couldn't find a matching status! + */ + *wc_status = IB_WC_GENERAL_ERR; + break; + case 0x1: + *wc_status = IB_WC_REM_INV_REQ_ERR; + break; + case 0x2: + *wc_status = IB_WC_REM_ACCESS_ERR; + break; + case 0x3: + *wc_status = IB_WC_REM_OP_ERR; + break; + case 0x4: + *wc_status = IB_WC_REM_INV_RD_REQ_ERR; + break; + } + break; + case 0x08: + *wc_status = IB_WC_RETRY_EXC_ERR; + break; + case 0x09: + *wc_status = IB_WC_RNR_RETRY_EXC_ERR; + break; + case 0x0A: + case 0x2D: + *wc_status = IB_WC_REM_ABORT_ERR; + break; + case 0x0B: + case 0x2E: + *wc_status = IB_WC_INV_EECN_ERR; + break; + case 0x0C: + case 0x2F: + *wc_status = IB_WC_INV_EEC_STATE_ERR; + break; + case 0x0D: + *wc_status = IB_WC_BAD_RESP_ERR; + break; + case 0x10: + /* WQE purged */ + *wc_status = IB_WC_WR_FLUSH_ERR; + break; + default: + *wc_status = IB_WC_FATAL_ERR; + + } + } else + *wc_status = IB_WC_SUCCESS; +} + +static inline int post_one_send(struct ehca_qp *my_qp, + struct ib_send_wr *cur_send_wr, + int hidden) +{ + struct ehca_wqe *wqe_p; + int ret; + u32 sq_map_idx; + u64 start_offset = my_qp->ipz_squeue.current_q_offset; + + /* get pointer next to free WQE */ + wqe_p = ipz_qeit_get_inc(&my_qp->ipz_squeue); + if (unlikely(!wqe_p)) { + /* too many posted work requests: queue overflow */ + ehca_err(my_qp->ib_qp.device, "Too many posted WQEs " + "qp_num=%x", my_qp->ib_qp.qp_num); + return -ENOMEM; + } + + /* + * Get the index of the WQE in the send queue. The same index is used + * for writing into the sq_map. + */ + sq_map_idx = start_offset / my_qp->ipz_squeue.qe_size; + + /* write a SEND WQE into the QUEUE */ + ret = ehca_write_swqe(my_qp, wqe_p, cur_send_wr, sq_map_idx, hidden); + /* + * if something failed, + * reset the free entry pointer to the start value + */ + if (unlikely(ret)) { + my_qp->ipz_squeue.current_q_offset = start_offset; + ehca_err(my_qp->ib_qp.device, "Could not write WQE " + "qp_num=%x", my_qp->ib_qp.qp_num); + return -EINVAL; + } + + return 0; +} + +int ehca_post_send(struct ib_qp *qp, + struct ib_send_wr *send_wr, + struct ib_send_wr **bad_send_wr) +{ + struct ehca_qp *my_qp = container_of(qp, struct ehca_qp, ib_qp); + int wqe_cnt = 0; + int ret = 0; + unsigned long flags; + + /* Reject WR if QP is in RESET, INIT or RTR state */ + if (unlikely(my_qp->state < IB_QPS_RTS)) { + ehca_err(qp->device, "Invalid QP state qp_state=%d qpn=%x", + my_qp->state, qp->qp_num); + ret = -EINVAL; + goto out; + } + + /* LOCK the QUEUE */ + spin_lock_irqsave(&my_qp->spinlock_s, flags); + + /* Send an empty extra RDMA read if: + * 1) there has been an RDMA read on this connection before + * 2) no RDMA read occurred for ACK_CIRC_THRESHOLD link packets + * 3) we can be sure that any previous extra RDMA read has been + * processed so we don't overflow the SQ + */ + if (unlikely(my_qp->unsol_ack_circ && + my_qp->packet_count > ACK_CIRC_THRESHOLD && + my_qp->message_count > my_qp->init_attr.cap.max_send_wr)) { + /* insert an empty RDMA READ to fix up the remote QP state */ + struct ib_send_wr circ_wr; + memset(&circ_wr, 0, sizeof(circ_wr)); + circ_wr.opcode = IB_WR_RDMA_READ; + post_one_send(my_qp, &circ_wr, 1); /* ignore retcode */ + wqe_cnt++; + ehca_dbg(qp->device, "posted circ wr qp_num=%x", qp->qp_num); + my_qp->message_count = my_qp->packet_count = 0; + } + + /* loop processes list of send reqs */ + while (send_wr) { + ret = post_one_send(my_qp, send_wr, 0); + if (unlikely(ret)) { + goto post_send_exit0; + } + wqe_cnt++; + send_wr = send_wr->next; + } + +post_send_exit0: + iosync(); /* serialize GAL register access */ + hipz_update_sqa(my_qp, wqe_cnt); + if (unlikely(ret || ehca_debug_level >= 2)) + ehca_dbg(qp->device, "ehca_qp=%p qp_num=%x wqe_cnt=%d ret=%i", + my_qp, qp->qp_num, wqe_cnt, ret); + my_qp->message_count += wqe_cnt; + spin_unlock_irqrestore(&my_qp->spinlock_s, flags); + +out: + if (ret) + *bad_send_wr = send_wr; + return ret; +} + +static int internal_post_recv(struct ehca_qp *my_qp, + struct ib_device *dev, + struct ib_recv_wr *recv_wr, + struct ib_recv_wr **bad_recv_wr) +{ + struct ehca_wqe *wqe_p; + int wqe_cnt = 0; + int ret = 0; + u32 rq_map_idx; + unsigned long flags; + struct ehca_qmap_entry *qmap_entry; + + if (unlikely(!HAS_RQ(my_qp))) { + ehca_err(dev, "QP has no RQ ehca_qp=%p qp_num=%x ext_type=%d", + my_qp, my_qp->real_qp_num, my_qp->ext_type); + ret = -ENODEV; + goto out; + } + + /* LOCK the QUEUE */ + spin_lock_irqsave(&my_qp->spinlock_r, flags); + + /* loop processes list of recv reqs */ + while (recv_wr) { + u64 start_offset = my_qp->ipz_rqueue.current_q_offset; + /* get pointer next to free WQE */ + wqe_p = ipz_qeit_get_inc(&my_qp->ipz_rqueue); + if (unlikely(!wqe_p)) { + /* too many posted work requests: queue overflow */ + ret = -ENOMEM; + ehca_err(dev, "Too many posted WQEs " + "qp_num=%x", my_qp->real_qp_num); + goto post_recv_exit0; + } + /* + * Get the index of the WQE in the recv queue. The same index + * is used for writing into the rq_map. + */ + rq_map_idx = start_offset / my_qp->ipz_rqueue.qe_size; + + /* write a RECV WQE into the QUEUE */ + ret = ehca_write_rwqe(&my_qp->ipz_rqueue, wqe_p, recv_wr, + rq_map_idx); + /* + * if something failed, + * reset the free entry pointer to the start value + */ + if (unlikely(ret)) { + my_qp->ipz_rqueue.current_q_offset = start_offset; + ret = -EINVAL; + ehca_err(dev, "Could not write WQE " + "qp_num=%x", my_qp->real_qp_num); + goto post_recv_exit0; + } + + qmap_entry = &my_qp->rq_map.map[rq_map_idx]; + qmap_entry->app_wr_id = get_app_wr_id(recv_wr->wr_id); + qmap_entry->reported = 0; + qmap_entry->cqe_req = 1; + + wqe_cnt++; + recv_wr = recv_wr->next; + } /* eof for recv_wr */ + +post_recv_exit0: + iosync(); /* serialize GAL register access */ + hipz_update_rqa(my_qp, wqe_cnt); + if (unlikely(ret || ehca_debug_level >= 2)) + ehca_dbg(dev, "ehca_qp=%p qp_num=%x wqe_cnt=%d ret=%i", + my_qp, my_qp->real_qp_num, wqe_cnt, ret); + spin_unlock_irqrestore(&my_qp->spinlock_r, flags); + +out: + if (ret) + *bad_recv_wr = recv_wr; + + return ret; +} + +int ehca_post_recv(struct ib_qp *qp, + struct ib_recv_wr *recv_wr, + struct ib_recv_wr **bad_recv_wr) +{ + struct ehca_qp *my_qp = container_of(qp, struct ehca_qp, ib_qp); + + /* Reject WR if QP is in RESET state */ + if (unlikely(my_qp->state == IB_QPS_RESET)) { + ehca_err(qp->device, "Invalid QP state qp_state=%d qpn=%x", + my_qp->state, qp->qp_num); + *bad_recv_wr = recv_wr; + return -EINVAL; + } + + return internal_post_recv(my_qp, qp->device, recv_wr, bad_recv_wr); +} + +int ehca_post_srq_recv(struct ib_srq *srq, + struct ib_recv_wr *recv_wr, + struct ib_recv_wr **bad_recv_wr) +{ + return internal_post_recv(container_of(srq, struct ehca_qp, ib_srq), + srq->device, recv_wr, bad_recv_wr); +} + +/* + * ib_wc_opcode table converts ehca wc opcode to ib + * Since we use zero to indicate invalid opcode, the actual ib opcode must + * be decremented!!! + */ +static const u8 ib_wc_opcode[255] = { + [0x01] = IB_WC_RECV+1, + [0x02] = IB_WC_RECV_RDMA_WITH_IMM+1, + [0x04] = IB_WC_BIND_MW+1, + [0x08] = IB_WC_FETCH_ADD+1, + [0x10] = IB_WC_COMP_SWAP+1, + [0x20] = IB_WC_RDMA_WRITE+1, + [0x40] = IB_WC_RDMA_READ+1, + [0x80] = IB_WC_SEND+1 +}; + +/* internal function to poll one entry of cq */ +static inline int ehca_poll_cq_one(struct ib_cq *cq, struct ib_wc *wc) +{ + int ret = 0, qmap_tail_idx; + struct ehca_cq *my_cq = container_of(cq, struct ehca_cq, ib_cq); + struct ehca_cqe *cqe; + struct ehca_qp *my_qp; + struct ehca_qmap_entry *qmap_entry; + struct ehca_queue_map *qmap; + int cqe_count = 0, is_error; + +repoll: + cqe = (struct ehca_cqe *) + ipz_qeit_get_inc_valid(&my_cq->ipz_queue); + if (!cqe) { + ret = -EAGAIN; + if (ehca_debug_level >= 3) + ehca_dbg(cq->device, "Completion queue is empty " + "my_cq=%p cq_num=%x", my_cq, my_cq->cq_number); + goto poll_cq_one_exit0; + } + + /* prevents loads being reordered across this point */ + rmb(); + + cqe_count++; + if (unlikely(cqe->status & WC_STATUS_PURGE_BIT)) { + struct ehca_qp *qp; + int purgeflag; + unsigned long flags; + + qp = ehca_cq_get_qp(my_cq, cqe->local_qp_number); + if (!qp) { + ehca_err(cq->device, "cq_num=%x qp_num=%x " + "could not find qp -> ignore cqe", + my_cq->cq_number, cqe->local_qp_number); + ehca_dmp(cqe, 64, "cq_num=%x qp_num=%x", + my_cq->cq_number, cqe->local_qp_number); + /* ignore this purged cqe */ + goto repoll; + } + spin_lock_irqsave(&qp->spinlock_s, flags); + purgeflag = qp->sqerr_purgeflag; + spin_unlock_irqrestore(&qp->spinlock_s, flags); + + if (purgeflag) { + ehca_dbg(cq->device, + "Got CQE with purged bit qp_num=%x src_qp=%x", + cqe->local_qp_number, cqe->remote_qp_number); + if (ehca_debug_level >= 2) + ehca_dmp(cqe, 64, "qp_num=%x src_qp=%x", + cqe->local_qp_number, + cqe->remote_qp_number); + /* + * ignore this to avoid double cqes of bad wqe + * that caused sqe and turn off purge flag + */ + qp->sqerr_purgeflag = 0; + goto repoll; + } + } + + is_error = cqe->status & WC_STATUS_ERROR_BIT; + + /* trace error CQEs if debug_level >= 1, trace all CQEs if >= 3 */ + if (unlikely(ehca_debug_level >= 3 || (ehca_debug_level && is_error))) { + ehca_dbg(cq->device, + "Received %sCOMPLETION ehca_cq=%p cq_num=%x -----", + is_error ? "ERROR " : "", my_cq, my_cq->cq_number); + ehca_dmp(cqe, 64, "ehca_cq=%p cq_num=%x", + my_cq, my_cq->cq_number); + ehca_dbg(cq->device, + "ehca_cq=%p cq_num=%x -------------------------", + my_cq, my_cq->cq_number); + } + + read_lock(&ehca_qp_idr_lock); + my_qp = idr_find(&ehca_qp_idr, cqe->qp_token); + read_unlock(&ehca_qp_idr_lock); + if (!my_qp) + goto repoll; + wc->qp = &my_qp->ib_qp; + + qmap_tail_idx = get_app_wr_id(cqe->work_request_id); + if (!(cqe->w_completion_flags & WC_SEND_RECEIVE_BIT)) + /* We got a send completion. */ + qmap = &my_qp->sq_map; + else + /* We got a receive completion. */ + qmap = &my_qp->rq_map; + + /* advance the tail pointer */ + qmap->tail = qmap_tail_idx; + + if (is_error) { + /* + * set left_to_poll to 0 because in error state, we will not + * get any additional CQEs + */ + my_qp->sq_map.next_wqe_idx = next_index(my_qp->sq_map.tail, + my_qp->sq_map.entries); + my_qp->sq_map.left_to_poll = 0; + ehca_add_to_err_list(my_qp, 1); + + my_qp->rq_map.next_wqe_idx = next_index(my_qp->rq_map.tail, + my_qp->rq_map.entries); + my_qp->rq_map.left_to_poll = 0; + if (HAS_RQ(my_qp)) + ehca_add_to_err_list(my_qp, 0); + } + + qmap_entry = &qmap->map[qmap_tail_idx]; + if (qmap_entry->reported) { + ehca_warn(cq->device, "Double cqe on qp_num=%#x", + my_qp->real_qp_num); + /* found a double cqe, discard it and read next one */ + goto repoll; + } + + wc->wr_id = replace_wr_id(cqe->work_request_id, qmap_entry->app_wr_id); + qmap_entry->reported = 1; + + /* if left_to_poll is decremented to 0, add the QP to the error list */ + if (qmap->left_to_poll > 0) { + qmap->left_to_poll--; + if ((my_qp->sq_map.left_to_poll == 0) && + (my_qp->rq_map.left_to_poll == 0)) { + ehca_add_to_err_list(my_qp, 1); + if (HAS_RQ(my_qp)) + ehca_add_to_err_list(my_qp, 0); + } + } + + /* eval ib_wc_opcode */ + wc->opcode = ib_wc_opcode[cqe->optype]-1; + if (unlikely(wc->opcode == -1)) { + ehca_err(cq->device, "Invalid cqe->OPType=%x cqe->status=%x " + "ehca_cq=%p cq_num=%x", + cqe->optype, cqe->status, my_cq, my_cq->cq_number); + /* dump cqe for other infos */ + ehca_dmp(cqe, 64, "ehca_cq=%p cq_num=%x", + my_cq, my_cq->cq_number); + /* update also queue adder to throw away this entry!!! */ + goto repoll; + } + + /* eval ib_wc_status */ + if (unlikely(is_error)) { + /* complete with errors */ + map_ib_wc_status(cqe->status, &wc->status); + wc->vendor_err = wc->status; + } else + wc->status = IB_WC_SUCCESS; + + wc->byte_len = cqe->nr_bytes_transferred; + wc->pkey_index = cqe->pkey_index; + wc->slid = cqe->rlid; + wc->dlid_path_bits = cqe->dlid; + wc->src_qp = cqe->remote_qp_number; + /* + * HW has "Immed data present" and "GRH present" in bits 6 and 5. + * SW defines those in bits 1 and 0, so we can just shift and mask. + */ + wc->wc_flags = (cqe->w_completion_flags >> 5) & 3; + wc->ex.imm_data = cpu_to_be32(cqe->immediate_data); + wc->sl = cqe->service_level; + +poll_cq_one_exit0: + if (cqe_count > 0) + hipz_update_feca(my_cq, cqe_count); + + return ret; +} + +static int generate_flush_cqes(struct ehca_qp *my_qp, struct ib_cq *cq, + struct ib_wc *wc, int num_entries, + struct ipz_queue *ipz_queue, int on_sq) +{ + int nr = 0; + struct ehca_wqe *wqe; + u64 offset; + struct ehca_queue_map *qmap; + struct ehca_qmap_entry *qmap_entry; + + if (on_sq) + qmap = &my_qp->sq_map; + else + qmap = &my_qp->rq_map; + + qmap_entry = &qmap->map[qmap->next_wqe_idx]; + + while ((nr < num_entries) && (qmap_entry->reported == 0)) { + /* generate flush CQE */ + + memset(wc, 0, sizeof(*wc)); + + offset = qmap->next_wqe_idx * ipz_queue->qe_size; + wqe = (struct ehca_wqe *)ipz_qeit_calc(ipz_queue, offset); + if (!wqe) { + ehca_err(cq->device, "Invalid wqe offset=%#llx on " + "qp_num=%#x", offset, my_qp->real_qp_num); + return nr; + } + + wc->wr_id = replace_wr_id(wqe->work_request_id, + qmap_entry->app_wr_id); + + if (on_sq) { + switch (wqe->optype) { + case WQE_OPTYPE_SEND: + wc->opcode = IB_WC_SEND; + break; + case WQE_OPTYPE_RDMAWRITE: + wc->opcode = IB_WC_RDMA_WRITE; + break; + case WQE_OPTYPE_RDMAREAD: + wc->opcode = IB_WC_RDMA_READ; + break; + default: + ehca_err(cq->device, "Invalid optype=%x", + wqe->optype); + return nr; + } + } else + wc->opcode = IB_WC_RECV; + + if (wqe->wr_flag & WQE_WRFLAG_IMM_DATA_PRESENT) { + wc->ex.imm_data = wqe->immediate_data; + wc->wc_flags |= IB_WC_WITH_IMM; + } + + wc->status = IB_WC_WR_FLUSH_ERR; + + wc->qp = &my_qp->ib_qp; + + /* mark as reported and advance next_wqe pointer */ + qmap_entry->reported = 1; + qmap->next_wqe_idx = next_index(qmap->next_wqe_idx, + qmap->entries); + qmap_entry = &qmap->map[qmap->next_wqe_idx]; + + wc++; nr++; + } + + return nr; + +} + +int ehca_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc) +{ + struct ehca_cq *my_cq = container_of(cq, struct ehca_cq, ib_cq); + int nr; + struct ehca_qp *err_qp; + struct ib_wc *current_wc = wc; + int ret = 0; + unsigned long flags; + int entries_left = num_entries; + + if (num_entries < 1) { + ehca_err(cq->device, "Invalid num_entries=%d ehca_cq=%p " + "cq_num=%x", num_entries, my_cq, my_cq->cq_number); + ret = -EINVAL; + goto poll_cq_exit0; + } + + spin_lock_irqsave(&my_cq->spinlock, flags); + + /* generate flush cqes for send queues */ + list_for_each_entry(err_qp, &my_cq->sqp_err_list, sq_err_node) { + nr = generate_flush_cqes(err_qp, cq, current_wc, entries_left, + &err_qp->ipz_squeue, 1); + entries_left -= nr; + current_wc += nr; + + if (entries_left == 0) + break; + } + + /* generate flush cqes for receive queues */ + list_for_each_entry(err_qp, &my_cq->rqp_err_list, rq_err_node) { + nr = generate_flush_cqes(err_qp, cq, current_wc, entries_left, + &err_qp->ipz_rqueue, 0); + entries_left -= nr; + current_wc += nr; + + if (entries_left == 0) + break; + } + + for (nr = 0; nr < entries_left; nr++) { + ret = ehca_poll_cq_one(cq, current_wc); + if (ret) + break; + current_wc++; + } /* eof for nr */ + entries_left -= nr; + + spin_unlock_irqrestore(&my_cq->spinlock, flags); + if (ret == -EAGAIN || !ret) + ret = num_entries - entries_left; + +poll_cq_exit0: + return ret; +} + +int ehca_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags notify_flags) +{ + struct ehca_cq *my_cq = container_of(cq, struct ehca_cq, ib_cq); + int ret = 0; + + switch (notify_flags & IB_CQ_SOLICITED_MASK) { + case IB_CQ_SOLICITED: + hipz_set_cqx_n0(my_cq, 1); + break; + case IB_CQ_NEXT_COMP: + hipz_set_cqx_n1(my_cq, 1); + break; + default: + return -EINVAL; + } + + if (notify_flags & IB_CQ_REPORT_MISSED_EVENTS) { + unsigned long spl_flags; + spin_lock_irqsave(&my_cq->spinlock, spl_flags); + ret = ipz_qeit_is_valid(&my_cq->ipz_queue); + spin_unlock_irqrestore(&my_cq->spinlock, spl_flags); + } + + return ret; +} diff --git a/kernel/drivers/infiniband/hw/ehca/ehca_sqp.c b/kernel/drivers/infiniband/hw/ehca/ehca_sqp.c new file mode 100644 index 000000000..dba8f9f8b --- /dev/null +++ b/kernel/drivers/infiniband/hw/ehca/ehca_sqp.c @@ -0,0 +1,237 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * SQP functions + * + * Authors: Khadija Souissi + * Heiko J Schick + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#include "ehca_classes.h" +#include "ehca_tools.h" +#include "ehca_iverbs.h" +#include "hcp_if.h" + +#define IB_MAD_STATUS_REDIRECT cpu_to_be16(0x0002) +#define IB_MAD_STATUS_UNSUP_VERSION cpu_to_be16(0x0004) +#define IB_MAD_STATUS_UNSUP_METHOD cpu_to_be16(0x0008) + +#define IB_PMA_CLASS_PORT_INFO cpu_to_be16(0x0001) + +/** + * ehca_define_sqp - Defines special queue pair 1 (GSI QP). When special queue + * pair is created successfully, the corresponding port gets active. + * + * Define Special Queue pair 0 (SMI QP) is still not supported. + * + * @qp_init_attr: Queue pair init attributes with port and queue pair type + */ + +u64 ehca_define_sqp(struct ehca_shca *shca, + struct ehca_qp *ehca_qp, + struct ib_qp_init_attr *qp_init_attr) +{ + u32 pma_qp_nr, bma_qp_nr; + u64 ret; + u8 port = qp_init_attr->port_num; + int counter; + + shca->sport[port - 1].port_state = IB_PORT_DOWN; + + switch (qp_init_attr->qp_type) { + case IB_QPT_SMI: + /* function not supported yet */ + break; + case IB_QPT_GSI: + ret = hipz_h_define_aqp1(shca->ipz_hca_handle, + ehca_qp->ipz_qp_handle, + ehca_qp->galpas.kernel, + (u32) qp_init_attr->port_num, + &pma_qp_nr, &bma_qp_nr); + + if (ret != H_SUCCESS) { + ehca_err(&shca->ib_device, + "Can't define AQP1 for port %x. h_ret=%lli", + port, ret); + return ret; + } + shca->sport[port - 1].pma_qp_nr = pma_qp_nr; + ehca_dbg(&shca->ib_device, "port=%x pma_qp_nr=%x", + port, pma_qp_nr); + break; + default: + ehca_err(&shca->ib_device, "invalid qp_type=%x", + qp_init_attr->qp_type); + return H_PARAMETER; + } + + if (ehca_nr_ports < 0) /* autodetect mode */ + return H_SUCCESS; + + for (counter = 0; + shca->sport[port - 1].port_state != IB_PORT_ACTIVE && + counter < ehca_port_act_time; + counter++) { + ehca_dbg(&shca->ib_device, "... wait until port %x is active", + port); + msleep_interruptible(1000); + } + + if (counter == ehca_port_act_time) { + ehca_err(&shca->ib_device, "Port %x is not active.", port); + return H_HARDWARE; + } + + return H_SUCCESS; +} + +struct ib_perf { + struct ib_mad_hdr mad_hdr; + u8 reserved[40]; + u8 data[192]; +} __attribute__ ((packed)); + +/* TC/SL/FL packed into 32 bits, as in ClassPortInfo */ +struct tcslfl { + u32 tc:8; + u32 sl:4; + u32 fl:20; +} __attribute__ ((packed)); + +/* IP Version/TC/FL packed into 32 bits, as in GRH */ +struct vertcfl { + u32 ver:4; + u32 tc:8; + u32 fl:20; +} __attribute__ ((packed)); + +static int ehca_process_perf(struct ib_device *ibdev, u8 port_num, + struct ib_wc *in_wc, struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad) +{ + struct ib_perf *in_perf = (struct ib_perf *)in_mad; + struct ib_perf *out_perf = (struct ib_perf *)out_mad; + struct ib_class_port_info *poi = + (struct ib_class_port_info *)out_perf->data; + struct tcslfl *tcslfl = + (struct tcslfl *)&poi->redirect_tcslfl; + struct ehca_shca *shca = + container_of(ibdev, struct ehca_shca, ib_device); + struct ehca_sport *sport = &shca->sport[port_num - 1]; + + ehca_dbg(ibdev, "method=%x", in_perf->mad_hdr.method); + + *out_mad = *in_mad; + + if (in_perf->mad_hdr.class_version != 1) { + ehca_warn(ibdev, "Unsupported class_version=%x", + in_perf->mad_hdr.class_version); + out_perf->mad_hdr.status = IB_MAD_STATUS_UNSUP_VERSION; + goto perf_reply; + } + + switch (in_perf->mad_hdr.method) { + case IB_MGMT_METHOD_GET: + case IB_MGMT_METHOD_SET: + /* set class port info for redirection */ + out_perf->mad_hdr.attr_id = IB_PMA_CLASS_PORT_INFO; + out_perf->mad_hdr.status = IB_MAD_STATUS_REDIRECT; + memset(poi, 0, sizeof(*poi)); + poi->base_version = 1; + poi->class_version = 1; + poi->resp_time_value = 18; + + /* copy local routing information from WC where applicable */ + tcslfl->sl = in_wc->sl; + poi->redirect_lid = + sport->saved_attr.lid | in_wc->dlid_path_bits; + poi->redirect_qp = sport->pma_qp_nr; + poi->redirect_qkey = IB_QP1_QKEY; + + ehca_query_pkey(ibdev, port_num, in_wc->pkey_index, + &poi->redirect_pkey); + + /* if request was globally routed, copy route info */ + if (in_grh) { + struct vertcfl *vertcfl = + (struct vertcfl *)&in_grh->version_tclass_flow; + memcpy(poi->redirect_gid, in_grh->dgid.raw, + sizeof(poi->redirect_gid)); + tcslfl->tc = vertcfl->tc; + tcslfl->fl = vertcfl->fl; + } else + /* else only fill in default GID */ + ehca_query_gid(ibdev, port_num, 0, + (union ib_gid *)&poi->redirect_gid); + + ehca_dbg(ibdev, "ehca_pma_lid=%x ehca_pma_qp=%x", + sport->saved_attr.lid, sport->pma_qp_nr); + break; + + case IB_MGMT_METHOD_GET_RESP: + return IB_MAD_RESULT_FAILURE; + + default: + out_perf->mad_hdr.status = IB_MAD_STATUS_UNSUP_METHOD; + break; + } + +perf_reply: + out_perf->mad_hdr.method = IB_MGMT_METHOD_GET_RESP; + + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; +} + +int ehca_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + struct ib_wc *in_wc, struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad) +{ + int ret; + + if (!port_num || port_num > ibdev->phys_port_cnt || !in_wc) + return IB_MAD_RESULT_FAILURE; + + /* accept only pma request */ + if (in_mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_PERF_MGMT) + return IB_MAD_RESULT_SUCCESS; + + ehca_dbg(ibdev, "port_num=%x src_qp=%x", port_num, in_wc->src_qp); + ret = ehca_process_perf(ibdev, port_num, in_wc, in_grh, + in_mad, out_mad); + + return ret; +} diff --git a/kernel/drivers/infiniband/hw/ehca/ehca_tools.h b/kernel/drivers/infiniband/hw/ehca/ehca_tools.h new file mode 100644 index 000000000..d280b12aa --- /dev/null +++ b/kernel/drivers/infiniband/hw/ehca/ehca_tools.h @@ -0,0 +1,155 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * auxiliary functions + * + * Authors: Christoph Raisch + * Hoang-Nam Nguyen + * Khadija Souissi + * Waleri Fomin + * Heiko J Schick + * + * Copyright (c) 2005 IBM Corporation + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + + +#ifndef EHCA_TOOLS_H +#define EHCA_TOOLS_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +extern int ehca_debug_level; + +#define ehca_dbg(ib_dev, format, arg...) \ + do { \ + if (unlikely(ehca_debug_level)) \ + dev_printk(KERN_DEBUG, (ib_dev)->dma_device, \ + "PU%04x EHCA_DBG:%s " format "\n", \ + raw_smp_processor_id(), __func__, \ + ## arg); \ + } while (0) + +#define ehca_info(ib_dev, format, arg...) \ + dev_info((ib_dev)->dma_device, "PU%04x EHCA_INFO:%s " format "\n", \ + raw_smp_processor_id(), __func__, ## arg) + +#define ehca_warn(ib_dev, format, arg...) \ + dev_warn((ib_dev)->dma_device, "PU%04x EHCA_WARN:%s " format "\n", \ + raw_smp_processor_id(), __func__, ## arg) + +#define ehca_err(ib_dev, format, arg...) \ + dev_err((ib_dev)->dma_device, "PU%04x EHCA_ERR:%s " format "\n", \ + raw_smp_processor_id(), __func__, ## arg) + +/* use this one only if no ib_dev available */ +#define ehca_gen_dbg(format, arg...) \ + do { \ + if (unlikely(ehca_debug_level)) \ + printk(KERN_DEBUG "PU%04x EHCA_DBG:%s " format "\n", \ + raw_smp_processor_id(), __func__, ## arg); \ + } while (0) + +#define ehca_gen_warn(format, arg...) \ + printk(KERN_INFO "PU%04x EHCA_WARN:%s " format "\n", \ + raw_smp_processor_id(), __func__, ## arg) + +#define ehca_gen_err(format, arg...) \ + printk(KERN_ERR "PU%04x EHCA_ERR:%s " format "\n", \ + raw_smp_processor_id(), __func__, ## arg) + +/** + * ehca_dmp - printk a memory block, whose length is n*8 bytes. + * Each line has the following layout: + * adr=X ofs=Y <8 bytes hex> <8 bytes hex> + */ +#define ehca_dmp(adr, len, format, args...) \ + do { \ + unsigned int x; \ + unsigned int l = (unsigned int)(len); \ + unsigned char *deb = (unsigned char *)(adr); \ + for (x = 0; x < l; x += 16) { \ + printk(KERN_INFO "EHCA_DMP:%s " format \ + " adr=%p ofs=%04x %016llx %016llx\n", \ + __func__, ##args, deb, x, \ + *((u64 *)&deb[0]), *((u64 *)&deb[8])); \ + deb += 16; \ + } \ + } while (0) + +/* define a bitmask, little endian version */ +#define EHCA_BMASK(pos, length) (((pos) << 16) + (length)) + +/* define a bitmask, the ibm way... */ +#define EHCA_BMASK_IBM(from, to) (((63 - to) << 16) + ((to) - (from) + 1)) + +/* internal function, don't use */ +#define EHCA_BMASK_SHIFTPOS(mask) (((mask) >> 16) & 0xffff) + +/* internal function, don't use */ +#define EHCA_BMASK_MASK(mask) (~0ULL >> ((64 - (mask)) & 0xffff)) + +/** + * EHCA_BMASK_SET - return value shifted and masked by mask + * variable|=EHCA_BMASK_SET(MY_MASK,0x4711) ORs the bits in variable + * variable&=~EHCA_BMASK_SET(MY_MASK,-1) clears the bits from the mask + * in variable + */ +#define EHCA_BMASK_SET(mask, value) \ + ((EHCA_BMASK_MASK(mask) & ((u64)(value))) << EHCA_BMASK_SHIFTPOS(mask)) + +/** + * EHCA_BMASK_GET - extract a parameter from value by mask + */ +#define EHCA_BMASK_GET(mask, value) \ + (EHCA_BMASK_MASK(mask) & (((u64)(value)) >> EHCA_BMASK_SHIFTPOS(mask))) + +/* Converts ehca to ib return code */ +int ehca2ib_return_code(u64 ehca_rc); + +#endif /* EHCA_TOOLS_H */ diff --git a/kernel/drivers/infiniband/hw/ehca/ehca_uverbs.c b/kernel/drivers/infiniband/hw/ehca/ehca_uverbs.c new file mode 100644 index 000000000..1a1d5d99f --- /dev/null +++ b/kernel/drivers/infiniband/hw/ehca/ehca_uverbs.c @@ -0,0 +1,309 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * userspace support verbs + * + * Authors: Christoph Raisch + * Hoang-Nam Nguyen + * Heiko J Schick + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#include "ehca_classes.h" +#include "ehca_iverbs.h" +#include "ehca_mrmw.h" +#include "ehca_tools.h" +#include "hcp_if.h" + +struct ib_ucontext *ehca_alloc_ucontext(struct ib_device *device, + struct ib_udata *udata) +{ + struct ehca_ucontext *my_context; + + my_context = kzalloc(sizeof *my_context, GFP_KERNEL); + if (!my_context) { + ehca_err(device, "Out of memory device=%p", device); + return ERR_PTR(-ENOMEM); + } + + return &my_context->ib_ucontext; +} + +int ehca_dealloc_ucontext(struct ib_ucontext *context) +{ + kfree(container_of(context, struct ehca_ucontext, ib_ucontext)); + return 0; +} + +static void ehca_mm_open(struct vm_area_struct *vma) +{ + u32 *count = (u32 *)vma->vm_private_data; + if (!count) { + ehca_gen_err("Invalid vma struct vm_start=%lx vm_end=%lx", + vma->vm_start, vma->vm_end); + return; + } + (*count)++; + if (!(*count)) + ehca_gen_err("Use count overflow vm_start=%lx vm_end=%lx", + vma->vm_start, vma->vm_end); + ehca_gen_dbg("vm_start=%lx vm_end=%lx count=%x", + vma->vm_start, vma->vm_end, *count); +} + +static void ehca_mm_close(struct vm_area_struct *vma) +{ + u32 *count = (u32 *)vma->vm_private_data; + if (!count) { + ehca_gen_err("Invalid vma struct vm_start=%lx vm_end=%lx", + vma->vm_start, vma->vm_end); + return; + } + (*count)--; + ehca_gen_dbg("vm_start=%lx vm_end=%lx count=%x", + vma->vm_start, vma->vm_end, *count); +} + +static const struct vm_operations_struct vm_ops = { + .open = ehca_mm_open, + .close = ehca_mm_close, +}; + +static int ehca_mmap_fw(struct vm_area_struct *vma, struct h_galpas *galpas, + u32 *mm_count) +{ + int ret; + u64 vsize, physical; + + vsize = vma->vm_end - vma->vm_start; + if (vsize < EHCA_PAGESIZE) { + ehca_gen_err("invalid vsize=%lx", vma->vm_end - vma->vm_start); + return -EINVAL; + } + + physical = galpas->user.fw_handle; + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + ehca_gen_dbg("vsize=%llx physical=%llx", vsize, physical); + /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */ + ret = remap_4k_pfn(vma, vma->vm_start, physical >> EHCA_PAGESHIFT, + vma->vm_page_prot); + if (unlikely(ret)) { + ehca_gen_err("remap_pfn_range() failed ret=%i", ret); + return -ENOMEM; + } + + vma->vm_private_data = mm_count; + (*mm_count)++; + vma->vm_ops = &vm_ops; + + return 0; +} + +static int ehca_mmap_queue(struct vm_area_struct *vma, struct ipz_queue *queue, + u32 *mm_count) +{ + int ret; + u64 start, ofs; + struct page *page; + + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + start = vma->vm_start; + for (ofs = 0; ofs < queue->queue_length; ofs += PAGE_SIZE) { + u64 virt_addr = (u64)ipz_qeit_calc(queue, ofs); + page = virt_to_page(virt_addr); + ret = vm_insert_page(vma, start, page); + if (unlikely(ret)) { + ehca_gen_err("vm_insert_page() failed rc=%i", ret); + return ret; + } + start += PAGE_SIZE; + } + vma->vm_private_data = mm_count; + (*mm_count)++; + vma->vm_ops = &vm_ops; + + return 0; +} + +static int ehca_mmap_cq(struct vm_area_struct *vma, struct ehca_cq *cq, + u32 rsrc_type) +{ + int ret; + + switch (rsrc_type) { + case 0: /* galpa fw handle */ + ehca_dbg(cq->ib_cq.device, "cq_num=%x fw", cq->cq_number); + ret = ehca_mmap_fw(vma, &cq->galpas, &cq->mm_count_galpa); + if (unlikely(ret)) { + ehca_err(cq->ib_cq.device, + "ehca_mmap_fw() failed rc=%i cq_num=%x", + ret, cq->cq_number); + return ret; + } + break; + + case 1: /* cq queue_addr */ + ehca_dbg(cq->ib_cq.device, "cq_num=%x queue", cq->cq_number); + ret = ehca_mmap_queue(vma, &cq->ipz_queue, &cq->mm_count_queue); + if (unlikely(ret)) { + ehca_err(cq->ib_cq.device, + "ehca_mmap_queue() failed rc=%i cq_num=%x", + ret, cq->cq_number); + return ret; + } + break; + + default: + ehca_err(cq->ib_cq.device, "bad resource type=%x cq_num=%x", + rsrc_type, cq->cq_number); + return -EINVAL; + } + + return 0; +} + +static int ehca_mmap_qp(struct vm_area_struct *vma, struct ehca_qp *qp, + u32 rsrc_type) +{ + int ret; + + switch (rsrc_type) { + case 0: /* galpa fw handle */ + ehca_dbg(qp->ib_qp.device, "qp_num=%x fw", qp->ib_qp.qp_num); + ret = ehca_mmap_fw(vma, &qp->galpas, &qp->mm_count_galpa); + if (unlikely(ret)) { + ehca_err(qp->ib_qp.device, + "remap_pfn_range() failed ret=%i qp_num=%x", + ret, qp->ib_qp.qp_num); + return -ENOMEM; + } + break; + + case 1: /* qp rqueue_addr */ + ehca_dbg(qp->ib_qp.device, "qp_num=%x rq", qp->ib_qp.qp_num); + ret = ehca_mmap_queue(vma, &qp->ipz_rqueue, + &qp->mm_count_rqueue); + if (unlikely(ret)) { + ehca_err(qp->ib_qp.device, + "ehca_mmap_queue(rq) failed rc=%i qp_num=%x", + ret, qp->ib_qp.qp_num); + return ret; + } + break; + + case 2: /* qp squeue_addr */ + ehca_dbg(qp->ib_qp.device, "qp_num=%x sq", qp->ib_qp.qp_num); + ret = ehca_mmap_queue(vma, &qp->ipz_squeue, + &qp->mm_count_squeue); + if (unlikely(ret)) { + ehca_err(qp->ib_qp.device, + "ehca_mmap_queue(sq) failed rc=%i qp_num=%x", + ret, qp->ib_qp.qp_num); + return ret; + } + break; + + default: + ehca_err(qp->ib_qp.device, "bad resource type=%x qp=num=%x", + rsrc_type, qp->ib_qp.qp_num); + return -EINVAL; + } + + return 0; +} + +int ehca_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) +{ + u64 fileoffset = vma->vm_pgoff; + u32 idr_handle = fileoffset & 0x1FFFFFF; + u32 q_type = (fileoffset >> 27) & 0x1; /* CQ, QP,... */ + u32 rsrc_type = (fileoffset >> 25) & 0x3; /* sq,rq,cmnd_window */ + u32 ret; + struct ehca_cq *cq; + struct ehca_qp *qp; + struct ib_uobject *uobject; + + switch (q_type) { + case 0: /* CQ */ + read_lock(&ehca_cq_idr_lock); + cq = idr_find(&ehca_cq_idr, idr_handle); + read_unlock(&ehca_cq_idr_lock); + + /* make sure this mmap really belongs to the authorized user */ + if (!cq) + return -EINVAL; + + if (!cq->ib_cq.uobject || cq->ib_cq.uobject->context != context) + return -EINVAL; + + ret = ehca_mmap_cq(vma, cq, rsrc_type); + if (unlikely(ret)) { + ehca_err(cq->ib_cq.device, + "ehca_mmap_cq() failed rc=%i cq_num=%x", + ret, cq->cq_number); + return ret; + } + break; + + case 1: /* QP */ + read_lock(&ehca_qp_idr_lock); + qp = idr_find(&ehca_qp_idr, idr_handle); + read_unlock(&ehca_qp_idr_lock); + + /* make sure this mmap really belongs to the authorized user */ + if (!qp) + return -EINVAL; + + uobject = IS_SRQ(qp) ? qp->ib_srq.uobject : qp->ib_qp.uobject; + if (!uobject || uobject->context != context) + return -EINVAL; + + ret = ehca_mmap_qp(vma, qp, rsrc_type); + if (unlikely(ret)) { + ehca_err(qp->ib_qp.device, + "ehca_mmap_qp() failed rc=%i qp_num=%x", + ret, qp->ib_qp.qp_num); + return ret; + } + break; + + default: + ehca_gen_err("bad queue type %x", q_type); + return -EINVAL; + } + + return 0; +} diff --git a/kernel/drivers/infiniband/hw/ehca/hcp_if.c b/kernel/drivers/infiniband/hw/ehca/hcp_if.c new file mode 100644 index 000000000..89517ffb4 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ehca/hcp_if.c @@ -0,0 +1,949 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * Firmware Infiniband Interface code for POWER + * + * Authors: Christoph Raisch + * Hoang-Nam Nguyen + * Joachim Fenkes + * Gerd Bayer + * Waleri Fomin + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include "ehca_tools.h" +#include "hcp_if.h" +#include "hcp_phyp.h" +#include "hipz_fns.h" +#include "ipz_pt_fn.h" + +#define H_ALL_RES_QP_ENHANCED_OPS EHCA_BMASK_IBM(9, 11) +#define H_ALL_RES_QP_PTE_PIN EHCA_BMASK_IBM(12, 12) +#define H_ALL_RES_QP_SERVICE_TYPE EHCA_BMASK_IBM(13, 15) +#define H_ALL_RES_QP_STORAGE EHCA_BMASK_IBM(16, 17) +#define H_ALL_RES_QP_LL_RQ_CQE_POSTING EHCA_BMASK_IBM(18, 18) +#define H_ALL_RES_QP_LL_SQ_CQE_POSTING EHCA_BMASK_IBM(19, 21) +#define H_ALL_RES_QP_SIGNALING_TYPE EHCA_BMASK_IBM(22, 23) +#define H_ALL_RES_QP_UD_AV_LKEY_CTRL EHCA_BMASK_IBM(31, 31) +#define H_ALL_RES_QP_SMALL_SQ_PAGE_SIZE EHCA_BMASK_IBM(32, 35) +#define H_ALL_RES_QP_SMALL_RQ_PAGE_SIZE EHCA_BMASK_IBM(36, 39) +#define H_ALL_RES_QP_RESOURCE_TYPE EHCA_BMASK_IBM(56, 63) + +#define H_ALL_RES_QP_MAX_OUTST_SEND_WR EHCA_BMASK_IBM(0, 15) +#define H_ALL_RES_QP_MAX_OUTST_RECV_WR EHCA_BMASK_IBM(16, 31) +#define H_ALL_RES_QP_MAX_SEND_SGE EHCA_BMASK_IBM(32, 39) +#define H_ALL_RES_QP_MAX_RECV_SGE EHCA_BMASK_IBM(40, 47) + +#define H_ALL_RES_QP_UD_AV_LKEY EHCA_BMASK_IBM(32, 63) +#define H_ALL_RES_QP_SRQ_QP_TOKEN EHCA_BMASK_IBM(0, 31) +#define H_ALL_RES_QP_SRQ_QP_HANDLE EHCA_BMASK_IBM(0, 64) +#define H_ALL_RES_QP_SRQ_LIMIT EHCA_BMASK_IBM(48, 63) +#define H_ALL_RES_QP_SRQ_QPN EHCA_BMASK_IBM(40, 63) + +#define H_ALL_RES_QP_ACT_OUTST_SEND_WR EHCA_BMASK_IBM(16, 31) +#define H_ALL_RES_QP_ACT_OUTST_RECV_WR EHCA_BMASK_IBM(48, 63) +#define H_ALL_RES_QP_ACT_SEND_SGE EHCA_BMASK_IBM(8, 15) +#define H_ALL_RES_QP_ACT_RECV_SGE EHCA_BMASK_IBM(24, 31) + +#define H_ALL_RES_QP_SQUEUE_SIZE_PAGES EHCA_BMASK_IBM(0, 31) +#define H_ALL_RES_QP_RQUEUE_SIZE_PAGES EHCA_BMASK_IBM(32, 63) + +#define H_MP_INIT_TYPE EHCA_BMASK_IBM(44, 47) +#define H_MP_SHUTDOWN EHCA_BMASK_IBM(48, 48) +#define H_MP_RESET_QKEY_CTR EHCA_BMASK_IBM(49, 49) + +#define HCALL4_REGS_FORMAT "r4=%lx r5=%lx r6=%lx r7=%lx" +#define HCALL7_REGS_FORMAT HCALL4_REGS_FORMAT " r8=%lx r9=%lx r10=%lx" +#define HCALL9_REGS_FORMAT HCALL7_REGS_FORMAT " r11=%lx r12=%lx" + +static DEFINE_SPINLOCK(hcall_lock); + +static long ehca_plpar_hcall_norets(unsigned long opcode, + unsigned long arg1, + unsigned long arg2, + unsigned long arg3, + unsigned long arg4, + unsigned long arg5, + unsigned long arg6, + unsigned long arg7) +{ + long ret; + int i, sleep_msecs; + unsigned long flags = 0; + + if (unlikely(ehca_debug_level >= 2)) + ehca_gen_dbg("opcode=%lx " HCALL7_REGS_FORMAT, + opcode, arg1, arg2, arg3, arg4, arg5, arg6, arg7); + + for (i = 0; i < 5; i++) { + /* serialize hCalls to work around firmware issue */ + if (ehca_lock_hcalls) + spin_lock_irqsave(&hcall_lock, flags); + + ret = plpar_hcall_norets(opcode, arg1, arg2, arg3, arg4, + arg5, arg6, arg7); + + if (ehca_lock_hcalls) + spin_unlock_irqrestore(&hcall_lock, flags); + + if (H_IS_LONG_BUSY(ret)) { + sleep_msecs = get_longbusy_msecs(ret); + msleep_interruptible(sleep_msecs); + continue; + } + + if (ret < H_SUCCESS) + ehca_gen_err("opcode=%lx ret=%li " HCALL7_REGS_FORMAT, + opcode, ret, arg1, arg2, arg3, + arg4, arg5, arg6, arg7); + else + if (unlikely(ehca_debug_level >= 2)) + ehca_gen_dbg("opcode=%lx ret=%li", opcode, ret); + + return ret; + } + + return H_BUSY; +} + +static long ehca_plpar_hcall9(unsigned long opcode, + unsigned long *outs, /* array of 9 outputs */ + unsigned long arg1, + unsigned long arg2, + unsigned long arg3, + unsigned long arg4, + unsigned long arg5, + unsigned long arg6, + unsigned long arg7, + unsigned long arg8, + unsigned long arg9) +{ + long ret; + int i, sleep_msecs; + unsigned long flags = 0; + + if (unlikely(ehca_debug_level >= 2)) + ehca_gen_dbg("INPUT -- opcode=%lx " HCALL9_REGS_FORMAT, opcode, + arg1, arg2, arg3, arg4, arg5, + arg6, arg7, arg8, arg9); + + for (i = 0; i < 5; i++) { + /* serialize hCalls to work around firmware issue */ + if (ehca_lock_hcalls) + spin_lock_irqsave(&hcall_lock, flags); + + ret = plpar_hcall9(opcode, outs, + arg1, arg2, arg3, arg4, arg5, + arg6, arg7, arg8, arg9); + + if (ehca_lock_hcalls) + spin_unlock_irqrestore(&hcall_lock, flags); + + if (H_IS_LONG_BUSY(ret)) { + sleep_msecs = get_longbusy_msecs(ret); + msleep_interruptible(sleep_msecs); + continue; + } + + if (ret < H_SUCCESS) { + ehca_gen_err("INPUT -- opcode=%lx " HCALL9_REGS_FORMAT, + opcode, arg1, arg2, arg3, arg4, arg5, + arg6, arg7, arg8, arg9); + ehca_gen_err("OUTPUT -- ret=%li " HCALL9_REGS_FORMAT, + ret, outs[0], outs[1], outs[2], outs[3], + outs[4], outs[5], outs[6], outs[7], + outs[8]); + } else if (unlikely(ehca_debug_level >= 2)) + ehca_gen_dbg("OUTPUT -- ret=%li " HCALL9_REGS_FORMAT, + ret, outs[0], outs[1], outs[2], outs[3], + outs[4], outs[5], outs[6], outs[7], + outs[8]); + return ret; + } + + return H_BUSY; +} + +u64 hipz_h_alloc_resource_eq(const struct ipz_adapter_handle adapter_handle, + struct ehca_pfeq *pfeq, + const u32 neq_control, + const u32 number_of_entries, + struct ipz_eq_handle *eq_handle, + u32 *act_nr_of_entries, + u32 *act_pages, + u32 *eq_ist) +{ + u64 ret; + unsigned long outs[PLPAR_HCALL9_BUFSIZE]; + u64 allocate_controls; + + /* resource type */ + allocate_controls = 3ULL; + + /* ISN is associated */ + if (neq_control != 1) + allocate_controls = (1ULL << (63 - 7)) | allocate_controls; + else /* notification event queue */ + allocate_controls = (1ULL << 63) | allocate_controls; + + ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs, + adapter_handle.handle, /* r4 */ + allocate_controls, /* r5 */ + number_of_entries, /* r6 */ + 0, 0, 0, 0, 0, 0); + eq_handle->handle = outs[0]; + *act_nr_of_entries = (u32)outs[3]; + *act_pages = (u32)outs[4]; + *eq_ist = (u32)outs[5]; + + if (ret == H_NOT_ENOUGH_RESOURCES) + ehca_gen_err("Not enough resource - ret=%lli ", ret); + + return ret; +} + +u64 hipz_h_reset_event(const struct ipz_adapter_handle adapter_handle, + struct ipz_eq_handle eq_handle, + const u64 event_mask) +{ + return ehca_plpar_hcall_norets(H_RESET_EVENTS, + adapter_handle.handle, /* r4 */ + eq_handle.handle, /* r5 */ + event_mask, /* r6 */ + 0, 0, 0, 0); +} + +u64 hipz_h_alloc_resource_cq(const struct ipz_adapter_handle adapter_handle, + struct ehca_cq *cq, + struct ehca_alloc_cq_parms *param) +{ + int rc; + u64 ret; + unsigned long outs[PLPAR_HCALL9_BUFSIZE]; + + ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs, + adapter_handle.handle, /* r4 */ + 2, /* r5 */ + param->eq_handle.handle, /* r6 */ + cq->token, /* r7 */ + param->nr_cqe, /* r8 */ + 0, 0, 0, 0); + cq->ipz_cq_handle.handle = outs[0]; + param->act_nr_of_entries = (u32)outs[3]; + param->act_pages = (u32)outs[4]; + + if (ret == H_SUCCESS) { + rc = hcp_galpas_ctor(&cq->galpas, 0, outs[5], outs[6]); + if (rc) { + ehca_gen_err("Could not establish HW access. rc=%d paddr=%#lx", + rc, outs[5]); + + ehca_plpar_hcall_norets(H_FREE_RESOURCE, + adapter_handle.handle, /* r4 */ + cq->ipz_cq_handle.handle, /* r5 */ + 0, 0, 0, 0, 0); + ret = H_NO_MEM; + } + } + + if (ret == H_NOT_ENOUGH_RESOURCES) + ehca_gen_err("Not enough resources. ret=%lli", ret); + + return ret; +} + +u64 hipz_h_alloc_resource_qp(const struct ipz_adapter_handle adapter_handle, + struct ehca_alloc_qp_parms *parms, int is_user) +{ + int rc; + u64 ret; + u64 allocate_controls, max_r10_reg, r11, r12; + unsigned long outs[PLPAR_HCALL9_BUFSIZE]; + + allocate_controls = + EHCA_BMASK_SET(H_ALL_RES_QP_ENHANCED_OPS, parms->ext_type) + | EHCA_BMASK_SET(H_ALL_RES_QP_PTE_PIN, 0) + | EHCA_BMASK_SET(H_ALL_RES_QP_SERVICE_TYPE, parms->servicetype) + | EHCA_BMASK_SET(H_ALL_RES_QP_SIGNALING_TYPE, parms->sigtype) + | EHCA_BMASK_SET(H_ALL_RES_QP_STORAGE, parms->qp_storage) + | EHCA_BMASK_SET(H_ALL_RES_QP_SMALL_SQ_PAGE_SIZE, + parms->squeue.page_size) + | EHCA_BMASK_SET(H_ALL_RES_QP_SMALL_RQ_PAGE_SIZE, + parms->rqueue.page_size) + | EHCA_BMASK_SET(H_ALL_RES_QP_LL_RQ_CQE_POSTING, + !!(parms->ll_comp_flags & LLQP_RECV_COMP)) + | EHCA_BMASK_SET(H_ALL_RES_QP_LL_SQ_CQE_POSTING, + !!(parms->ll_comp_flags & LLQP_SEND_COMP)) + | EHCA_BMASK_SET(H_ALL_RES_QP_UD_AV_LKEY_CTRL, + parms->ud_av_l_key_ctl) + | EHCA_BMASK_SET(H_ALL_RES_QP_RESOURCE_TYPE, 1); + + max_r10_reg = + EHCA_BMASK_SET(H_ALL_RES_QP_MAX_OUTST_SEND_WR, + parms->squeue.max_wr + 1) + | EHCA_BMASK_SET(H_ALL_RES_QP_MAX_OUTST_RECV_WR, + parms->rqueue.max_wr + 1) + | EHCA_BMASK_SET(H_ALL_RES_QP_MAX_SEND_SGE, + parms->squeue.max_sge) + | EHCA_BMASK_SET(H_ALL_RES_QP_MAX_RECV_SGE, + parms->rqueue.max_sge); + + r11 = EHCA_BMASK_SET(H_ALL_RES_QP_SRQ_QP_TOKEN, parms->srq_token); + + if (parms->ext_type == EQPT_SRQ) + r12 = EHCA_BMASK_SET(H_ALL_RES_QP_SRQ_LIMIT, parms->srq_limit); + else + r12 = EHCA_BMASK_SET(H_ALL_RES_QP_SRQ_QPN, parms->srq_qpn); + + ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs, + adapter_handle.handle, /* r4 */ + allocate_controls, /* r5 */ + parms->send_cq_handle.handle, + parms->recv_cq_handle.handle, + parms->eq_handle.handle, + ((u64)parms->token << 32) | parms->pd.value, + max_r10_reg, r11, r12); + + parms->qp_handle.handle = outs[0]; + parms->real_qp_num = (u32)outs[1]; + parms->squeue.act_nr_wqes = + (u16)EHCA_BMASK_GET(H_ALL_RES_QP_ACT_OUTST_SEND_WR, outs[2]); + parms->rqueue.act_nr_wqes = + (u16)EHCA_BMASK_GET(H_ALL_RES_QP_ACT_OUTST_RECV_WR, outs[2]); + parms->squeue.act_nr_sges = + (u8)EHCA_BMASK_GET(H_ALL_RES_QP_ACT_SEND_SGE, outs[3]); + parms->rqueue.act_nr_sges = + (u8)EHCA_BMASK_GET(H_ALL_RES_QP_ACT_RECV_SGE, outs[3]); + parms->squeue.queue_size = + (u32)EHCA_BMASK_GET(H_ALL_RES_QP_SQUEUE_SIZE_PAGES, outs[4]); + parms->rqueue.queue_size = + (u32)EHCA_BMASK_GET(H_ALL_RES_QP_RQUEUE_SIZE_PAGES, outs[4]); + + if (ret == H_SUCCESS) { + rc = hcp_galpas_ctor(&parms->galpas, is_user, outs[6], outs[6]); + if (rc) { + ehca_gen_err("Could not establish HW access. rc=%d paddr=%#lx", + rc, outs[6]); + + ehca_plpar_hcall_norets(H_FREE_RESOURCE, + adapter_handle.handle, /* r4 */ + parms->qp_handle.handle, /* r5 */ + 0, 0, 0, 0, 0); + ret = H_NO_MEM; + } + } + + if (ret == H_NOT_ENOUGH_RESOURCES) + ehca_gen_err("Not enough resources. ret=%lli", ret); + + return ret; +} + +u64 hipz_h_query_port(const struct ipz_adapter_handle adapter_handle, + const u8 port_id, + struct hipz_query_port *query_port_response_block) +{ + u64 ret; + u64 r_cb = __pa(query_port_response_block); + + if (r_cb & (EHCA_PAGESIZE-1)) { + ehca_gen_err("response block not page aligned"); + return H_PARAMETER; + } + + ret = ehca_plpar_hcall_norets(H_QUERY_PORT, + adapter_handle.handle, /* r4 */ + port_id, /* r5 */ + r_cb, /* r6 */ + 0, 0, 0, 0); + + if (ehca_debug_level >= 2) + ehca_dmp(query_port_response_block, 64, "response_block"); + + return ret; +} + +u64 hipz_h_modify_port(const struct ipz_adapter_handle adapter_handle, + const u8 port_id, const u32 port_cap, + const u8 init_type, const int modify_mask) +{ + u64 port_attributes = port_cap; + + if (modify_mask & IB_PORT_SHUTDOWN) + port_attributes |= EHCA_BMASK_SET(H_MP_SHUTDOWN, 1); + if (modify_mask & IB_PORT_INIT_TYPE) + port_attributes |= EHCA_BMASK_SET(H_MP_INIT_TYPE, init_type); + if (modify_mask & IB_PORT_RESET_QKEY_CNTR) + port_attributes |= EHCA_BMASK_SET(H_MP_RESET_QKEY_CTR, 1); + + return ehca_plpar_hcall_norets(H_MODIFY_PORT, + adapter_handle.handle, /* r4 */ + port_id, /* r5 */ + port_attributes, /* r6 */ + 0, 0, 0, 0); +} + +u64 hipz_h_query_hca(const struct ipz_adapter_handle adapter_handle, + struct hipz_query_hca *query_hca_rblock) +{ + u64 r_cb = __pa(query_hca_rblock); + + if (r_cb & (EHCA_PAGESIZE-1)) { + ehca_gen_err("response_block=%p not page aligned", + query_hca_rblock); + return H_PARAMETER; + } + + return ehca_plpar_hcall_norets(H_QUERY_HCA, + adapter_handle.handle, /* r4 */ + r_cb, /* r5 */ + 0, 0, 0, 0, 0); +} + +u64 hipz_h_register_rpage(const struct ipz_adapter_handle adapter_handle, + const u8 pagesize, + const u8 queue_type, + const u64 resource_handle, + const u64 logical_address_of_page, + u64 count) +{ + return ehca_plpar_hcall_norets(H_REGISTER_RPAGES, + adapter_handle.handle, /* r4 */ + (u64)queue_type | ((u64)pagesize) << 8, + /* r5 */ + resource_handle, /* r6 */ + logical_address_of_page, /* r7 */ + count, /* r8 */ + 0, 0); +} + +u64 hipz_h_register_rpage_eq(const struct ipz_adapter_handle adapter_handle, + const struct ipz_eq_handle eq_handle, + struct ehca_pfeq *pfeq, + const u8 pagesize, + const u8 queue_type, + const u64 logical_address_of_page, + const u64 count) +{ + if (count != 1) { + ehca_gen_err("Ppage counter=%llx", count); + return H_PARAMETER; + } + return hipz_h_register_rpage(adapter_handle, + pagesize, + queue_type, + eq_handle.handle, + logical_address_of_page, count); +} + +u64 hipz_h_query_int_state(const struct ipz_adapter_handle adapter_handle, + u32 ist) +{ + u64 ret; + ret = ehca_plpar_hcall_norets(H_QUERY_INT_STATE, + adapter_handle.handle, /* r4 */ + ist, /* r5 */ + 0, 0, 0, 0, 0); + + if (ret != H_SUCCESS && ret != H_BUSY) + ehca_gen_err("Could not query interrupt state."); + + return ret; +} + +u64 hipz_h_register_rpage_cq(const struct ipz_adapter_handle adapter_handle, + const struct ipz_cq_handle cq_handle, + struct ehca_pfcq *pfcq, + const u8 pagesize, + const u8 queue_type, + const u64 logical_address_of_page, + const u64 count, + const struct h_galpa gal) +{ + if (count != 1) { + ehca_gen_err("Page counter=%llx", count); + return H_PARAMETER; + } + + return hipz_h_register_rpage(adapter_handle, pagesize, queue_type, + cq_handle.handle, logical_address_of_page, + count); +} + +u64 hipz_h_register_rpage_qp(const struct ipz_adapter_handle adapter_handle, + const struct ipz_qp_handle qp_handle, + struct ehca_pfqp *pfqp, + const u8 pagesize, + const u8 queue_type, + const u64 logical_address_of_page, + const u64 count, + const struct h_galpa galpa) +{ + if (count > 1) { + ehca_gen_err("Page counter=%llx", count); + return H_PARAMETER; + } + + return hipz_h_register_rpage(adapter_handle, pagesize, queue_type, + qp_handle.handle, logical_address_of_page, + count); +} + +u64 hipz_h_disable_and_get_wqe(const struct ipz_adapter_handle adapter_handle, + const struct ipz_qp_handle qp_handle, + struct ehca_pfqp *pfqp, + void **log_addr_next_sq_wqe2processed, + void **log_addr_next_rq_wqe2processed, + int dis_and_get_function_code) +{ + u64 ret; + unsigned long outs[PLPAR_HCALL9_BUFSIZE]; + + ret = ehca_plpar_hcall9(H_DISABLE_AND_GETC, outs, + adapter_handle.handle, /* r4 */ + dis_and_get_function_code, /* r5 */ + qp_handle.handle, /* r6 */ + 0, 0, 0, 0, 0, 0); + if (log_addr_next_sq_wqe2processed) + *log_addr_next_sq_wqe2processed = (void *)outs[0]; + if (log_addr_next_rq_wqe2processed) + *log_addr_next_rq_wqe2processed = (void *)outs[1]; + + return ret; +} + +u64 hipz_h_modify_qp(const struct ipz_adapter_handle adapter_handle, + const struct ipz_qp_handle qp_handle, + struct ehca_pfqp *pfqp, + const u64 update_mask, + struct hcp_modify_qp_control_block *mqpcb, + struct h_galpa gal) +{ + u64 ret; + unsigned long outs[PLPAR_HCALL9_BUFSIZE]; + ret = ehca_plpar_hcall9(H_MODIFY_QP, outs, + adapter_handle.handle, /* r4 */ + qp_handle.handle, /* r5 */ + update_mask, /* r6 */ + __pa(mqpcb), /* r7 */ + 0, 0, 0, 0, 0); + + if (ret == H_NOT_ENOUGH_RESOURCES) + ehca_gen_err("Insufficient resources ret=%lli", ret); + + return ret; +} + +u64 hipz_h_query_qp(const struct ipz_adapter_handle adapter_handle, + const struct ipz_qp_handle qp_handle, + struct ehca_pfqp *pfqp, + struct hcp_modify_qp_control_block *qqpcb, + struct h_galpa gal) +{ + return ehca_plpar_hcall_norets(H_QUERY_QP, + adapter_handle.handle, /* r4 */ + qp_handle.handle, /* r5 */ + __pa(qqpcb), /* r6 */ + 0, 0, 0, 0); +} + +u64 hipz_h_destroy_qp(const struct ipz_adapter_handle adapter_handle, + struct ehca_qp *qp) +{ + u64 ret; + unsigned long outs[PLPAR_HCALL9_BUFSIZE]; + + ret = hcp_galpas_dtor(&qp->galpas); + if (ret) { + ehca_gen_err("Could not destruct qp->galpas"); + return H_RESOURCE; + } + ret = ehca_plpar_hcall9(H_DISABLE_AND_GETC, outs, + adapter_handle.handle, /* r4 */ + /* function code */ + 1, /* r5 */ + qp->ipz_qp_handle.handle, /* r6 */ + 0, 0, 0, 0, 0, 0); + if (ret == H_HARDWARE) + ehca_gen_err("HCA not operational. ret=%lli", ret); + + ret = ehca_plpar_hcall_norets(H_FREE_RESOURCE, + adapter_handle.handle, /* r4 */ + qp->ipz_qp_handle.handle, /* r5 */ + 0, 0, 0, 0, 0); + + if (ret == H_RESOURCE) + ehca_gen_err("Resource still in use. ret=%lli", ret); + + return ret; +} + +u64 hipz_h_define_aqp0(const struct ipz_adapter_handle adapter_handle, + const struct ipz_qp_handle qp_handle, + struct h_galpa gal, + u32 port) +{ + return ehca_plpar_hcall_norets(H_DEFINE_AQP0, + adapter_handle.handle, /* r4 */ + qp_handle.handle, /* r5 */ + port, /* r6 */ + 0, 0, 0, 0); +} + +u64 hipz_h_define_aqp1(const struct ipz_adapter_handle adapter_handle, + const struct ipz_qp_handle qp_handle, + struct h_galpa gal, + u32 port, u32 * pma_qp_nr, + u32 * bma_qp_nr) +{ + u64 ret; + unsigned long outs[PLPAR_HCALL9_BUFSIZE]; + + ret = ehca_plpar_hcall9(H_DEFINE_AQP1, outs, + adapter_handle.handle, /* r4 */ + qp_handle.handle, /* r5 */ + port, /* r6 */ + 0, 0, 0, 0, 0, 0); + *pma_qp_nr = (u32)outs[0]; + *bma_qp_nr = (u32)outs[1]; + + if (ret == H_ALIAS_EXIST) + ehca_gen_err("AQP1 already exists. ret=%lli", ret); + + return ret; +} + +u64 hipz_h_attach_mcqp(const struct ipz_adapter_handle adapter_handle, + const struct ipz_qp_handle qp_handle, + struct h_galpa gal, + u16 mcg_dlid, + u64 subnet_prefix, u64 interface_id) +{ + u64 ret; + + ret = ehca_plpar_hcall_norets(H_ATTACH_MCQP, + adapter_handle.handle, /* r4 */ + qp_handle.handle, /* r5 */ + mcg_dlid, /* r6 */ + interface_id, /* r7 */ + subnet_prefix, /* r8 */ + 0, 0); + + if (ret == H_NOT_ENOUGH_RESOURCES) + ehca_gen_err("Not enough resources. ret=%lli", ret); + + return ret; +} + +u64 hipz_h_detach_mcqp(const struct ipz_adapter_handle adapter_handle, + const struct ipz_qp_handle qp_handle, + struct h_galpa gal, + u16 mcg_dlid, + u64 subnet_prefix, u64 interface_id) +{ + return ehca_plpar_hcall_norets(H_DETACH_MCQP, + adapter_handle.handle, /* r4 */ + qp_handle.handle, /* r5 */ + mcg_dlid, /* r6 */ + interface_id, /* r7 */ + subnet_prefix, /* r8 */ + 0, 0); +} + +u64 hipz_h_destroy_cq(const struct ipz_adapter_handle adapter_handle, + struct ehca_cq *cq, + u8 force_flag) +{ + u64 ret; + + ret = hcp_galpas_dtor(&cq->galpas); + if (ret) { + ehca_gen_err("Could not destruct cp->galpas"); + return H_RESOURCE; + } + + ret = ehca_plpar_hcall_norets(H_FREE_RESOURCE, + adapter_handle.handle, /* r4 */ + cq->ipz_cq_handle.handle, /* r5 */ + force_flag != 0 ? 1L : 0L, /* r6 */ + 0, 0, 0, 0); + + if (ret == H_RESOURCE) + ehca_gen_err("H_FREE_RESOURCE failed ret=%lli ", ret); + + return ret; +} + +u64 hipz_h_destroy_eq(const struct ipz_adapter_handle adapter_handle, + struct ehca_eq *eq) +{ + u64 ret; + + ret = hcp_galpas_dtor(&eq->galpas); + if (ret) { + ehca_gen_err("Could not destruct eq->galpas"); + return H_RESOURCE; + } + + ret = ehca_plpar_hcall_norets(H_FREE_RESOURCE, + adapter_handle.handle, /* r4 */ + eq->ipz_eq_handle.handle, /* r5 */ + 0, 0, 0, 0, 0); + + if (ret == H_RESOURCE) + ehca_gen_err("Resource in use. ret=%lli ", ret); + + return ret; +} + +u64 hipz_h_alloc_resource_mr(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mr *mr, + const u64 vaddr, + const u64 length, + const u32 access_ctrl, + const struct ipz_pd pd, + struct ehca_mr_hipzout_parms *outparms) +{ + u64 ret; + unsigned long outs[PLPAR_HCALL9_BUFSIZE]; + + ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs, + adapter_handle.handle, /* r4 */ + 5, /* r5 */ + vaddr, /* r6 */ + length, /* r7 */ + (((u64)access_ctrl) << 32ULL), /* r8 */ + pd.value, /* r9 */ + 0, 0, 0); + outparms->handle.handle = outs[0]; + outparms->lkey = (u32)outs[2]; + outparms->rkey = (u32)outs[3]; + + return ret; +} + +u64 hipz_h_register_rpage_mr(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mr *mr, + const u8 pagesize, + const u8 queue_type, + const u64 logical_address_of_page, + const u64 count) +{ + u64 ret; + + if (unlikely(ehca_debug_level >= 3)) { + if (count > 1) { + u64 *kpage; + int i; + kpage = __va(logical_address_of_page); + for (i = 0; i < count; i++) + ehca_gen_dbg("kpage[%d]=%p", + i, (void *)kpage[i]); + } else + ehca_gen_dbg("kpage=%p", + (void *)logical_address_of_page); + } + + if ((count > 1) && (logical_address_of_page & (EHCA_PAGESIZE-1))) { + ehca_gen_err("logical_address_of_page not on a 4k boundary " + "adapter_handle=%llx mr=%p mr_handle=%llx " + "pagesize=%x queue_type=%x " + "logical_address_of_page=%llx count=%llx", + adapter_handle.handle, mr, + mr->ipz_mr_handle.handle, pagesize, queue_type, + logical_address_of_page, count); + ret = H_PARAMETER; + } else + ret = hipz_h_register_rpage(adapter_handle, pagesize, + queue_type, + mr->ipz_mr_handle.handle, + logical_address_of_page, count); + return ret; +} + +u64 hipz_h_query_mr(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mr *mr, + struct ehca_mr_hipzout_parms *outparms) +{ + u64 ret; + unsigned long outs[PLPAR_HCALL9_BUFSIZE]; + + ret = ehca_plpar_hcall9(H_QUERY_MR, outs, + adapter_handle.handle, /* r4 */ + mr->ipz_mr_handle.handle, /* r5 */ + 0, 0, 0, 0, 0, 0, 0); + outparms->len = outs[0]; + outparms->vaddr = outs[1]; + outparms->acl = outs[4] >> 32; + outparms->lkey = (u32)(outs[5] >> 32); + outparms->rkey = (u32)(outs[5] & (0xffffffff)); + + return ret; +} + +u64 hipz_h_free_resource_mr(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mr *mr) +{ + return ehca_plpar_hcall_norets(H_FREE_RESOURCE, + adapter_handle.handle, /* r4 */ + mr->ipz_mr_handle.handle, /* r5 */ + 0, 0, 0, 0, 0); +} + +u64 hipz_h_reregister_pmr(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mr *mr, + const u64 vaddr_in, + const u64 length, + const u32 access_ctrl, + const struct ipz_pd pd, + const u64 mr_addr_cb, + struct ehca_mr_hipzout_parms *outparms) +{ + u64 ret; + unsigned long outs[PLPAR_HCALL9_BUFSIZE]; + + ret = ehca_plpar_hcall9(H_REREGISTER_PMR, outs, + adapter_handle.handle, /* r4 */ + mr->ipz_mr_handle.handle, /* r5 */ + vaddr_in, /* r6 */ + length, /* r7 */ + /* r8 */ + ((((u64)access_ctrl) << 32ULL) | pd.value), + mr_addr_cb, /* r9 */ + 0, 0, 0); + outparms->vaddr = outs[1]; + outparms->lkey = (u32)outs[2]; + outparms->rkey = (u32)outs[3]; + + return ret; +} + +u64 hipz_h_register_smr(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mr *mr, + const struct ehca_mr *orig_mr, + const u64 vaddr_in, + const u32 access_ctrl, + const struct ipz_pd pd, + struct ehca_mr_hipzout_parms *outparms) +{ + u64 ret; + unsigned long outs[PLPAR_HCALL9_BUFSIZE]; + + ret = ehca_plpar_hcall9(H_REGISTER_SMR, outs, + adapter_handle.handle, /* r4 */ + orig_mr->ipz_mr_handle.handle, /* r5 */ + vaddr_in, /* r6 */ + (((u64)access_ctrl) << 32ULL), /* r7 */ + pd.value, /* r8 */ + 0, 0, 0, 0); + outparms->handle.handle = outs[0]; + outparms->lkey = (u32)outs[2]; + outparms->rkey = (u32)outs[3]; + + return ret; +} + +u64 hipz_h_alloc_resource_mw(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mw *mw, + const struct ipz_pd pd, + struct ehca_mw_hipzout_parms *outparms) +{ + u64 ret; + unsigned long outs[PLPAR_HCALL9_BUFSIZE]; + + ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs, + adapter_handle.handle, /* r4 */ + 6, /* r5 */ + pd.value, /* r6 */ + 0, 0, 0, 0, 0, 0); + outparms->handle.handle = outs[0]; + outparms->rkey = (u32)outs[3]; + + return ret; +} + +u64 hipz_h_query_mw(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mw *mw, + struct ehca_mw_hipzout_parms *outparms) +{ + u64 ret; + unsigned long outs[PLPAR_HCALL9_BUFSIZE]; + + ret = ehca_plpar_hcall9(H_QUERY_MW, outs, + adapter_handle.handle, /* r4 */ + mw->ipz_mw_handle.handle, /* r5 */ + 0, 0, 0, 0, 0, 0, 0); + outparms->rkey = (u32)outs[3]; + + return ret; +} + +u64 hipz_h_free_resource_mw(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mw *mw) +{ + return ehca_plpar_hcall_norets(H_FREE_RESOURCE, + adapter_handle.handle, /* r4 */ + mw->ipz_mw_handle.handle, /* r5 */ + 0, 0, 0, 0, 0); +} + +u64 hipz_h_error_data(const struct ipz_adapter_handle adapter_handle, + const u64 ressource_handle, + void *rblock, + unsigned long *byte_count) +{ + u64 r_cb = __pa(rblock); + + if (r_cb & (EHCA_PAGESIZE-1)) { + ehca_gen_err("rblock not page aligned."); + return H_PARAMETER; + } + + return ehca_plpar_hcall_norets(H_ERROR_DATA, + adapter_handle.handle, + ressource_handle, + r_cb, + 0, 0, 0, 0); +} + +u64 hipz_h_eoi(int irq) +{ + unsigned long xirr; + + iosync(); + xirr = (0xffULL << 24) | irq; + + return plpar_hcall_norets(H_EOI, xirr); +} diff --git a/kernel/drivers/infiniband/hw/ehca/hcp_if.h b/kernel/drivers/infiniband/hw/ehca/hcp_if.h new file mode 100644 index 000000000..a46e514c3 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ehca/hcp_if.h @@ -0,0 +1,265 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * Firmware Infiniband Interface code for POWER + * + * Authors: Christoph Raisch + * Hoang-Nam Nguyen + * Gerd Bayer + * Waleri Fomin + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __HCP_IF_H__ +#define __HCP_IF_H__ + +#include "ehca_classes.h" +#include "ehca_tools.h" +#include "hipz_hw.h" + +/* + * hipz_h_alloc_resource_eq allocates EQ resources in HW and FW, initialize + * resources, create the empty EQPT (ring). + */ +u64 hipz_h_alloc_resource_eq(const struct ipz_adapter_handle adapter_handle, + struct ehca_pfeq *pfeq, + const u32 neq_control, + const u32 number_of_entries, + struct ipz_eq_handle *eq_handle, + u32 * act_nr_of_entries, + u32 * act_pages, + u32 * eq_ist); + +u64 hipz_h_reset_event(const struct ipz_adapter_handle adapter_handle, + struct ipz_eq_handle eq_handle, + const u64 event_mask); +/* + * hipz_h_allocate_resource_cq allocates CQ resources in HW and FW, initialize + * resources, create the empty CQPT (ring). + */ +u64 hipz_h_alloc_resource_cq(const struct ipz_adapter_handle adapter_handle, + struct ehca_cq *cq, + struct ehca_alloc_cq_parms *param); + + +/* + * hipz_h_alloc_resource_qp allocates QP resources in HW and FW, + * initialize resources, create empty QPPTs (2 rings). + */ +u64 hipz_h_alloc_resource_qp(const struct ipz_adapter_handle adapter_handle, + struct ehca_alloc_qp_parms *parms, int is_user); + +u64 hipz_h_query_port(const struct ipz_adapter_handle adapter_handle, + const u8 port_id, + struct hipz_query_port *query_port_response_block); + +u64 hipz_h_modify_port(const struct ipz_adapter_handle adapter_handle, + const u8 port_id, const u32 port_cap, + const u8 init_type, const int modify_mask); + +u64 hipz_h_query_hca(const struct ipz_adapter_handle adapter_handle, + struct hipz_query_hca *query_hca_rblock); + +/* + * hipz_h_register_rpage internal function in hcp_if.h for all + * hcp_H_REGISTER_RPAGE calls. + */ +u64 hipz_h_register_rpage(const struct ipz_adapter_handle adapter_handle, + const u8 pagesize, + const u8 queue_type, + const u64 resource_handle, + const u64 logical_address_of_page, + u64 count); + +u64 hipz_h_register_rpage_eq(const struct ipz_adapter_handle adapter_handle, + const struct ipz_eq_handle eq_handle, + struct ehca_pfeq *pfeq, + const u8 pagesize, + const u8 queue_type, + const u64 logical_address_of_page, + const u64 count); + +u64 hipz_h_query_int_state(const struct ipz_adapter_handle + hcp_adapter_handle, + u32 ist); + +u64 hipz_h_register_rpage_cq(const struct ipz_adapter_handle adapter_handle, + const struct ipz_cq_handle cq_handle, + struct ehca_pfcq *pfcq, + const u8 pagesize, + const u8 queue_type, + const u64 logical_address_of_page, + const u64 count, + const struct h_galpa gal); + +u64 hipz_h_register_rpage_qp(const struct ipz_adapter_handle adapter_handle, + const struct ipz_qp_handle qp_handle, + struct ehca_pfqp *pfqp, + const u8 pagesize, + const u8 queue_type, + const u64 logical_address_of_page, + const u64 count, + const struct h_galpa galpa); + +u64 hipz_h_disable_and_get_wqe(const struct ipz_adapter_handle adapter_handle, + const struct ipz_qp_handle qp_handle, + struct ehca_pfqp *pfqp, + void **log_addr_next_sq_wqe_tb_processed, + void **log_addr_next_rq_wqe_tb_processed, + int dis_and_get_function_code); +enum hcall_sigt { + HCALL_SIGT_NO_CQE = 0, + HCALL_SIGT_BY_WQE = 1, + HCALL_SIGT_EVERY = 2 +}; + +u64 hipz_h_modify_qp(const struct ipz_adapter_handle adapter_handle, + const struct ipz_qp_handle qp_handle, + struct ehca_pfqp *pfqp, + const u64 update_mask, + struct hcp_modify_qp_control_block *mqpcb, + struct h_galpa gal); + +u64 hipz_h_query_qp(const struct ipz_adapter_handle adapter_handle, + const struct ipz_qp_handle qp_handle, + struct ehca_pfqp *pfqp, + struct hcp_modify_qp_control_block *qqpcb, + struct h_galpa gal); + +u64 hipz_h_destroy_qp(const struct ipz_adapter_handle adapter_handle, + struct ehca_qp *qp); + +u64 hipz_h_define_aqp0(const struct ipz_adapter_handle adapter_handle, + const struct ipz_qp_handle qp_handle, + struct h_galpa gal, + u32 port); + +u64 hipz_h_define_aqp1(const struct ipz_adapter_handle adapter_handle, + const struct ipz_qp_handle qp_handle, + struct h_galpa gal, + u32 port, u32 * pma_qp_nr, + u32 * bma_qp_nr); + +u64 hipz_h_attach_mcqp(const struct ipz_adapter_handle adapter_handle, + const struct ipz_qp_handle qp_handle, + struct h_galpa gal, + u16 mcg_dlid, + u64 subnet_prefix, u64 interface_id); + +u64 hipz_h_detach_mcqp(const struct ipz_adapter_handle adapter_handle, + const struct ipz_qp_handle qp_handle, + struct h_galpa gal, + u16 mcg_dlid, + u64 subnet_prefix, u64 interface_id); + +u64 hipz_h_destroy_cq(const struct ipz_adapter_handle adapter_handle, + struct ehca_cq *cq, + u8 force_flag); + +u64 hipz_h_destroy_eq(const struct ipz_adapter_handle adapter_handle, + struct ehca_eq *eq); + +/* + * hipz_h_alloc_resource_mr allocates MR resources in HW and FW, initialize + * resources. + */ +u64 hipz_h_alloc_resource_mr(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mr *mr, + const u64 vaddr, + const u64 length, + const u32 access_ctrl, + const struct ipz_pd pd, + struct ehca_mr_hipzout_parms *outparms); + +/* hipz_h_register_rpage_mr registers MR resource pages in HW and FW */ +u64 hipz_h_register_rpage_mr(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mr *mr, + const u8 pagesize, + const u8 queue_type, + const u64 logical_address_of_page, + const u64 count); + +/* hipz_h_query_mr queries MR in HW and FW */ +u64 hipz_h_query_mr(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mr *mr, + struct ehca_mr_hipzout_parms *outparms); + +/* hipz_h_free_resource_mr frees MR resources in HW and FW */ +u64 hipz_h_free_resource_mr(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mr *mr); + +/* hipz_h_reregister_pmr reregisters MR in HW and FW */ +u64 hipz_h_reregister_pmr(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mr *mr, + const u64 vaddr_in, + const u64 length, + const u32 access_ctrl, + const struct ipz_pd pd, + const u64 mr_addr_cb, + struct ehca_mr_hipzout_parms *outparms); + +/* hipz_h_register_smr register shared MR in HW and FW */ +u64 hipz_h_register_smr(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mr *mr, + const struct ehca_mr *orig_mr, + const u64 vaddr_in, + const u32 access_ctrl, + const struct ipz_pd pd, + struct ehca_mr_hipzout_parms *outparms); + +/* + * hipz_h_alloc_resource_mw allocates MW resources in HW and FW, initialize + * resources. + */ +u64 hipz_h_alloc_resource_mw(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mw *mw, + const struct ipz_pd pd, + struct ehca_mw_hipzout_parms *outparms); + +/* hipz_h_query_mw queries MW in HW and FW */ +u64 hipz_h_query_mw(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mw *mw, + struct ehca_mw_hipzout_parms *outparms); + +/* hipz_h_free_resource_mw frees MW resources in HW and FW */ +u64 hipz_h_free_resource_mw(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mw *mw); + +u64 hipz_h_error_data(const struct ipz_adapter_handle adapter_handle, + const u64 ressource_handle, + void *rblock, + unsigned long *byte_count); +u64 hipz_h_eoi(int irq); + +#endif /* __HCP_IF_H__ */ diff --git a/kernel/drivers/infiniband/hw/ehca/hcp_phyp.c b/kernel/drivers/infiniband/hw/ehca/hcp_phyp.c new file mode 100644 index 000000000..077376ff3 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ehca/hcp_phyp.c @@ -0,0 +1,82 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * load store abstraction for ehca register access with tracing + * + * Authors: Christoph Raisch + * Hoang-Nam Nguyen + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "ehca_classes.h" +#include "hipz_hw.h" + +u64 hcall_map_page(u64 physaddr) +{ + return (u64)ioremap(physaddr, EHCA_PAGESIZE); +} + +int hcall_unmap_page(u64 mapaddr) +{ + iounmap((volatile void __iomem *) mapaddr); + return 0; +} + +int hcp_galpas_ctor(struct h_galpas *galpas, int is_user, + u64 paddr_kernel, u64 paddr_user) +{ + if (!is_user) { + galpas->kernel.fw_handle = hcall_map_page(paddr_kernel); + if (!galpas->kernel.fw_handle) + return -ENOMEM; + } else + galpas->kernel.fw_handle = 0; + + galpas->user.fw_handle = paddr_user; + + return 0; +} + +int hcp_galpas_dtor(struct h_galpas *galpas) +{ + if (galpas->kernel.fw_handle) { + int ret = hcall_unmap_page(galpas->kernel.fw_handle); + if (ret) + return ret; + } + + galpas->user.fw_handle = galpas->kernel.fw_handle = 0; + + return 0; +} diff --git a/kernel/drivers/infiniband/hw/ehca/hcp_phyp.h b/kernel/drivers/infiniband/hw/ehca/hcp_phyp.h new file mode 100644 index 000000000..d1b029910 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ehca/hcp_phyp.h @@ -0,0 +1,90 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * Firmware calls + * + * Authors: Christoph Raisch + * Hoang-Nam Nguyen + * Waleri Fomin + * Gerd Bayer + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __HCP_PHYP_H__ +#define __HCP_PHYP_H__ + + +/* + * eHCA page (mapped into memory) + * resource to access eHCA register pages in CPU address space +*/ +struct h_galpa { + u64 fw_handle; + /* for pSeries this is a 64bit memory address where + I/O memory is mapped into CPU address space (kv) */ +}; + +/* + * resource to access eHCA address space registers, all types + */ +struct h_galpas { + u32 pid; /*PID of userspace galpa checking */ + struct h_galpa user; /* user space accessible resource, + set to 0 if unused */ + struct h_galpa kernel; /* kernel space accessible resource, + set to 0 if unused */ +}; + +static inline u64 hipz_galpa_load(struct h_galpa galpa, u32 offset) +{ + u64 addr = galpa.fw_handle + offset; + return *(volatile u64 __force *)addr; +} + +static inline void hipz_galpa_store(struct h_galpa galpa, u32 offset, u64 value) +{ + u64 addr = galpa.fw_handle + offset; + *(volatile u64 __force *)addr = value; +} + +int hcp_galpas_ctor(struct h_galpas *galpas, int is_user, + u64 paddr_kernel, u64 paddr_user); + +int hcp_galpas_dtor(struct h_galpas *galpas); + +u64 hcall_map_page(u64 physaddr); + +int hcall_unmap_page(u64 mapaddr); + +#endif diff --git a/kernel/drivers/infiniband/hw/ehca/hipz_fns.h b/kernel/drivers/infiniband/hw/ehca/hipz_fns.h new file mode 100644 index 000000000..9dac93d02 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ehca/hipz_fns.h @@ -0,0 +1,68 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * HW abstraction register functions + * + * Authors: Christoph Raisch + * Reinhard Ernst + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __HIPZ_FNS_H__ +#define __HIPZ_FNS_H__ + +#include "ehca_classes.h" +#include "hipz_hw.h" + +#include "hipz_fns_core.h" + +#define hipz_galpa_store_eq(gal, offset, value) \ + hipz_galpa_store(gal, EQTEMM_OFFSET(offset), value) + +#define hipz_galpa_load_eq(gal, offset) \ + hipz_galpa_load(gal, EQTEMM_OFFSET(offset)) + +#define hipz_galpa_store_qped(gal, offset, value) \ + hipz_galpa_store(gal, QPEDMM_OFFSET(offset), value) + +#define hipz_galpa_load_qped(gal, offset) \ + hipz_galpa_load(gal, QPEDMM_OFFSET(offset)) + +#define hipz_galpa_store_mrmw(gal, offset, value) \ + hipz_galpa_store(gal, MRMWMM_OFFSET(offset), value) + +#define hipz_galpa_load_mrmw(gal, offset) \ + hipz_galpa_load(gal, MRMWMM_OFFSET(offset)) + +#endif diff --git a/kernel/drivers/infiniband/hw/ehca/hipz_fns_core.h b/kernel/drivers/infiniband/hw/ehca/hipz_fns_core.h new file mode 100644 index 000000000..868735fd3 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ehca/hipz_fns_core.h @@ -0,0 +1,100 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * HW abstraction register functions + * + * Authors: Christoph Raisch + * Heiko J Schick + * Hoang-Nam Nguyen + * Reinhard Ernst + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __HIPZ_FNS_CORE_H__ +#define __HIPZ_FNS_CORE_H__ + +#include "hcp_phyp.h" +#include "hipz_hw.h" + +#define hipz_galpa_store_cq(gal, offset, value) \ + hipz_galpa_store(gal, CQTEMM_OFFSET(offset), value) + +#define hipz_galpa_load_cq(gal, offset) \ + hipz_galpa_load(gal, CQTEMM_OFFSET(offset)) + +#define hipz_galpa_store_qp(gal, offset, value) \ + hipz_galpa_store(gal, QPTEMM_OFFSET(offset), value) +#define hipz_galpa_load_qp(gal, offset) \ + hipz_galpa_load(gal, QPTEMM_OFFSET(offset)) + +static inline void hipz_update_sqa(struct ehca_qp *qp, u16 nr_wqes) +{ + /* ringing doorbell :-) */ + hipz_galpa_store_qp(qp->galpas.kernel, qpx_sqa, + EHCA_BMASK_SET(QPX_SQADDER, nr_wqes)); +} + +static inline void hipz_update_rqa(struct ehca_qp *qp, u16 nr_wqes) +{ + /* ringing doorbell :-) */ + hipz_galpa_store_qp(qp->galpas.kernel, qpx_rqa, + EHCA_BMASK_SET(QPX_RQADDER, nr_wqes)); +} + +static inline void hipz_update_feca(struct ehca_cq *cq, u32 nr_cqes) +{ + hipz_galpa_store_cq(cq->galpas.kernel, cqx_feca, + EHCA_BMASK_SET(CQX_FECADDER, nr_cqes)); +} + +static inline void hipz_set_cqx_n0(struct ehca_cq *cq, u32 value) +{ + u64 cqx_n0_reg; + + hipz_galpa_store_cq(cq->galpas.kernel, cqx_n0, + EHCA_BMASK_SET(CQX_N0_GENERATE_SOLICITED_COMP_EVENT, + value)); + cqx_n0_reg = hipz_galpa_load_cq(cq->galpas.kernel, cqx_n0); +} + +static inline void hipz_set_cqx_n1(struct ehca_cq *cq, u32 value) +{ + u64 cqx_n1_reg; + + hipz_galpa_store_cq(cq->galpas.kernel, cqx_n1, + EHCA_BMASK_SET(CQX_N1_GENERATE_COMP_EVENT, value)); + cqx_n1_reg = hipz_galpa_load_cq(cq->galpas.kernel, cqx_n1); +} + +#endif /* __HIPZ_FNC_CORE_H__ */ diff --git a/kernel/drivers/infiniband/hw/ehca/hipz_hw.h b/kernel/drivers/infiniband/hw/ehca/hipz_hw.h new file mode 100644 index 000000000..bf996c7ac --- /dev/null +++ b/kernel/drivers/infiniband/hw/ehca/hipz_hw.h @@ -0,0 +1,414 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * eHCA register definitions + * + * Authors: Waleri Fomin + * Christoph Raisch + * Reinhard Ernst + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __HIPZ_HW_H__ +#define __HIPZ_HW_H__ + +#include "ehca_tools.h" + +#define EHCA_MAX_MTU 4 + +/* QP Table Entry Memory Map */ +struct hipz_qptemm { + u64 qpx_hcr; + u64 qpx_c; + u64 qpx_herr; + u64 qpx_aer; +/* 0x20*/ + u64 qpx_sqa; + u64 qpx_sqc; + u64 qpx_rqa; + u64 qpx_rqc; +/* 0x40*/ + u64 qpx_st; + u64 qpx_pmstate; + u64 qpx_pmfa; + u64 qpx_pkey; +/* 0x60*/ + u64 qpx_pkeya; + u64 qpx_pkeyb; + u64 qpx_pkeyc; + u64 qpx_pkeyd; +/* 0x80*/ + u64 qpx_qkey; + u64 qpx_dqp; + u64 qpx_dlidp; + u64 qpx_portp; +/* 0xa0*/ + u64 qpx_slidp; + u64 qpx_slidpp; + u64 qpx_dlida; + u64 qpx_porta; +/* 0xc0*/ + u64 qpx_slida; + u64 qpx_slidpa; + u64 qpx_slvl; + u64 qpx_ipd; +/* 0xe0*/ + u64 qpx_mtu; + u64 qpx_lato; + u64 qpx_rlimit; + u64 qpx_rnrlimit; +/* 0x100*/ + u64 qpx_t; + u64 qpx_sqhp; + u64 qpx_sqptp; + u64 qpx_nspsn; +/* 0x120*/ + u64 qpx_nspsnhwm; + u64 reserved1; + u64 qpx_sdsi; + u64 qpx_sdsbc; +/* 0x140*/ + u64 qpx_sqwsize; + u64 qpx_sqwts; + u64 qpx_lsn; + u64 qpx_nssn; +/* 0x160 */ + u64 qpx_mor; + u64 qpx_cor; + u64 qpx_sqsize; + u64 qpx_erc; +/* 0x180*/ + u64 qpx_rnrrc; + u64 qpx_ernrwt; + u64 qpx_rnrresp; + u64 qpx_lmsna; +/* 0x1a0 */ + u64 qpx_sqhpc; + u64 qpx_sqcptp; + u64 qpx_sigt; + u64 qpx_wqecnt; +/* 0x1c0*/ + u64 qpx_rqhp; + u64 qpx_rqptp; + u64 qpx_rqsize; + u64 qpx_nrr; +/* 0x1e0*/ + u64 qpx_rdmac; + u64 qpx_nrpsn; + u64 qpx_lapsn; + u64 qpx_lcr; +/* 0x200*/ + u64 qpx_rwc; + u64 qpx_rwva; + u64 qpx_rdsi; + u64 qpx_rdsbc; +/* 0x220*/ + u64 qpx_rqwsize; + u64 qpx_crmsn; + u64 qpx_rdd; + u64 qpx_larpsn; +/* 0x240*/ + u64 qpx_pd; + u64 qpx_scqn; + u64 qpx_rcqn; + u64 qpx_aeqn; +/* 0x260*/ + u64 qpx_aaelog; + u64 qpx_ram; + u64 qpx_rdmaqe0; + u64 qpx_rdmaqe1; +/* 0x280*/ + u64 qpx_rdmaqe2; + u64 qpx_rdmaqe3; + u64 qpx_nrpsnhwm; +/* 0x298*/ + u64 reserved[(0x400 - 0x298) / 8]; +/* 0x400 extended data */ + u64 reserved_ext[(0x500 - 0x400) / 8]; +/* 0x500 */ + u64 reserved2[(0x1000 - 0x500) / 8]; +/* 0x1000 */ +}; + +#define QPX_SQADDER EHCA_BMASK_IBM(48, 63) +#define QPX_RQADDER EHCA_BMASK_IBM(48, 63) +#define QPX_AAELOG_RESET_SRQ_LIMIT EHCA_BMASK_IBM(3, 3) + +#define QPTEMM_OFFSET(x) offsetof(struct hipz_qptemm, x) + +/* MRMWPT Entry Memory Map */ +struct hipz_mrmwmm { + /* 0x00 */ + u64 mrx_hcr; + + u64 mrx_c; + u64 mrx_herr; + u64 mrx_aer; + /* 0x20 */ + u64 mrx_pp; + u64 reserved1; + u64 reserved2; + u64 reserved3; + /* 0x40 */ + u64 reserved4[(0x200 - 0x40) / 8]; + /* 0x200 */ + u64 mrx_ctl[64]; + +}; + +#define MRMWMM_OFFSET(x) offsetof(struct hipz_mrmwmm, x) + +struct hipz_qpedmm { + /* 0x00 */ + u64 reserved0[(0x400) / 8]; + /* 0x400 */ + u64 qpedx_phh; + u64 qpedx_ppsgp; + /* 0x410 */ + u64 qpedx_ppsgu; + u64 qpedx_ppdgp; + /* 0x420 */ + u64 qpedx_ppdgu; + u64 qpedx_aph; + /* 0x430 */ + u64 qpedx_apsgp; + u64 qpedx_apsgu; + /* 0x440 */ + u64 qpedx_apdgp; + u64 qpedx_apdgu; + /* 0x450 */ + u64 qpedx_apav; + u64 qpedx_apsav; + /* 0x460 */ + u64 qpedx_hcr; + u64 reserved1[4]; + /* 0x488 */ + u64 qpedx_rrl0; + /* 0x490 */ + u64 qpedx_rrrkey0; + u64 qpedx_rrva0; + /* 0x4a0 */ + u64 reserved2; + u64 qpedx_rrl1; + /* 0x4b0 */ + u64 qpedx_rrrkey1; + u64 qpedx_rrva1; + /* 0x4c0 */ + u64 reserved3; + u64 qpedx_rrl2; + /* 0x4d0 */ + u64 qpedx_rrrkey2; + u64 qpedx_rrva2; + /* 0x4e0 */ + u64 reserved4; + u64 qpedx_rrl3; + /* 0x4f0 */ + u64 qpedx_rrrkey3; + u64 qpedx_rrva3; +}; + +#define QPEDMM_OFFSET(x) offsetof(struct hipz_qpedmm, x) + +/* CQ Table Entry Memory Map */ +struct hipz_cqtemm { + u64 cqx_hcr; + u64 cqx_c; + u64 cqx_herr; + u64 cqx_aer; +/* 0x20 */ + u64 cqx_ptp; + u64 cqx_tp; + u64 cqx_fec; + u64 cqx_feca; +/* 0x40 */ + u64 cqx_ep; + u64 cqx_eq; +/* 0x50 */ + u64 reserved1; + u64 cqx_n0; +/* 0x60 */ + u64 cqx_n1; + u64 reserved2[(0x1000 - 0x60) / 8]; +/* 0x1000 */ +}; + +#define CQX_FEC_CQE_CNT EHCA_BMASK_IBM(32, 63) +#define CQX_FECADDER EHCA_BMASK_IBM(32, 63) +#define CQX_N0_GENERATE_SOLICITED_COMP_EVENT EHCA_BMASK_IBM(0, 0) +#define CQX_N1_GENERATE_COMP_EVENT EHCA_BMASK_IBM(0, 0) + +#define CQTEMM_OFFSET(x) offsetof(struct hipz_cqtemm, x) + +/* EQ Table Entry Memory Map */ +struct hipz_eqtemm { + u64 eqx_hcr; + u64 eqx_c; + + u64 eqx_herr; + u64 eqx_aer; +/* 0x20 */ + u64 eqx_ptp; + u64 eqx_tp; + u64 eqx_ssba; + u64 eqx_psba; + +/* 0x40 */ + u64 eqx_cec; + u64 eqx_meql; + u64 eqx_xisbi; + u64 eqx_xisc; +/* 0x60 */ + u64 eqx_it; + +}; + +#define EQTEMM_OFFSET(x) offsetof(struct hipz_eqtemm, x) + +/* access control defines for MR/MW */ +#define HIPZ_ACCESSCTRL_L_WRITE 0x00800000 +#define HIPZ_ACCESSCTRL_R_WRITE 0x00400000 +#define HIPZ_ACCESSCTRL_R_READ 0x00200000 +#define HIPZ_ACCESSCTRL_R_ATOMIC 0x00100000 +#define HIPZ_ACCESSCTRL_MW_BIND 0x00080000 + +/* query hca response block */ +struct hipz_query_hca { + u32 cur_reliable_dg; + u32 cur_qp; + u32 cur_cq; + u32 cur_eq; + u32 cur_mr; + u32 cur_mw; + u32 cur_ee_context; + u32 cur_mcast_grp; + u32 cur_qp_attached_mcast_grp; + u32 reserved1; + u32 cur_ipv6_qp; + u32 cur_eth_qp; + u32 cur_hp_mr; + u32 reserved2[3]; + u32 max_rd_domain; + u32 max_qp; + u32 max_cq; + u32 max_eq; + u32 max_mr; + u32 max_hp_mr; + u32 max_mw; + u32 max_mrwpte; + u32 max_special_mrwpte; + u32 max_rd_ee_context; + u32 max_mcast_grp; + u32 max_total_mcast_qp_attach; + u32 max_mcast_qp_attach; + u32 max_raw_ipv6_qp; + u32 max_raw_ethy_qp; + u32 internal_clock_frequency; + u32 max_pd; + u32 max_ah; + u32 max_cqe; + u32 max_wqes_wq; + u32 max_partitions; + u32 max_rr_ee_context; + u32 max_rr_qp; + u32 max_rr_hca; + u32 max_act_wqs_ee_context; + u32 max_act_wqs_qp; + u32 max_sge; + u32 max_sge_rd; + u32 memory_page_size_supported; + u64 max_mr_size; + u32 local_ca_ack_delay; + u32 num_ports; + u32 vendor_id; + u32 vendor_part_id; + u32 hw_ver; + u64 node_guid; + u64 hca_cap_indicators; + u32 data_counter_register_size; + u32 max_shared_rq; + u32 max_isns_eq; + u32 max_neq; +} __attribute__ ((packed)); + +#define HCA_CAP_AH_PORT_NR_CHECK EHCA_BMASK_IBM( 0, 0) +#define HCA_CAP_ATOMIC EHCA_BMASK_IBM( 1, 1) +#define HCA_CAP_AUTO_PATH_MIG EHCA_BMASK_IBM( 2, 2) +#define HCA_CAP_BAD_P_KEY_CTR EHCA_BMASK_IBM( 3, 3) +#define HCA_CAP_SQD_RTS_PORT_CHANGE EHCA_BMASK_IBM( 4, 4) +#define HCA_CAP_CUR_QP_STATE_MOD EHCA_BMASK_IBM( 5, 5) +#define HCA_CAP_INIT_TYPE EHCA_BMASK_IBM( 6, 6) +#define HCA_CAP_PORT_ACTIVE_EVENT EHCA_BMASK_IBM( 7, 7) +#define HCA_CAP_Q_KEY_VIOL_CTR EHCA_BMASK_IBM( 8, 8) +#define HCA_CAP_WQE_RESIZE EHCA_BMASK_IBM( 9, 9) +#define HCA_CAP_RAW_PACKET_MCAST EHCA_BMASK_IBM(10, 10) +#define HCA_CAP_SHUTDOWN_PORT EHCA_BMASK_IBM(11, 11) +#define HCA_CAP_RC_LL_QP EHCA_BMASK_IBM(12, 12) +#define HCA_CAP_SRQ EHCA_BMASK_IBM(13, 13) +#define HCA_CAP_UD_LL_QP EHCA_BMASK_IBM(16, 16) +#define HCA_CAP_RESIZE_MR EHCA_BMASK_IBM(17, 17) +#define HCA_CAP_MINI_QP EHCA_BMASK_IBM(18, 18) +#define HCA_CAP_H_ALLOC_RES_SYNC EHCA_BMASK_IBM(19, 19) + +/* query port response block */ +struct hipz_query_port { + u32 state; + u32 bad_pkey_cntr; + u32 lmc; + u32 lid; + u32 subnet_timeout; + u32 qkey_viol_cntr; + u32 sm_sl; + u32 sm_lid; + u32 capability_mask; + u32 init_type_reply; + u32 pkey_tbl_len; + u32 gid_tbl_len; + u64 gid_prefix; + u32 port_nr; + u16 pkey_entries[16]; + u8 reserved1[32]; + u32 trent_size; + u32 trbuf_size; + u64 max_msg_sz; + u32 max_mtu; + u32 vl_cap; + u32 phys_pstate; + u32 phys_state; + u32 phys_speed; + u32 phys_width; + u8 reserved2[1884]; + u64 guid_entries[255]; +} __attribute__ ((packed)); + +#endif diff --git a/kernel/drivers/infiniband/hw/ehca/ipz_pt_fn.c b/kernel/drivers/infiniband/hw/ehca/ipz_pt_fn.c new file mode 100644 index 000000000..8d594517c --- /dev/null +++ b/kernel/drivers/infiniband/hw/ehca/ipz_pt_fn.c @@ -0,0 +1,295 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * internal queue handling + * + * Authors: Waleri Fomin + * Reinhard Ernst + * Christoph Raisch + * + * Copyright (c) 2005 IBM Corporation + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#include "ehca_tools.h" +#include "ipz_pt_fn.h" +#include "ehca_classes.h" + +#define PAGES_PER_KPAGE (PAGE_SIZE >> EHCA_PAGESHIFT) + +struct kmem_cache *small_qp_cache; + +void *ipz_qpageit_get_inc(struct ipz_queue *queue) +{ + void *ret = ipz_qeit_get(queue); + queue->current_q_offset += queue->pagesize; + if (queue->current_q_offset > queue->queue_length) { + queue->current_q_offset -= queue->pagesize; + ret = NULL; + } + if (((u64)ret) % queue->pagesize) { + ehca_gen_err("ERROR!! not at PAGE-Boundary"); + return NULL; + } + return ret; +} + +void *ipz_qeit_eq_get_inc(struct ipz_queue *queue) +{ + void *ret = ipz_qeit_get(queue); + u64 last_entry_in_q = queue->queue_length - queue->qe_size; + + queue->current_q_offset += queue->qe_size; + if (queue->current_q_offset > last_entry_in_q) { + queue->current_q_offset = 0; + queue->toggle_state = (~queue->toggle_state) & 1; + } + + return ret; +} + +int ipz_queue_abs_to_offset(struct ipz_queue *queue, u64 addr, u64 *q_offset) +{ + int i; + for (i = 0; i < queue->queue_length / queue->pagesize; i++) { + u64 page = __pa(queue->queue_pages[i]); + if (addr >= page && addr < page + queue->pagesize) { + *q_offset = addr - page + i * queue->pagesize; + return 0; + } + } + return -EINVAL; +} + +#if PAGE_SHIFT < EHCA_PAGESHIFT +#error Kernel pages must be at least as large than eHCA pages (4K) ! +#endif + +/* + * allocate pages for queue: + * outer loop allocates whole kernel pages (page aligned) and + * inner loop divides a kernel page into smaller hca queue pages + */ +static int alloc_queue_pages(struct ipz_queue *queue, const u32 nr_of_pages) +{ + int k, f = 0; + u8 *kpage; + + while (f < nr_of_pages) { + kpage = (u8 *)get_zeroed_page(GFP_KERNEL); + if (!kpage) + goto out; + + for (k = 0; k < PAGES_PER_KPAGE && f < nr_of_pages; k++) { + queue->queue_pages[f] = (struct ipz_page *)kpage; + kpage += EHCA_PAGESIZE; + f++; + } + } + return 1; + +out: + for (f = 0; f < nr_of_pages && queue->queue_pages[f]; + f += PAGES_PER_KPAGE) + free_page((unsigned long)(queue->queue_pages)[f]); + return 0; +} + +static int alloc_small_queue_page(struct ipz_queue *queue, struct ehca_pd *pd) +{ + int order = ilog2(queue->pagesize) - 9; + struct ipz_small_queue_page *page; + unsigned long bit; + + mutex_lock(&pd->lock); + + if (!list_empty(&pd->free[order])) + page = list_entry(pd->free[order].next, + struct ipz_small_queue_page, list); + else { + page = kmem_cache_zalloc(small_qp_cache, GFP_KERNEL); + if (!page) + goto out; + + page->page = get_zeroed_page(GFP_KERNEL); + if (!page->page) { + kmem_cache_free(small_qp_cache, page); + goto out; + } + + list_add(&page->list, &pd->free[order]); + } + + bit = find_first_zero_bit(page->bitmap, IPZ_SPAGE_PER_KPAGE >> order); + __set_bit(bit, page->bitmap); + page->fill++; + + if (page->fill == IPZ_SPAGE_PER_KPAGE >> order) + list_move(&page->list, &pd->full[order]); + + mutex_unlock(&pd->lock); + + queue->queue_pages[0] = (void *)(page->page | (bit << (order + 9))); + queue->small_page = page; + queue->offset = bit << (order + 9); + return 1; + +out: + ehca_err(pd->ib_pd.device, "failed to allocate small queue page"); + mutex_unlock(&pd->lock); + return 0; +} + +static void free_small_queue_page(struct ipz_queue *queue, struct ehca_pd *pd) +{ + int order = ilog2(queue->pagesize) - 9; + struct ipz_small_queue_page *page = queue->small_page; + unsigned long bit; + int free_page = 0; + + bit = ((unsigned long)queue->queue_pages[0] & ~PAGE_MASK) + >> (order + 9); + + mutex_lock(&pd->lock); + + __clear_bit(bit, page->bitmap); + page->fill--; + + if (page->fill == 0) { + list_del(&page->list); + free_page = 1; + } + + if (page->fill == (IPZ_SPAGE_PER_KPAGE >> order) - 1) + /* the page was full until we freed the chunk */ + list_move_tail(&page->list, &pd->free[order]); + + mutex_unlock(&pd->lock); + + if (free_page) { + free_page(page->page); + kmem_cache_free(small_qp_cache, page); + } +} + +int ipz_queue_ctor(struct ehca_pd *pd, struct ipz_queue *queue, + const u32 nr_of_pages, const u32 pagesize, + const u32 qe_size, const u32 nr_of_sg, + int is_small) +{ + if (pagesize > PAGE_SIZE) { + ehca_gen_err("FATAL ERROR: pagesize=%x " + "is greater than kernel page size", pagesize); + return 0; + } + + /* init queue fields */ + queue->queue_length = nr_of_pages * pagesize; + queue->pagesize = pagesize; + queue->qe_size = qe_size; + queue->act_nr_of_sg = nr_of_sg; + queue->current_q_offset = 0; + queue->toggle_state = 1; + queue->small_page = NULL; + + /* allocate queue page pointers */ + queue->queue_pages = kzalloc(nr_of_pages * sizeof(void *), + GFP_KERNEL | __GFP_NOWARN); + if (!queue->queue_pages) { + queue->queue_pages = vzalloc(nr_of_pages * sizeof(void *)); + if (!queue->queue_pages) { + ehca_gen_err("Couldn't allocate queue page list"); + return 0; + } + } + + /* allocate actual queue pages */ + if (is_small) { + if (!alloc_small_queue_page(queue, pd)) + goto ipz_queue_ctor_exit0; + } else + if (!alloc_queue_pages(queue, nr_of_pages)) + goto ipz_queue_ctor_exit0; + + return 1; + +ipz_queue_ctor_exit0: + ehca_gen_err("Couldn't alloc pages queue=%p " + "nr_of_pages=%x", queue, nr_of_pages); + if (is_vmalloc_addr(queue->queue_pages)) + vfree(queue->queue_pages); + else + kfree(queue->queue_pages); + + return 0; +} + +int ipz_queue_dtor(struct ehca_pd *pd, struct ipz_queue *queue) +{ + int i, nr_pages; + + if (!queue || !queue->queue_pages) { + ehca_gen_dbg("queue or queue_pages is NULL"); + return 0; + } + + if (queue->small_page) + free_small_queue_page(queue, pd); + else { + nr_pages = queue->queue_length / queue->pagesize; + for (i = 0; i < nr_pages; i += PAGES_PER_KPAGE) + free_page((unsigned long)queue->queue_pages[i]); + } + + if (is_vmalloc_addr(queue->queue_pages)) + vfree(queue->queue_pages); + else + kfree(queue->queue_pages); + + return 1; +} + +int ehca_init_small_qp_cache(void) +{ + small_qp_cache = kmem_cache_create("ehca_cache_small_qp", + sizeof(struct ipz_small_queue_page), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!small_qp_cache) + return -ENOMEM; + + return 0; +} + +void ehca_cleanup_small_qp_cache(void) +{ + kmem_cache_destroy(small_qp_cache); +} diff --git a/kernel/drivers/infiniband/hw/ehca/ipz_pt_fn.h b/kernel/drivers/infiniband/hw/ehca/ipz_pt_fn.h new file mode 100644 index 000000000..a801274ea --- /dev/null +++ b/kernel/drivers/infiniband/hw/ehca/ipz_pt_fn.h @@ -0,0 +1,289 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * internal queue handling + * + * Authors: Waleri Fomin + * Reinhard Ernst + * Christoph Raisch + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __IPZ_PT_FN_H__ +#define __IPZ_PT_FN_H__ + +#define EHCA_PAGESHIFT 12 +#define EHCA_PAGESIZE 4096UL +#define EHCA_PAGEMASK (~(EHCA_PAGESIZE-1)) +#define EHCA_PT_ENTRIES 512UL + +#include "ehca_tools.h" +#include "ehca_qes.h" + +struct ehca_pd; +struct ipz_small_queue_page; + +extern struct kmem_cache *small_qp_cache; + +/* struct generic ehca page */ +struct ipz_page { + u8 entries[EHCA_PAGESIZE]; +}; + +#define IPZ_SPAGE_PER_KPAGE (PAGE_SIZE / 512) + +struct ipz_small_queue_page { + unsigned long page; + unsigned long bitmap[IPZ_SPAGE_PER_KPAGE / BITS_PER_LONG]; + int fill; + void *mapped_addr; + u32 mmap_count; + struct list_head list; +}; + +/* struct generic queue in linux kernel virtual memory (kv) */ +struct ipz_queue { + u64 current_q_offset; /* current queue entry */ + + struct ipz_page **queue_pages; /* array of pages belonging to queue */ + u32 qe_size; /* queue entry size */ + u32 act_nr_of_sg; + u32 queue_length; /* queue length allocated in bytes */ + u32 pagesize; + u32 toggle_state; /* toggle flag - per page */ + u32 offset; /* save offset within page for small_qp */ + struct ipz_small_queue_page *small_page; +}; + +/* + * return current Queue Entry for a certain q_offset + * returns address (kv) of Queue Entry + */ +static inline void *ipz_qeit_calc(struct ipz_queue *queue, u64 q_offset) +{ + struct ipz_page *current_page; + if (q_offset >= queue->queue_length) + return NULL; + current_page = (queue->queue_pages)[q_offset >> EHCA_PAGESHIFT]; + return ¤t_page->entries[q_offset & (EHCA_PAGESIZE - 1)]; +} + +/* + * return current Queue Entry + * returns address (kv) of Queue Entry + */ +static inline void *ipz_qeit_get(struct ipz_queue *queue) +{ + return ipz_qeit_calc(queue, queue->current_q_offset); +} + +/* + * return current Queue Page , increment Queue Page iterator from + * page to page in struct ipz_queue, last increment will return 0! and + * NOT wrap + * returns address (kv) of Queue Page + * warning don't use in parallel with ipz_QE_get_inc() + */ +void *ipz_qpageit_get_inc(struct ipz_queue *queue); + +/* + * return current Queue Entry, increment Queue Entry iterator by one + * step in struct ipz_queue, will wrap in ringbuffer + * returns address (kv) of Queue Entry BEFORE increment + * warning don't use in parallel with ipz_qpageit_get_inc() + */ +static inline void *ipz_qeit_get_inc(struct ipz_queue *queue) +{ + void *ret = ipz_qeit_get(queue); + queue->current_q_offset += queue->qe_size; + if (queue->current_q_offset >= queue->queue_length) { + queue->current_q_offset = 0; + /* toggle the valid flag */ + queue->toggle_state = (~queue->toggle_state) & 1; + } + + return ret; +} + +/* + * return a bool indicating whether current Queue Entry is valid + */ +static inline int ipz_qeit_is_valid(struct ipz_queue *queue) +{ + struct ehca_cqe *cqe = ipz_qeit_get(queue); + return ((cqe->cqe_flags >> 7) == (queue->toggle_state & 1)); +} + +/* + * return current Queue Entry, increment Queue Entry iterator by one + * step in struct ipz_queue, will wrap in ringbuffer + * returns address (kv) of Queue Entry BEFORE increment + * returns 0 and does not increment, if wrong valid state + * warning don't use in parallel with ipz_qpageit_get_inc() + */ +static inline void *ipz_qeit_get_inc_valid(struct ipz_queue *queue) +{ + return ipz_qeit_is_valid(queue) ? ipz_qeit_get_inc(queue) : NULL; +} + +/* + * returns and resets Queue Entry iterator + * returns address (kv) of first Queue Entry + */ +static inline void *ipz_qeit_reset(struct ipz_queue *queue) +{ + queue->current_q_offset = 0; + return ipz_qeit_get(queue); +} + +/* + * return the q_offset corresponding to an absolute address + */ +int ipz_queue_abs_to_offset(struct ipz_queue *queue, u64 addr, u64 *q_offset); + +/* + * return the next queue offset. don't modify the queue. + */ +static inline u64 ipz_queue_advance_offset(struct ipz_queue *queue, u64 offset) +{ + offset += queue->qe_size; + if (offset >= queue->queue_length) offset = 0; + return offset; +} + +/* struct generic page table */ +struct ipz_pt { + u64 entries[EHCA_PT_ENTRIES]; +}; + +/* struct page table for a queue, only to be used in pf */ +struct ipz_qpt { + /* queue page tables (kv), use u64 because we know the element length */ + u64 *qpts; + u32 n_qpts; + u32 n_ptes; /* number of page table entries */ + u64 *current_pte_addr; +}; + +/* + * constructor for a ipz_queue_t, placement new for ipz_queue_t, + * new for all dependent datastructors + * all QP Tables are the same + * flow: + * allocate+pin queue + * see ipz_qpt_ctor() + * returns true if ok, false if out of memory + */ +int ipz_queue_ctor(struct ehca_pd *pd, struct ipz_queue *queue, + const u32 nr_of_pages, const u32 pagesize, + const u32 qe_size, const u32 nr_of_sg, + int is_small); + +/* + * destructor for a ipz_queue_t + * -# free queue + * see ipz_queue_ctor() + * returns true if ok, false if queue was NULL-ptr of free failed + */ +int ipz_queue_dtor(struct ehca_pd *pd, struct ipz_queue *queue); + +/* + * constructor for a ipz_qpt_t, + * placement new for struct ipz_queue, new for all dependent datastructors + * all QP Tables are the same, + * flow: + * -# allocate+pin queue + * -# initialise ptcb + * -# allocate+pin PTs + * -# link PTs to a ring, according to HCA Arch, set bit62 id needed + * -# the ring must have room for exactly nr_of_PTEs + * see ipz_qpt_ctor() + */ +void ipz_qpt_ctor(struct ipz_qpt *qpt, + const u32 nr_of_qes, + const u32 pagesize, + const u32 qe_size, + const u8 lowbyte, const u8 toggle, + u32 * act_nr_of_QEs, u32 * act_nr_of_pages); + +/* + * return current Queue Entry, increment Queue Entry iterator by one + * step in struct ipz_queue, will wrap in ringbuffer + * returns address (kv) of Queue Entry BEFORE increment + * warning don't use in parallel with ipz_qpageit_get_inc() + * warning unpredictable results may occur if steps>act_nr_of_queue_entries + * fix EQ page problems + */ +void *ipz_qeit_eq_get_inc(struct ipz_queue *queue); + +/* + * return current Event Queue Entry, increment Queue Entry iterator + * by one step in struct ipz_queue if valid, will wrap in ringbuffer + * returns address (kv) of Queue Entry BEFORE increment + * returns 0 and does not increment, if wrong valid state + * warning don't use in parallel with ipz_queue_QPageit_get_inc() + * warning unpredictable results may occur if steps>act_nr_of_queue_entries + */ +static inline void *ipz_eqit_eq_get_inc_valid(struct ipz_queue *queue) +{ + void *ret = ipz_qeit_get(queue); + u32 qe = *(u8 *)ret; + if ((qe >> 7) != (queue->toggle_state & 1)) + return NULL; + ipz_qeit_eq_get_inc(queue); /* this is a good one */ + return ret; +} + +static inline void *ipz_eqit_eq_peek_valid(struct ipz_queue *queue) +{ + void *ret = ipz_qeit_get(queue); + u32 qe = *(u8 *)ret; + if ((qe >> 7) != (queue->toggle_state & 1)) + return NULL; + return ret; +} + +/* returns address (GX) of first queue entry */ +static inline u64 ipz_qpt_get_firstpage(struct ipz_qpt *qpt) +{ + return be64_to_cpu(qpt->qpts[0]); +} + +/* returns address (kv) of first page of queue page table */ +static inline void *ipz_qpt_get_qpt(struct ipz_qpt *qpt) +{ + return qpt->qpts; +} + +#endif /* __IPZ_PT_FN_H__ */ diff --git a/kernel/drivers/infiniband/hw/ipath/Kconfig b/kernel/drivers/infiniband/hw/ipath/Kconfig new file mode 100644 index 000000000..1d9bb115c --- /dev/null +++ b/kernel/drivers/infiniband/hw/ipath/Kconfig @@ -0,0 +1,11 @@ +config INFINIBAND_IPATH + tristate "QLogic HTX HCA support" + depends on 64BIT && NET && HT_IRQ + ---help--- + This is a driver for the obsolete QLogic Hyper-Transport + IB host channel adapter (model QHT7140), + including InfiniBand verbs support. This driver allows these + devices to be used with both kernel upper level protocols such + as IP-over-InfiniBand as well as with userspace applications + (in conjunction with InfiniBand userspace access). + For QLogic PCIe QLE based cards, use the QIB driver instead. diff --git a/kernel/drivers/infiniband/hw/ipath/Makefile b/kernel/drivers/infiniband/hw/ipath/Makefile new file mode 100644 index 000000000..4496f2820 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ipath/Makefile @@ -0,0 +1,37 @@ +ccflags-y := -DIPATH_IDSTR='"QLogic kernel.org driver"' \ + -DIPATH_KERN_TYPE=0 + +obj-$(CONFIG_INFINIBAND_IPATH) += ib_ipath.o + +ib_ipath-y := \ + ipath_cq.o \ + ipath_diag.o \ + ipath_dma.o \ + ipath_driver.o \ + ipath_eeprom.o \ + ipath_file_ops.o \ + ipath_fs.o \ + ipath_init_chip.o \ + ipath_intr.o \ + ipath_keys.o \ + ipath_mad.o \ + ipath_mmap.o \ + ipath_mr.o \ + ipath_qp.o \ + ipath_rc.o \ + ipath_ruc.o \ + ipath_sdma.o \ + ipath_srq.o \ + ipath_stats.o \ + ipath_sysfs.o \ + ipath_uc.o \ + ipath_ud.o \ + ipath_user_pages.o \ + ipath_user_sdma.o \ + ipath_verbs_mcast.o \ + ipath_verbs.o + +ib_ipath-$(CONFIG_HT_IRQ) += ipath_iba6110.o + +ib_ipath-$(CONFIG_X86_64) += ipath_wc_x86_64.o +ib_ipath-$(CONFIG_PPC64) += ipath_wc_ppc64.o diff --git a/kernel/drivers/infiniband/hw/ipath/ipath_common.h b/kernel/drivers/infiniband/hw/ipath/ipath_common.h new file mode 100644 index 000000000..28cfe97cf --- /dev/null +++ b/kernel/drivers/infiniband/hw/ipath/ipath_common.h @@ -0,0 +1,851 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _IPATH_COMMON_H +#define _IPATH_COMMON_H + +/* + * This file contains defines, structures, etc. that are used + * to communicate between kernel and user code. + */ + + +/* This is the IEEE-assigned OUI for QLogic Inc. InfiniPath */ +#define IPATH_SRC_OUI_1 0x00 +#define IPATH_SRC_OUI_2 0x11 +#define IPATH_SRC_OUI_3 0x75 + +/* version of protocol header (known to chip also). In the long run, + * we should be able to generate and accept a range of version numbers; + * for now we only accept one, and it's compiled in. + */ +#define IPS_PROTO_VERSION 2 + +/* + * These are compile time constants that you may want to enable or disable + * if you are trying to debug problems with code or performance. + * IPATH_VERBOSE_TRACING define as 1 if you want additional tracing in + * fastpath code + * IPATH_TRACE_REGWRITES define as 1 if you want register writes to be + * traced in faspath code + * _IPATH_TRACING define as 0 if you want to remove all tracing in a + * compilation unit + * _IPATH_DEBUGGING define as 0 if you want to remove debug prints + */ + +/* + * The value in the BTH QP field that InfiniPath uses to differentiate + * an infinipath protocol IB packet vs standard IB transport + */ +#define IPATH_KD_QP 0x656b79 + +/* + * valid states passed to ipath_set_linkstate() user call + */ +#define IPATH_IB_LINKDOWN 0 +#define IPATH_IB_LINKARM 1 +#define IPATH_IB_LINKACTIVE 2 +#define IPATH_IB_LINKDOWN_ONLY 3 +#define IPATH_IB_LINKDOWN_SLEEP 4 +#define IPATH_IB_LINKDOWN_DISABLE 5 +#define IPATH_IB_LINK_LOOPBACK 6 /* enable local loopback */ +#define IPATH_IB_LINK_EXTERNAL 7 /* normal, disable local loopback */ +#define IPATH_IB_LINK_NO_HRTBT 8 /* disable Heartbeat, e.g. for loopback */ +#define IPATH_IB_LINK_HRTBT 9 /* enable heartbeat, normal, non-loopback */ + +/* + * These 3 values (SDR and DDR may be ORed for auto-speed + * negotiation) are used for the 3rd argument to path_f_set_ib_cfg + * with cmd IPATH_IB_CFG_SPD_ENB, by direct calls or via sysfs. They + * are also the the possible values for ipath_link_speed_enabled and active + * The values were chosen to match values used within the IB spec. + */ +#define IPATH_IB_SDR 1 +#define IPATH_IB_DDR 2 + +/* + * stats maintained by the driver. For now, at least, this is global + * to all minor devices. + */ +struct infinipath_stats { + /* number of interrupts taken */ + __u64 sps_ints; + /* number of interrupts for errors */ + __u64 sps_errints; + /* number of errors from chip (not incl. packet errors or CRC) */ + __u64 sps_errs; + /* number of packet errors from chip other than CRC */ + __u64 sps_pkterrs; + /* number of packets with CRC errors (ICRC and VCRC) */ + __u64 sps_crcerrs; + /* number of hardware errors reported (parity, etc.) */ + __u64 sps_hwerrs; + /* number of times IB link changed state unexpectedly */ + __u64 sps_iblink; + __u64 sps_unused; /* was fastrcvint, no longer implemented */ + /* number of kernel (port0) packets received */ + __u64 sps_port0pkts; + /* number of "ethernet" packets sent by driver */ + __u64 sps_ether_spkts; + /* number of "ethernet" packets received by driver */ + __u64 sps_ether_rpkts; + /* number of SMA packets sent by driver. Obsolete. */ + __u64 sps_sma_spkts; + /* number of SMA packets received by driver. Obsolete. */ + __u64 sps_sma_rpkts; + /* number of times all ports rcvhdrq was full and packet dropped */ + __u64 sps_hdrqfull; + /* number of times all ports egrtid was full and packet dropped */ + __u64 sps_etidfull; + /* + * number of times we tried to send from driver, but no pio buffers + * avail + */ + __u64 sps_nopiobufs; + /* number of ports currently open */ + __u64 sps_ports; + /* list of pkeys (other than default) accepted (0 means not set) */ + __u16 sps_pkeys[4]; + __u16 sps_unused16[4]; /* available; maintaining compatible layout */ + /* number of user ports per chip (not IB ports) */ + __u32 sps_nports; + /* not our interrupt, or already handled */ + __u32 sps_nullintr; + /* max number of packets handled per receive call */ + __u32 sps_maxpkts_call; + /* avg number of packets handled per receive call */ + __u32 sps_avgpkts_call; + /* total number of pages locked */ + __u64 sps_pagelocks; + /* total number of pages unlocked */ + __u64 sps_pageunlocks; + /* + * Number of packets dropped in kernel other than errors (ether + * packets if ipath not configured, etc.) + */ + __u64 sps_krdrops; + __u64 sps_txeparity; /* PIO buffer parity error, recovered */ + /* pad for future growth */ + __u64 __sps_pad[45]; +}; + +/* + * These are the status bits readable (in ascii form, 64bit value) + * from the "status" sysfs file. + */ +#define IPATH_STATUS_INITTED 0x1 /* basic initialization done */ +#define IPATH_STATUS_DISABLED 0x2 /* hardware disabled */ +/* Device has been disabled via admin request */ +#define IPATH_STATUS_ADMIN_DISABLED 0x4 +/* Chip has been found and initted */ +#define IPATH_STATUS_CHIP_PRESENT 0x20 +/* IB link is at ACTIVE, usable for data traffic */ +#define IPATH_STATUS_IB_READY 0x40 +/* link is configured, LID, MTU, etc. have been set */ +#define IPATH_STATUS_IB_CONF 0x80 +/* no link established, probably no cable */ +#define IPATH_STATUS_IB_NOCABLE 0x100 +/* A Fatal hardware error has occurred. */ +#define IPATH_STATUS_HWERROR 0x200 + +/* + * The list of usermode accessible registers. Also see Reg_* later in file. + */ +typedef enum _ipath_ureg { + /* (RO) DMA RcvHdr to be used next. */ + ur_rcvhdrtail = 0, + /* (RW) RcvHdr entry to be processed next by host. */ + ur_rcvhdrhead = 1, + /* (RO) Index of next Eager index to use. */ + ur_rcvegrindextail = 2, + /* (RW) Eager TID to be processed next */ + ur_rcvegrindexhead = 3, + /* For internal use only; max register number. */ + _IPATH_UregMax +} ipath_ureg; + +/* bit values for spi_runtime_flags */ +#define IPATH_RUNTIME_HT 0x1 +#define IPATH_RUNTIME_PCIE 0x2 +#define IPATH_RUNTIME_FORCE_WC_ORDER 0x4 +#define IPATH_RUNTIME_RCVHDR_COPY 0x8 +#define IPATH_RUNTIME_MASTER 0x10 +#define IPATH_RUNTIME_NODMA_RTAIL 0x80 +#define IPATH_RUNTIME_SDMA 0x200 +#define IPATH_RUNTIME_FORCE_PIOAVAIL 0x400 +#define IPATH_RUNTIME_PIO_REGSWAPPED 0x800 + +/* + * This structure is returned by ipath_userinit() immediately after + * open to get implementation-specific info, and info specific to this + * instance. + * + * This struct must have explict pad fields where type sizes + * may result in different alignments between 32 and 64 bit + * programs, since the 64 bit * bit kernel requires the user code + * to have matching offsets + */ +struct ipath_base_info { + /* version of hardware, for feature checking. */ + __u32 spi_hw_version; + /* version of software, for feature checking. */ + __u32 spi_sw_version; + /* InfiniPath port assigned, goes into sent packets */ + __u16 spi_port; + __u16 spi_subport; + /* + * IB MTU, packets IB data must be less than this. + * The MTU is in bytes, and will be a multiple of 4 bytes. + */ + __u32 spi_mtu; + /* + * Size of a PIO buffer. Any given packet's total size must be less + * than this (in words). Included is the starting control word, so + * if 513 is returned, then total pkt size is 512 words or less. + */ + __u32 spi_piosize; + /* size of the TID cache in infinipath, in entries */ + __u32 spi_tidcnt; + /* size of the TID Eager list in infinipath, in entries */ + __u32 spi_tidegrcnt; + /* size of a single receive header queue entry in words. */ + __u32 spi_rcvhdrent_size; + /* + * Count of receive header queue entries allocated. + * This may be less than the spu_rcvhdrcnt passed in!. + */ + __u32 spi_rcvhdr_cnt; + + /* per-chip and other runtime features bitmap (IPATH_RUNTIME_*) */ + __u32 spi_runtime_flags; + + /* address where receive buffer queue is mapped into */ + __u64 spi_rcvhdr_base; + + /* user program. */ + + /* base address of eager TID receive buffers. */ + __u64 spi_rcv_egrbufs; + + /* Allocated by initialization code, not by protocol. */ + + /* + * Size of each TID buffer in host memory, starting at + * spi_rcv_egrbufs. The buffers are virtually contiguous. + */ + __u32 spi_rcv_egrbufsize; + /* + * The special QP (queue pair) value that identifies an infinipath + * protocol packet from standard IB packets. More, probably much + * more, to be added. + */ + __u32 spi_qpair; + + /* + * User register base for init code, not to be used directly by + * protocol or applications. + */ + __u64 __spi_uregbase; + /* + * Maximum buffer size in bytes that can be used in a single TID + * entry (assuming the buffer is aligned to this boundary). This is + * the minimum of what the hardware and software support Guaranteed + * to be a power of 2. + */ + __u32 spi_tid_maxsize; + /* + * alignment of each pio send buffer (byte count + * to add to spi_piobufbase to get to second buffer) + */ + __u32 spi_pioalign; + /* + * The index of the first pio buffer available to this process; + * needed to do lookup in spi_pioavailaddr; not added to + * spi_piobufbase. + */ + __u32 spi_pioindex; + /* number of buffers mapped for this process */ + __u32 spi_piocnt; + + /* + * Base address of writeonly pio buffers for this process. + * Each buffer has spi_piosize words, and is aligned on spi_pioalign + * boundaries. spi_piocnt buffers are mapped from this address + */ + __u64 spi_piobufbase; + + /* + * Base address of readonly memory copy of the pioavail registers. + * There are 2 bits for each buffer. + */ + __u64 spi_pioavailaddr; + + /* + * Address where driver updates a copy of the interface and driver + * status (IPATH_STATUS_*) as a 64 bit value. It's followed by a + * string indicating hardware error, if there was one. + */ + __u64 spi_status; + + /* number of chip ports available to user processes */ + __u32 spi_nports; + /* unit number of chip we are using */ + __u32 spi_unit; + /* num bufs in each contiguous set */ + __u32 spi_rcv_egrperchunk; + /* size in bytes of each contiguous set */ + __u32 spi_rcv_egrchunksize; + /* total size of mmap to cover full rcvegrbuffers */ + __u32 spi_rcv_egrbuftotlen; + __u32 spi_filler_for_align; + /* address of readonly memory copy of the rcvhdrq tail register. */ + __u64 spi_rcvhdr_tailaddr; + + /* shared memory pages for subports if port is shared */ + __u64 spi_subport_uregbase; + __u64 spi_subport_rcvegrbuf; + __u64 spi_subport_rcvhdr_base; + + /* shared memory page for hardware port if it is shared */ + __u64 spi_port_uregbase; + __u64 spi_port_rcvegrbuf; + __u64 spi_port_rcvhdr_base; + __u64 spi_port_rcvhdr_tailaddr; + +} __attribute__ ((aligned(8))); + + +/* + * This version number is given to the driver by the user code during + * initialization in the spu_userversion field of ipath_user_info, so + * the driver can check for compatibility with user code. + * + * The major version changes when data structures + * change in an incompatible way. The driver must be the same or higher + * for initialization to succeed. In some cases, a higher version + * driver will not interoperate with older software, and initialization + * will return an error. + */ +#define IPATH_USER_SWMAJOR 1 + +/* + * Minor version differences are always compatible + * a within a major version, however if user software is larger + * than driver software, some new features and/or structure fields + * may not be implemented; the user code must deal with this if it + * cares, or it must abort after initialization reports the difference. + */ +#define IPATH_USER_SWMINOR 6 + +#define IPATH_USER_SWVERSION ((IPATH_USER_SWMAJOR<<16) | IPATH_USER_SWMINOR) + +#define IPATH_KERN_TYPE 0 + +/* + * Similarly, this is the kernel version going back to the user. It's + * slightly different, in that we want to tell if the driver was built as + * part of a QLogic release, or from the driver from openfabrics.org, + * kernel.org, or a standard distribution, for support reasons. + * The high bit is 0 for non-QLogic and 1 for QLogic-built/supplied. + * + * It's returned by the driver to the user code during initialization in the + * spi_sw_version field of ipath_base_info, so the user code can in turn + * check for compatibility with the kernel. +*/ +#define IPATH_KERN_SWVERSION ((IPATH_KERN_TYPE<<31) | IPATH_USER_SWVERSION) + +/* + * This structure is passed to ipath_userinit() to tell the driver where + * user code buffers are, sizes, etc. The offsets and sizes of the + * fields must remain unchanged, for binary compatibility. It can + * be extended, if userversion is changed so user code can tell, if needed + */ +struct ipath_user_info { + /* + * version of user software, to detect compatibility issues. + * Should be set to IPATH_USER_SWVERSION. + */ + __u32 spu_userversion; + + /* desired number of receive header queue entries */ + __u32 spu_rcvhdrcnt; + + /* size of struct base_info to write to */ + __u32 spu_base_info_size; + + /* + * number of words in KD protocol header + * This tells InfiniPath how many words to copy to rcvhdrq. If 0, + * kernel uses a default. Once set, attempts to set any other value + * are an error (EAGAIN) until driver is reloaded. + */ + __u32 spu_rcvhdrsize; + + /* + * If two or more processes wish to share a port, each process + * must set the spu_subport_cnt and spu_subport_id to the same + * values. The only restriction on the spu_subport_id is that + * it be unique for a given node. + */ + __u16 spu_subport_cnt; + __u16 spu_subport_id; + + __u32 spu_unused; /* kept for compatible layout */ + + /* + * address of struct base_info to write to + */ + __u64 spu_base_info; + +} __attribute__ ((aligned(8))); + +/* User commands. */ + +#define IPATH_CMD_MIN 16 + +#define __IPATH_CMD_USER_INIT 16 /* old set up userspace (for old user code) */ +#define IPATH_CMD_PORT_INFO 17 /* find out what resources we got */ +#define IPATH_CMD_RECV_CTRL 18 /* control receipt of packets */ +#define IPATH_CMD_TID_UPDATE 19 /* update expected TID entries */ +#define IPATH_CMD_TID_FREE 20 /* free expected TID entries */ +#define IPATH_CMD_SET_PART_KEY 21 /* add partition key */ +#define __IPATH_CMD_SLAVE_INFO 22 /* return info on slave processes (for old user code) */ +#define IPATH_CMD_ASSIGN_PORT 23 /* allocate HCA and port */ +#define IPATH_CMD_USER_INIT 24 /* set up userspace */ +#define IPATH_CMD_UNUSED_1 25 +#define IPATH_CMD_UNUSED_2 26 +#define IPATH_CMD_PIOAVAILUPD 27 /* force an update of PIOAvail reg */ +#define IPATH_CMD_POLL_TYPE 28 /* set the kind of polling we want */ +#define IPATH_CMD_ARMLAUNCH_CTRL 29 /* armlaunch detection control */ +/* 30 is unused */ +#define IPATH_CMD_SDMA_INFLIGHT 31 /* sdma inflight counter request */ +#define IPATH_CMD_SDMA_COMPLETE 32 /* sdma completion counter request */ + +/* + * Poll types + */ +#define IPATH_POLL_TYPE_URGENT 0x01 +#define IPATH_POLL_TYPE_OVERFLOW 0x02 + +struct ipath_port_info { + __u32 num_active; /* number of active units */ + __u32 unit; /* unit (chip) assigned to caller */ + __u16 port; /* port on unit assigned to caller */ + __u16 subport; /* subport on unit assigned to caller */ + __u16 num_ports; /* number of ports available on unit */ + __u16 num_subports; /* number of subports opened on port */ +}; + +struct ipath_tid_info { + __u32 tidcnt; + /* make structure same size in 32 and 64 bit */ + __u32 tid__unused; + /* virtual address of first page in transfer */ + __u64 tidvaddr; + /* pointer (same size 32/64 bit) to __u16 tid array */ + __u64 tidlist; + + /* + * pointer (same size 32/64 bit) to bitmap of TIDs used + * for this call; checked for being large enough at open + */ + __u64 tidmap; +}; + +struct ipath_cmd { + __u32 type; /* command type */ + union { + struct ipath_tid_info tid_info; + struct ipath_user_info user_info; + + /* + * address in userspace where we should put the sdma + * inflight counter + */ + __u64 sdma_inflight; + /* + * address in userspace where we should put the sdma + * completion counter + */ + __u64 sdma_complete; + /* address in userspace of struct ipath_port_info to + write result to */ + __u64 port_info; + /* enable/disable receipt of packets */ + __u32 recv_ctrl; + /* enable/disable armlaunch errors (non-zero to enable) */ + __u32 armlaunch_ctrl; + /* partition key to set */ + __u16 part_key; + /* user address of __u32 bitmask of active slaves */ + __u64 slave_mask_addr; + /* type of polling we want */ + __u16 poll_type; + } cmd; +}; + +struct ipath_iovec { + /* Pointer to data, but same size 32 and 64 bit */ + __u64 iov_base; + + /* + * Length of data; don't need 64 bits, but want + * ipath_sendpkt to remain same size as before 32 bit changes, so... + */ + __u64 iov_len; +}; + +/* + * Describes a single packet for send. Each packet can have one or more + * buffers, but the total length (exclusive of IB headers) must be less + * than the MTU, and if using the PIO method, entire packet length, + * including IB headers, must be less than the ipath_piosize value (words). + * Use of this necessitates including sys/uio.h + */ +struct __ipath_sendpkt { + __u32 sps_flags; /* flags for packet (TBD) */ + __u32 sps_cnt; /* number of entries to use in sps_iov */ + /* array of iov's describing packet. TEMPORARY */ + struct ipath_iovec sps_iov[4]; +}; + +/* + * diagnostics can send a packet by "writing" one of the following + * two structs to diag data special file + * The first is the legacy version for backward compatibility + */ +struct ipath_diag_pkt { + __u32 unit; + __u64 data; + __u32 len; +}; + +/* The second diag_pkt struct is the expanded version that allows + * more control over the packet, specifically, by allowing a custom + * pbc (+ static rate) qword, so that special modes and deliberate + * changes to CRCs can be used. The elements were also re-ordered + * for better alignment and to avoid padding issues. + */ +struct ipath_diag_xpkt { + __u64 data; + __u64 pbc_wd; + __u32 unit; + __u32 len; +}; + +/* + * Data layout in I2C flash (for GUID, etc.) + * All fields are little-endian binary unless otherwise stated + */ +#define IPATH_FLASH_VERSION 2 +struct ipath_flash { + /* flash layout version (IPATH_FLASH_VERSION) */ + __u8 if_fversion; + /* checksum protecting if_length bytes */ + __u8 if_csum; + /* + * valid length (in use, protected by if_csum), including + * if_fversion and if_csum themselves) + */ + __u8 if_length; + /* the GUID, in network order */ + __u8 if_guid[8]; + /* number of GUIDs to use, starting from if_guid */ + __u8 if_numguid; + /* the (last 10 characters of) board serial number, in ASCII */ + char if_serial[12]; + /* board mfg date (YYYYMMDD ASCII) */ + char if_mfgdate[8]; + /* last board rework/test date (YYYYMMDD ASCII) */ + char if_testdate[8]; + /* logging of error counts, TBD */ + __u8 if_errcntp[4]; + /* powered on hours, updated at driver unload */ + __u8 if_powerhour[2]; + /* ASCII free-form comment field */ + char if_comment[32]; + /* Backwards compatible prefix for longer QLogic Serial Numbers */ + char if_sprefix[4]; + /* 82 bytes used, min flash size is 128 bytes */ + __u8 if_future[46]; +}; + +/* + * These are the counters implemented in the chip, and are listed in order. + * The InterCaps naming is taken straight from the chip spec. + */ +struct infinipath_counters { + __u64 LBIntCnt; + __u64 LBFlowStallCnt; + __u64 TxSDmaDescCnt; /* was Reserved1 */ + __u64 TxUnsupVLErrCnt; + __u64 TxDataPktCnt; + __u64 TxFlowPktCnt; + __u64 TxDwordCnt; + __u64 TxLenErrCnt; + __u64 TxMaxMinLenErrCnt; + __u64 TxUnderrunCnt; + __u64 TxFlowStallCnt; + __u64 TxDroppedPktCnt; + __u64 RxDroppedPktCnt; + __u64 RxDataPktCnt; + __u64 RxFlowPktCnt; + __u64 RxDwordCnt; + __u64 RxLenErrCnt; + __u64 RxMaxMinLenErrCnt; + __u64 RxICRCErrCnt; + __u64 RxVCRCErrCnt; + __u64 RxFlowCtrlErrCnt; + __u64 RxBadFormatCnt; + __u64 RxLinkProblemCnt; + __u64 RxEBPCnt; + __u64 RxLPCRCErrCnt; + __u64 RxBufOvflCnt; + __u64 RxTIDFullErrCnt; + __u64 RxTIDValidErrCnt; + __u64 RxPKeyMismatchCnt; + __u64 RxP0HdrEgrOvflCnt; + __u64 RxP1HdrEgrOvflCnt; + __u64 RxP2HdrEgrOvflCnt; + __u64 RxP3HdrEgrOvflCnt; + __u64 RxP4HdrEgrOvflCnt; + __u64 RxP5HdrEgrOvflCnt; + __u64 RxP6HdrEgrOvflCnt; + __u64 RxP7HdrEgrOvflCnt; + __u64 RxP8HdrEgrOvflCnt; + __u64 RxP9HdrEgrOvflCnt; /* was Reserved6 */ + __u64 RxP10HdrEgrOvflCnt; /* was Reserved7 */ + __u64 RxP11HdrEgrOvflCnt; /* new for IBA7220 */ + __u64 RxP12HdrEgrOvflCnt; /* new for IBA7220 */ + __u64 RxP13HdrEgrOvflCnt; /* new for IBA7220 */ + __u64 RxP14HdrEgrOvflCnt; /* new for IBA7220 */ + __u64 RxP15HdrEgrOvflCnt; /* new for IBA7220 */ + __u64 RxP16HdrEgrOvflCnt; /* new for IBA7220 */ + __u64 IBStatusChangeCnt; + __u64 IBLinkErrRecoveryCnt; + __u64 IBLinkDownedCnt; + __u64 IBSymbolErrCnt; + /* The following are new for IBA7220 */ + __u64 RxVL15DroppedPktCnt; + __u64 RxOtherLocalPhyErrCnt; + __u64 PcieRetryBufDiagQwordCnt; + __u64 ExcessBufferOvflCnt; + __u64 LocalLinkIntegrityErrCnt; + __u64 RxVlErrCnt; + __u64 RxDlidFltrCnt; +}; + +/* + * The next set of defines are for packet headers, and chip register + * and memory bits that are visible to and/or used by user-mode software + * The other bits that are used only by the driver or diags are in + * ipath_registers.h + */ + +/* RcvHdrFlags bits */ +#define INFINIPATH_RHF_LENGTH_MASK 0x7FF +#define INFINIPATH_RHF_LENGTH_SHIFT 0 +#define INFINIPATH_RHF_RCVTYPE_MASK 0x7 +#define INFINIPATH_RHF_RCVTYPE_SHIFT 11 +#define INFINIPATH_RHF_EGRINDEX_MASK 0xFFF +#define INFINIPATH_RHF_EGRINDEX_SHIFT 16 +#define INFINIPATH_RHF_SEQ_MASK 0xF +#define INFINIPATH_RHF_SEQ_SHIFT 0 +#define INFINIPATH_RHF_HDRQ_OFFSET_MASK 0x7FF +#define INFINIPATH_RHF_HDRQ_OFFSET_SHIFT 4 +#define INFINIPATH_RHF_H_ICRCERR 0x80000000 +#define INFINIPATH_RHF_H_VCRCERR 0x40000000 +#define INFINIPATH_RHF_H_PARITYERR 0x20000000 +#define INFINIPATH_RHF_H_LENERR 0x10000000 +#define INFINIPATH_RHF_H_MTUERR 0x08000000 +#define INFINIPATH_RHF_H_IHDRERR 0x04000000 +#define INFINIPATH_RHF_H_TIDERR 0x02000000 +#define INFINIPATH_RHF_H_MKERR 0x01000000 +#define INFINIPATH_RHF_H_IBERR 0x00800000 +#define INFINIPATH_RHF_H_ERR_MASK 0xFF800000 +#define INFINIPATH_RHF_L_USE_EGR 0x80000000 +#define INFINIPATH_RHF_L_SWA 0x00008000 +#define INFINIPATH_RHF_L_SWB 0x00004000 + +/* infinipath header fields */ +#define INFINIPATH_I_VERS_MASK 0xF +#define INFINIPATH_I_VERS_SHIFT 28 +#define INFINIPATH_I_PORT_MASK 0xF +#define INFINIPATH_I_PORT_SHIFT 24 +#define INFINIPATH_I_TID_MASK 0x7FF +#define INFINIPATH_I_TID_SHIFT 13 +#define INFINIPATH_I_OFFSET_MASK 0x1FFF +#define INFINIPATH_I_OFFSET_SHIFT 0 + +/* K_PktFlags bits */ +#define INFINIPATH_KPF_INTR 0x1 +#define INFINIPATH_KPF_SUBPORT_MASK 0x3 +#define INFINIPATH_KPF_SUBPORT_SHIFT 1 + +#define INFINIPATH_MAX_SUBPORT 4 + +/* SendPIO per-buffer control */ +#define INFINIPATH_SP_TEST 0x40 +#define INFINIPATH_SP_TESTEBP 0x20 +#define INFINIPATH_SP_TRIGGER_SHIFT 15 + +/* SendPIOAvail bits */ +#define INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT 1 +#define INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT 0 + +/* infinipath header format */ +struct ipath_header { + /* + * Version - 4 bits, Port - 4 bits, TID - 10 bits and Offset - + * 14 bits before ECO change ~28 Dec 03. After that, Vers 4, + * Port 4, TID 11, offset 13. + */ + __le32 ver_port_tid_offset; + __le16 chksum; + __le16 pkt_flags; +}; + +/* infinipath user message header format. + * This structure contains the first 4 fields common to all protocols + * that employ infinipath. + */ +struct ipath_message_header { + __be16 lrh[4]; + __be32 bth[3]; + /* fields below this point are in host byte order */ + struct ipath_header iph; + __u8 sub_opcode; +}; + +/* infinipath ethernet header format */ +struct ether_header { + __be16 lrh[4]; + __be32 bth[3]; + struct ipath_header iph; + __u8 sub_opcode; + __u8 cmd; + __be16 lid; + __u16 mac[3]; + __u8 frag_num; + __u8 seq_num; + __le32 len; + /* MUST be of word size due to PIO write requirements */ + __le32 csum; + __le16 csum_offset; + __le16 flags; + __u16 first_2_bytes; + __u8 unused[2]; /* currently unused */ +}; + + +/* IB - LRH header consts */ +#define IPATH_LRH_GRH 0x0003 /* 1. word of IB LRH - next header: GRH */ +#define IPATH_LRH_BTH 0x0002 /* 1. word of IB LRH - next header: BTH */ + +/* misc. */ +#define SIZE_OF_CRC 1 + +#define IPATH_DEFAULT_P_KEY 0xFFFF +#define IPATH_PERMISSIVE_LID 0xFFFF +#define IPATH_AETH_CREDIT_SHIFT 24 +#define IPATH_AETH_CREDIT_MASK 0x1F +#define IPATH_AETH_CREDIT_INVAL 0x1F +#define IPATH_PSN_MASK 0xFFFFFF +#define IPATH_MSN_MASK 0xFFFFFF +#define IPATH_QPN_MASK 0xFFFFFF +#define IPATH_MULTICAST_LID_BASE 0xC000 +#define IPATH_EAGER_TID_ID INFINIPATH_I_TID_MASK +#define IPATH_MULTICAST_QPN 0xFFFFFF + +/* Receive Header Queue: receive type (from infinipath) */ +#define RCVHQ_RCV_TYPE_EXPECTED 0 +#define RCVHQ_RCV_TYPE_EAGER 1 +#define RCVHQ_RCV_TYPE_NON_KD 2 +#define RCVHQ_RCV_TYPE_ERROR 3 + + +/* sub OpCodes - ith4x */ +#define IPATH_ITH4X_OPCODE_ENCAP 0x81 +#define IPATH_ITH4X_OPCODE_LID_ARP 0x82 + +#define IPATH_HEADER_QUEUE_WORDS 9 + +/* functions for extracting fields from rcvhdrq entries for the driver. + */ +static inline __u32 ipath_hdrget_err_flags(const __le32 * rbuf) +{ + return __le32_to_cpu(rbuf[1]) & INFINIPATH_RHF_H_ERR_MASK; +} + +static inline __u32 ipath_hdrget_rcv_type(const __le32 * rbuf) +{ + return (__le32_to_cpu(rbuf[0]) >> INFINIPATH_RHF_RCVTYPE_SHIFT) + & INFINIPATH_RHF_RCVTYPE_MASK; +} + +static inline __u32 ipath_hdrget_length_in_bytes(const __le32 * rbuf) +{ + return ((__le32_to_cpu(rbuf[0]) >> INFINIPATH_RHF_LENGTH_SHIFT) + & INFINIPATH_RHF_LENGTH_MASK) << 2; +} + +static inline __u32 ipath_hdrget_index(const __le32 * rbuf) +{ + return (__le32_to_cpu(rbuf[0]) >> INFINIPATH_RHF_EGRINDEX_SHIFT) + & INFINIPATH_RHF_EGRINDEX_MASK; +} + +static inline __u32 ipath_hdrget_seq(const __le32 *rbuf) +{ + return (__le32_to_cpu(rbuf[1]) >> INFINIPATH_RHF_SEQ_SHIFT) + & INFINIPATH_RHF_SEQ_MASK; +} + +static inline __u32 ipath_hdrget_offset(const __le32 *rbuf) +{ + return (__le32_to_cpu(rbuf[1]) >> INFINIPATH_RHF_HDRQ_OFFSET_SHIFT) + & INFINIPATH_RHF_HDRQ_OFFSET_MASK; +} + +static inline __u32 ipath_hdrget_use_egr_buf(const __le32 *rbuf) +{ + return __le32_to_cpu(rbuf[0]) & INFINIPATH_RHF_L_USE_EGR; +} + +static inline __u32 ipath_hdrget_ipath_ver(__le32 hdrword) +{ + return (__le32_to_cpu(hdrword) >> INFINIPATH_I_VERS_SHIFT) + & INFINIPATH_I_VERS_MASK; +} + +#endif /* _IPATH_COMMON_H */ diff --git a/kernel/drivers/infiniband/hw/ipath/ipath_cq.c b/kernel/drivers/infiniband/hw/ipath/ipath_cq.c new file mode 100644 index 000000000..0416c6c0e --- /dev/null +++ b/kernel/drivers/infiniband/hw/ipath/ipath_cq.c @@ -0,0 +1,478 @@ +/* + * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include "ipath_verbs.h" + +/** + * ipath_cq_enter - add a new entry to the completion queue + * @cq: completion queue + * @entry: work completion entry to add + * @sig: true if @entry is a solicitated entry + * + * This may be called with qp->s_lock held. + */ +void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int solicited) +{ + struct ipath_cq_wc *wc; + unsigned long flags; + u32 head; + u32 next; + + spin_lock_irqsave(&cq->lock, flags); + + /* + * Note that the head pointer might be writable by user processes. + * Take care to verify it is a sane value. + */ + wc = cq->queue; + head = wc->head; + if (head >= (unsigned) cq->ibcq.cqe) { + head = cq->ibcq.cqe; + next = 0; + } else + next = head + 1; + if (unlikely(next == wc->tail)) { + spin_unlock_irqrestore(&cq->lock, flags); + if (cq->ibcq.event_handler) { + struct ib_event ev; + + ev.device = cq->ibcq.device; + ev.element.cq = &cq->ibcq; + ev.event = IB_EVENT_CQ_ERR; + cq->ibcq.event_handler(&ev, cq->ibcq.cq_context); + } + return; + } + if (cq->ip) { + wc->uqueue[head].wr_id = entry->wr_id; + wc->uqueue[head].status = entry->status; + wc->uqueue[head].opcode = entry->opcode; + wc->uqueue[head].vendor_err = entry->vendor_err; + wc->uqueue[head].byte_len = entry->byte_len; + wc->uqueue[head].ex.imm_data = (__u32 __force) entry->ex.imm_data; + wc->uqueue[head].qp_num = entry->qp->qp_num; + wc->uqueue[head].src_qp = entry->src_qp; + wc->uqueue[head].wc_flags = entry->wc_flags; + wc->uqueue[head].pkey_index = entry->pkey_index; + wc->uqueue[head].slid = entry->slid; + wc->uqueue[head].sl = entry->sl; + wc->uqueue[head].dlid_path_bits = entry->dlid_path_bits; + wc->uqueue[head].port_num = entry->port_num; + /* Make sure entry is written before the head index. */ + smp_wmb(); + } else + wc->kqueue[head] = *entry; + wc->head = next; + + if (cq->notify == IB_CQ_NEXT_COMP || + (cq->notify == IB_CQ_SOLICITED && solicited)) { + cq->notify = IB_CQ_NONE; + cq->triggered++; + /* + * This will cause send_complete() to be called in + * another thread. + */ + tasklet_hi_schedule(&cq->comptask); + } + + spin_unlock_irqrestore(&cq->lock, flags); + + if (entry->status != IB_WC_SUCCESS) + to_idev(cq->ibcq.device)->n_wqe_errs++; +} + +/** + * ipath_poll_cq - poll for work completion entries + * @ibcq: the completion queue to poll + * @num_entries: the maximum number of entries to return + * @entry: pointer to array where work completions are placed + * + * Returns the number of completion entries polled. + * + * This may be called from interrupt context. Also called by ib_poll_cq() + * in the generic verbs code. + */ +int ipath_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) +{ + struct ipath_cq *cq = to_icq(ibcq); + struct ipath_cq_wc *wc; + unsigned long flags; + int npolled; + u32 tail; + + /* The kernel can only poll a kernel completion queue */ + if (cq->ip) { + npolled = -EINVAL; + goto bail; + } + + spin_lock_irqsave(&cq->lock, flags); + + wc = cq->queue; + tail = wc->tail; + if (tail > (u32) cq->ibcq.cqe) + tail = (u32) cq->ibcq.cqe; + for (npolled = 0; npolled < num_entries; ++npolled, ++entry) { + if (tail == wc->head) + break; + /* The kernel doesn't need a RMB since it has the lock. */ + *entry = wc->kqueue[tail]; + if (tail >= cq->ibcq.cqe) + tail = 0; + else + tail++; + } + wc->tail = tail; + + spin_unlock_irqrestore(&cq->lock, flags); + +bail: + return npolled; +} + +static void send_complete(unsigned long data) +{ + struct ipath_cq *cq = (struct ipath_cq *)data; + + /* + * The completion handler will most likely rearm the notification + * and poll for all pending entries. If a new completion entry + * is added while we are in this routine, tasklet_hi_schedule() + * won't call us again until we return so we check triggered to + * see if we need to call the handler again. + */ + for (;;) { + u8 triggered = cq->triggered; + + cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); + + if (cq->triggered == triggered) + return; + } +} + +/** + * ipath_create_cq - create a completion queue + * @ibdev: the device this completion queue is attached to + * @entries: the minimum size of the completion queue + * @context: unused by the InfiniPath driver + * @udata: unused by the InfiniPath driver + * + * Returns a pointer to the completion queue or negative errno values + * for failure. + * + * Called by ib_create_cq() in the generic verbs code. + */ +struct ib_cq *ipath_create_cq(struct ib_device *ibdev, int entries, int comp_vector, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct ipath_ibdev *dev = to_idev(ibdev); + struct ipath_cq *cq; + struct ipath_cq_wc *wc; + struct ib_cq *ret; + u32 sz; + + if (entries < 1 || entries > ib_ipath_max_cqes) { + ret = ERR_PTR(-EINVAL); + goto done; + } + + /* Allocate the completion queue structure. */ + cq = kmalloc(sizeof(*cq), GFP_KERNEL); + if (!cq) { + ret = ERR_PTR(-ENOMEM); + goto done; + } + + /* + * Allocate the completion queue entries and head/tail pointers. + * This is allocated separately so that it can be resized and + * also mapped into user space. + * We need to use vmalloc() in order to support mmap and large + * numbers of entries. + */ + sz = sizeof(*wc); + if (udata && udata->outlen >= sizeof(__u64)) + sz += sizeof(struct ib_uverbs_wc) * (entries + 1); + else + sz += sizeof(struct ib_wc) * (entries + 1); + wc = vmalloc_user(sz); + if (!wc) { + ret = ERR_PTR(-ENOMEM); + goto bail_cq; + } + + /* + * Return the address of the WC as the offset to mmap. + * See ipath_mmap() for details. + */ + if (udata && udata->outlen >= sizeof(__u64)) { + int err; + + cq->ip = ipath_create_mmap_info(dev, sz, context, wc); + if (!cq->ip) { + ret = ERR_PTR(-ENOMEM); + goto bail_wc; + } + + err = ib_copy_to_udata(udata, &cq->ip->offset, + sizeof(cq->ip->offset)); + if (err) { + ret = ERR_PTR(err); + goto bail_ip; + } + } else + cq->ip = NULL; + + spin_lock(&dev->n_cqs_lock); + if (dev->n_cqs_allocated == ib_ipath_max_cqs) { + spin_unlock(&dev->n_cqs_lock); + ret = ERR_PTR(-ENOMEM); + goto bail_ip; + } + + dev->n_cqs_allocated++; + spin_unlock(&dev->n_cqs_lock); + + if (cq->ip) { + spin_lock_irq(&dev->pending_lock); + list_add(&cq->ip->pending_mmaps, &dev->pending_mmaps); + spin_unlock_irq(&dev->pending_lock); + } + + /* + * ib_create_cq() will initialize cq->ibcq except for cq->ibcq.cqe. + * The number of entries should be >= the number requested or return + * an error. + */ + cq->ibcq.cqe = entries; + cq->notify = IB_CQ_NONE; + cq->triggered = 0; + spin_lock_init(&cq->lock); + tasklet_init(&cq->comptask, send_complete, (unsigned long)cq); + wc->head = 0; + wc->tail = 0; + cq->queue = wc; + + ret = &cq->ibcq; + + goto done; + +bail_ip: + kfree(cq->ip); +bail_wc: + vfree(wc); +bail_cq: + kfree(cq); +done: + return ret; +} + +/** + * ipath_destroy_cq - destroy a completion queue + * @ibcq: the completion queue to destroy. + * + * Returns 0 for success. + * + * Called by ib_destroy_cq() in the generic verbs code. + */ +int ipath_destroy_cq(struct ib_cq *ibcq) +{ + struct ipath_ibdev *dev = to_idev(ibcq->device); + struct ipath_cq *cq = to_icq(ibcq); + + tasklet_kill(&cq->comptask); + spin_lock(&dev->n_cqs_lock); + dev->n_cqs_allocated--; + spin_unlock(&dev->n_cqs_lock); + if (cq->ip) + kref_put(&cq->ip->ref, ipath_release_mmap_info); + else + vfree(cq->queue); + kfree(cq); + + return 0; +} + +/** + * ipath_req_notify_cq - change the notification type for a completion queue + * @ibcq: the completion queue + * @notify_flags: the type of notification to request + * + * Returns 0 for success. + * + * This may be called from interrupt context. Also called by + * ib_req_notify_cq() in the generic verbs code. + */ +int ipath_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags) +{ + struct ipath_cq *cq = to_icq(ibcq); + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&cq->lock, flags); + /* + * Don't change IB_CQ_NEXT_COMP to IB_CQ_SOLICITED but allow + * any other transitions (see C11-31 and C11-32 in ch. 11.4.2.2). + */ + if (cq->notify != IB_CQ_NEXT_COMP) + cq->notify = notify_flags & IB_CQ_SOLICITED_MASK; + + if ((notify_flags & IB_CQ_REPORT_MISSED_EVENTS) && + cq->queue->head != cq->queue->tail) + ret = 1; + + spin_unlock_irqrestore(&cq->lock, flags); + + return ret; +} + +/** + * ipath_resize_cq - change the size of the CQ + * @ibcq: the completion queue + * + * Returns 0 for success. + */ +int ipath_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) +{ + struct ipath_cq *cq = to_icq(ibcq); + struct ipath_cq_wc *old_wc; + struct ipath_cq_wc *wc; + u32 head, tail, n; + int ret; + u32 sz; + + if (cqe < 1 || cqe > ib_ipath_max_cqes) { + ret = -EINVAL; + goto bail; + } + + /* + * Need to use vmalloc() if we want to support large #s of entries. + */ + sz = sizeof(*wc); + if (udata && udata->outlen >= sizeof(__u64)) + sz += sizeof(struct ib_uverbs_wc) * (cqe + 1); + else + sz += sizeof(struct ib_wc) * (cqe + 1); + wc = vmalloc_user(sz); + if (!wc) { + ret = -ENOMEM; + goto bail; + } + + /* Check that we can write the offset to mmap. */ + if (udata && udata->outlen >= sizeof(__u64)) { + __u64 offset = 0; + + ret = ib_copy_to_udata(udata, &offset, sizeof(offset)); + if (ret) + goto bail_free; + } + + spin_lock_irq(&cq->lock); + /* + * Make sure head and tail are sane since they + * might be user writable. + */ + old_wc = cq->queue; + head = old_wc->head; + if (head > (u32) cq->ibcq.cqe) + head = (u32) cq->ibcq.cqe; + tail = old_wc->tail; + if (tail > (u32) cq->ibcq.cqe) + tail = (u32) cq->ibcq.cqe; + if (head < tail) + n = cq->ibcq.cqe + 1 + head - tail; + else + n = head - tail; + if (unlikely((u32)cqe < n)) { + ret = -EINVAL; + goto bail_unlock; + } + for (n = 0; tail != head; n++) { + if (cq->ip) + wc->uqueue[n] = old_wc->uqueue[tail]; + else + wc->kqueue[n] = old_wc->kqueue[tail]; + if (tail == (u32) cq->ibcq.cqe) + tail = 0; + else + tail++; + } + cq->ibcq.cqe = cqe; + wc->head = n; + wc->tail = 0; + cq->queue = wc; + spin_unlock_irq(&cq->lock); + + vfree(old_wc); + + if (cq->ip) { + struct ipath_ibdev *dev = to_idev(ibcq->device); + struct ipath_mmap_info *ip = cq->ip; + + ipath_update_mmap_info(dev, ip, sz, wc); + + /* + * Return the offset to mmap. + * See ipath_mmap() for details. + */ + if (udata && udata->outlen >= sizeof(__u64)) { + ret = ib_copy_to_udata(udata, &ip->offset, + sizeof(ip->offset)); + if (ret) + goto bail; + } + + spin_lock_irq(&dev->pending_lock); + if (list_empty(&ip->pending_mmaps)) + list_add(&ip->pending_mmaps, &dev->pending_mmaps); + spin_unlock_irq(&dev->pending_lock); + } + + ret = 0; + goto bail; + +bail_unlock: + spin_unlock_irq(&cq->lock); +bail_free: + vfree(wc); +bail: + return ret; +} diff --git a/kernel/drivers/infiniband/hw/ipath/ipath_debug.h b/kernel/drivers/infiniband/hw/ipath/ipath_debug.h new file mode 100644 index 000000000..65926cd35 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ipath/ipath_debug.h @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _IPATH_DEBUG_H +#define _IPATH_DEBUG_H + +#ifndef _IPATH_DEBUGGING /* debugging enabled or not */ +#define _IPATH_DEBUGGING 1 +#endif + +#if _IPATH_DEBUGGING + +/* + * Mask values for debugging. The scheme allows us to compile out any + * of the debug tracing stuff, and if compiled in, to enable or disable + * dynamically. This can be set at modprobe time also: + * modprobe infinipath.ko infinipath_debug=7 + */ + +#define __IPATH_INFO 0x1 /* generic low verbosity stuff */ +#define __IPATH_DBG 0x2 /* generic debug */ +#define __IPATH_TRSAMPLE 0x8 /* generate trace buffer sample entries */ +/* leave some low verbosity spots open */ +#define __IPATH_VERBDBG 0x40 /* very verbose debug */ +#define __IPATH_PKTDBG 0x80 /* print packet data */ +/* print process startup (init)/exit messages */ +#define __IPATH_PROCDBG 0x100 +/* print mmap/fault stuff, not using VDBG any more */ +#define __IPATH_MMDBG 0x200 +#define __IPATH_ERRPKTDBG 0x400 +#define __IPATH_USER_SEND 0x1000 /* use user mode send */ +#define __IPATH_KERNEL_SEND 0x2000 /* use kernel mode send */ +#define __IPATH_EPKTDBG 0x4000 /* print ethernet packet data */ +#define __IPATH_IPATHDBG 0x10000 /* Ethernet (IPATH) gen debug */ +#define __IPATH_IPATHWARN 0x20000 /* Ethernet (IPATH) warnings */ +#define __IPATH_IPATHERR 0x40000 /* Ethernet (IPATH) errors */ +#define __IPATH_IPATHPD 0x80000 /* Ethernet (IPATH) packet dump */ +#define __IPATH_IPATHTABLE 0x100000 /* Ethernet (IPATH) table dump */ +#define __IPATH_LINKVERBDBG 0x200000 /* very verbose linkchange debug */ + +#else /* _IPATH_DEBUGGING */ + +/* + * define all of these even with debugging off, for the few places that do + * if(infinipath_debug & _IPATH_xyzzy), but in a way that will make the + * compiler eliminate the code + */ + +#define __IPATH_INFO 0x0 /* generic low verbosity stuff */ +#define __IPATH_DBG 0x0 /* generic debug */ +#define __IPATH_TRSAMPLE 0x0 /* generate trace buffer sample entries */ +#define __IPATH_VERBDBG 0x0 /* very verbose debug */ +#define __IPATH_PKTDBG 0x0 /* print packet data */ +#define __IPATH_PROCDBG 0x0 /* process startup (init)/exit messages */ +/* print mmap/fault stuff, not using VDBG any more */ +#define __IPATH_MMDBG 0x0 +#define __IPATH_EPKTDBG 0x0 /* print ethernet packet data */ +#define __IPATH_IPATHDBG 0x0 /* Ethernet (IPATH) table dump on */ +#define __IPATH_IPATHWARN 0x0 /* Ethernet (IPATH) warnings on */ +#define __IPATH_IPATHERR 0x0 /* Ethernet (IPATH) errors on */ +#define __IPATH_IPATHPD 0x0 /* Ethernet (IPATH) packet dump on */ +#define __IPATH_IPATHTABLE 0x0 /* Ethernet (IPATH) packet dump on */ +#define __IPATH_LINKVERBDBG 0x0 /* very verbose linkchange debug */ + +#endif /* _IPATH_DEBUGGING */ + +#define __IPATH_VERBOSEDBG __IPATH_VERBDBG + +#endif /* _IPATH_DEBUG_H */ diff --git a/kernel/drivers/infiniband/hw/ipath/ipath_diag.c b/kernel/drivers/infiniband/hw/ipath/ipath_diag.c new file mode 100644 index 000000000..45802e973 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ipath/ipath_diag.c @@ -0,0 +1,551 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * This file contains support for diagnostic functions. It is accessed by + * opening the ipath_diag device, normally minor number 129. Diagnostic use + * of the InfiniPath chip may render the chip or board unusable until the + * driver is unloaded, or in some cases, until the system is rebooted. + * + * Accesses to the chip through this interface are not similar to going + * through the /sys/bus/pci resource mmap interface. + */ + +#include +#include +#include +#include +#include +#include + +#include "ipath_kernel.h" +#include "ipath_common.h" + +int ipath_diag_inuse; +static int diag_set_link; + +static int ipath_diag_open(struct inode *in, struct file *fp); +static int ipath_diag_release(struct inode *in, struct file *fp); +static ssize_t ipath_diag_read(struct file *fp, char __user *data, + size_t count, loff_t *off); +static ssize_t ipath_diag_write(struct file *fp, const char __user *data, + size_t count, loff_t *off); + +static const struct file_operations diag_file_ops = { + .owner = THIS_MODULE, + .write = ipath_diag_write, + .read = ipath_diag_read, + .open = ipath_diag_open, + .release = ipath_diag_release, + .llseek = default_llseek, +}; + +static ssize_t ipath_diagpkt_write(struct file *fp, + const char __user *data, + size_t count, loff_t *off); + +static const struct file_operations diagpkt_file_ops = { + .owner = THIS_MODULE, + .write = ipath_diagpkt_write, + .llseek = noop_llseek, +}; + +static atomic_t diagpkt_count = ATOMIC_INIT(0); +static struct cdev *diagpkt_cdev; +static struct device *diagpkt_dev; + +int ipath_diag_add(struct ipath_devdata *dd) +{ + char name[16]; + int ret = 0; + + if (atomic_inc_return(&diagpkt_count) == 1) { + ret = ipath_cdev_init(IPATH_DIAGPKT_MINOR, + "ipath_diagpkt", &diagpkt_file_ops, + &diagpkt_cdev, &diagpkt_dev); + + if (ret) { + ipath_dev_err(dd, "Couldn't create ipath_diagpkt " + "device: %d", ret); + goto done; + } + } + + snprintf(name, sizeof(name), "ipath_diag%d", dd->ipath_unit); + + ret = ipath_cdev_init(IPATH_DIAG_MINOR_BASE + dd->ipath_unit, name, + &diag_file_ops, &dd->diag_cdev, + &dd->diag_dev); + if (ret) + ipath_dev_err(dd, "Couldn't create %s device: %d", + name, ret); + +done: + return ret; +} + +void ipath_diag_remove(struct ipath_devdata *dd) +{ + if (atomic_dec_and_test(&diagpkt_count)) + ipath_cdev_cleanup(&diagpkt_cdev, &diagpkt_dev); + + ipath_cdev_cleanup(&dd->diag_cdev, &dd->diag_dev); +} + +/** + * ipath_read_umem64 - read a 64-bit quantity from the chip into user space + * @dd: the infinipath device + * @uaddr: the location to store the data in user memory + * @caddr: the source chip address (full pointer, not offset) + * @count: number of bytes to copy (multiple of 32 bits) + * + * This function also localizes all chip memory accesses. + * The copy should be written such that we read full cacheline packets + * from the chip. This is usually used for a single qword + * + * NOTE: This assumes the chip address is 64-bit aligned. + */ +static int ipath_read_umem64(struct ipath_devdata *dd, void __user *uaddr, + const void __iomem *caddr, size_t count) +{ + const u64 __iomem *reg_addr = caddr; + const u64 __iomem *reg_end = reg_addr + (count / sizeof(u64)); + int ret; + + /* not very efficient, but it works for now */ + if (reg_addr < dd->ipath_kregbase || reg_end > dd->ipath_kregend) { + ret = -EINVAL; + goto bail; + } + while (reg_addr < reg_end) { + u64 data = readq(reg_addr); + if (copy_to_user(uaddr, &data, sizeof(u64))) { + ret = -EFAULT; + goto bail; + } + reg_addr++; + uaddr += sizeof(u64); + } + ret = 0; +bail: + return ret; +} + +/** + * ipath_write_umem64 - write a 64-bit quantity to the chip from user space + * @dd: the infinipath device + * @caddr: the destination chip address (full pointer, not offset) + * @uaddr: the source of the data in user memory + * @count: the number of bytes to copy (multiple of 32 bits) + * + * This is usually used for a single qword + * NOTE: This assumes the chip address is 64-bit aligned. + */ + +static int ipath_write_umem64(struct ipath_devdata *dd, void __iomem *caddr, + const void __user *uaddr, size_t count) +{ + u64 __iomem *reg_addr = caddr; + const u64 __iomem *reg_end = reg_addr + (count / sizeof(u64)); + int ret; + + /* not very efficient, but it works for now */ + if (reg_addr < dd->ipath_kregbase || reg_end > dd->ipath_kregend) { + ret = -EINVAL; + goto bail; + } + while (reg_addr < reg_end) { + u64 data; + if (copy_from_user(&data, uaddr, sizeof(data))) { + ret = -EFAULT; + goto bail; + } + writeq(data, reg_addr); + + reg_addr++; + uaddr += sizeof(u64); + } + ret = 0; +bail: + return ret; +} + +/** + * ipath_read_umem32 - read a 32-bit quantity from the chip into user space + * @dd: the infinipath device + * @uaddr: the location to store the data in user memory + * @caddr: the source chip address (full pointer, not offset) + * @count: number of bytes to copy + * + * read 32 bit values, not 64 bit; for memories that only + * support 32 bit reads; usually a single dword. + */ +static int ipath_read_umem32(struct ipath_devdata *dd, void __user *uaddr, + const void __iomem *caddr, size_t count) +{ + const u32 __iomem *reg_addr = caddr; + const u32 __iomem *reg_end = reg_addr + (count / sizeof(u32)); + int ret; + + if (reg_addr < (u32 __iomem *) dd->ipath_kregbase || + reg_end > (u32 __iomem *) dd->ipath_kregend) { + ret = -EINVAL; + goto bail; + } + /* not very efficient, but it works for now */ + while (reg_addr < reg_end) { + u32 data = readl(reg_addr); + if (copy_to_user(uaddr, &data, sizeof(data))) { + ret = -EFAULT; + goto bail; + } + + reg_addr++; + uaddr += sizeof(u32); + + } + ret = 0; +bail: + return ret; +} + +/** + * ipath_write_umem32 - write a 32-bit quantity to the chip from user space + * @dd: the infinipath device + * @caddr: the destination chip address (full pointer, not offset) + * @uaddr: the source of the data in user memory + * @count: number of bytes to copy + * + * write 32 bit values, not 64 bit; for memories that only + * support 32 bit write; usually a single dword. + */ + +static int ipath_write_umem32(struct ipath_devdata *dd, void __iomem *caddr, + const void __user *uaddr, size_t count) +{ + u32 __iomem *reg_addr = caddr; + const u32 __iomem *reg_end = reg_addr + (count / sizeof(u32)); + int ret; + + if (reg_addr < (u32 __iomem *) dd->ipath_kregbase || + reg_end > (u32 __iomem *) dd->ipath_kregend) { + ret = -EINVAL; + goto bail; + } + while (reg_addr < reg_end) { + u32 data; + if (copy_from_user(&data, uaddr, sizeof(data))) { + ret = -EFAULT; + goto bail; + } + writel(data, reg_addr); + + reg_addr++; + uaddr += sizeof(u32); + } + ret = 0; +bail: + return ret; +} + +static int ipath_diag_open(struct inode *in, struct file *fp) +{ + int unit = iminor(in) - IPATH_DIAG_MINOR_BASE; + struct ipath_devdata *dd; + int ret; + + mutex_lock(&ipath_mutex); + + if (ipath_diag_inuse) { + ret = -EBUSY; + goto bail; + } + + dd = ipath_lookup(unit); + + if (dd == NULL || !(dd->ipath_flags & IPATH_PRESENT) || + !dd->ipath_kregbase) { + ret = -ENODEV; + goto bail; + } + + fp->private_data = dd; + ipath_diag_inuse = -2; + diag_set_link = 0; + ret = 0; + + /* Only expose a way to reset the device if we + make it into diag mode. */ + ipath_expose_reset(&dd->pcidev->dev); + +bail: + mutex_unlock(&ipath_mutex); + + return ret; +} + +/** + * ipath_diagpkt_write - write an IB packet + * @fp: the diag data device file pointer + * @data: ipath_diag_pkt structure saying where to get the packet + * @count: size of data to write + * @off: unused by this code + */ +static ssize_t ipath_diagpkt_write(struct file *fp, + const char __user *data, + size_t count, loff_t *off) +{ + u32 __iomem *piobuf; + u32 plen, pbufn, maxlen_reserve; + struct ipath_diag_pkt odp; + struct ipath_diag_xpkt dp; + u32 *tmpbuf = NULL; + struct ipath_devdata *dd; + ssize_t ret = 0; + u64 val; + u32 l_state, lt_state; /* LinkState, LinkTrainingState */ + + + if (count == sizeof(dp)) { + if (copy_from_user(&dp, data, sizeof(dp))) { + ret = -EFAULT; + goto bail; + } + } else if (count == sizeof(odp)) { + if (copy_from_user(&odp, data, sizeof(odp))) { + ret = -EFAULT; + goto bail; + } + dp.len = odp.len; + dp.unit = odp.unit; + dp.data = odp.data; + dp.pbc_wd = 0; + } else { + ret = -EINVAL; + goto bail; + } + + /* send count must be an exact number of dwords */ + if (dp.len & 3) { + ret = -EINVAL; + goto bail; + } + + plen = dp.len >> 2; + + dd = ipath_lookup(dp.unit); + if (!dd || !(dd->ipath_flags & IPATH_PRESENT) || + !dd->ipath_kregbase) { + ipath_cdbg(VERBOSE, "illegal unit %u for diag data send\n", + dp.unit); + ret = -ENODEV; + goto bail; + } + + if (ipath_diag_inuse && !diag_set_link && + !(dd->ipath_flags & IPATH_LINKACTIVE)) { + diag_set_link = 1; + ipath_cdbg(VERBOSE, "Trying to set to set link active for " + "diag pkt\n"); + ipath_set_linkstate(dd, IPATH_IB_LINKARM); + ipath_set_linkstate(dd, IPATH_IB_LINKACTIVE); + } + + if (!(dd->ipath_flags & IPATH_INITTED)) { + /* no hardware, freeze, etc. */ + ipath_cdbg(VERBOSE, "unit %u not usable\n", dd->ipath_unit); + ret = -ENODEV; + goto bail; + } + /* + * Want to skip check for l_state if using custom PBC, + * because we might be trying to force an SM packet out. + * first-cut, skip _all_ state checking in that case. + */ + val = ipath_ib_state(dd, dd->ipath_lastibcstat); + lt_state = ipath_ib_linktrstate(dd, dd->ipath_lastibcstat); + l_state = ipath_ib_linkstate(dd, dd->ipath_lastibcstat); + if (!dp.pbc_wd && (lt_state != INFINIPATH_IBCS_LT_STATE_LINKUP || + (val != dd->ib_init && val != dd->ib_arm && + val != dd->ib_active))) { + ipath_cdbg(VERBOSE, "unit %u not ready (state %llx)\n", + dd->ipath_unit, (unsigned long long) val); + ret = -EINVAL; + goto bail; + } + + /* + * need total length before first word written, plus 2 Dwords. One Dword + * is for padding so we get the full user data when not aligned on + * a word boundary. The other Dword is to make sure we have room for the + * ICRC which gets tacked on later. + */ + maxlen_reserve = 2 * sizeof(u32); + if (dp.len > dd->ipath_ibmaxlen - maxlen_reserve) { + ipath_dbg("Pkt len 0x%x > ibmaxlen %x\n", + dp.len, dd->ipath_ibmaxlen); + ret = -EINVAL; + goto bail; + } + + plen = sizeof(u32) + dp.len; + + tmpbuf = vmalloc(plen); + if (!tmpbuf) { + dev_info(&dd->pcidev->dev, "Unable to allocate tmp buffer, " + "failing\n"); + ret = -ENOMEM; + goto bail; + } + + if (copy_from_user(tmpbuf, + (const void __user *) (unsigned long) dp.data, + dp.len)) { + ret = -EFAULT; + goto bail; + } + + plen >>= 2; /* in dwords */ + + piobuf = ipath_getpiobuf(dd, plen, &pbufn); + if (!piobuf) { + ipath_cdbg(VERBOSE, "No PIO buffers avail unit for %u\n", + dd->ipath_unit); + ret = -EBUSY; + goto bail; + } + /* disarm it just to be extra sure */ + ipath_disarm_piobufs(dd, pbufn, 1); + + if (ipath_debug & __IPATH_PKTDBG) + ipath_cdbg(VERBOSE, "unit %u 0x%x+1w pio%d\n", + dd->ipath_unit, plen - 1, pbufn); + + if (dp.pbc_wd == 0) + dp.pbc_wd = plen; + writeq(dp.pbc_wd, piobuf); + /* + * Copy all by the trigger word, then flush, so it's written + * to chip before trigger word, then write trigger word, then + * flush again, so packet is sent. + */ + if (dd->ipath_flags & IPATH_PIO_FLUSH_WC) { + ipath_flush_wc(); + __iowrite32_copy(piobuf + 2, tmpbuf, plen - 1); + ipath_flush_wc(); + __raw_writel(tmpbuf[plen - 1], piobuf + plen + 1); + } else + __iowrite32_copy(piobuf + 2, tmpbuf, plen); + + ipath_flush_wc(); + + ret = sizeof(dp); + +bail: + vfree(tmpbuf); + return ret; +} + +static int ipath_diag_release(struct inode *in, struct file *fp) +{ + mutex_lock(&ipath_mutex); + ipath_diag_inuse = 0; + fp->private_data = NULL; + mutex_unlock(&ipath_mutex); + return 0; +} + +static ssize_t ipath_diag_read(struct file *fp, char __user *data, + size_t count, loff_t *off) +{ + struct ipath_devdata *dd = fp->private_data; + void __iomem *kreg_base; + ssize_t ret; + + kreg_base = dd->ipath_kregbase; + + if (count == 0) + ret = 0; + else if ((count % 4) || (*off % 4)) + /* address or length is not 32-bit aligned, hence invalid */ + ret = -EINVAL; + else if (ipath_diag_inuse < 1 && (*off || count != 8)) + ret = -EINVAL; /* prevent cat /dev/ipath_diag* */ + else if ((count % 8) || (*off % 8)) + /* address or length not 64-bit aligned; do 32-bit reads */ + ret = ipath_read_umem32(dd, data, kreg_base + *off, count); + else + ret = ipath_read_umem64(dd, data, kreg_base + *off, count); + + if (ret >= 0) { + *off += count; + ret = count; + if (ipath_diag_inuse == -2) + ipath_diag_inuse++; + } + + return ret; +} + +static ssize_t ipath_diag_write(struct file *fp, const char __user *data, + size_t count, loff_t *off) +{ + struct ipath_devdata *dd = fp->private_data; + void __iomem *kreg_base; + ssize_t ret; + + kreg_base = dd->ipath_kregbase; + + if (count == 0) + ret = 0; + else if ((count % 4) || (*off % 4)) + /* address or length is not 32-bit aligned, hence invalid */ + ret = -EINVAL; + else if ((ipath_diag_inuse == -1 && (*off || count != 8)) || + ipath_diag_inuse == -2) /* read qw off 0, write qw off 0 */ + ret = -EINVAL; /* before any other write allowed */ + else if ((count % 8) || (*off % 8)) + /* address or length not 64-bit aligned; do 32-bit writes */ + ret = ipath_write_umem32(dd, kreg_base + *off, data, count); + else + ret = ipath_write_umem64(dd, kreg_base + *off, data, count); + + if (ret >= 0) { + *off += count; + ret = count; + if (ipath_diag_inuse == -1) + ipath_diag_inuse = 1; /* all read/write OK now */ + } + + return ret; +} diff --git a/kernel/drivers/infiniband/hw/ipath/ipath_dma.c b/kernel/drivers/infiniband/hw/ipath/ipath_dma.c new file mode 100644 index 000000000..123a8c053 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ipath/ipath_dma.c @@ -0,0 +1,179 @@ +/* + * Copyright (c) 2006 QLogic, Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include "ipath_verbs.h" + +#define BAD_DMA_ADDRESS ((u64) 0) + +/* + * The following functions implement driver specific replacements + * for the ib_dma_*() functions. + * + * These functions return kernel virtual addresses instead of + * device bus addresses since the driver uses the CPU to copy + * data instead of using hardware DMA. + */ + +static int ipath_mapping_error(struct ib_device *dev, u64 dma_addr) +{ + return dma_addr == BAD_DMA_ADDRESS; +} + +static u64 ipath_dma_map_single(struct ib_device *dev, + void *cpu_addr, size_t size, + enum dma_data_direction direction) +{ + BUG_ON(!valid_dma_direction(direction)); + return (u64) cpu_addr; +} + +static void ipath_dma_unmap_single(struct ib_device *dev, + u64 addr, size_t size, + enum dma_data_direction direction) +{ + BUG_ON(!valid_dma_direction(direction)); +} + +static u64 ipath_dma_map_page(struct ib_device *dev, + struct page *page, + unsigned long offset, + size_t size, + enum dma_data_direction direction) +{ + u64 addr; + + BUG_ON(!valid_dma_direction(direction)); + + if (offset + size > PAGE_SIZE) { + addr = BAD_DMA_ADDRESS; + goto done; + } + + addr = (u64) page_address(page); + if (addr) + addr += offset; + /* TODO: handle highmem pages */ + +done: + return addr; +} + +static void ipath_dma_unmap_page(struct ib_device *dev, + u64 addr, size_t size, + enum dma_data_direction direction) +{ + BUG_ON(!valid_dma_direction(direction)); +} + +static int ipath_map_sg(struct ib_device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction direction) +{ + struct scatterlist *sg; + u64 addr; + int i; + int ret = nents; + + BUG_ON(!valid_dma_direction(direction)); + + for_each_sg(sgl, sg, nents, i) { + addr = (u64) page_address(sg_page(sg)); + /* TODO: handle highmem pages */ + if (!addr) { + ret = 0; + break; + } + sg->dma_address = addr + sg->offset; +#ifdef CONFIG_NEED_SG_DMA_LENGTH + sg->dma_length = sg->length; +#endif + } + return ret; +} + +static void ipath_unmap_sg(struct ib_device *dev, + struct scatterlist *sg, int nents, + enum dma_data_direction direction) +{ + BUG_ON(!valid_dma_direction(direction)); +} + +static void ipath_sync_single_for_cpu(struct ib_device *dev, + u64 addr, + size_t size, + enum dma_data_direction dir) +{ +} + +static void ipath_sync_single_for_device(struct ib_device *dev, + u64 addr, + size_t size, + enum dma_data_direction dir) +{ +} + +static void *ipath_dma_alloc_coherent(struct ib_device *dev, size_t size, + u64 *dma_handle, gfp_t flag) +{ + struct page *p; + void *addr = NULL; + + p = alloc_pages(flag, get_order(size)); + if (p) + addr = page_address(p); + if (dma_handle) + *dma_handle = (u64) addr; + return addr; +} + +static void ipath_dma_free_coherent(struct ib_device *dev, size_t size, + void *cpu_addr, u64 dma_handle) +{ + free_pages((unsigned long) cpu_addr, get_order(size)); +} + +struct ib_dma_mapping_ops ipath_dma_mapping_ops = { + .mapping_error = ipath_mapping_error, + .map_single = ipath_dma_map_single, + .unmap_single = ipath_dma_unmap_single, + .map_page = ipath_dma_map_page, + .unmap_page = ipath_dma_unmap_page, + .map_sg = ipath_map_sg, + .unmap_sg = ipath_unmap_sg, + .sync_single_for_cpu = ipath_sync_single_for_cpu, + .sync_single_for_device = ipath_sync_single_for_device, + .alloc_coherent = ipath_dma_alloc_coherent, + .free_coherent = ipath_dma_free_coherent +}; diff --git a/kernel/drivers/infiniband/hw/ipath/ipath_driver.c b/kernel/drivers/infiniband/hw/ipath/ipath_driver.c new file mode 100644 index 000000000..bd0caedaf --- /dev/null +++ b/kernel/drivers/infiniband/hw/ipath/ipath_driver.c @@ -0,0 +1,2779 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ipath_kernel.h" +#include "ipath_verbs.h" + +static void ipath_update_pio_bufs(struct ipath_devdata *); + +const char *ipath_get_unit_name(int unit) +{ + static char iname[16]; + snprintf(iname, sizeof iname, "infinipath%u", unit); + return iname; +} + +#define DRIVER_LOAD_MSG "QLogic " IPATH_DRV_NAME " loaded: " +#define PFX IPATH_DRV_NAME ": " + +/* + * The size has to be longer than this string, so we can append + * board/chip information to it in the init code. + */ +const char ib_ipath_version[] = IPATH_IDSTR "\n"; + +static struct idr unit_table; +DEFINE_SPINLOCK(ipath_devs_lock); +LIST_HEAD(ipath_dev_list); + +wait_queue_head_t ipath_state_wait; + +unsigned ipath_debug = __IPATH_INFO; + +module_param_named(debug, ipath_debug, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(debug, "mask for debug prints"); +EXPORT_SYMBOL_GPL(ipath_debug); + +unsigned ipath_mtu4096 = 1; /* max 4KB IB mtu by default, if supported */ +module_param_named(mtu4096, ipath_mtu4096, uint, S_IRUGO); +MODULE_PARM_DESC(mtu4096, "enable MTU of 4096 bytes, if supported"); + +static unsigned ipath_hol_timeout_ms = 13000; +module_param_named(hol_timeout_ms, ipath_hol_timeout_ms, uint, S_IRUGO); +MODULE_PARM_DESC(hol_timeout_ms, + "duration of user app suspension after link failure"); + +unsigned ipath_linkrecovery = 1; +module_param_named(linkrecovery, ipath_linkrecovery, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(linkrecovery, "enable workaround for link recovery issue"); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("QLogic "); +MODULE_DESCRIPTION("QLogic InfiniPath driver"); + +/* + * Table to translate the LINKTRAININGSTATE portion of + * IBCStatus to a human-readable form. + */ +const char *ipath_ibcstatus_str[] = { + "Disabled", + "LinkUp", + "PollActive", + "PollQuiet", + "SleepDelay", + "SleepQuiet", + "LState6", /* unused */ + "LState7", /* unused */ + "CfgDebounce", + "CfgRcvfCfg", + "CfgWaitRmt", + "CfgIdle", + "RecovRetrain", + "CfgTxRevLane", /* unused before IBA7220 */ + "RecovWaitRmt", + "RecovIdle", + /* below were added for IBA7220 */ + "CfgEnhanced", + "CfgTest", + "CfgWaitRmtTest", + "CfgWaitCfgEnhanced", + "SendTS_T", + "SendTstIdles", + "RcvTS_T", + "SendTst_TS1s", + "LTState18", "LTState19", "LTState1A", "LTState1B", + "LTState1C", "LTState1D", "LTState1E", "LTState1F" +}; + +static void ipath_remove_one(struct pci_dev *); +static int ipath_init_one(struct pci_dev *, const struct pci_device_id *); + +/* Only needed for registration, nothing else needs this info */ +#define PCI_VENDOR_ID_PATHSCALE 0x1fc1 +#define PCI_DEVICE_ID_INFINIPATH_HT 0xd + +/* Number of seconds before our card status check... */ +#define STATUS_TIMEOUT 60 + +static const struct pci_device_id ipath_pci_tbl[] = { + { PCI_DEVICE(PCI_VENDOR_ID_PATHSCALE, PCI_DEVICE_ID_INFINIPATH_HT) }, + { 0, } +}; + +MODULE_DEVICE_TABLE(pci, ipath_pci_tbl); + +static struct pci_driver ipath_driver = { + .name = IPATH_DRV_NAME, + .probe = ipath_init_one, + .remove = ipath_remove_one, + .id_table = ipath_pci_tbl, + .driver = { + .groups = ipath_driver_attr_groups, + }, +}; + +static inline void read_bars(struct ipath_devdata *dd, struct pci_dev *dev, + u32 *bar0, u32 *bar1) +{ + int ret; + + ret = pci_read_config_dword(dev, PCI_BASE_ADDRESS_0, bar0); + if (ret) + ipath_dev_err(dd, "failed to read bar0 before enable: " + "error %d\n", -ret); + + ret = pci_read_config_dword(dev, PCI_BASE_ADDRESS_1, bar1); + if (ret) + ipath_dev_err(dd, "failed to read bar1 before enable: " + "error %d\n", -ret); + + ipath_dbg("Read bar0 %x bar1 %x\n", *bar0, *bar1); +} + +static void ipath_free_devdata(struct pci_dev *pdev, + struct ipath_devdata *dd) +{ + unsigned long flags; + + pci_set_drvdata(pdev, NULL); + + if (dd->ipath_unit != -1) { + spin_lock_irqsave(&ipath_devs_lock, flags); + idr_remove(&unit_table, dd->ipath_unit); + list_del(&dd->ipath_list); + spin_unlock_irqrestore(&ipath_devs_lock, flags); + } + vfree(dd); +} + +static struct ipath_devdata *ipath_alloc_devdata(struct pci_dev *pdev) +{ + unsigned long flags; + struct ipath_devdata *dd; + int ret; + + dd = vzalloc(sizeof(*dd)); + if (!dd) { + dd = ERR_PTR(-ENOMEM); + goto bail; + } + dd->ipath_unit = -1; + + idr_preload(GFP_KERNEL); + spin_lock_irqsave(&ipath_devs_lock, flags); + + ret = idr_alloc(&unit_table, dd, 0, 0, GFP_NOWAIT); + if (ret < 0) { + printk(KERN_ERR IPATH_DRV_NAME + ": Could not allocate unit ID: error %d\n", -ret); + ipath_free_devdata(pdev, dd); + dd = ERR_PTR(ret); + goto bail_unlock; + } + dd->ipath_unit = ret; + + dd->pcidev = pdev; + pci_set_drvdata(pdev, dd); + + list_add(&dd->ipath_list, &ipath_dev_list); + +bail_unlock: + spin_unlock_irqrestore(&ipath_devs_lock, flags); + idr_preload_end(); +bail: + return dd; +} + +static inline struct ipath_devdata *__ipath_lookup(int unit) +{ + return idr_find(&unit_table, unit); +} + +struct ipath_devdata *ipath_lookup(int unit) +{ + struct ipath_devdata *dd; + unsigned long flags; + + spin_lock_irqsave(&ipath_devs_lock, flags); + dd = __ipath_lookup(unit); + spin_unlock_irqrestore(&ipath_devs_lock, flags); + + return dd; +} + +int ipath_count_units(int *npresentp, int *nupp, int *maxportsp) +{ + int nunits, npresent, nup; + struct ipath_devdata *dd; + unsigned long flags; + int maxports; + + nunits = npresent = nup = maxports = 0; + + spin_lock_irqsave(&ipath_devs_lock, flags); + + list_for_each_entry(dd, &ipath_dev_list, ipath_list) { + nunits++; + if ((dd->ipath_flags & IPATH_PRESENT) && dd->ipath_kregbase) + npresent++; + if (dd->ipath_lid && + !(dd->ipath_flags & (IPATH_DISABLED | IPATH_LINKDOWN + | IPATH_LINKUNK))) + nup++; + if (dd->ipath_cfgports > maxports) + maxports = dd->ipath_cfgports; + } + + spin_unlock_irqrestore(&ipath_devs_lock, flags); + + if (npresentp) + *npresentp = npresent; + if (nupp) + *nupp = nup; + if (maxportsp) + *maxportsp = maxports; + + return nunits; +} + +/* + * These next two routines are placeholders in case we don't have per-arch + * code for controlling write combining. If explicit control of write + * combining is not available, performance will probably be awful. + */ + +int __attribute__((weak)) ipath_enable_wc(struct ipath_devdata *dd) +{ + return -EOPNOTSUPP; +} + +void __attribute__((weak)) ipath_disable_wc(struct ipath_devdata *dd) +{ +} + +/* + * Perform a PIO buffer bandwidth write test, to verify proper system + * configuration. Even when all the setup calls work, occasionally + * BIOS or other issues can prevent write combining from working, or + * can cause other bandwidth problems to the chip. + * + * This test simply writes the same buffer over and over again, and + * measures close to the peak bandwidth to the chip (not testing + * data bandwidth to the wire). On chips that use an address-based + * trigger to send packets to the wire, this is easy. On chips that + * use a count to trigger, we want to make sure that the packet doesn't + * go out on the wire, or trigger flow control checks. + */ +static void ipath_verify_pioperf(struct ipath_devdata *dd) +{ + u32 pbnum, cnt, lcnt; + u32 __iomem *piobuf; + u32 *addr; + u64 msecs, emsecs; + + piobuf = ipath_getpiobuf(dd, 0, &pbnum); + if (!piobuf) { + dev_info(&dd->pcidev->dev, + "No PIObufs for checking perf, skipping\n"); + return; + } + + /* + * Enough to give us a reasonable test, less than piobuf size, and + * likely multiple of store buffer length. + */ + cnt = 1024; + + addr = vmalloc(cnt); + if (!addr) { + dev_info(&dd->pcidev->dev, + "Couldn't get memory for checking PIO perf," + " skipping\n"); + goto done; + } + + preempt_disable(); /* we want reasonably accurate elapsed time */ + msecs = 1 + jiffies_to_msecs(jiffies); + for (lcnt = 0; lcnt < 10000U; lcnt++) { + /* wait until we cross msec boundary */ + if (jiffies_to_msecs(jiffies) >= msecs) + break; + udelay(1); + } + + ipath_disable_armlaunch(dd); + + /* + * length 0, no dwords actually sent, and mark as VL15 + * on chips where that may matter (due to IB flowcontrol) + */ + if ((dd->ipath_flags & IPATH_HAS_PBC_CNT)) + writeq(1UL << 63, piobuf); + else + writeq(0, piobuf); + ipath_flush_wc(); + + /* + * this is only roughly accurate, since even with preempt we + * still take interrupts that could take a while. Running for + * >= 5 msec seems to get us "close enough" to accurate values + */ + msecs = jiffies_to_msecs(jiffies); + for (emsecs = lcnt = 0; emsecs <= 5UL; lcnt++) { + __iowrite32_copy(piobuf + 64, addr, cnt >> 2); + emsecs = jiffies_to_msecs(jiffies) - msecs; + } + + /* 1 GiB/sec, slightly over IB SDR line rate */ + if (lcnt < (emsecs * 1024U)) + ipath_dev_err(dd, + "Performance problem: bandwidth to PIO buffers is " + "only %u MiB/sec\n", + lcnt / (u32) emsecs); + else + ipath_dbg("PIO buffer bandwidth %u MiB/sec is OK\n", + lcnt / (u32) emsecs); + + preempt_enable(); + + vfree(addr); + +done: + /* disarm piobuf, so it's available again */ + ipath_disarm_piobufs(dd, pbnum, 1); + ipath_enable_armlaunch(dd); +} + +static void cleanup_device(struct ipath_devdata *dd); + +static int ipath_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) +{ + int ret, len, j; + struct ipath_devdata *dd; + unsigned long long addr; + u32 bar0 = 0, bar1 = 0; + + dd = ipath_alloc_devdata(pdev); + if (IS_ERR(dd)) { + ret = PTR_ERR(dd); + printk(KERN_ERR IPATH_DRV_NAME + ": Could not allocate devdata: error %d\n", -ret); + goto bail; + } + + ipath_cdbg(VERBOSE, "initializing unit #%u\n", dd->ipath_unit); + + ret = pci_enable_device(pdev); + if (ret) { + /* This can happen iff: + * + * We did a chip reset, and then failed to reprogram the + * BAR, or the chip reset due to an internal error. We then + * unloaded the driver and reloaded it. + * + * Both reset cases set the BAR back to initial state. For + * the latter case, the AER sticky error bit at offset 0x718 + * should be set, but the Linux kernel doesn't yet know + * about that, it appears. If the original BAR was retained + * in the kernel data structures, this may be OK. + */ + ipath_dev_err(dd, "enable unit %d failed: error %d\n", + dd->ipath_unit, -ret); + goto bail_devdata; + } + addr = pci_resource_start(pdev, 0); + len = pci_resource_len(pdev, 0); + ipath_cdbg(VERBOSE, "regbase (0) %llx len %d irq %d, vend %x/%x " + "driver_data %lx\n", addr, len, pdev->irq, ent->vendor, + ent->device, ent->driver_data); + + read_bars(dd, pdev, &bar0, &bar1); + + if (!bar1 && !(bar0 & ~0xf)) { + if (addr) { + dev_info(&pdev->dev, "BAR is 0 (probable RESET), " + "rewriting as %llx\n", addr); + ret = pci_write_config_dword( + pdev, PCI_BASE_ADDRESS_0, addr); + if (ret) { + ipath_dev_err(dd, "rewrite of BAR0 " + "failed: err %d\n", -ret); + goto bail_disable; + } + ret = pci_write_config_dword( + pdev, PCI_BASE_ADDRESS_1, addr >> 32); + if (ret) { + ipath_dev_err(dd, "rewrite of BAR1 " + "failed: err %d\n", -ret); + goto bail_disable; + } + } else { + ipath_dev_err(dd, "BAR is 0 (probable RESET), " + "not usable until reboot\n"); + ret = -ENODEV; + goto bail_disable; + } + } + + ret = pci_request_regions(pdev, IPATH_DRV_NAME); + if (ret) { + dev_info(&pdev->dev, "pci_request_regions unit %u fails: " + "err %d\n", dd->ipath_unit, -ret); + goto bail_disable; + } + + ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); + if (ret) { + /* + * if the 64 bit setup fails, try 32 bit. Some systems + * do not setup 64 bit maps on systems with 2GB or less + * memory installed. + */ + ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); + if (ret) { + dev_info(&pdev->dev, + "Unable to set DMA mask for unit %u: %d\n", + dd->ipath_unit, ret); + goto bail_regions; + } + else { + ipath_dbg("No 64bit DMA mask, used 32 bit mask\n"); + ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)); + if (ret) + dev_info(&pdev->dev, + "Unable to set DMA consistent mask " + "for unit %u: %d\n", + dd->ipath_unit, ret); + + } + } + else { + ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); + if (ret) + dev_info(&pdev->dev, + "Unable to set DMA consistent mask " + "for unit %u: %d\n", + dd->ipath_unit, ret); + } + + pci_set_master(pdev); + + /* + * Save BARs to rewrite after device reset. Save all 64 bits of + * BAR, just in case. + */ + dd->ipath_pcibar0 = addr; + dd->ipath_pcibar1 = addr >> 32; + dd->ipath_deviceid = ent->device; /* save for later use */ + dd->ipath_vendorid = ent->vendor; + + /* setup the chip-specific functions, as early as possible. */ + switch (ent->device) { + case PCI_DEVICE_ID_INFINIPATH_HT: + ipath_init_iba6110_funcs(dd); + break; + + default: + ipath_dev_err(dd, "Found unknown QLogic deviceid 0x%x, " + "failing\n", ent->device); + return -ENODEV; + } + + for (j = 0; j < 6; j++) { + if (!pdev->resource[j].start) + continue; + ipath_cdbg(VERBOSE, "BAR %d %pR, len %llx\n", + j, &pdev->resource[j], + (unsigned long long)pci_resource_len(pdev, j)); + } + + if (!addr) { + ipath_dev_err(dd, "No valid address in BAR 0!\n"); + ret = -ENODEV; + goto bail_regions; + } + + dd->ipath_pcirev = pdev->revision; + +#if defined(__powerpc__) + /* There isn't a generic way to specify writethrough mappings */ + dd->ipath_kregbase = __ioremap(addr, len, + (_PAGE_NO_CACHE|_PAGE_WRITETHRU)); +#else + dd->ipath_kregbase = ioremap_nocache(addr, len); +#endif + + if (!dd->ipath_kregbase) { + ipath_dbg("Unable to map io addr %llx to kvirt, failing\n", + addr); + ret = -ENOMEM; + goto bail_iounmap; + } + dd->ipath_kregend = (u64 __iomem *) + ((void __iomem *)dd->ipath_kregbase + len); + dd->ipath_physaddr = addr; /* used for io_remap, etc. */ + /* for user mmap */ + ipath_cdbg(VERBOSE, "mapped io addr %llx to kregbase %p\n", + addr, dd->ipath_kregbase); + + if (dd->ipath_f_bus(dd, pdev)) + ipath_dev_err(dd, "Failed to setup config space; " + "continuing anyway\n"); + + /* + * set up our interrupt handler; IRQF_SHARED probably not needed, + * since MSI interrupts shouldn't be shared but won't hurt for now. + * check 0 irq after we return from chip-specific bus setup, since + * that can affect this due to setup + */ + if (!dd->ipath_irq) + ipath_dev_err(dd, "irq is 0, BIOS error? Interrupts won't " + "work\n"); + else { + ret = request_irq(dd->ipath_irq, ipath_intr, IRQF_SHARED, + IPATH_DRV_NAME, dd); + if (ret) { + ipath_dev_err(dd, "Couldn't setup irq handler, " + "irq=%d: %d\n", dd->ipath_irq, ret); + goto bail_iounmap; + } + } + + ret = ipath_init_chip(dd, 0); /* do the chip-specific init */ + if (ret) + goto bail_irqsetup; + + ret = ipath_enable_wc(dd); + + if (ret) { + ipath_dev_err(dd, "Write combining not enabled " + "(err %d): performance may be poor\n", + -ret); + ret = 0; + } + + ipath_verify_pioperf(dd); + + ipath_device_create_group(&pdev->dev, dd); + ipathfs_add_device(dd); + ipath_user_add(dd); + ipath_diag_add(dd); + ipath_register_ib_device(dd); + + goto bail; + +bail_irqsetup: + cleanup_device(dd); + + if (dd->ipath_irq) + dd->ipath_f_free_irq(dd); + + if (dd->ipath_f_cleanup) + dd->ipath_f_cleanup(dd); + +bail_iounmap: + iounmap((volatile void __iomem *) dd->ipath_kregbase); + +bail_regions: + pci_release_regions(pdev); + +bail_disable: + pci_disable_device(pdev); + +bail_devdata: + ipath_free_devdata(pdev, dd); + +bail: + return ret; +} + +static void cleanup_device(struct ipath_devdata *dd) +{ + int port; + struct ipath_portdata **tmp; + unsigned long flags; + + if (*dd->ipath_statusp & IPATH_STATUS_CHIP_PRESENT) { + /* can't do anything more with chip; needs re-init */ + *dd->ipath_statusp &= ~IPATH_STATUS_CHIP_PRESENT; + if (dd->ipath_kregbase) { + /* + * if we haven't already cleaned up before these are + * to ensure any register reads/writes "fail" until + * re-init + */ + dd->ipath_kregbase = NULL; + dd->ipath_uregbase = 0; + dd->ipath_sregbase = 0; + dd->ipath_cregbase = 0; + dd->ipath_kregsize = 0; + } + ipath_disable_wc(dd); + } + + if (dd->ipath_spectriggerhit) + dev_info(&dd->pcidev->dev, "%lu special trigger hits\n", + dd->ipath_spectriggerhit); + + if (dd->ipath_pioavailregs_dma) { + dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE, + (void *) dd->ipath_pioavailregs_dma, + dd->ipath_pioavailregs_phys); + dd->ipath_pioavailregs_dma = NULL; + } + if (dd->ipath_dummy_hdrq) { + dma_free_coherent(&dd->pcidev->dev, + dd->ipath_pd[0]->port_rcvhdrq_size, + dd->ipath_dummy_hdrq, dd->ipath_dummy_hdrq_phys); + dd->ipath_dummy_hdrq = NULL; + } + + if (dd->ipath_pageshadow) { + struct page **tmpp = dd->ipath_pageshadow; + dma_addr_t *tmpd = dd->ipath_physshadow; + int i, cnt = 0; + + ipath_cdbg(VERBOSE, "Unlocking any expTID pages still " + "locked\n"); + for (port = 0; port < dd->ipath_cfgports; port++) { + int port_tidbase = port * dd->ipath_rcvtidcnt; + int maxtid = port_tidbase + dd->ipath_rcvtidcnt; + for (i = port_tidbase; i < maxtid; i++) { + if (!tmpp[i]) + continue; + pci_unmap_page(dd->pcidev, tmpd[i], + PAGE_SIZE, PCI_DMA_FROMDEVICE); + ipath_release_user_pages(&tmpp[i], 1); + tmpp[i] = NULL; + cnt++; + } + } + if (cnt) { + ipath_stats.sps_pageunlocks += cnt; + ipath_cdbg(VERBOSE, "There were still %u expTID " + "entries locked\n", cnt); + } + if (ipath_stats.sps_pagelocks || + ipath_stats.sps_pageunlocks) + ipath_cdbg(VERBOSE, "%llu pages locked, %llu " + "unlocked via ipath_m{un}lock\n", + (unsigned long long) + ipath_stats.sps_pagelocks, + (unsigned long long) + ipath_stats.sps_pageunlocks); + + ipath_cdbg(VERBOSE, "Free shadow page tid array at %p\n", + dd->ipath_pageshadow); + tmpp = dd->ipath_pageshadow; + dd->ipath_pageshadow = NULL; + vfree(tmpp); + + dd->ipath_egrtidbase = NULL; + } + + /* + * free any resources still in use (usually just kernel ports) + * at unload; we do for portcnt, because that's what we allocate. + * We acquire lock to be really paranoid that ipath_pd isn't being + * accessed from some interrupt-related code (that should not happen, + * but best to be sure). + */ + spin_lock_irqsave(&dd->ipath_uctxt_lock, flags); + tmp = dd->ipath_pd; + dd->ipath_pd = NULL; + spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags); + for (port = 0; port < dd->ipath_portcnt; port++) { + struct ipath_portdata *pd = tmp[port]; + tmp[port] = NULL; /* debugging paranoia */ + ipath_free_pddata(dd, pd); + } + kfree(tmp); +} + +static void ipath_remove_one(struct pci_dev *pdev) +{ + struct ipath_devdata *dd = pci_get_drvdata(pdev); + + ipath_cdbg(VERBOSE, "removing, pdev=%p, dd=%p\n", pdev, dd); + + /* + * disable the IB link early, to be sure no new packets arrive, which + * complicates the shutdown process + */ + ipath_shutdown_device(dd); + + flush_workqueue(ib_wq); + + if (dd->verbs_dev) + ipath_unregister_ib_device(dd->verbs_dev); + + ipath_diag_remove(dd); + ipath_user_remove(dd); + ipathfs_remove_device(dd); + ipath_device_remove_group(&pdev->dev, dd); + + ipath_cdbg(VERBOSE, "Releasing pci memory regions, dd %p, " + "unit %u\n", dd, (u32) dd->ipath_unit); + + cleanup_device(dd); + + /* + * turn off rcv, send, and interrupts for all ports, all drivers + * should also hard reset the chip here? + * free up port 0 (kernel) rcvhdr, egr bufs, and eventually tid bufs + * for all versions of the driver, if they were allocated + */ + if (dd->ipath_irq) { + ipath_cdbg(VERBOSE, "unit %u free irq %d\n", + dd->ipath_unit, dd->ipath_irq); + dd->ipath_f_free_irq(dd); + } else + ipath_dbg("irq is 0, not doing free_irq " + "for unit %u\n", dd->ipath_unit); + /* + * we check for NULL here, because it's outside + * the kregbase check, and we need to call it + * after the free_irq. Thus it's possible that + * the function pointers were never initialized. + */ + if (dd->ipath_f_cleanup) + /* clean up chip-specific stuff */ + dd->ipath_f_cleanup(dd); + + ipath_cdbg(VERBOSE, "Unmapping kregbase %p\n", dd->ipath_kregbase); + iounmap((volatile void __iomem *) dd->ipath_kregbase); + pci_release_regions(pdev); + ipath_cdbg(VERBOSE, "calling pci_disable_device\n"); + pci_disable_device(pdev); + + ipath_free_devdata(pdev, dd); +} + +/* general driver use */ +DEFINE_MUTEX(ipath_mutex); + +static DEFINE_SPINLOCK(ipath_pioavail_lock); + +/** + * ipath_disarm_piobufs - cancel a range of PIO buffers + * @dd: the infinipath device + * @first: the first PIO buffer to cancel + * @cnt: the number of PIO buffers to cancel + * + * cancel a range of PIO buffers, used when they might be armed, but + * not triggered. Used at init to ensure buffer state, and also user + * process close, in case it died while writing to a PIO buffer + * Also after errors. + */ +void ipath_disarm_piobufs(struct ipath_devdata *dd, unsigned first, + unsigned cnt) +{ + unsigned i, last = first + cnt; + unsigned long flags; + + ipath_cdbg(PKT, "disarm %u PIObufs first=%u\n", cnt, first); + for (i = first; i < last; i++) { + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + /* + * The disarm-related bits are write-only, so it + * is ok to OR them in with our copy of sendctrl + * while we hold the lock. + */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl | INFINIPATH_S_DISARM | + (i << INFINIPATH_S_DISARMPIOBUF_SHIFT)); + /* can't disarm bufs back-to-back per iba7220 spec */ + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + } + /* on some older chips, update may not happen after cancel */ + ipath_force_pio_avail_update(dd); +} + +/** + * ipath_wait_linkstate - wait for an IB link state change to occur + * @dd: the infinipath device + * @state: the state to wait for + * @msecs: the number of milliseconds to wait + * + * wait up to msecs milliseconds for IB link state change to occur for + * now, take the easy polling route. Currently used only by + * ipath_set_linkstate. Returns 0 if state reached, otherwise + * -ETIMEDOUT state can have multiple states set, for any of several + * transitions. + */ +int ipath_wait_linkstate(struct ipath_devdata *dd, u32 state, int msecs) +{ + dd->ipath_state_wanted = state; + wait_event_interruptible_timeout(ipath_state_wait, + (dd->ipath_flags & state), + msecs_to_jiffies(msecs)); + dd->ipath_state_wanted = 0; + + if (!(dd->ipath_flags & state)) { + u64 val; + ipath_cdbg(VERBOSE, "Didn't reach linkstate %s within %u" + " ms\n", + /* test INIT ahead of DOWN, both can be set */ + (state & IPATH_LINKINIT) ? "INIT" : + ((state & IPATH_LINKDOWN) ? "DOWN" : + ((state & IPATH_LINKARMED) ? "ARM" : "ACTIVE")), + msecs); + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus); + ipath_cdbg(VERBOSE, "ibcc=%llx ibcstatus=%llx (%s)\n", + (unsigned long long) ipath_read_kreg64( + dd, dd->ipath_kregs->kr_ibcctrl), + (unsigned long long) val, + ipath_ibcstatus_str[val & dd->ibcs_lts_mask]); + } + return (dd->ipath_flags & state) ? 0 : -ETIMEDOUT; +} + +static void decode_sdma_errs(struct ipath_devdata *dd, ipath_err_t err, + char *buf, size_t blen) +{ + static const struct { + ipath_err_t err; + const char *msg; + } errs[] = { + { INFINIPATH_E_SDMAGENMISMATCH, "SDmaGenMismatch" }, + { INFINIPATH_E_SDMAOUTOFBOUND, "SDmaOutOfBound" }, + { INFINIPATH_E_SDMATAILOUTOFBOUND, "SDmaTailOutOfBound" }, + { INFINIPATH_E_SDMABASE, "SDmaBase" }, + { INFINIPATH_E_SDMA1STDESC, "SDma1stDesc" }, + { INFINIPATH_E_SDMARPYTAG, "SDmaRpyTag" }, + { INFINIPATH_E_SDMADWEN, "SDmaDwEn" }, + { INFINIPATH_E_SDMAMISSINGDW, "SDmaMissingDw" }, + { INFINIPATH_E_SDMAUNEXPDATA, "SDmaUnexpData" }, + { INFINIPATH_E_SDMADESCADDRMISALIGN, "SDmaDescAddrMisalign" }, + { INFINIPATH_E_SENDBUFMISUSE, "SendBufMisuse" }, + { INFINIPATH_E_SDMADISABLED, "SDmaDisabled" }, + }; + int i; + int expected; + size_t bidx = 0; + + for (i = 0; i < ARRAY_SIZE(errs); i++) { + expected = (errs[i].err != INFINIPATH_E_SDMADISABLED) ? 0 : + test_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status); + if ((err & errs[i].err) && !expected) + bidx += snprintf(buf + bidx, blen - bidx, + "%s ", errs[i].msg); + } +} + +/* + * Decode the error status into strings, deciding whether to always + * print * it or not depending on "normal packet errors" vs everything + * else. Return 1 if "real" errors, otherwise 0 if only packet + * errors, so caller can decide what to print with the string. + */ +int ipath_decode_err(struct ipath_devdata *dd, char *buf, size_t blen, + ipath_err_t err) +{ + int iserr = 1; + *buf = '\0'; + if (err & INFINIPATH_E_PKTERRS) { + if (!(err & ~INFINIPATH_E_PKTERRS)) + iserr = 0; // if only packet errors. + if (ipath_debug & __IPATH_ERRPKTDBG) { + if (err & INFINIPATH_E_REBP) + strlcat(buf, "EBP ", blen); + if (err & INFINIPATH_E_RVCRC) + strlcat(buf, "VCRC ", blen); + if (err & INFINIPATH_E_RICRC) { + strlcat(buf, "CRC ", blen); + // clear for check below, so only once + err &= INFINIPATH_E_RICRC; + } + if (err & INFINIPATH_E_RSHORTPKTLEN) + strlcat(buf, "rshortpktlen ", blen); + if (err & INFINIPATH_E_SDROPPEDDATAPKT) + strlcat(buf, "sdroppeddatapkt ", blen); + if (err & INFINIPATH_E_SPKTLEN) + strlcat(buf, "spktlen ", blen); + } + if ((err & INFINIPATH_E_RICRC) && + !(err&(INFINIPATH_E_RVCRC|INFINIPATH_E_REBP))) + strlcat(buf, "CRC ", blen); + if (!iserr) + goto done; + } + if (err & INFINIPATH_E_RHDRLEN) + strlcat(buf, "rhdrlen ", blen); + if (err & INFINIPATH_E_RBADTID) + strlcat(buf, "rbadtid ", blen); + if (err & INFINIPATH_E_RBADVERSION) + strlcat(buf, "rbadversion ", blen); + if (err & INFINIPATH_E_RHDR) + strlcat(buf, "rhdr ", blen); + if (err & INFINIPATH_E_SENDSPECIALTRIGGER) + strlcat(buf, "sendspecialtrigger ", blen); + if (err & INFINIPATH_E_RLONGPKTLEN) + strlcat(buf, "rlongpktlen ", blen); + if (err & INFINIPATH_E_RMAXPKTLEN) + strlcat(buf, "rmaxpktlen ", blen); + if (err & INFINIPATH_E_RMINPKTLEN) + strlcat(buf, "rminpktlen ", blen); + if (err & INFINIPATH_E_SMINPKTLEN) + strlcat(buf, "sminpktlen ", blen); + if (err & INFINIPATH_E_RFORMATERR) + strlcat(buf, "rformaterr ", blen); + if (err & INFINIPATH_E_RUNSUPVL) + strlcat(buf, "runsupvl ", blen); + if (err & INFINIPATH_E_RUNEXPCHAR) + strlcat(buf, "runexpchar ", blen); + if (err & INFINIPATH_E_RIBFLOW) + strlcat(buf, "ribflow ", blen); + if (err & INFINIPATH_E_SUNDERRUN) + strlcat(buf, "sunderrun ", blen); + if (err & INFINIPATH_E_SPIOARMLAUNCH) + strlcat(buf, "spioarmlaunch ", blen); + if (err & INFINIPATH_E_SUNEXPERRPKTNUM) + strlcat(buf, "sunexperrpktnum ", blen); + if (err & INFINIPATH_E_SDROPPEDSMPPKT) + strlcat(buf, "sdroppedsmppkt ", blen); + if (err & INFINIPATH_E_SMAXPKTLEN) + strlcat(buf, "smaxpktlen ", blen); + if (err & INFINIPATH_E_SUNSUPVL) + strlcat(buf, "sunsupVL ", blen); + if (err & INFINIPATH_E_INVALIDADDR) + strlcat(buf, "invalidaddr ", blen); + if (err & INFINIPATH_E_RRCVEGRFULL) + strlcat(buf, "rcvegrfull ", blen); + if (err & INFINIPATH_E_RRCVHDRFULL) + strlcat(buf, "rcvhdrfull ", blen); + if (err & INFINIPATH_E_IBSTATUSCHANGED) + strlcat(buf, "ibcstatuschg ", blen); + if (err & INFINIPATH_E_RIBLOSTLINK) + strlcat(buf, "riblostlink ", blen); + if (err & INFINIPATH_E_HARDWARE) + strlcat(buf, "hardware ", blen); + if (err & INFINIPATH_E_RESET) + strlcat(buf, "reset ", blen); + if (err & INFINIPATH_E_SDMAERRS) + decode_sdma_errs(dd, err, buf, blen); + if (err & INFINIPATH_E_INVALIDEEPCMD) + strlcat(buf, "invalideepromcmd ", blen); +done: + return iserr; +} + +/** + * get_rhf_errstring - decode RHF errors + * @err: the err number + * @msg: the output buffer + * @len: the length of the output buffer + * + * only used one place now, may want more later + */ +static void get_rhf_errstring(u32 err, char *msg, size_t len) +{ + /* if no errors, and so don't need to check what's first */ + *msg = '\0'; + + if (err & INFINIPATH_RHF_H_ICRCERR) + strlcat(msg, "icrcerr ", len); + if (err & INFINIPATH_RHF_H_VCRCERR) + strlcat(msg, "vcrcerr ", len); + if (err & INFINIPATH_RHF_H_PARITYERR) + strlcat(msg, "parityerr ", len); + if (err & INFINIPATH_RHF_H_LENERR) + strlcat(msg, "lenerr ", len); + if (err & INFINIPATH_RHF_H_MTUERR) + strlcat(msg, "mtuerr ", len); + if (err & INFINIPATH_RHF_H_IHDRERR) + /* infinipath hdr checksum error */ + strlcat(msg, "ipathhdrerr ", len); + if (err & INFINIPATH_RHF_H_TIDERR) + strlcat(msg, "tiderr ", len); + if (err & INFINIPATH_RHF_H_MKERR) + /* bad port, offset, etc. */ + strlcat(msg, "invalid ipathhdr ", len); + if (err & INFINIPATH_RHF_H_IBERR) + strlcat(msg, "iberr ", len); + if (err & INFINIPATH_RHF_L_SWA) + strlcat(msg, "swA ", len); + if (err & INFINIPATH_RHF_L_SWB) + strlcat(msg, "swB ", len); +} + +/** + * ipath_get_egrbuf - get an eager buffer + * @dd: the infinipath device + * @bufnum: the eager buffer to get + * + * must only be called if ipath_pd[port] is known to be allocated + */ +static inline void *ipath_get_egrbuf(struct ipath_devdata *dd, u32 bufnum) +{ + return dd->ipath_port0_skbinfo ? + (void *) dd->ipath_port0_skbinfo[bufnum].skb->data : NULL; +} + +/** + * ipath_alloc_skb - allocate an skb and buffer with possible constraints + * @dd: the infinipath device + * @gfp_mask: the sk_buff SFP mask + */ +struct sk_buff *ipath_alloc_skb(struct ipath_devdata *dd, + gfp_t gfp_mask) +{ + struct sk_buff *skb; + u32 len; + + /* + * Only fully supported way to handle this is to allocate lots + * extra, align as needed, and then do skb_reserve(). That wastes + * a lot of memory... I'll have to hack this into infinipath_copy + * also. + */ + + /* + * We need 2 extra bytes for ipath_ether data sent in the + * key header. In order to keep everything dword aligned, + * we'll reserve 4 bytes. + */ + len = dd->ipath_ibmaxlen + 4; + + if (dd->ipath_flags & IPATH_4BYTE_TID) { + /* We need a 2KB multiple alignment, and there is no way + * to do it except to allocate extra and then skb_reserve + * enough to bring it up to the right alignment. + */ + len += 2047; + } + + skb = __dev_alloc_skb(len, gfp_mask); + if (!skb) { + ipath_dev_err(dd, "Failed to allocate skbuff, length %u\n", + len); + goto bail; + } + + skb_reserve(skb, 4); + + if (dd->ipath_flags & IPATH_4BYTE_TID) { + u32 una = (unsigned long)skb->data & 2047; + if (una) + skb_reserve(skb, 2048 - una); + } + +bail: + return skb; +} + +static void ipath_rcv_hdrerr(struct ipath_devdata *dd, + u32 eflags, + u32 l, + u32 etail, + __le32 *rhf_addr, + struct ipath_message_header *hdr) +{ + char emsg[128]; + + get_rhf_errstring(eflags, emsg, sizeof emsg); + ipath_cdbg(PKT, "RHFerrs %x hdrqtail=%x typ=%u " + "tlen=%x opcode=%x egridx=%x: %s\n", + eflags, l, + ipath_hdrget_rcv_type(rhf_addr), + ipath_hdrget_length_in_bytes(rhf_addr), + be32_to_cpu(hdr->bth[0]) >> 24, + etail, emsg); + + /* Count local link integrity errors. */ + if (eflags & (INFINIPATH_RHF_H_ICRCERR | INFINIPATH_RHF_H_VCRCERR)) { + u8 n = (dd->ipath_ibcctrl >> + INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT) & + INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK; + + if (++dd->ipath_lli_counter > n) { + dd->ipath_lli_counter = 0; + dd->ipath_lli_errors++; + } + } +} + +/* + * ipath_kreceive - receive a packet + * @pd: the infinipath port + * + * called from interrupt handler for errors or receive interrupt + */ +void ipath_kreceive(struct ipath_portdata *pd) +{ + struct ipath_devdata *dd = pd->port_dd; + __le32 *rhf_addr; + void *ebuf; + const u32 rsize = dd->ipath_rcvhdrentsize; /* words */ + const u32 maxcnt = dd->ipath_rcvhdrcnt * rsize; /* words */ + u32 etail = -1, l, hdrqtail; + struct ipath_message_header *hdr; + u32 eflags, i, etype, tlen, pkttot = 0, updegr = 0, reloop = 0; + static u64 totcalls; /* stats, may eventually remove */ + int last; + + l = pd->port_head; + rhf_addr = (__le32 *) pd->port_rcvhdrq + l + dd->ipath_rhf_offset; + if (dd->ipath_flags & IPATH_NODMA_RTAIL) { + u32 seq = ipath_hdrget_seq(rhf_addr); + + if (seq != pd->port_seq_cnt) + goto bail; + hdrqtail = 0; + } else { + hdrqtail = ipath_get_rcvhdrtail(pd); + if (l == hdrqtail) + goto bail; + smp_rmb(); + } + +reloop: + for (last = 0, i = 1; !last; i += !last) { + hdr = dd->ipath_f_get_msgheader(dd, rhf_addr); + eflags = ipath_hdrget_err_flags(rhf_addr); + etype = ipath_hdrget_rcv_type(rhf_addr); + /* total length */ + tlen = ipath_hdrget_length_in_bytes(rhf_addr); + ebuf = NULL; + if ((dd->ipath_flags & IPATH_NODMA_RTAIL) ? + ipath_hdrget_use_egr_buf(rhf_addr) : + (etype != RCVHQ_RCV_TYPE_EXPECTED)) { + /* + * It turns out that the chip uses an eager buffer + * for all non-expected packets, whether it "needs" + * one or not. So always get the index, but don't + * set ebuf (so we try to copy data) unless the + * length requires it. + */ + etail = ipath_hdrget_index(rhf_addr); + updegr = 1; + if (tlen > sizeof(*hdr) || + etype == RCVHQ_RCV_TYPE_NON_KD) + ebuf = ipath_get_egrbuf(dd, etail); + } + + /* + * both tiderr and ipathhdrerr are set for all plain IB + * packets; only ipathhdrerr should be set. + */ + + if (etype != RCVHQ_RCV_TYPE_NON_KD && + etype != RCVHQ_RCV_TYPE_ERROR && + ipath_hdrget_ipath_ver(hdr->iph.ver_port_tid_offset) != + IPS_PROTO_VERSION) + ipath_cdbg(PKT, "Bad InfiniPath protocol version " + "%x\n", etype); + + if (unlikely(eflags)) + ipath_rcv_hdrerr(dd, eflags, l, etail, rhf_addr, hdr); + else if (etype == RCVHQ_RCV_TYPE_NON_KD) { + ipath_ib_rcv(dd->verbs_dev, (u32 *)hdr, ebuf, tlen); + if (dd->ipath_lli_counter) + dd->ipath_lli_counter--; + } else if (etype == RCVHQ_RCV_TYPE_EAGER) { + u8 opcode = be32_to_cpu(hdr->bth[0]) >> 24; + u32 qp = be32_to_cpu(hdr->bth[1]) & 0xffffff; + ipath_cdbg(PKT, "typ %x, opcode %x (eager, " + "qp=%x), len %x; ignored\n", + etype, opcode, qp, tlen); + } + else if (etype == RCVHQ_RCV_TYPE_EXPECTED) + ipath_dbg("Bug: Expected TID, opcode %x; ignored\n", + be32_to_cpu(hdr->bth[0]) >> 24); + else { + /* + * error packet, type of error unknown. + * Probably type 3, but we don't know, so don't + * even try to print the opcode, etc. + * Usually caused by a "bad packet", that has no + * BTH, when the LRH says it should. + */ + ipath_cdbg(ERRPKT, "Error Pkt, but no eflags! egrbuf" + " %x, len %x hdrq+%x rhf: %Lx\n", + etail, tlen, l, (unsigned long long) + le64_to_cpu(*(__le64 *) rhf_addr)); + if (ipath_debug & __IPATH_ERRPKTDBG) { + u32 j, *d, dw = rsize-2; + if (rsize > (tlen>>2)) + dw = tlen>>2; + d = (u32 *)hdr; + printk(KERN_DEBUG "EPkt rcvhdr(%x dw):\n", + dw); + for (j = 0; j < dw; j++) + printk(KERN_DEBUG "%8x%s", d[j], + (j%8) == 7 ? "\n" : " "); + printk(KERN_DEBUG ".\n"); + } + } + l += rsize; + if (l >= maxcnt) + l = 0; + rhf_addr = (__le32 *) pd->port_rcvhdrq + + l + dd->ipath_rhf_offset; + if (dd->ipath_flags & IPATH_NODMA_RTAIL) { + u32 seq = ipath_hdrget_seq(rhf_addr); + + if (++pd->port_seq_cnt > 13) + pd->port_seq_cnt = 1; + if (seq != pd->port_seq_cnt) + last = 1; + } else if (l == hdrqtail) + last = 1; + /* + * update head regs on last packet, and every 16 packets. + * Reduce bus traffic, while still trying to prevent + * rcvhdrq overflows, for when the queue is nearly full + */ + if (last || !(i & 0xf)) { + u64 lval = l; + + /* request IBA6120 and 7220 interrupt only on last */ + if (last) + lval |= dd->ipath_rhdrhead_intr_off; + ipath_write_ureg(dd, ur_rcvhdrhead, lval, + pd->port_port); + if (updegr) { + ipath_write_ureg(dd, ur_rcvegrindexhead, + etail, pd->port_port); + updegr = 0; + } + } + } + + if (!dd->ipath_rhdrhead_intr_off && !reloop && + !(dd->ipath_flags & IPATH_NODMA_RTAIL)) { + /* IBA6110 workaround; we can have a race clearing chip + * interrupt with another interrupt about to be delivered, + * and can clear it before it is delivered on the GPIO + * workaround. By doing the extra check here for the + * in-memory tail register updating while we were doing + * earlier packets, we "almost" guarantee we have covered + * that case. + */ + u32 hqtail = ipath_get_rcvhdrtail(pd); + if (hqtail != hdrqtail) { + hdrqtail = hqtail; + reloop = 1; /* loop 1 extra time at most */ + goto reloop; + } + } + + pkttot += i; + + pd->port_head = l; + + if (pkttot > ipath_stats.sps_maxpkts_call) + ipath_stats.sps_maxpkts_call = pkttot; + ipath_stats.sps_port0pkts += pkttot; + ipath_stats.sps_avgpkts_call = + ipath_stats.sps_port0pkts / ++totcalls; + +bail:; +} + +/** + * ipath_update_pio_bufs - update shadow copy of the PIO availability map + * @dd: the infinipath device + * + * called whenever our local copy indicates we have run out of send buffers + * NOTE: This can be called from interrupt context by some code + * and from non-interrupt context by ipath_getpiobuf(). + */ + +static void ipath_update_pio_bufs(struct ipath_devdata *dd) +{ + unsigned long flags; + int i; + const unsigned piobregs = (unsigned)dd->ipath_pioavregs; + + /* If the generation (check) bits have changed, then we update the + * busy bit for the corresponding PIO buffer. This algorithm will + * modify positions to the value they already have in some cases + * (i.e., no change), but it's faster than changing only the bits + * that have changed. + * + * We would like to do this atomicly, to avoid spinlocks in the + * critical send path, but that's not really possible, given the + * type of changes, and that this routine could be called on + * multiple cpu's simultaneously, so we lock in this routine only, + * to avoid conflicting updates; all we change is the shadow, and + * it's a single 64 bit memory location, so by definition the update + * is atomic in terms of what other cpu's can see in testing the + * bits. The spin_lock overhead isn't too bad, since it only + * happens when all buffers are in use, so only cpu overhead, not + * latency or bandwidth is affected. + */ + if (!dd->ipath_pioavailregs_dma) { + ipath_dbg("Update shadow pioavail, but regs_dma NULL!\n"); + return; + } + if (ipath_debug & __IPATH_VERBDBG) { + /* only if packet debug and verbose */ + volatile __le64 *dma = dd->ipath_pioavailregs_dma; + unsigned long *shadow = dd->ipath_pioavailshadow; + + ipath_cdbg(PKT, "Refill avail, dma0=%llx shad0=%lx, " + "d1=%llx s1=%lx, d2=%llx s2=%lx, d3=%llx " + "s3=%lx\n", + (unsigned long long) le64_to_cpu(dma[0]), + shadow[0], + (unsigned long long) le64_to_cpu(dma[1]), + shadow[1], + (unsigned long long) le64_to_cpu(dma[2]), + shadow[2], + (unsigned long long) le64_to_cpu(dma[3]), + shadow[3]); + if (piobregs > 4) + ipath_cdbg( + PKT, "2nd group, dma4=%llx shad4=%lx, " + "d5=%llx s5=%lx, d6=%llx s6=%lx, " + "d7=%llx s7=%lx\n", + (unsigned long long) le64_to_cpu(dma[4]), + shadow[4], + (unsigned long long) le64_to_cpu(dma[5]), + shadow[5], + (unsigned long long) le64_to_cpu(dma[6]), + shadow[6], + (unsigned long long) le64_to_cpu(dma[7]), + shadow[7]); + } + spin_lock_irqsave(&ipath_pioavail_lock, flags); + for (i = 0; i < piobregs; i++) { + u64 pchbusy, pchg, piov, pnew; + /* + * Chip Errata: bug 6641; even and odd qwords>3 are swapped + */ + if (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS)) + piov = le64_to_cpu(dd->ipath_pioavailregs_dma[i ^ 1]); + else + piov = le64_to_cpu(dd->ipath_pioavailregs_dma[i]); + pchg = dd->ipath_pioavailkernel[i] & + ~(dd->ipath_pioavailshadow[i] ^ piov); + pchbusy = pchg << INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT; + if (pchg && (pchbusy & dd->ipath_pioavailshadow[i])) { + pnew = dd->ipath_pioavailshadow[i] & ~pchbusy; + pnew |= piov & pchbusy; + dd->ipath_pioavailshadow[i] = pnew; + } + } + spin_unlock_irqrestore(&ipath_pioavail_lock, flags); +} + +/* + * used to force update of pioavailshadow if we can't get a pio buffer. + * Needed primarily due to exitting freeze mode after recovering + * from errors. Done lazily, because it's safer (known to not + * be writing pio buffers). + */ +static void ipath_reset_availshadow(struct ipath_devdata *dd) +{ + int i, im; + unsigned long flags; + + spin_lock_irqsave(&ipath_pioavail_lock, flags); + for (i = 0; i < dd->ipath_pioavregs; i++) { + u64 val, oldval; + /* deal with 6110 chip bug on high register #s */ + im = (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS)) ? + i ^ 1 : i; + val = le64_to_cpu(dd->ipath_pioavailregs_dma[im]); + /* + * busy out the buffers not in the kernel avail list, + * without changing the generation bits. + */ + oldval = dd->ipath_pioavailshadow[i]; + dd->ipath_pioavailshadow[i] = val | + ((~dd->ipath_pioavailkernel[i] << + INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT) & + 0xaaaaaaaaaaaaaaaaULL); /* All BUSY bits in qword */ + if (oldval != dd->ipath_pioavailshadow[i]) + ipath_dbg("shadow[%d] was %Lx, now %lx\n", + i, (unsigned long long) oldval, + dd->ipath_pioavailshadow[i]); + } + spin_unlock_irqrestore(&ipath_pioavail_lock, flags); +} + +/** + * ipath_setrcvhdrsize - set the receive header size + * @dd: the infinipath device + * @rhdrsize: the receive header size + * + * called from user init code, and also layered driver init + */ +int ipath_setrcvhdrsize(struct ipath_devdata *dd, unsigned rhdrsize) +{ + int ret = 0; + + if (dd->ipath_flags & IPATH_RCVHDRSZ_SET) { + if (dd->ipath_rcvhdrsize != rhdrsize) { + dev_info(&dd->pcidev->dev, + "Error: can't set protocol header " + "size %u, already %u\n", + rhdrsize, dd->ipath_rcvhdrsize); + ret = -EAGAIN; + } else + ipath_cdbg(VERBOSE, "Reuse same protocol header " + "size %u\n", dd->ipath_rcvhdrsize); + } else if (rhdrsize > (dd->ipath_rcvhdrentsize - + (sizeof(u64) / sizeof(u32)))) { + ipath_dbg("Error: can't set protocol header size %u " + "(> max %u)\n", rhdrsize, + dd->ipath_rcvhdrentsize - + (u32) (sizeof(u64) / sizeof(u32))); + ret = -EOVERFLOW; + } else { + dd->ipath_flags |= IPATH_RCVHDRSZ_SET; + dd->ipath_rcvhdrsize = rhdrsize; + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrsize, + dd->ipath_rcvhdrsize); + ipath_cdbg(VERBOSE, "Set protocol header size to %u\n", + dd->ipath_rcvhdrsize); + } + return ret; +} + +/* + * debugging code and stats updates if no pio buffers available. + */ +static noinline void no_pio_bufs(struct ipath_devdata *dd) +{ + unsigned long *shadow = dd->ipath_pioavailshadow; + __le64 *dma = (__le64 *)dd->ipath_pioavailregs_dma; + + dd->ipath_upd_pio_shadow = 1; + + /* + * not atomic, but if we lose a stat count in a while, that's OK + */ + ipath_stats.sps_nopiobufs++; + if (!(++dd->ipath_consec_nopiobuf % 100000)) { + ipath_force_pio_avail_update(dd); /* at start */ + ipath_dbg("%u tries no piobufavail ts%lx; dmacopy: " + "%llx %llx %llx %llx\n" + "ipath shadow: %lx %lx %lx %lx\n", + dd->ipath_consec_nopiobuf, + (unsigned long)get_cycles(), + (unsigned long long) le64_to_cpu(dma[0]), + (unsigned long long) le64_to_cpu(dma[1]), + (unsigned long long) le64_to_cpu(dma[2]), + (unsigned long long) le64_to_cpu(dma[3]), + shadow[0], shadow[1], shadow[2], shadow[3]); + /* + * 4 buffers per byte, 4 registers above, cover rest + * below + */ + if ((dd->ipath_piobcnt2k + dd->ipath_piobcnt4k) > + (sizeof(shadow[0]) * 4 * 4)) + ipath_dbg("2nd group: dmacopy: " + "%llx %llx %llx %llx\n" + "ipath shadow: %lx %lx %lx %lx\n", + (unsigned long long)le64_to_cpu(dma[4]), + (unsigned long long)le64_to_cpu(dma[5]), + (unsigned long long)le64_to_cpu(dma[6]), + (unsigned long long)le64_to_cpu(dma[7]), + shadow[4], shadow[5], shadow[6], shadow[7]); + + /* at end, so update likely happened */ + ipath_reset_availshadow(dd); + } +} + +/* + * common code for normal driver pio buffer allocation, and reserved + * allocation. + * + * do appropriate marking as busy, etc. + * returns buffer number if one found (>=0), negative number is error. + */ +static u32 __iomem *ipath_getpiobuf_range(struct ipath_devdata *dd, + u32 *pbufnum, u32 first, u32 last, u32 firsti) +{ + int i, j, updated = 0; + unsigned piobcnt; + unsigned long flags; + unsigned long *shadow = dd->ipath_pioavailshadow; + u32 __iomem *buf; + + piobcnt = last - first; + if (dd->ipath_upd_pio_shadow) { + /* + * Minor optimization. If we had no buffers on last call, + * start out by doing the update; continue and do scan even + * if no buffers were updated, to be paranoid + */ + ipath_update_pio_bufs(dd); + updated++; + i = first; + } else + i = firsti; +rescan: + /* + * while test_and_set_bit() is atomic, we do that and then the + * change_bit(), and the pair is not. See if this is the cause + * of the remaining armlaunch errors. + */ + spin_lock_irqsave(&ipath_pioavail_lock, flags); + for (j = 0; j < piobcnt; j++, i++) { + if (i >= last) + i = first; + if (__test_and_set_bit((2 * i) + 1, shadow)) + continue; + /* flip generation bit */ + __change_bit(2 * i, shadow); + break; + } + spin_unlock_irqrestore(&ipath_pioavail_lock, flags); + + if (j == piobcnt) { + if (!updated) { + /* + * first time through; shadow exhausted, but may be + * buffers available, try an update and then rescan. + */ + ipath_update_pio_bufs(dd); + updated++; + i = first; + goto rescan; + } else if (updated == 1 && piobcnt <= + ((dd->ipath_sendctrl + >> INFINIPATH_S_UPDTHRESH_SHIFT) & + INFINIPATH_S_UPDTHRESH_MASK)) { + /* + * for chips supporting and using the update + * threshold we need to force an update of the + * in-memory copy if the count is less than the + * thershold, then check one more time. + */ + ipath_force_pio_avail_update(dd); + ipath_update_pio_bufs(dd); + updated++; + i = first; + goto rescan; + } + + no_pio_bufs(dd); + buf = NULL; + } else { + if (i < dd->ipath_piobcnt2k) + buf = (u32 __iomem *) (dd->ipath_pio2kbase + + i * dd->ipath_palign); + else + buf = (u32 __iomem *) + (dd->ipath_pio4kbase + + (i - dd->ipath_piobcnt2k) * dd->ipath_4kalign); + if (pbufnum) + *pbufnum = i; + } + + return buf; +} + +/** + * ipath_getpiobuf - find an available pio buffer + * @dd: the infinipath device + * @plen: the size of the PIO buffer needed in 32-bit words + * @pbufnum: the buffer number is placed here + */ +u32 __iomem *ipath_getpiobuf(struct ipath_devdata *dd, u32 plen, u32 *pbufnum) +{ + u32 __iomem *buf; + u32 pnum, nbufs; + u32 first, lasti; + + if (plen + 1 >= IPATH_SMALLBUF_DWORDS) { + first = dd->ipath_piobcnt2k; + lasti = dd->ipath_lastpioindexl; + } else { + first = 0; + lasti = dd->ipath_lastpioindex; + } + nbufs = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k; + buf = ipath_getpiobuf_range(dd, &pnum, first, nbufs, lasti); + + if (buf) { + /* + * Set next starting place. It's just an optimization, + * it doesn't matter who wins on this, so no locking + */ + if (plen + 1 >= IPATH_SMALLBUF_DWORDS) + dd->ipath_lastpioindexl = pnum + 1; + else + dd->ipath_lastpioindex = pnum + 1; + if (dd->ipath_upd_pio_shadow) + dd->ipath_upd_pio_shadow = 0; + if (dd->ipath_consec_nopiobuf) + dd->ipath_consec_nopiobuf = 0; + ipath_cdbg(VERBOSE, "Return piobuf%u %uk @ %p\n", + pnum, (pnum < dd->ipath_piobcnt2k) ? 2 : 4, buf); + if (pbufnum) + *pbufnum = pnum; + + } + return buf; +} + +/** + * ipath_chg_pioavailkernel - change which send buffers are available for kernel + * @dd: the infinipath device + * @start: the starting send buffer number + * @len: the number of send buffers + * @avail: true if the buffers are available for kernel use, false otherwise + */ +void ipath_chg_pioavailkernel(struct ipath_devdata *dd, unsigned start, + unsigned len, int avail) +{ + unsigned long flags; + unsigned end, cnt = 0; + + /* There are two bits per send buffer (busy and generation) */ + start *= 2; + end = start + len * 2; + + spin_lock_irqsave(&ipath_pioavail_lock, flags); + /* Set or clear the busy bit in the shadow. */ + while (start < end) { + if (avail) { + unsigned long dma; + int i, im; + /* + * the BUSY bit will never be set, because we disarm + * the user buffers before we hand them back to the + * kernel. We do have to make sure the generation + * bit is set correctly in shadow, since it could + * have changed many times while allocated to user. + * We can't use the bitmap functions on the full + * dma array because it is always little-endian, so + * we have to flip to host-order first. + * BITS_PER_LONG is slightly wrong, since it's + * always 64 bits per register in chip... + * We only work on 64 bit kernels, so that's OK. + */ + /* deal with 6110 chip bug on high register #s */ + i = start / BITS_PER_LONG; + im = (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS)) ? + i ^ 1 : i; + __clear_bit(INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT + + start, dd->ipath_pioavailshadow); + dma = (unsigned long) le64_to_cpu( + dd->ipath_pioavailregs_dma[im]); + if (test_bit((INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT + + start) % BITS_PER_LONG, &dma)) + __set_bit(INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT + + start, dd->ipath_pioavailshadow); + else + __clear_bit(INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT + + start, dd->ipath_pioavailshadow); + __set_bit(start, dd->ipath_pioavailkernel); + } else { + __set_bit(start + INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT, + dd->ipath_pioavailshadow); + __clear_bit(start, dd->ipath_pioavailkernel); + } + start += 2; + } + + if (dd->ipath_pioupd_thresh) { + end = 2 * (dd->ipath_piobcnt2k + dd->ipath_piobcnt4k); + cnt = bitmap_weight(dd->ipath_pioavailkernel, end); + } + spin_unlock_irqrestore(&ipath_pioavail_lock, flags); + + /* + * When moving buffers from kernel to user, if number assigned to + * the user is less than the pio update threshold, and threshold + * is supported (cnt was computed > 0), drop the update threshold + * so we update at least once per allocated number of buffers. + * In any case, if the kernel buffers are less than the threshold, + * drop the threshold. We don't bother increasing it, having once + * decreased it, since it would typically just cycle back and forth. + * If we don't decrease below buffers in use, we can wait a long + * time for an update, until some other context uses PIO buffers. + */ + if (!avail && len < cnt) + cnt = len; + if (cnt < dd->ipath_pioupd_thresh) { + dd->ipath_pioupd_thresh = cnt; + ipath_dbg("Decreased pio update threshold to %u\n", + dd->ipath_pioupd_thresh); + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + dd->ipath_sendctrl &= ~(INFINIPATH_S_UPDTHRESH_MASK + << INFINIPATH_S_UPDTHRESH_SHIFT); + dd->ipath_sendctrl |= dd->ipath_pioupd_thresh + << INFINIPATH_S_UPDTHRESH_SHIFT; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + } +} + +/** + * ipath_create_rcvhdrq - create a receive header queue + * @dd: the infinipath device + * @pd: the port data + * + * this must be contiguous memory (from an i/o perspective), and must be + * DMA'able (which means for some systems, it will go through an IOMMU, + * or be forced into a low address range). + */ +int ipath_create_rcvhdrq(struct ipath_devdata *dd, + struct ipath_portdata *pd) +{ + int ret = 0; + + if (!pd->port_rcvhdrq) { + dma_addr_t phys_hdrqtail; + gfp_t gfp_flags = GFP_USER | __GFP_COMP; + int amt = ALIGN(dd->ipath_rcvhdrcnt * dd->ipath_rcvhdrentsize * + sizeof(u32), PAGE_SIZE); + + pd->port_rcvhdrq = dma_alloc_coherent( + &dd->pcidev->dev, amt, &pd->port_rcvhdrq_phys, + gfp_flags); + + if (!pd->port_rcvhdrq) { + ipath_dev_err(dd, "attempt to allocate %d bytes " + "for port %u rcvhdrq failed\n", + amt, pd->port_port); + ret = -ENOMEM; + goto bail; + } + + if (!(dd->ipath_flags & IPATH_NODMA_RTAIL)) { + pd->port_rcvhdrtail_kvaddr = dma_alloc_coherent( + &dd->pcidev->dev, PAGE_SIZE, &phys_hdrqtail, + GFP_KERNEL); + if (!pd->port_rcvhdrtail_kvaddr) { + ipath_dev_err(dd, "attempt to allocate 1 page " + "for port %u rcvhdrqtailaddr " + "failed\n", pd->port_port); + ret = -ENOMEM; + dma_free_coherent(&dd->pcidev->dev, amt, + pd->port_rcvhdrq, + pd->port_rcvhdrq_phys); + pd->port_rcvhdrq = NULL; + goto bail; + } + pd->port_rcvhdrqtailaddr_phys = phys_hdrqtail; + ipath_cdbg(VERBOSE, "port %d hdrtailaddr, %llx " + "physical\n", pd->port_port, + (unsigned long long) phys_hdrqtail); + } + + pd->port_rcvhdrq_size = amt; + + ipath_cdbg(VERBOSE, "%d pages at %p (phys %lx) size=%lu " + "for port %u rcvhdr Q\n", + amt >> PAGE_SHIFT, pd->port_rcvhdrq, + (unsigned long) pd->port_rcvhdrq_phys, + (unsigned long) pd->port_rcvhdrq_size, + pd->port_port); + } + else + ipath_cdbg(VERBOSE, "reuse port %d rcvhdrq @%p %llx phys; " + "hdrtailaddr@%p %llx physical\n", + pd->port_port, pd->port_rcvhdrq, + (unsigned long long) pd->port_rcvhdrq_phys, + pd->port_rcvhdrtail_kvaddr, (unsigned long long) + pd->port_rcvhdrqtailaddr_phys); + + /* clear for security and sanity on each use */ + memset(pd->port_rcvhdrq, 0, pd->port_rcvhdrq_size); + if (pd->port_rcvhdrtail_kvaddr) + memset(pd->port_rcvhdrtail_kvaddr, 0, PAGE_SIZE); + + /* + * tell chip each time we init it, even if we are re-using previous + * memory (we zero the register at process close) + */ + ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdrtailaddr, + pd->port_port, pd->port_rcvhdrqtailaddr_phys); + ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdraddr, + pd->port_port, pd->port_rcvhdrq_phys); + +bail: + return ret; +} + + +/* + * Flush all sends that might be in the ready to send state, as well as any + * that are in the process of being sent. Used whenever we need to be + * sure the send side is idle. Cleans up all buffer state by canceling + * all pio buffers, and issuing an abort, which cleans up anything in the + * launch fifo. The cancel is superfluous on some chip versions, but + * it's safer to always do it. + * PIOAvail bits are updated by the chip as if normal send had happened. + */ +void ipath_cancel_sends(struct ipath_devdata *dd, int restore_sendctrl) +{ + unsigned long flags; + + if (dd->ipath_flags & IPATH_IB_AUTONEG_INPROG) { + ipath_cdbg(VERBOSE, "Ignore while in autonegotiation\n"); + goto bail; + } + /* + * If we have SDMA, and it's not disabled, we have to kick off the + * abort state machine, provided we aren't already aborting. + * If we are in the process of aborting SDMA (!DISABLED, but ABORTING), + * we skip the rest of this routine. It is already "in progress" + */ + if (dd->ipath_flags & IPATH_HAS_SEND_DMA) { + int skip_cancel; + unsigned long *statp = &dd->ipath_sdma_status; + + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + skip_cancel = + test_and_set_bit(IPATH_SDMA_ABORTING, statp) + && !test_bit(IPATH_SDMA_DISABLED, statp); + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + if (skip_cancel) + goto bail; + } + + ipath_dbg("Cancelling all in-progress send buffers\n"); + + /* skip armlaunch errs for a while */ + dd->ipath_lastcancel = jiffies + HZ / 2; + + /* + * The abort bit is auto-clearing. We also don't want pioavail + * update happening during this, and we don't want any other + * sends going out, so turn those off for the duration. We read + * the scratch register to be sure that cancels and the abort + * have taken effect in the chip. Otherwise two parts are same + * as ipath_force_pio_avail_update() + */ + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + dd->ipath_sendctrl &= ~(INFINIPATH_S_PIOBUFAVAILUPD + | INFINIPATH_S_PIOENABLE); + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl | INFINIPATH_S_ABORT); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + + /* disarm all send buffers */ + ipath_disarm_piobufs(dd, 0, + dd->ipath_piobcnt2k + dd->ipath_piobcnt4k); + + if (dd->ipath_flags & IPATH_HAS_SEND_DMA) + set_bit(IPATH_SDMA_DISARMED, &dd->ipath_sdma_status); + + if (restore_sendctrl) { + /* else done by caller later if needed */ + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + dd->ipath_sendctrl |= INFINIPATH_S_PIOBUFAVAILUPD | + INFINIPATH_S_PIOENABLE; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl); + /* and again, be sure all have hit the chip */ + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + } + + if ((dd->ipath_flags & IPATH_HAS_SEND_DMA) && + !test_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status) && + test_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status)) { + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + /* only wait so long for intr */ + dd->ipath_sdma_abort_intr_timeout = jiffies + HZ; + dd->ipath_sdma_reset_wait = 200; + if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status)) + tasklet_hi_schedule(&dd->ipath_sdma_abort_task); + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + } +bail:; +} + +/* + * Force an update of in-memory copy of the pioavail registers, when + * needed for any of a variety of reasons. We read the scratch register + * to make it highly likely that the update will have happened by the + * time we return. If already off (as in cancel_sends above), this + * routine is a nop, on the assumption that the caller will "do the + * right thing". + */ +void ipath_force_pio_avail_update(struct ipath_devdata *dd) +{ + unsigned long flags; + + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + if (dd->ipath_sendctrl & INFINIPATH_S_PIOBUFAVAILUPD) { + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl & ~INFINIPATH_S_PIOBUFAVAILUPD); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + } + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); +} + +static void ipath_set_ib_lstate(struct ipath_devdata *dd, int linkcmd, + int linitcmd) +{ + u64 mod_wd; + static const char *what[4] = { + [0] = "NOP", + [INFINIPATH_IBCC_LINKCMD_DOWN] = "DOWN", + [INFINIPATH_IBCC_LINKCMD_ARMED] = "ARMED", + [INFINIPATH_IBCC_LINKCMD_ACTIVE] = "ACTIVE" + }; + + if (linitcmd == INFINIPATH_IBCC_LINKINITCMD_DISABLE) { + /* + * If we are told to disable, note that so link-recovery + * code does not attempt to bring us back up. + */ + preempt_disable(); + dd->ipath_flags |= IPATH_IB_LINK_DISABLED; + preempt_enable(); + } else if (linitcmd) { + /* + * Any other linkinitcmd will lead to LINKDOWN and then + * to INIT (if all is well), so clear flag to let + * link-recovery code attempt to bring us back up. + */ + preempt_disable(); + dd->ipath_flags &= ~IPATH_IB_LINK_DISABLED; + preempt_enable(); + } + + mod_wd = (linkcmd << dd->ibcc_lc_shift) | + (linitcmd << INFINIPATH_IBCC_LINKINITCMD_SHIFT); + ipath_cdbg(VERBOSE, + "Moving unit %u to %s (initcmd=0x%x), current ltstate is %s\n", + dd->ipath_unit, what[linkcmd], linitcmd, + ipath_ibcstatus_str[ipath_ib_linktrstate(dd, + ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus))]); + + ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl, + dd->ipath_ibcctrl | mod_wd); + /* read from chip so write is flushed */ + (void) ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus); +} + +int ipath_set_linkstate(struct ipath_devdata *dd, u8 newstate) +{ + u32 lstate; + int ret; + + switch (newstate) { + case IPATH_IB_LINKDOWN_ONLY: + ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN, 0); + /* don't wait */ + ret = 0; + goto bail; + + case IPATH_IB_LINKDOWN: + ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN, + INFINIPATH_IBCC_LINKINITCMD_POLL); + /* don't wait */ + ret = 0; + goto bail; + + case IPATH_IB_LINKDOWN_SLEEP: + ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN, + INFINIPATH_IBCC_LINKINITCMD_SLEEP); + /* don't wait */ + ret = 0; + goto bail; + + case IPATH_IB_LINKDOWN_DISABLE: + ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN, + INFINIPATH_IBCC_LINKINITCMD_DISABLE); + /* don't wait */ + ret = 0; + goto bail; + + case IPATH_IB_LINKARM: + if (dd->ipath_flags & IPATH_LINKARMED) { + ret = 0; + goto bail; + } + if (!(dd->ipath_flags & + (IPATH_LINKINIT | IPATH_LINKACTIVE))) { + ret = -EINVAL; + goto bail; + } + ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_ARMED, 0); + + /* + * Since the port can transition to ACTIVE by receiving + * a non VL 15 packet, wait for either state. + */ + lstate = IPATH_LINKARMED | IPATH_LINKACTIVE; + break; + + case IPATH_IB_LINKACTIVE: + if (dd->ipath_flags & IPATH_LINKACTIVE) { + ret = 0; + goto bail; + } + if (!(dd->ipath_flags & IPATH_LINKARMED)) { + ret = -EINVAL; + goto bail; + } + ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_ACTIVE, 0); + lstate = IPATH_LINKACTIVE; + break; + + case IPATH_IB_LINK_LOOPBACK: + dev_info(&dd->pcidev->dev, "Enabling IB local loopback\n"); + dd->ipath_ibcctrl |= INFINIPATH_IBCC_LOOPBACK; + ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl, + dd->ipath_ibcctrl); + + /* turn heartbeat off, as it causes loopback to fail */ + dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT, + IPATH_IB_HRTBT_OFF); + /* don't wait */ + ret = 0; + goto bail; + + case IPATH_IB_LINK_EXTERNAL: + dev_info(&dd->pcidev->dev, + "Disabling IB local loopback (normal)\n"); + dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT, + IPATH_IB_HRTBT_ON); + dd->ipath_ibcctrl &= ~INFINIPATH_IBCC_LOOPBACK; + ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl, + dd->ipath_ibcctrl); + /* don't wait */ + ret = 0; + goto bail; + + /* + * Heartbeat can be explicitly enabled by the user via + * "hrtbt_enable" "file", and if disabled, trying to enable here + * will have no effect. Implicit changes (heartbeat off when + * loopback on, and vice versa) are included to ease testing. + */ + case IPATH_IB_LINK_HRTBT: + ret = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT, + IPATH_IB_HRTBT_ON); + goto bail; + + case IPATH_IB_LINK_NO_HRTBT: + ret = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT, + IPATH_IB_HRTBT_OFF); + goto bail; + + default: + ipath_dbg("Invalid linkstate 0x%x requested\n", newstate); + ret = -EINVAL; + goto bail; + } + ret = ipath_wait_linkstate(dd, lstate, 2000); + +bail: + return ret; +} + +/** + * ipath_set_mtu - set the MTU + * @dd: the infinipath device + * @arg: the new MTU + * + * we can handle "any" incoming size, the issue here is whether we + * need to restrict our outgoing size. For now, we don't do any + * sanity checking on this, and we don't deal with what happens to + * programs that are already running when the size changes. + * NOTE: changing the MTU will usually cause the IBC to go back to + * link INIT state... + */ +int ipath_set_mtu(struct ipath_devdata *dd, u16 arg) +{ + u32 piosize; + int changed = 0; + int ret; + + /* + * mtu is IB data payload max. It's the largest power of 2 less + * than piosize (or even larger, since it only really controls the + * largest we can receive; we can send the max of the mtu and + * piosize). We check that it's one of the valid IB sizes. + */ + if (arg != 256 && arg != 512 && arg != 1024 && arg != 2048 && + (arg != 4096 || !ipath_mtu4096)) { + ipath_dbg("Trying to set invalid mtu %u, failing\n", arg); + ret = -EINVAL; + goto bail; + } + if (dd->ipath_ibmtu == arg) { + ret = 0; /* same as current */ + goto bail; + } + + piosize = dd->ipath_ibmaxlen; + dd->ipath_ibmtu = arg; + + if (arg >= (piosize - IPATH_PIO_MAXIBHDR)) { + /* Only if it's not the initial value (or reset to it) */ + if (piosize != dd->ipath_init_ibmaxlen) { + if (arg > piosize && arg <= dd->ipath_init_ibmaxlen) + piosize = dd->ipath_init_ibmaxlen; + dd->ipath_ibmaxlen = piosize; + changed = 1; + } + } else if ((arg + IPATH_PIO_MAXIBHDR) != dd->ipath_ibmaxlen) { + piosize = arg + IPATH_PIO_MAXIBHDR; + ipath_cdbg(VERBOSE, "ibmaxlen was 0x%x, setting to 0x%x " + "(mtu 0x%x)\n", dd->ipath_ibmaxlen, piosize, + arg); + dd->ipath_ibmaxlen = piosize; + changed = 1; + } + + if (changed) { + u64 ibc = dd->ipath_ibcctrl, ibdw; + /* + * update our housekeeping variables, and set IBC max + * size, same as init code; max IBC is max we allow in + * buffer, less the qword pbc, plus 1 for ICRC, in dwords + */ + dd->ipath_ibmaxlen = piosize - 2 * sizeof(u32); + ibdw = (dd->ipath_ibmaxlen >> 2) + 1; + ibc &= ~(INFINIPATH_IBCC_MAXPKTLEN_MASK << + dd->ibcc_mpl_shift); + ibc |= ibdw << dd->ibcc_mpl_shift; + dd->ipath_ibcctrl = ibc; + ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl, + dd->ipath_ibcctrl); + dd->ipath_f_tidtemplate(dd); + } + + ret = 0; + +bail: + return ret; +} + +int ipath_set_lid(struct ipath_devdata *dd, u32 lid, u8 lmc) +{ + dd->ipath_lid = lid; + dd->ipath_lmc = lmc; + + dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LIDLMC, lid | + (~((1U << lmc) - 1)) << 16); + + dev_info(&dd->pcidev->dev, "We got a lid: 0x%x\n", lid); + + return 0; +} + + +/** + * ipath_write_kreg_port - write a device's per-port 64-bit kernel register + * @dd: the infinipath device + * @regno: the register number to write + * @port: the port containing the register + * @value: the value to write + * + * Registers that vary with the chip implementation constants (port) + * use this routine. + */ +void ipath_write_kreg_port(const struct ipath_devdata *dd, ipath_kreg regno, + unsigned port, u64 value) +{ + u16 where; + + if (port < dd->ipath_portcnt && + (regno == dd->ipath_kregs->kr_rcvhdraddr || + regno == dd->ipath_kregs->kr_rcvhdrtailaddr)) + where = regno + port; + else + where = -1; + + ipath_write_kreg(dd, where, value); +} + +/* + * Following deal with the "obviously simple" task of overriding the state + * of the LEDS, which normally indicate link physical and logical status. + * The complications arise in dealing with different hardware mappings + * and the board-dependent routine being called from interrupts. + * and then there's the requirement to _flash_ them. + */ +#define LED_OVER_FREQ_SHIFT 8 +#define LED_OVER_FREQ_MASK (0xFF<ipath_flags & IPATH_INITTED)) + return; + + pidx = dd->ipath_led_override_phase++ & 1; + dd->ipath_led_override = dd->ipath_led_override_vals[pidx]; + timeoff = dd->ipath_led_override_timeoff; + + /* + * below potentially restores the LED values per current status, + * should also possibly setup the traffic-blink register, + * but leave that to per-chip functions. + */ + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus); + ltstate = ipath_ib_linktrstate(dd, val); + lstate = ipath_ib_linkstate(dd, val); + + dd->ipath_f_setextled(dd, lstate, ltstate); + mod_timer(&dd->ipath_led_override_timer, jiffies + timeoff); +} + +void ipath_set_led_override(struct ipath_devdata *dd, unsigned int val) +{ + int timeoff, freq; + + if (!(dd->ipath_flags & IPATH_INITTED)) + return; + + /* First check if we are blinking. If not, use 1HZ polling */ + timeoff = HZ; + freq = (val & LED_OVER_FREQ_MASK) >> LED_OVER_FREQ_SHIFT; + + if (freq) { + /* For blink, set each phase from one nybble of val */ + dd->ipath_led_override_vals[0] = val & 0xF; + dd->ipath_led_override_vals[1] = (val >> 4) & 0xF; + timeoff = (HZ << 4)/freq; + } else { + /* Non-blink set both phases the same. */ + dd->ipath_led_override_vals[0] = val & 0xF; + dd->ipath_led_override_vals[1] = val & 0xF; + } + dd->ipath_led_override_timeoff = timeoff; + + /* + * If the timer has not already been started, do so. Use a "quick" + * timeout so the function will be called soon, to look at our request. + */ + if (atomic_inc_return(&dd->ipath_led_override_timer_active) == 1) { + /* Need to start timer */ + init_timer(&dd->ipath_led_override_timer); + dd->ipath_led_override_timer.function = + ipath_run_led_override; + dd->ipath_led_override_timer.data = (unsigned long) dd; + dd->ipath_led_override_timer.expires = jiffies + 1; + add_timer(&dd->ipath_led_override_timer); + } else + atomic_dec(&dd->ipath_led_override_timer_active); +} + +/** + * ipath_shutdown_device - shut down a device + * @dd: the infinipath device + * + * This is called to make the device quiet when we are about to + * unload the driver, and also when the device is administratively + * disabled. It does not free any data structures. + * Everything it does has to be setup again by ipath_init_chip(dd,1) + */ +void ipath_shutdown_device(struct ipath_devdata *dd) +{ + unsigned long flags; + + ipath_dbg("Shutting down the device\n"); + + ipath_hol_up(dd); /* make sure user processes aren't suspended */ + + dd->ipath_flags |= IPATH_LINKUNK; + dd->ipath_flags &= ~(IPATH_INITTED | IPATH_LINKDOWN | + IPATH_LINKINIT | IPATH_LINKARMED | + IPATH_LINKACTIVE); + *dd->ipath_statusp &= ~(IPATH_STATUS_IB_CONF | + IPATH_STATUS_IB_READY); + + /* mask interrupts, but not errors */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL); + + dd->ipath_rcvctrl = 0; + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, + dd->ipath_rcvctrl); + + if (dd->ipath_flags & IPATH_HAS_SEND_DMA) + teardown_sdma(dd); + + /* + * gracefully stop all sends allowing any in progress to trickle out + * first. + */ + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + dd->ipath_sendctrl = 0; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl); + /* flush it */ + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + + /* + * enough for anything that's going to trickle out to have actually + * done so. + */ + udelay(5); + + dd->ipath_f_setextled(dd, 0, 0); /* make sure LEDs are off */ + + ipath_set_ib_lstate(dd, 0, INFINIPATH_IBCC_LINKINITCMD_DISABLE); + ipath_cancel_sends(dd, 0); + + /* + * we are shutting down, so tell components that care. We don't do + * this on just a link state change, much like ethernet, a cable + * unplug, etc. doesn't change driver state + */ + signal_ib_event(dd, IB_EVENT_PORT_ERR); + + /* disable IBC */ + dd->ipath_control &= ~INFINIPATH_C_LINKENABLE; + ipath_write_kreg(dd, dd->ipath_kregs->kr_control, + dd->ipath_control | INFINIPATH_C_FREEZEMODE); + + /* + * clear SerdesEnable and turn the leds off; do this here because + * we are unloading, so don't count on interrupts to move along + * Turn the LEDs off explicitly for the same reason. + */ + dd->ipath_f_quiet_serdes(dd); + + /* stop all the timers that might still be running */ + del_timer_sync(&dd->ipath_hol_timer); + if (dd->ipath_stats_timer_active) { + del_timer_sync(&dd->ipath_stats_timer); + dd->ipath_stats_timer_active = 0; + } + if (dd->ipath_intrchk_timer.data) { + del_timer_sync(&dd->ipath_intrchk_timer); + dd->ipath_intrchk_timer.data = 0; + } + if (atomic_read(&dd->ipath_led_override_timer_active)) { + del_timer_sync(&dd->ipath_led_override_timer); + atomic_set(&dd->ipath_led_override_timer_active, 0); + } + + /* + * clear all interrupts and errors, so that the next time the driver + * is loaded or device is enabled, we know that whatever is set + * happened while we were unloaded + */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear, + ~0ULL & ~INFINIPATH_HWE_MEMBISTFAILED); + ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, -1LL); + ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, -1LL); + + ipath_cdbg(VERBOSE, "Flush time and errors to EEPROM\n"); + ipath_update_eeprom_log(dd); +} + +/** + * ipath_free_pddata - free a port's allocated data + * @dd: the infinipath device + * @pd: the portdata structure + * + * free up any allocated data for a port + * This should not touch anything that would affect a simultaneous + * re-allocation of port data, because it is called after ipath_mutex + * is released (and can be called from reinit as well). + * It should never change any chip state, or global driver state. + * (The only exception to global state is freeing the port0 port0_skbs.) + */ +void ipath_free_pddata(struct ipath_devdata *dd, struct ipath_portdata *pd) +{ + if (!pd) + return; + + if (pd->port_rcvhdrq) { + ipath_cdbg(VERBOSE, "free closed port %d rcvhdrq @ %p " + "(size=%lu)\n", pd->port_port, pd->port_rcvhdrq, + (unsigned long) pd->port_rcvhdrq_size); + dma_free_coherent(&dd->pcidev->dev, pd->port_rcvhdrq_size, + pd->port_rcvhdrq, pd->port_rcvhdrq_phys); + pd->port_rcvhdrq = NULL; + if (pd->port_rcvhdrtail_kvaddr) { + dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE, + pd->port_rcvhdrtail_kvaddr, + pd->port_rcvhdrqtailaddr_phys); + pd->port_rcvhdrtail_kvaddr = NULL; + } + } + if (pd->port_port && pd->port_rcvegrbuf) { + unsigned e; + + for (e = 0; e < pd->port_rcvegrbuf_chunks; e++) { + void *base = pd->port_rcvegrbuf[e]; + size_t size = pd->port_rcvegrbuf_size; + + ipath_cdbg(VERBOSE, "egrbuf free(%p, %lu), " + "chunk %u/%u\n", base, + (unsigned long) size, + e, pd->port_rcvegrbuf_chunks); + dma_free_coherent(&dd->pcidev->dev, size, + base, pd->port_rcvegrbuf_phys[e]); + } + kfree(pd->port_rcvegrbuf); + pd->port_rcvegrbuf = NULL; + kfree(pd->port_rcvegrbuf_phys); + pd->port_rcvegrbuf_phys = NULL; + pd->port_rcvegrbuf_chunks = 0; + } else if (pd->port_port == 0 && dd->ipath_port0_skbinfo) { + unsigned e; + struct ipath_skbinfo *skbinfo = dd->ipath_port0_skbinfo; + + dd->ipath_port0_skbinfo = NULL; + ipath_cdbg(VERBOSE, "free closed port %d " + "ipath_port0_skbinfo @ %p\n", pd->port_port, + skbinfo); + for (e = 0; e < dd->ipath_p0_rcvegrcnt; e++) + if (skbinfo[e].skb) { + pci_unmap_single(dd->pcidev, skbinfo[e].phys, + dd->ipath_ibmaxlen, + PCI_DMA_FROMDEVICE); + dev_kfree_skb(skbinfo[e].skb); + } + vfree(skbinfo); + } + kfree(pd->port_tid_pg_list); + vfree(pd->subport_uregbase); + vfree(pd->subport_rcvegrbuf); + vfree(pd->subport_rcvhdr_base); + kfree(pd); +} + +static int __init infinipath_init(void) +{ + int ret; + + if (ipath_debug & __IPATH_DBG) + printk(KERN_INFO DRIVER_LOAD_MSG "%s", ib_ipath_version); + + /* + * These must be called before the driver is registered with + * the PCI subsystem. + */ + idr_init(&unit_table); + + ret = pci_register_driver(&ipath_driver); + if (ret < 0) { + printk(KERN_ERR IPATH_DRV_NAME + ": Unable to register driver: error %d\n", -ret); + goto bail_unit; + } + + ret = ipath_init_ipathfs(); + if (ret < 0) { + printk(KERN_ERR IPATH_DRV_NAME ": Unable to create " + "ipathfs: error %d\n", -ret); + goto bail_pci; + } + + goto bail; + +bail_pci: + pci_unregister_driver(&ipath_driver); + +bail_unit: + idr_destroy(&unit_table); + +bail: + return ret; +} + +static void __exit infinipath_cleanup(void) +{ + ipath_exit_ipathfs(); + + ipath_cdbg(VERBOSE, "Unregistering pci driver\n"); + pci_unregister_driver(&ipath_driver); + + idr_destroy(&unit_table); +} + +/** + * ipath_reset_device - reset the chip if possible + * @unit: the device to reset + * + * Whether or not reset is successful, we attempt to re-initialize the chip + * (that is, much like a driver unload/reload). We clear the INITTED flag + * so that the various entry points will fail until we reinitialize. For + * now, we only allow this if no user ports are open that use chip resources + */ +int ipath_reset_device(int unit) +{ + int ret, i; + struct ipath_devdata *dd = ipath_lookup(unit); + unsigned long flags; + + if (!dd) { + ret = -ENODEV; + goto bail; + } + + if (atomic_read(&dd->ipath_led_override_timer_active)) { + /* Need to stop LED timer, _then_ shut off LEDs */ + del_timer_sync(&dd->ipath_led_override_timer); + atomic_set(&dd->ipath_led_override_timer_active, 0); + } + + /* Shut off LEDs after we are sure timer is not running */ + dd->ipath_led_override = LED_OVER_BOTH_OFF; + dd->ipath_f_setextled(dd, 0, 0); + + dev_info(&dd->pcidev->dev, "Reset on unit %u requested\n", unit); + + if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT)) { + dev_info(&dd->pcidev->dev, "Invalid unit number %u or " + "not initialized or not present\n", unit); + ret = -ENXIO; + goto bail; + } + + spin_lock_irqsave(&dd->ipath_uctxt_lock, flags); + if (dd->ipath_pd) + for (i = 1; i < dd->ipath_cfgports; i++) { + if (!dd->ipath_pd[i] || !dd->ipath_pd[i]->port_cnt) + continue; + spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags); + ipath_dbg("unit %u port %d is in use " + "(PID %u cmd %s), can't reset\n", + unit, i, + pid_nr(dd->ipath_pd[i]->port_pid), + dd->ipath_pd[i]->port_comm); + ret = -EBUSY; + goto bail; + } + spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags); + + if (dd->ipath_flags & IPATH_HAS_SEND_DMA) + teardown_sdma(dd); + + dd->ipath_flags &= ~IPATH_INITTED; + ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL); + ret = dd->ipath_f_reset(dd); + if (ret == 1) { + ipath_dbg("Reinitializing unit %u after reset attempt\n", + unit); + ret = ipath_init_chip(dd, 1); + } else + ret = -EAGAIN; + if (ret) + ipath_dev_err(dd, "Reinitialize unit %u after " + "reset failed with %d\n", unit, ret); + else + dev_info(&dd->pcidev->dev, "Reinitialized unit %u after " + "resetting\n", unit); + +bail: + return ret; +} + +/* + * send a signal to all the processes that have the driver open + * through the normal interfaces (i.e., everything other than diags + * interface). Returns number of signalled processes. + */ +static int ipath_signal_procs(struct ipath_devdata *dd, int sig) +{ + int i, sub, any = 0; + struct pid *pid; + unsigned long flags; + + if (!dd->ipath_pd) + return 0; + + spin_lock_irqsave(&dd->ipath_uctxt_lock, flags); + for (i = 1; i < dd->ipath_cfgports; i++) { + if (!dd->ipath_pd[i] || !dd->ipath_pd[i]->port_cnt) + continue; + pid = dd->ipath_pd[i]->port_pid; + if (!pid) + continue; + + dev_info(&dd->pcidev->dev, "context %d in use " + "(PID %u), sending signal %d\n", + i, pid_nr(pid), sig); + kill_pid(pid, sig, 1); + any++; + for (sub = 0; sub < INFINIPATH_MAX_SUBPORT; sub++) { + pid = dd->ipath_pd[i]->port_subpid[sub]; + if (!pid) + continue; + dev_info(&dd->pcidev->dev, "sub-context " + "%d:%d in use (PID %u), sending " + "signal %d\n", i, sub, pid_nr(pid), sig); + kill_pid(pid, sig, 1); + any++; + } + } + spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags); + return any; +} + +static void ipath_hol_signal_down(struct ipath_devdata *dd) +{ + if (ipath_signal_procs(dd, SIGSTOP)) + ipath_dbg("Stopped some processes\n"); + ipath_cancel_sends(dd, 1); +} + + +static void ipath_hol_signal_up(struct ipath_devdata *dd) +{ + if (ipath_signal_procs(dd, SIGCONT)) + ipath_dbg("Continued some processes\n"); +} + +/* + * link is down, stop any users processes, and flush pending sends + * to prevent HoL blocking, then start the HoL timer that + * periodically continues, then stop procs, so they can detect + * link down if they want, and do something about it. + * Timer may already be running, so use mod_timer, not add_timer. + */ +void ipath_hol_down(struct ipath_devdata *dd) +{ + dd->ipath_hol_state = IPATH_HOL_DOWN; + ipath_hol_signal_down(dd); + dd->ipath_hol_next = IPATH_HOL_DOWNCONT; + dd->ipath_hol_timer.expires = jiffies + + msecs_to_jiffies(ipath_hol_timeout_ms); + mod_timer(&dd->ipath_hol_timer, dd->ipath_hol_timer.expires); +} + +/* + * link is up, continue any user processes, and ensure timer + * is a nop, if running. Let timer keep running, if set; it + * will nop when it sees the link is up + */ +void ipath_hol_up(struct ipath_devdata *dd) +{ + ipath_hol_signal_up(dd); + dd->ipath_hol_state = IPATH_HOL_UP; +} + +/* + * toggle the running/not running state of user proceses + * to prevent HoL blocking on chip resources, but still allow + * user processes to do link down special case handling. + * Should only be called via the timer + */ +void ipath_hol_event(unsigned long opaque) +{ + struct ipath_devdata *dd = (struct ipath_devdata *)opaque; + + if (dd->ipath_hol_next == IPATH_HOL_DOWNSTOP + && dd->ipath_hol_state != IPATH_HOL_UP) { + dd->ipath_hol_next = IPATH_HOL_DOWNCONT; + ipath_dbg("Stopping processes\n"); + ipath_hol_signal_down(dd); + } else { /* may do "extra" if also in ipath_hol_up() */ + dd->ipath_hol_next = IPATH_HOL_DOWNSTOP; + ipath_dbg("Continuing processes\n"); + ipath_hol_signal_up(dd); + } + if (dd->ipath_hol_state == IPATH_HOL_UP) + ipath_dbg("link's up, don't resched timer\n"); + else { + dd->ipath_hol_timer.expires = jiffies + + msecs_to_jiffies(ipath_hol_timeout_ms); + mod_timer(&dd->ipath_hol_timer, + dd->ipath_hol_timer.expires); + } +} + +int ipath_set_rx_pol_inv(struct ipath_devdata *dd, u8 new_pol_inv) +{ + u64 val; + + if (new_pol_inv > INFINIPATH_XGXS_RX_POL_MASK) + return -1; + if (dd->ipath_rx_pol_inv != new_pol_inv) { + dd->ipath_rx_pol_inv = new_pol_inv; + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig); + val &= ~(INFINIPATH_XGXS_RX_POL_MASK << + INFINIPATH_XGXS_RX_POL_SHIFT); + val |= ((u64)dd->ipath_rx_pol_inv) << + INFINIPATH_XGXS_RX_POL_SHIFT; + ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, val); + } + return 0; +} + +/* + * Disable and enable the armlaunch error. Used for PIO bandwidth testing on + * the 7220, which is count-based, rather than trigger-based. Safe for the + * driver check, since it's at init. Not completely safe when used for + * user-mode checking, since some error checking can be lost, but not + * particularly risky, and only has problematic side-effects in the face of + * very buggy user code. There is no reference counting, but that's also + * fine, given the intended use. + */ +void ipath_enable_armlaunch(struct ipath_devdata *dd) +{ + dd->ipath_lasterror &= ~INFINIPATH_E_SPIOARMLAUNCH; + ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, + INFINIPATH_E_SPIOARMLAUNCH); + dd->ipath_errormask |= INFINIPATH_E_SPIOARMLAUNCH; + ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, + dd->ipath_errormask); +} + +void ipath_disable_armlaunch(struct ipath_devdata *dd) +{ + /* so don't re-enable if already set */ + dd->ipath_maskederrs &= ~INFINIPATH_E_SPIOARMLAUNCH; + dd->ipath_errormask &= ~INFINIPATH_E_SPIOARMLAUNCH; + ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, + dd->ipath_errormask); +} + +module_init(infinipath_init); +module_exit(infinipath_cleanup); diff --git a/kernel/drivers/infiniband/hw/ipath/ipath_eeprom.c b/kernel/drivers/infiniband/hw/ipath/ipath_eeprom.c new file mode 100644 index 000000000..fc7181985 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ipath/ipath_eeprom.c @@ -0,0 +1,1183 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include "ipath_kernel.h" + +/* + * InfiniPath I2C driver for a serial eeprom. This is not a generic + * I2C interface. For a start, the device we're using (Atmel AT24C11) + * doesn't work like a regular I2C device. It looks like one + * electrically, but not logically. Normal I2C devices have a single + * 7-bit or 10-bit I2C address that they respond to. Valid 7-bit + * addresses range from 0x03 to 0x77. Addresses 0x00 to 0x02 and 0x78 + * to 0x7F are special reserved addresses (e.g. 0x00 is the "general + * call" address.) The Atmel device, on the other hand, responds to ALL + * 7-bit addresses. It's designed to be the only device on a given I2C + * bus. A 7-bit address corresponds to the memory address within the + * Atmel device itself. + * + * Also, the timing requirements mean more than simple software + * bitbanging, with readbacks from chip to ensure timing (simple udelay + * is not enough). + * + * This all means that accessing the device is specialized enough + * that using the standard kernel I2C bitbanging interface would be + * impossible. For example, the core I2C eeprom driver expects to find + * a device at one or more of a limited set of addresses only. It doesn't + * allow writing to an eeprom. It also doesn't provide any means of + * accessing eeprom contents from within the kernel, only via sysfs. + */ + +/* Added functionality for IBA7220-based cards */ +#define IPATH_EEPROM_DEV_V1 0xA0 +#define IPATH_EEPROM_DEV_V2 0xA2 +#define IPATH_TEMP_DEV 0x98 +#define IPATH_BAD_DEV (IPATH_EEPROM_DEV_V2+2) +#define IPATH_NO_DEV (0xFF) + +/* + * The number of I2C chains is proliferating. Table below brings + * some order to the madness. The basic principle is that the + * table is scanned from the top, and a "probe" is made to the + * device probe_dev. If that succeeds, the chain is considered + * to be of that type, and dd->i2c_chain_type is set to the index+1 + * of the entry. + * The +1 is so static initialization can mean "unknown, do probe." + */ +static struct i2c_chain_desc { + u8 probe_dev; /* If seen at probe, chain is this type */ + u8 eeprom_dev; /* Dev addr (if any) for EEPROM */ + u8 temp_dev; /* Dev Addr (if any) for Temp-sense */ +} i2c_chains[] = { + { IPATH_BAD_DEV, IPATH_NO_DEV, IPATH_NO_DEV }, /* pre-iba7220 bds */ + { IPATH_EEPROM_DEV_V1, IPATH_EEPROM_DEV_V1, IPATH_TEMP_DEV}, /* V1 */ + { IPATH_EEPROM_DEV_V2, IPATH_EEPROM_DEV_V2, IPATH_TEMP_DEV}, /* V2 */ + { IPATH_NO_DEV } +}; + +enum i2c_type { + i2c_line_scl = 0, + i2c_line_sda +}; + +enum i2c_state { + i2c_line_low = 0, + i2c_line_high +}; + +#define READ_CMD 1 +#define WRITE_CMD 0 + +/** + * i2c_gpio_set - set a GPIO line + * @dd: the infinipath device + * @line: the line to set + * @new_line_state: the state to set + * + * Returns 0 if the line was set to the new state successfully, non-zero + * on error. + */ +static int i2c_gpio_set(struct ipath_devdata *dd, + enum i2c_type line, + enum i2c_state new_line_state) +{ + u64 out_mask, dir_mask, *gpioval; + unsigned long flags = 0; + + gpioval = &dd->ipath_gpio_out; + + if (line == i2c_line_scl) { + dir_mask = dd->ipath_gpio_scl; + out_mask = (1UL << dd->ipath_gpio_scl_num); + } else { + dir_mask = dd->ipath_gpio_sda; + out_mask = (1UL << dd->ipath_gpio_sda_num); + } + + spin_lock_irqsave(&dd->ipath_gpio_lock, flags); + if (new_line_state == i2c_line_high) { + /* tri-state the output rather than force high */ + dd->ipath_extctrl &= ~dir_mask; + } else { + /* config line to be an output */ + dd->ipath_extctrl |= dir_mask; + } + ipath_write_kreg(dd, dd->ipath_kregs->kr_extctrl, dd->ipath_extctrl); + + /* set output as well (no real verify) */ + if (new_line_state == i2c_line_high) + *gpioval |= out_mask; + else + *gpioval &= ~out_mask; + + ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_out, *gpioval); + spin_unlock_irqrestore(&dd->ipath_gpio_lock, flags); + + return 0; +} + +/** + * i2c_gpio_get - get a GPIO line state + * @dd: the infinipath device + * @line: the line to get + * @curr_statep: where to put the line state + * + * Returns 0 if the line was set to the new state successfully, non-zero + * on error. curr_state is not set on error. + */ +static int i2c_gpio_get(struct ipath_devdata *dd, + enum i2c_type line, + enum i2c_state *curr_statep) +{ + u64 read_val, mask; + int ret; + unsigned long flags = 0; + + /* check args */ + if (curr_statep == NULL) { + ret = 1; + goto bail; + } + + /* config line to be an input */ + if (line == i2c_line_scl) + mask = dd->ipath_gpio_scl; + else + mask = dd->ipath_gpio_sda; + + spin_lock_irqsave(&dd->ipath_gpio_lock, flags); + dd->ipath_extctrl &= ~mask; + ipath_write_kreg(dd, dd->ipath_kregs->kr_extctrl, dd->ipath_extctrl); + /* + * Below is very unlikely to reflect true input state if Output + * Enable actually changed. + */ + read_val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extstatus); + spin_unlock_irqrestore(&dd->ipath_gpio_lock, flags); + + if (read_val & mask) + *curr_statep = i2c_line_high; + else + *curr_statep = i2c_line_low; + + ret = 0; + +bail: + return ret; +} + +/** + * i2c_wait_for_writes - wait for a write + * @dd: the infinipath device + * + * We use this instead of udelay directly, so we can make sure + * that previous register writes have been flushed all the way + * to the chip. Since we are delaying anyway, the cost doesn't + * hurt, and makes the bit twiddling more regular + */ +static void i2c_wait_for_writes(struct ipath_devdata *dd) +{ + (void)ipath_read_kreg32(dd, dd->ipath_kregs->kr_scratch); + rmb(); +} + +static void scl_out(struct ipath_devdata *dd, u8 bit) +{ + udelay(1); + i2c_gpio_set(dd, i2c_line_scl, bit ? i2c_line_high : i2c_line_low); + + i2c_wait_for_writes(dd); +} + +static void sda_out(struct ipath_devdata *dd, u8 bit) +{ + i2c_gpio_set(dd, i2c_line_sda, bit ? i2c_line_high : i2c_line_low); + + i2c_wait_for_writes(dd); +} + +static u8 sda_in(struct ipath_devdata *dd, int wait) +{ + enum i2c_state bit; + + if (i2c_gpio_get(dd, i2c_line_sda, &bit)) + ipath_dbg("get bit failed!\n"); + + if (wait) + i2c_wait_for_writes(dd); + + return bit == i2c_line_high ? 1U : 0; +} + +/** + * i2c_ackrcv - see if ack following write is true + * @dd: the infinipath device + */ +static int i2c_ackrcv(struct ipath_devdata *dd) +{ + u8 ack_received; + + /* AT ENTRY SCL = LOW */ + /* change direction, ignore data */ + ack_received = sda_in(dd, 1); + scl_out(dd, i2c_line_high); + ack_received = sda_in(dd, 1) == 0; + scl_out(dd, i2c_line_low); + return ack_received; +} + +/** + * rd_byte - read a byte, leaving ACK, STOP, etc up to caller + * @dd: the infinipath device + * + * Returns byte shifted out of device + */ +static int rd_byte(struct ipath_devdata *dd) +{ + int bit_cntr, data; + + data = 0; + + for (bit_cntr = 7; bit_cntr >= 0; --bit_cntr) { + data <<= 1; + scl_out(dd, i2c_line_high); + data |= sda_in(dd, 0); + scl_out(dd, i2c_line_low); + } + return data; +} + +/** + * wr_byte - write a byte, one bit at a time + * @dd: the infinipath device + * @data: the byte to write + * + * Returns 0 if we got the following ack, otherwise 1 + */ +static int wr_byte(struct ipath_devdata *dd, u8 data) +{ + int bit_cntr; + u8 bit; + + for (bit_cntr = 7; bit_cntr >= 0; bit_cntr--) { + bit = (data >> bit_cntr) & 1; + sda_out(dd, bit); + scl_out(dd, i2c_line_high); + scl_out(dd, i2c_line_low); + } + return (!i2c_ackrcv(dd)) ? 1 : 0; +} + +static void send_ack(struct ipath_devdata *dd) +{ + sda_out(dd, i2c_line_low); + scl_out(dd, i2c_line_high); + scl_out(dd, i2c_line_low); + sda_out(dd, i2c_line_high); +} + +/** + * i2c_startcmd - transmit the start condition, followed by address/cmd + * @dd: the infinipath device + * @offset_dir: direction byte + * + * (both clock/data high, clock high, data low while clock is high) + */ +static int i2c_startcmd(struct ipath_devdata *dd, u8 offset_dir) +{ + int res; + + /* issue start sequence */ + sda_out(dd, i2c_line_high); + scl_out(dd, i2c_line_high); + sda_out(dd, i2c_line_low); + scl_out(dd, i2c_line_low); + + /* issue length and direction byte */ + res = wr_byte(dd, offset_dir); + + if (res) + ipath_cdbg(VERBOSE, "No ack to complete start\n"); + + return res; +} + +/** + * stop_cmd - transmit the stop condition + * @dd: the infinipath device + * + * (both clock/data low, clock high, data high while clock is high) + */ +static void stop_cmd(struct ipath_devdata *dd) +{ + scl_out(dd, i2c_line_low); + sda_out(dd, i2c_line_low); + scl_out(dd, i2c_line_high); + sda_out(dd, i2c_line_high); + udelay(2); +} + +/** + * eeprom_reset - reset I2C communication + * @dd: the infinipath device + */ + +static int eeprom_reset(struct ipath_devdata *dd) +{ + int clock_cycles_left = 9; + u64 *gpioval = &dd->ipath_gpio_out; + int ret; + unsigned long flags; + + spin_lock_irqsave(&dd->ipath_gpio_lock, flags); + /* Make sure shadows are consistent */ + dd->ipath_extctrl = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extctrl); + *gpioval = ipath_read_kreg64(dd, dd->ipath_kregs->kr_gpio_out); + spin_unlock_irqrestore(&dd->ipath_gpio_lock, flags); + + ipath_cdbg(VERBOSE, "Resetting i2c eeprom; initial gpioout reg " + "is %llx\n", (unsigned long long) *gpioval); + + /* + * This is to get the i2c into a known state, by first going low, + * then tristate sda (and then tristate scl as first thing + * in loop) + */ + scl_out(dd, i2c_line_low); + sda_out(dd, i2c_line_high); + + /* Clock up to 9 cycles looking for SDA hi, then issue START and STOP */ + while (clock_cycles_left--) { + scl_out(dd, i2c_line_high); + + /* SDA seen high, issue START by dropping it while SCL high */ + if (sda_in(dd, 0)) { + sda_out(dd, i2c_line_low); + scl_out(dd, i2c_line_low); + /* ATMEL spec says must be followed by STOP. */ + scl_out(dd, i2c_line_high); + sda_out(dd, i2c_line_high); + ret = 0; + goto bail; + } + + scl_out(dd, i2c_line_low); + } + + ret = 1; + +bail: + return ret; +} + +/* + * Probe for I2C device at specified address. Returns 0 for "success" + * to match rest of this file. + * Leave bus in "reasonable" state for further commands. + */ +static int i2c_probe(struct ipath_devdata *dd, int devaddr) +{ + int ret = 0; + + ret = eeprom_reset(dd); + if (ret) { + ipath_dev_err(dd, "Failed reset probing device 0x%02X\n", + devaddr); + return ret; + } + /* + * Reset no longer leaves bus in start condition, so normal + * i2c_startcmd() will do. + */ + ret = i2c_startcmd(dd, devaddr | READ_CMD); + if (ret) + ipath_cdbg(VERBOSE, "Failed startcmd for device 0x%02X\n", + devaddr); + else { + /* + * Device did respond. Complete a single-byte read, because some + * devices apparently cannot handle STOP immediately after they + * ACK the start-cmd. + */ + int data; + data = rd_byte(dd); + stop_cmd(dd); + ipath_cdbg(VERBOSE, "Response from device 0x%02X\n", devaddr); + } + return ret; +} + +/* + * Returns the "i2c type". This is a pointer to a struct that describes + * the I2C chain on this board. To minimize impact on struct ipath_devdata, + * the (small integer) index into the table is actually memoized, rather + * then the pointer. + * Memoization is because the type is determined on the first call per chip. + * An alternative would be to move type determination to early + * init code. + */ +static struct i2c_chain_desc *ipath_i2c_type(struct ipath_devdata *dd) +{ + int idx; + + /* Get memoized index, from previous successful probes */ + idx = dd->ipath_i2c_chain_type - 1; + if (idx >= 0 && idx < (ARRAY_SIZE(i2c_chains) - 1)) + goto done; + + idx = 0; + while (i2c_chains[idx].probe_dev != IPATH_NO_DEV) { + /* if probe succeeds, this is type */ + if (!i2c_probe(dd, i2c_chains[idx].probe_dev)) + break; + ++idx; + } + + /* + * Old EEPROM (first entry) may require a reset after probe, + * rather than being able to "start" after "stop" + */ + if (idx == 0) + eeprom_reset(dd); + + if (i2c_chains[idx].probe_dev == IPATH_NO_DEV) + idx = -1; + else + dd->ipath_i2c_chain_type = idx + 1; +done: + return (idx >= 0) ? i2c_chains + idx : NULL; +} + +static int ipath_eeprom_internal_read(struct ipath_devdata *dd, + u8 eeprom_offset, void *buffer, int len) +{ + int ret; + struct i2c_chain_desc *icd; + u8 *bp = buffer; + + ret = 1; + icd = ipath_i2c_type(dd); + if (!icd) + goto bail; + + if (icd->eeprom_dev == IPATH_NO_DEV) { + /* legacy not-really-I2C */ + ipath_cdbg(VERBOSE, "Start command only address\n"); + eeprom_offset = (eeprom_offset << 1) | READ_CMD; + ret = i2c_startcmd(dd, eeprom_offset); + } else { + /* Actual I2C */ + ipath_cdbg(VERBOSE, "Start command uses devaddr\n"); + if (i2c_startcmd(dd, icd->eeprom_dev | WRITE_CMD)) { + ipath_dbg("Failed EEPROM startcmd\n"); + stop_cmd(dd); + ret = 1; + goto bail; + } + ret = wr_byte(dd, eeprom_offset); + stop_cmd(dd); + if (ret) { + ipath_dev_err(dd, "Failed to write EEPROM address\n"); + ret = 1; + goto bail; + } + ret = i2c_startcmd(dd, icd->eeprom_dev | READ_CMD); + } + if (ret) { + ipath_dbg("Failed startcmd for dev %02X\n", icd->eeprom_dev); + stop_cmd(dd); + ret = 1; + goto bail; + } + + /* + * eeprom keeps clocking data out as long as we ack, automatically + * incrementing the address. + */ + while (len-- > 0) { + /* get and store data */ + *bp++ = rd_byte(dd); + /* send ack if not the last byte */ + if (len) + send_ack(dd); + } + + stop_cmd(dd); + + ret = 0; + +bail: + return ret; +} + +static int ipath_eeprom_internal_write(struct ipath_devdata *dd, u8 eeprom_offset, + const void *buffer, int len) +{ + int sub_len; + const u8 *bp = buffer; + int max_wait_time, i; + int ret; + struct i2c_chain_desc *icd; + + ret = 1; + icd = ipath_i2c_type(dd); + if (!icd) + goto bail; + + while (len > 0) { + if (icd->eeprom_dev == IPATH_NO_DEV) { + if (i2c_startcmd(dd, + (eeprom_offset << 1) | WRITE_CMD)) { + ipath_dbg("Failed to start cmd offset %u\n", + eeprom_offset); + goto failed_write; + } + } else { + /* Real I2C */ + if (i2c_startcmd(dd, icd->eeprom_dev | WRITE_CMD)) { + ipath_dbg("Failed EEPROM startcmd\n"); + goto failed_write; + } + ret = wr_byte(dd, eeprom_offset); + if (ret) { + ipath_dev_err(dd, "Failed to write EEPROM " + "address\n"); + goto failed_write; + } + } + + sub_len = min(len, 4); + eeprom_offset += sub_len; + len -= sub_len; + + for (i = 0; i < sub_len; i++) { + if (wr_byte(dd, *bp++)) { + ipath_dbg("no ack after byte %u/%u (%u " + "total remain)\n", i, sub_len, + len + sub_len - i); + goto failed_write; + } + } + + stop_cmd(dd); + + /* + * wait for write complete by waiting for a successful + * read (the chip replies with a zero after the write + * cmd completes, and before it writes to the eeprom. + * The startcmd for the read will fail the ack until + * the writes have completed. We do this inline to avoid + * the debug prints that are in the real read routine + * if the startcmd fails. + * We also use the proper device address, so it doesn't matter + * whether we have real eeprom_dev. legacy likes any address. + */ + max_wait_time = 100; + while (i2c_startcmd(dd, icd->eeprom_dev | READ_CMD)) { + stop_cmd(dd); + if (!--max_wait_time) { + ipath_dbg("Did not get successful read to " + "complete write\n"); + goto failed_write; + } + } + /* now read (and ignore) the resulting byte */ + rd_byte(dd); + stop_cmd(dd); + } + + ret = 0; + goto bail; + +failed_write: + stop_cmd(dd); + ret = 1; + +bail: + return ret; +} + +/** + * ipath_eeprom_read - receives bytes from the eeprom via I2C + * @dd: the infinipath device + * @eeprom_offset: address to read from + * @buffer: where to store result + * @len: number of bytes to receive + */ +int ipath_eeprom_read(struct ipath_devdata *dd, u8 eeprom_offset, + void *buff, int len) +{ + int ret; + + ret = mutex_lock_interruptible(&dd->ipath_eep_lock); + if (!ret) { + ret = ipath_eeprom_internal_read(dd, eeprom_offset, buff, len); + mutex_unlock(&dd->ipath_eep_lock); + } + + return ret; +} + +/** + * ipath_eeprom_write - writes data to the eeprom via I2C + * @dd: the infinipath device + * @eeprom_offset: where to place data + * @buffer: data to write + * @len: number of bytes to write + */ +int ipath_eeprom_write(struct ipath_devdata *dd, u8 eeprom_offset, + const void *buff, int len) +{ + int ret; + + ret = mutex_lock_interruptible(&dd->ipath_eep_lock); + if (!ret) { + ret = ipath_eeprom_internal_write(dd, eeprom_offset, buff, len); + mutex_unlock(&dd->ipath_eep_lock); + } + + return ret; +} + +static u8 flash_csum(struct ipath_flash *ifp, int adjust) +{ + u8 *ip = (u8 *) ifp; + u8 csum = 0, len; + + /* + * Limit length checksummed to max length of actual data. + * Checksum of erased eeprom will still be bad, but we avoid + * reading past the end of the buffer we were passed. + */ + len = ifp->if_length; + if (len > sizeof(struct ipath_flash)) + len = sizeof(struct ipath_flash); + while (len--) + csum += *ip++; + csum -= ifp->if_csum; + csum = ~csum; + if (adjust) + ifp->if_csum = csum; + + return csum; +} + +/** + * ipath_get_guid - get the GUID from the i2c device + * @dd: the infinipath device + * + * We have the capability to use the ipath_nguid field, and get + * the guid from the first chip's flash, to use for all of them. + */ +void ipath_get_eeprom_info(struct ipath_devdata *dd) +{ + void *buf; + struct ipath_flash *ifp; + __be64 guid; + int len, eep_stat; + u8 csum, *bguid; + int t = dd->ipath_unit; + struct ipath_devdata *dd0 = ipath_lookup(0); + + if (t && dd0->ipath_nguid > 1 && t <= dd0->ipath_nguid) { + u8 oguid; + dd->ipath_guid = dd0->ipath_guid; + bguid = (u8 *) & dd->ipath_guid; + + oguid = bguid[7]; + bguid[7] += t; + if (oguid > bguid[7]) { + if (bguid[6] == 0xff) { + if (bguid[5] == 0xff) { + ipath_dev_err( + dd, + "Can't set %s GUID from " + "base, wraps to OUI!\n", + ipath_get_unit_name(t)); + dd->ipath_guid = 0; + goto bail; + } + bguid[5]++; + } + bguid[6]++; + } + dd->ipath_nguid = 1; + + ipath_dbg("nguid %u, so adding %u to device 0 guid, " + "for %llx\n", + dd0->ipath_nguid, t, + (unsigned long long) be64_to_cpu(dd->ipath_guid)); + goto bail; + } + + /* + * read full flash, not just currently used part, since it may have + * been written with a newer definition + * */ + len = sizeof(struct ipath_flash); + buf = vmalloc(len); + if (!buf) { + ipath_dev_err(dd, "Couldn't allocate memory to read %u " + "bytes from eeprom for GUID\n", len); + goto bail; + } + + mutex_lock(&dd->ipath_eep_lock); + eep_stat = ipath_eeprom_internal_read(dd, 0, buf, len); + mutex_unlock(&dd->ipath_eep_lock); + + if (eep_stat) { + ipath_dev_err(dd, "Failed reading GUID from eeprom\n"); + goto done; + } + ifp = (struct ipath_flash *)buf; + + csum = flash_csum(ifp, 0); + if (csum != ifp->if_csum) { + dev_info(&dd->pcidev->dev, "Bad I2C flash checksum: " + "0x%x, not 0x%x\n", csum, ifp->if_csum); + goto done; + } + if (*(__be64 *) ifp->if_guid == cpu_to_be64(0) || + *(__be64 *) ifp->if_guid == ~cpu_to_be64(0)) { + ipath_dev_err(dd, "Invalid GUID %llx from flash; " + "ignoring\n", + *(unsigned long long *) ifp->if_guid); + /* don't allow GUID if all 0 or all 1's */ + goto done; + } + + /* complain, but allow it */ + if (*(u64 *) ifp->if_guid == 0x100007511000000ULL) + dev_info(&dd->pcidev->dev, "Warning, GUID %llx is " + "default, probably not correct!\n", + *(unsigned long long *) ifp->if_guid); + + bguid = ifp->if_guid; + if (!bguid[0] && !bguid[1] && !bguid[2]) { + /* original incorrect GUID format in flash; fix in + * core copy, by shifting up 2 octets; don't need to + * change top octet, since both it and shifted are + * 0.. */ + bguid[1] = bguid[3]; + bguid[2] = bguid[4]; + bguid[3] = bguid[4] = 0; + guid = *(__be64 *) ifp->if_guid; + ipath_cdbg(VERBOSE, "Old GUID format in flash, top 3 zero, " + "shifting 2 octets\n"); + } else + guid = *(__be64 *) ifp->if_guid; + dd->ipath_guid = guid; + dd->ipath_nguid = ifp->if_numguid; + /* + * Things are slightly complicated by the desire to transparently + * support both the Pathscale 10-digit serial number and the QLogic + * 13-character version. + */ + if ((ifp->if_fversion > 1) && ifp->if_sprefix[0] + && ((u8 *)ifp->if_sprefix)[0] != 0xFF) { + /* This board has a Serial-prefix, which is stored + * elsewhere for backward-compatibility. + */ + char *snp = dd->ipath_serial; + memcpy(snp, ifp->if_sprefix, sizeof ifp->if_sprefix); + snp[sizeof ifp->if_sprefix] = '\0'; + len = strlen(snp); + snp += len; + len = (sizeof dd->ipath_serial) - len; + if (len > sizeof ifp->if_serial) { + len = sizeof ifp->if_serial; + } + memcpy(snp, ifp->if_serial, len); + } else + memcpy(dd->ipath_serial, ifp->if_serial, + sizeof ifp->if_serial); + if (!strstr(ifp->if_comment, "Tested successfully")) + ipath_dev_err(dd, "Board SN %s did not pass functional " + "test: %s\n", dd->ipath_serial, + ifp->if_comment); + + ipath_cdbg(VERBOSE, "Initted GUID to %llx from eeprom\n", + (unsigned long long) be64_to_cpu(dd->ipath_guid)); + + memcpy(&dd->ipath_eep_st_errs, &ifp->if_errcntp, IPATH_EEP_LOG_CNT); + /* + * Power-on (actually "active") hours are kept as little-endian value + * in EEPROM, but as seconds in a (possibly as small as 24-bit) + * atomic_t while running. + */ + atomic_set(&dd->ipath_active_time, 0); + dd->ipath_eep_hrs = ifp->if_powerhour[0] | (ifp->if_powerhour[1] << 8); + +done: + vfree(buf); + +bail:; +} + +/** + * ipath_update_eeprom_log - copy active-time and error counters to eeprom + * @dd: the infinipath device + * + * Although the time is kept as seconds in the ipath_devdata struct, it is + * rounded to hours for re-write, as we have only 16 bits in EEPROM. + * First-cut code reads whole (expected) struct ipath_flash, modifies, + * re-writes. Future direction: read/write only what we need, assuming + * that the EEPROM had to have been "good enough" for driver init, and + * if not, we aren't making it worse. + * + */ + +int ipath_update_eeprom_log(struct ipath_devdata *dd) +{ + void *buf; + struct ipath_flash *ifp; + int len, hi_water; + uint32_t new_time, new_hrs; + u8 csum; + int ret, idx; + unsigned long flags; + + /* first, check if we actually need to do anything. */ + ret = 0; + for (idx = 0; idx < IPATH_EEP_LOG_CNT; ++idx) { + if (dd->ipath_eep_st_new_errs[idx]) { + ret = 1; + break; + } + } + new_time = atomic_read(&dd->ipath_active_time); + + if (ret == 0 && new_time < 3600) + return 0; + + /* + * The quick-check above determined that there is something worthy + * of logging, so get current contents and do a more detailed idea. + * read full flash, not just currently used part, since it may have + * been written with a newer definition + */ + len = sizeof(struct ipath_flash); + buf = vmalloc(len); + ret = 1; + if (!buf) { + ipath_dev_err(dd, "Couldn't allocate memory to read %u " + "bytes from eeprom for logging\n", len); + goto bail; + } + + /* Grab semaphore and read current EEPROM. If we get an + * error, let go, but if not, keep it until we finish write. + */ + ret = mutex_lock_interruptible(&dd->ipath_eep_lock); + if (ret) { + ipath_dev_err(dd, "Unable to acquire EEPROM for logging\n"); + goto free_bail; + } + ret = ipath_eeprom_internal_read(dd, 0, buf, len); + if (ret) { + mutex_unlock(&dd->ipath_eep_lock); + ipath_dev_err(dd, "Unable read EEPROM for logging\n"); + goto free_bail; + } + ifp = (struct ipath_flash *)buf; + + csum = flash_csum(ifp, 0); + if (csum != ifp->if_csum) { + mutex_unlock(&dd->ipath_eep_lock); + ipath_dev_err(dd, "EEPROM cks err (0x%02X, S/B 0x%02X)\n", + csum, ifp->if_csum); + ret = 1; + goto free_bail; + } + hi_water = 0; + spin_lock_irqsave(&dd->ipath_eep_st_lock, flags); + for (idx = 0; idx < IPATH_EEP_LOG_CNT; ++idx) { + int new_val = dd->ipath_eep_st_new_errs[idx]; + if (new_val) { + /* + * If we have seen any errors, add to EEPROM values + * We need to saturate at 0xFF (255) and we also + * would need to adjust the checksum if we were + * trying to minimize EEPROM traffic + * Note that we add to actual current count in EEPROM, + * in case it was altered while we were running. + */ + new_val += ifp->if_errcntp[idx]; + if (new_val > 0xFF) + new_val = 0xFF; + if (ifp->if_errcntp[idx] != new_val) { + ifp->if_errcntp[idx] = new_val; + hi_water = offsetof(struct ipath_flash, + if_errcntp) + idx; + } + /* + * update our shadow (used to minimize EEPROM + * traffic), to match what we are about to write. + */ + dd->ipath_eep_st_errs[idx] = new_val; + dd->ipath_eep_st_new_errs[idx] = 0; + } + } + /* + * now update active-time. We would like to round to the nearest hour + * but unless atomic_t are sure to be proper signed ints we cannot, + * because we need to account for what we "transfer" to EEPROM and + * if we log an hour at 31 minutes, then we would need to set + * active_time to -29 to accurately count the _next_ hour. + */ + if (new_time >= 3600) { + new_hrs = new_time / 3600; + atomic_sub((new_hrs * 3600), &dd->ipath_active_time); + new_hrs += dd->ipath_eep_hrs; + if (new_hrs > 0xFFFF) + new_hrs = 0xFFFF; + dd->ipath_eep_hrs = new_hrs; + if ((new_hrs & 0xFF) != ifp->if_powerhour[0]) { + ifp->if_powerhour[0] = new_hrs & 0xFF; + hi_water = offsetof(struct ipath_flash, if_powerhour); + } + if ((new_hrs >> 8) != ifp->if_powerhour[1]) { + ifp->if_powerhour[1] = new_hrs >> 8; + hi_water = offsetof(struct ipath_flash, if_powerhour) + + 1; + } + } + /* + * There is a tiny possibility that we could somehow fail to write + * the EEPROM after updating our shadows, but problems from holding + * the spinlock too long are a much bigger issue. + */ + spin_unlock_irqrestore(&dd->ipath_eep_st_lock, flags); + if (hi_water) { + /* we made some change to the data, uopdate cksum and write */ + csum = flash_csum(ifp, 1); + ret = ipath_eeprom_internal_write(dd, 0, buf, hi_water + 1); + } + mutex_unlock(&dd->ipath_eep_lock); + if (ret) + ipath_dev_err(dd, "Failed updating EEPROM\n"); + +free_bail: + vfree(buf); +bail: + return ret; + +} + +/** + * ipath_inc_eeprom_err - increment one of the four error counters + * that are logged to EEPROM. + * @dd: the infinipath device + * @eidx: 0..3, the counter to increment + * @incr: how much to add + * + * Each counter is 8-bits, and saturates at 255 (0xFF). They + * are copied to the EEPROM (aka flash) whenever ipath_update_eeprom_log() + * is called, but it can only be called in a context that allows sleep. + * This function can be called even at interrupt level. + */ + +void ipath_inc_eeprom_err(struct ipath_devdata *dd, u32 eidx, u32 incr) +{ + uint new_val; + unsigned long flags; + + spin_lock_irqsave(&dd->ipath_eep_st_lock, flags); + new_val = dd->ipath_eep_st_new_errs[eidx] + incr; + if (new_val > 255) + new_val = 255; + dd->ipath_eep_st_new_errs[eidx] = new_val; + spin_unlock_irqrestore(&dd->ipath_eep_st_lock, flags); + return; +} + +static int ipath_tempsense_internal_read(struct ipath_devdata *dd, u8 regnum) +{ + int ret; + struct i2c_chain_desc *icd; + + ret = -ENOENT; + + icd = ipath_i2c_type(dd); + if (!icd) + goto bail; + + if (icd->temp_dev == IPATH_NO_DEV) { + /* tempsense only exists on new, real-I2C boards */ + ret = -ENXIO; + goto bail; + } + + if (i2c_startcmd(dd, icd->temp_dev | WRITE_CMD)) { + ipath_dbg("Failed tempsense startcmd\n"); + stop_cmd(dd); + ret = -ENXIO; + goto bail; + } + ret = wr_byte(dd, regnum); + stop_cmd(dd); + if (ret) { + ipath_dev_err(dd, "Failed tempsense WR command %02X\n", + regnum); + ret = -ENXIO; + goto bail; + } + if (i2c_startcmd(dd, icd->temp_dev | READ_CMD)) { + ipath_dbg("Failed tempsense RD startcmd\n"); + stop_cmd(dd); + ret = -ENXIO; + goto bail; + } + /* + * We can only clock out one byte per command, sensibly + */ + ret = rd_byte(dd); + stop_cmd(dd); + +bail: + return ret; +} + +#define VALID_TS_RD_REG_MASK 0xBF + +/** + * ipath_tempsense_read - read register of temp sensor via I2C + * @dd: the infinipath device + * @regnum: register to read from + * + * returns reg contents (0..255) or < 0 for error + */ +int ipath_tempsense_read(struct ipath_devdata *dd, u8 regnum) +{ + int ret; + + if (regnum > 7) + return -EINVAL; + + /* return a bogus value for (the one) register we do not have */ + if (!((1 << regnum) & VALID_TS_RD_REG_MASK)) + return 0; + + ret = mutex_lock_interruptible(&dd->ipath_eep_lock); + if (!ret) { + ret = ipath_tempsense_internal_read(dd, regnum); + mutex_unlock(&dd->ipath_eep_lock); + } + + /* + * There are three possibilities here: + * ret is actual value (0..255) + * ret is -ENXIO or -EINVAL from code in this file + * ret is -EINTR from mutex_lock_interruptible. + */ + return ret; +} + +static int ipath_tempsense_internal_write(struct ipath_devdata *dd, + u8 regnum, u8 data) +{ + int ret = -ENOENT; + struct i2c_chain_desc *icd; + + icd = ipath_i2c_type(dd); + if (!icd) + goto bail; + + if (icd->temp_dev == IPATH_NO_DEV) { + /* tempsense only exists on new, real-I2C boards */ + ret = -ENXIO; + goto bail; + } + if (i2c_startcmd(dd, icd->temp_dev | WRITE_CMD)) { + ipath_dbg("Failed tempsense startcmd\n"); + stop_cmd(dd); + ret = -ENXIO; + goto bail; + } + ret = wr_byte(dd, regnum); + if (ret) { + stop_cmd(dd); + ipath_dev_err(dd, "Failed to write tempsense command %02X\n", + regnum); + ret = -ENXIO; + goto bail; + } + ret = wr_byte(dd, data); + stop_cmd(dd); + ret = i2c_startcmd(dd, icd->temp_dev | READ_CMD); + if (ret) { + ipath_dev_err(dd, "Failed tempsense data wrt to %02X\n", + regnum); + ret = -ENXIO; + } + +bail: + return ret; +} + +#define VALID_TS_WR_REG_MASK ((1 << 9) | (1 << 0xB) | (1 << 0xD)) + +/** + * ipath_tempsense_write - write register of temp sensor via I2C + * @dd: the infinipath device + * @regnum: register to write + * @data: data to write + * + * returns 0 for success or < 0 for error + */ +int ipath_tempsense_write(struct ipath_devdata *dd, u8 regnum, u8 data) +{ + int ret; + + if (regnum > 15 || !((1 << regnum) & VALID_TS_WR_REG_MASK)) + return -EINVAL; + + ret = mutex_lock_interruptible(&dd->ipath_eep_lock); + if (!ret) { + ret = ipath_tempsense_internal_write(dd, regnum, data); + mutex_unlock(&dd->ipath_eep_lock); + } + + /* + * There are three possibilities here: + * ret is 0 for success + * ret is -ENXIO or -EINVAL from code in this file + * ret is -EINTR from mutex_lock_interruptible. + */ + return ret; +} diff --git a/kernel/drivers/infiniband/hw/ipath/ipath_file_ops.c b/kernel/drivers/infiniband/hw/ipath/ipath_file_ops.c new file mode 100644 index 000000000..450d15965 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ipath/ipath_file_ops.c @@ -0,0 +1,2620 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ipath_kernel.h" +#include "ipath_common.h" +#include "ipath_user_sdma.h" + +static int ipath_open(struct inode *, struct file *); +static int ipath_close(struct inode *, struct file *); +static ssize_t ipath_write(struct file *, const char __user *, size_t, + loff_t *); +static ssize_t ipath_write_iter(struct kiocb *, struct iov_iter *from); +static unsigned int ipath_poll(struct file *, struct poll_table_struct *); +static int ipath_mmap(struct file *, struct vm_area_struct *); + +/* + * This is really, really weird shit - write() and writev() here + * have completely unrelated semantics. Sucky userland ABI, + * film at 11. + */ +static const struct file_operations ipath_file_ops = { + .owner = THIS_MODULE, + .write = ipath_write, + .write_iter = ipath_write_iter, + .open = ipath_open, + .release = ipath_close, + .poll = ipath_poll, + .mmap = ipath_mmap, + .llseek = noop_llseek, +}; + +/* + * Convert kernel virtual addresses to physical addresses so they don't + * potentially conflict with the chip addresses used as mmap offsets. + * It doesn't really matter what mmap offset we use as long as we can + * interpret it correctly. + */ +static u64 cvt_kvaddr(void *p) +{ + struct page *page; + u64 paddr = 0; + + page = vmalloc_to_page(p); + if (page) + paddr = page_to_pfn(page) << PAGE_SHIFT; + + return paddr; +} + +static int ipath_get_base_info(struct file *fp, + void __user *ubase, size_t ubase_size) +{ + struct ipath_portdata *pd = port_fp(fp); + int ret = 0; + struct ipath_base_info *kinfo = NULL; + struct ipath_devdata *dd = pd->port_dd; + unsigned subport_cnt; + int shared, master; + size_t sz; + + subport_cnt = pd->port_subport_cnt; + if (!subport_cnt) { + shared = 0; + master = 0; + subport_cnt = 1; + } else { + shared = 1; + master = !subport_fp(fp); + } + + sz = sizeof(*kinfo); + /* If port sharing is not requested, allow the old size structure */ + if (!shared) + sz -= 7 * sizeof(u64); + if (ubase_size < sz) { + ipath_cdbg(PROC, + "Base size %zu, need %zu (version mismatch?)\n", + ubase_size, sz); + ret = -EINVAL; + goto bail; + } + + kinfo = kzalloc(sizeof(*kinfo), GFP_KERNEL); + if (kinfo == NULL) { + ret = -ENOMEM; + goto bail; + } + + ret = dd->ipath_f_get_base_info(pd, kinfo); + if (ret < 0) + goto bail; + + kinfo->spi_rcvhdr_cnt = dd->ipath_rcvhdrcnt; + kinfo->spi_rcvhdrent_size = dd->ipath_rcvhdrentsize; + kinfo->spi_tidegrcnt = dd->ipath_rcvegrcnt; + kinfo->spi_rcv_egrbufsize = dd->ipath_rcvegrbufsize; + /* + * have to mmap whole thing + */ + kinfo->spi_rcv_egrbuftotlen = + pd->port_rcvegrbuf_chunks * pd->port_rcvegrbuf_size; + kinfo->spi_rcv_egrperchunk = pd->port_rcvegrbufs_perchunk; + kinfo->spi_rcv_egrchunksize = kinfo->spi_rcv_egrbuftotlen / + pd->port_rcvegrbuf_chunks; + kinfo->spi_tidcnt = dd->ipath_rcvtidcnt / subport_cnt; + if (master) + kinfo->spi_tidcnt += dd->ipath_rcvtidcnt % subport_cnt; + /* + * for this use, may be ipath_cfgports summed over all chips that + * are are configured and present + */ + kinfo->spi_nports = dd->ipath_cfgports; + /* unit (chip/board) our port is on */ + kinfo->spi_unit = dd->ipath_unit; + /* for now, only a single page */ + kinfo->spi_tid_maxsize = PAGE_SIZE; + + /* + * Doing this per port, and based on the skip value, etc. This has + * to be the actual buffer size, since the protocol code treats it + * as an array. + * + * These have to be set to user addresses in the user code via mmap. + * These values are used on return to user code for the mmap target + * addresses only. For 32 bit, same 44 bit address problem, so use + * the physical address, not virtual. Before 2.6.11, using the + * page_address() macro worked, but in 2.6.11, even that returns the + * full 64 bit address (upper bits all 1's). So far, using the + * physical addresses (or chip offsets, for chip mapping) works, but + * no doubt some future kernel release will change that, and we'll be + * on to yet another method of dealing with this. + */ + kinfo->spi_rcvhdr_base = (u64) pd->port_rcvhdrq_phys; + kinfo->spi_rcvhdr_tailaddr = (u64) pd->port_rcvhdrqtailaddr_phys; + kinfo->spi_rcv_egrbufs = (u64) pd->port_rcvegr_phys; + kinfo->spi_pioavailaddr = (u64) dd->ipath_pioavailregs_phys; + kinfo->spi_status = (u64) kinfo->spi_pioavailaddr + + (void *) dd->ipath_statusp - + (void *) dd->ipath_pioavailregs_dma; + if (!shared) { + kinfo->spi_piocnt = pd->port_piocnt; + kinfo->spi_piobufbase = (u64) pd->port_piobufs; + kinfo->__spi_uregbase = (u64) dd->ipath_uregbase + + dd->ipath_ureg_align * pd->port_port; + } else if (master) { + kinfo->spi_piocnt = (pd->port_piocnt / subport_cnt) + + (pd->port_piocnt % subport_cnt); + /* Master's PIO buffers are after all the slave's */ + kinfo->spi_piobufbase = (u64) pd->port_piobufs + + dd->ipath_palign * + (pd->port_piocnt - kinfo->spi_piocnt); + } else { + unsigned slave = subport_fp(fp) - 1; + + kinfo->spi_piocnt = pd->port_piocnt / subport_cnt; + kinfo->spi_piobufbase = (u64) pd->port_piobufs + + dd->ipath_palign * kinfo->spi_piocnt * slave; + } + + if (shared) { + kinfo->spi_port_uregbase = (u64) dd->ipath_uregbase + + dd->ipath_ureg_align * pd->port_port; + kinfo->spi_port_rcvegrbuf = kinfo->spi_rcv_egrbufs; + kinfo->spi_port_rcvhdr_base = kinfo->spi_rcvhdr_base; + kinfo->spi_port_rcvhdr_tailaddr = kinfo->spi_rcvhdr_tailaddr; + + kinfo->__spi_uregbase = cvt_kvaddr(pd->subport_uregbase + + PAGE_SIZE * subport_fp(fp)); + + kinfo->spi_rcvhdr_base = cvt_kvaddr(pd->subport_rcvhdr_base + + pd->port_rcvhdrq_size * subport_fp(fp)); + kinfo->spi_rcvhdr_tailaddr = 0; + kinfo->spi_rcv_egrbufs = cvt_kvaddr(pd->subport_rcvegrbuf + + pd->port_rcvegrbuf_chunks * pd->port_rcvegrbuf_size * + subport_fp(fp)); + + kinfo->spi_subport_uregbase = + cvt_kvaddr(pd->subport_uregbase); + kinfo->spi_subport_rcvegrbuf = + cvt_kvaddr(pd->subport_rcvegrbuf); + kinfo->spi_subport_rcvhdr_base = + cvt_kvaddr(pd->subport_rcvhdr_base); + ipath_cdbg(PROC, "port %u flags %x %llx %llx %llx\n", + kinfo->spi_port, kinfo->spi_runtime_flags, + (unsigned long long) kinfo->spi_subport_uregbase, + (unsigned long long) kinfo->spi_subport_rcvegrbuf, + (unsigned long long) kinfo->spi_subport_rcvhdr_base); + } + + /* + * All user buffers are 2KB buffers. If we ever support + * giving 4KB buffers to user processes, this will need some + * work. + */ + kinfo->spi_pioindex = (kinfo->spi_piobufbase - + (dd->ipath_piobufbase & 0xffffffff)) / dd->ipath_palign; + kinfo->spi_pioalign = dd->ipath_palign; + + kinfo->spi_qpair = IPATH_KD_QP; + /* + * user mode PIO buffers are always 2KB, even when 4KB can + * be received, and sent via the kernel; this is ibmaxlen + * for 2K MTU. + */ + kinfo->spi_piosize = dd->ipath_piosize2k - 2 * sizeof(u32); + kinfo->spi_mtu = dd->ipath_ibmaxlen; /* maxlen, not ibmtu */ + kinfo->spi_port = pd->port_port; + kinfo->spi_subport = subport_fp(fp); + kinfo->spi_sw_version = IPATH_KERN_SWVERSION; + kinfo->spi_hw_version = dd->ipath_revision; + + if (master) { + kinfo->spi_runtime_flags |= IPATH_RUNTIME_MASTER; + } + + sz = (ubase_size < sizeof(*kinfo)) ? ubase_size : sizeof(*kinfo); + if (copy_to_user(ubase, kinfo, sz)) + ret = -EFAULT; + +bail: + kfree(kinfo); + return ret; +} + +/** + * ipath_tid_update - update a port TID + * @pd: the port + * @fp: the ipath device file + * @ti: the TID information + * + * The new implementation as of Oct 2004 is that the driver assigns + * the tid and returns it to the caller. To make it easier to + * catch bugs, and to reduce search time, we keep a cursor for + * each port, walking the shadow tid array to find one that's not + * in use. + * + * For now, if we can't allocate the full list, we fail, although + * in the long run, we'll allocate as many as we can, and the + * caller will deal with that by trying the remaining pages later. + * That means that when we fail, we have to mark the tids as not in + * use again, in our shadow copy. + * + * It's up to the caller to free the tids when they are done. + * We'll unlock the pages as they free them. + * + * Also, right now we are locking one page at a time, but since + * the intended use of this routine is for a single group of + * virtually contiguous pages, that should change to improve + * performance. + */ +static int ipath_tid_update(struct ipath_portdata *pd, struct file *fp, + const struct ipath_tid_info *ti) +{ + int ret = 0, ntids; + u32 tid, porttid, cnt, i, tidcnt, tidoff; + u16 *tidlist; + struct ipath_devdata *dd = pd->port_dd; + u64 physaddr; + unsigned long vaddr; + u64 __iomem *tidbase; + unsigned long tidmap[8]; + struct page **pagep = NULL; + unsigned subport = subport_fp(fp); + + if (!dd->ipath_pageshadow) { + ret = -ENOMEM; + goto done; + } + + cnt = ti->tidcnt; + if (!cnt) { + ipath_dbg("After copyin, tidcnt 0, tidlist %llx\n", + (unsigned long long) ti->tidlist); + /* + * Should we treat as success? likely a bug + */ + ret = -EFAULT; + goto done; + } + porttid = pd->port_port * dd->ipath_rcvtidcnt; + if (!pd->port_subport_cnt) { + tidcnt = dd->ipath_rcvtidcnt; + tid = pd->port_tidcursor; + tidoff = 0; + } else if (!subport) { + tidcnt = (dd->ipath_rcvtidcnt / pd->port_subport_cnt) + + (dd->ipath_rcvtidcnt % pd->port_subport_cnt); + tidoff = dd->ipath_rcvtidcnt - tidcnt; + porttid += tidoff; + tid = tidcursor_fp(fp); + } else { + tidcnt = dd->ipath_rcvtidcnt / pd->port_subport_cnt; + tidoff = tidcnt * (subport - 1); + porttid += tidoff; + tid = tidcursor_fp(fp); + } + if (cnt > tidcnt) { + /* make sure it all fits in port_tid_pg_list */ + dev_info(&dd->pcidev->dev, "Process tried to allocate %u " + "TIDs, only trying max (%u)\n", cnt, tidcnt); + cnt = tidcnt; + } + pagep = &((struct page **) pd->port_tid_pg_list)[tidoff]; + tidlist = &((u16 *) &pagep[dd->ipath_rcvtidcnt])[tidoff]; + + memset(tidmap, 0, sizeof(tidmap)); + /* before decrement; chip actual # */ + ntids = tidcnt; + tidbase = (u64 __iomem *) (((char __iomem *) dd->ipath_kregbase) + + dd->ipath_rcvtidbase + + porttid * sizeof(*tidbase)); + + ipath_cdbg(VERBOSE, "Port%u %u tids, cursor %u, tidbase %p\n", + pd->port_port, cnt, tid, tidbase); + + /* virtual address of first page in transfer */ + vaddr = ti->tidvaddr; + if (!access_ok(VERIFY_WRITE, (void __user *) vaddr, + cnt * PAGE_SIZE)) { + ipath_dbg("Fail vaddr %p, %u pages, !access_ok\n", + (void *)vaddr, cnt); + ret = -EFAULT; + goto done; + } + ret = ipath_get_user_pages(vaddr, cnt, pagep); + if (ret) { + if (ret == -EBUSY) { + ipath_dbg("Failed to lock addr %p, %u pages " + "(already locked)\n", + (void *) vaddr, cnt); + /* + * for now, continue, and see what happens but with + * the new implementation, this should never happen, + * unless perhaps the user has mpin'ed the pages + * themselves (something we need to test) + */ + ret = 0; + } else { + dev_info(&dd->pcidev->dev, + "Failed to lock addr %p, %u pages: " + "errno %d\n", (void *) vaddr, cnt, -ret); + goto done; + } + } + for (i = 0; i < cnt; i++, vaddr += PAGE_SIZE) { + for (; ntids--; tid++) { + if (tid == tidcnt) + tid = 0; + if (!dd->ipath_pageshadow[porttid + tid]) + break; + } + if (ntids < 0) { + /* + * oops, wrapped all the way through their TIDs, + * and didn't have enough free; see comments at + * start of routine + */ + ipath_dbg("Not enough free TIDs for %u pages " + "(index %d), failing\n", cnt, i); + i--; /* last tidlist[i] not filled in */ + ret = -ENOMEM; + break; + } + tidlist[i] = tid + tidoff; + ipath_cdbg(VERBOSE, "Updating idx %u to TID %u, " + "vaddr %lx\n", i, tid + tidoff, vaddr); + /* we "know" system pages and TID pages are same size */ + dd->ipath_pageshadow[porttid + tid] = pagep[i]; + dd->ipath_physshadow[porttid + tid] = ipath_map_page( + dd->pcidev, pagep[i], 0, PAGE_SIZE, + PCI_DMA_FROMDEVICE); + /* + * don't need atomic or it's overhead + */ + __set_bit(tid, tidmap); + physaddr = dd->ipath_physshadow[porttid + tid]; + ipath_stats.sps_pagelocks++; + ipath_cdbg(VERBOSE, + "TID %u, vaddr %lx, physaddr %llx pgp %p\n", + tid, vaddr, (unsigned long long) physaddr, + pagep[i]); + dd->ipath_f_put_tid(dd, &tidbase[tid], RCVHQ_RCV_TYPE_EXPECTED, + physaddr); + /* + * don't check this tid in ipath_portshadow, since we + * just filled it in; start with the next one. + */ + tid++; + } + + if (ret) { + u32 limit; + cleanup: + /* jump here if copy out of updated info failed... */ + ipath_dbg("After failure (ret=%d), undo %d of %d entries\n", + -ret, i, cnt); + /* same code that's in ipath_free_tid() */ + limit = sizeof(tidmap) * BITS_PER_BYTE; + if (limit > tidcnt) + /* just in case size changes in future */ + limit = tidcnt; + tid = find_first_bit((const unsigned long *)tidmap, limit); + for (; tid < limit; tid++) { + if (!test_bit(tid, tidmap)) + continue; + if (dd->ipath_pageshadow[porttid + tid]) { + ipath_cdbg(VERBOSE, "Freeing TID %u\n", + tid); + dd->ipath_f_put_tid(dd, &tidbase[tid], + RCVHQ_RCV_TYPE_EXPECTED, + dd->ipath_tidinvalid); + pci_unmap_page(dd->pcidev, + dd->ipath_physshadow[porttid + tid], + PAGE_SIZE, PCI_DMA_FROMDEVICE); + dd->ipath_pageshadow[porttid + tid] = NULL; + ipath_stats.sps_pageunlocks++; + } + } + ipath_release_user_pages(pagep, cnt); + } else { + /* + * Copy the updated array, with ipath_tid's filled in, back + * to user. Since we did the copy in already, this "should + * never fail" If it does, we have to clean up... + */ + if (copy_to_user((void __user *) + (unsigned long) ti->tidlist, + tidlist, cnt * sizeof(*tidlist))) { + ret = -EFAULT; + goto cleanup; + } + if (copy_to_user((void __user *) (unsigned long) ti->tidmap, + tidmap, sizeof tidmap)) { + ret = -EFAULT; + goto cleanup; + } + if (tid == tidcnt) + tid = 0; + if (!pd->port_subport_cnt) + pd->port_tidcursor = tid; + else + tidcursor_fp(fp) = tid; + } + +done: + if (ret) + ipath_dbg("Failed to map %u TID pages, failing with %d\n", + ti->tidcnt, -ret); + return ret; +} + +/** + * ipath_tid_free - free a port TID + * @pd: the port + * @subport: the subport + * @ti: the TID info + * + * right now we are unlocking one page at a time, but since + * the intended use of this routine is for a single group of + * virtually contiguous pages, that should change to improve + * performance. We check that the TID is in range for this port + * but otherwise don't check validity; if user has an error and + * frees the wrong tid, it's only their own data that can thereby + * be corrupted. We do check that the TID was in use, for sanity + * We always use our idea of the saved address, not the address that + * they pass in to us. + */ + +static int ipath_tid_free(struct ipath_portdata *pd, unsigned subport, + const struct ipath_tid_info *ti) +{ + int ret = 0; + u32 tid, porttid, cnt, limit, tidcnt; + struct ipath_devdata *dd = pd->port_dd; + u64 __iomem *tidbase; + unsigned long tidmap[8]; + + if (!dd->ipath_pageshadow) { + ret = -ENOMEM; + goto done; + } + + if (copy_from_user(tidmap, (void __user *)(unsigned long)ti->tidmap, + sizeof tidmap)) { + ret = -EFAULT; + goto done; + } + + porttid = pd->port_port * dd->ipath_rcvtidcnt; + if (!pd->port_subport_cnt) + tidcnt = dd->ipath_rcvtidcnt; + else if (!subport) { + tidcnt = (dd->ipath_rcvtidcnt / pd->port_subport_cnt) + + (dd->ipath_rcvtidcnt % pd->port_subport_cnt); + porttid += dd->ipath_rcvtidcnt - tidcnt; + } else { + tidcnt = dd->ipath_rcvtidcnt / pd->port_subport_cnt; + porttid += tidcnt * (subport - 1); + } + tidbase = (u64 __iomem *) ((char __iomem *)(dd->ipath_kregbase) + + dd->ipath_rcvtidbase + + porttid * sizeof(*tidbase)); + + limit = sizeof(tidmap) * BITS_PER_BYTE; + if (limit > tidcnt) + /* just in case size changes in future */ + limit = tidcnt; + tid = find_first_bit(tidmap, limit); + ipath_cdbg(VERBOSE, "Port%u free %u tids; first bit (max=%d) " + "set is %d, porttid %u\n", pd->port_port, ti->tidcnt, + limit, tid, porttid); + for (cnt = 0; tid < limit; tid++) { + /* + * small optimization; if we detect a run of 3 or so without + * any set, use find_first_bit again. That's mainly to + * accelerate the case where we wrapped, so we have some at + * the beginning, and some at the end, and a big gap + * in the middle. + */ + if (!test_bit(tid, tidmap)) + continue; + cnt++; + if (dd->ipath_pageshadow[porttid + tid]) { + struct page *p; + p = dd->ipath_pageshadow[porttid + tid]; + dd->ipath_pageshadow[porttid + tid] = NULL; + ipath_cdbg(VERBOSE, "PID %u freeing TID %u\n", + pid_nr(pd->port_pid), tid); + dd->ipath_f_put_tid(dd, &tidbase[tid], + RCVHQ_RCV_TYPE_EXPECTED, + dd->ipath_tidinvalid); + pci_unmap_page(dd->pcidev, + dd->ipath_physshadow[porttid + tid], + PAGE_SIZE, PCI_DMA_FROMDEVICE); + ipath_release_user_pages(&p, 1); + ipath_stats.sps_pageunlocks++; + } else + ipath_dbg("Unused tid %u, ignoring\n", tid); + } + if (cnt != ti->tidcnt) + ipath_dbg("passed in tidcnt %d, only %d bits set in map\n", + ti->tidcnt, cnt); +done: + if (ret) + ipath_dbg("Failed to unmap %u TID pages, failing with %d\n", + ti->tidcnt, -ret); + return ret; +} + +/** + * ipath_set_part_key - set a partition key + * @pd: the port + * @key: the key + * + * We can have up to 4 active at a time (other than the default, which is + * always allowed). This is somewhat tricky, since multiple ports may set + * the same key, so we reference count them, and clean up at exit. All 4 + * partition keys are packed into a single infinipath register. It's an + * error for a process to set the same pkey multiple times. We provide no + * mechanism to de-allocate a pkey at this time, we may eventually need to + * do that. I've used the atomic operations, and no locking, and only make + * a single pass through what's available. This should be more than + * adequate for some time. I'll think about spinlocks or the like if and as + * it's necessary. + */ +static int ipath_set_part_key(struct ipath_portdata *pd, u16 key) +{ + struct ipath_devdata *dd = pd->port_dd; + int i, any = 0, pidx = -1; + u16 lkey = key & 0x7FFF; + int ret; + + if (lkey == (IPATH_DEFAULT_P_KEY & 0x7FFF)) { + /* nothing to do; this key always valid */ + ret = 0; + goto bail; + } + + ipath_cdbg(VERBOSE, "p%u try to set pkey %hx, current keys " + "%hx:%x %hx:%x %hx:%x %hx:%x\n", + pd->port_port, key, dd->ipath_pkeys[0], + atomic_read(&dd->ipath_pkeyrefs[0]), dd->ipath_pkeys[1], + atomic_read(&dd->ipath_pkeyrefs[1]), dd->ipath_pkeys[2], + atomic_read(&dd->ipath_pkeyrefs[2]), dd->ipath_pkeys[3], + atomic_read(&dd->ipath_pkeyrefs[3])); + + if (!lkey) { + ipath_cdbg(PROC, "p%u tries to set key 0, not allowed\n", + pd->port_port); + ret = -EINVAL; + goto bail; + } + + /* + * Set the full membership bit, because it has to be + * set in the register or the packet, and it seems + * cleaner to set in the register than to force all + * callers to set it. (see bug 4331) + */ + key |= 0x8000; + + for (i = 0; i < ARRAY_SIZE(pd->port_pkeys); i++) { + if (!pd->port_pkeys[i] && pidx == -1) + pidx = i; + if (pd->port_pkeys[i] == key) { + ipath_cdbg(VERBOSE, "p%u tries to set same pkey " + "(%x) more than once\n", + pd->port_port, key); + ret = -EEXIST; + goto bail; + } + } + if (pidx == -1) { + ipath_dbg("All pkeys for port %u already in use, " + "can't set %x\n", pd->port_port, key); + ret = -EBUSY; + goto bail; + } + for (any = i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) { + if (!dd->ipath_pkeys[i]) { + any++; + continue; + } + if (dd->ipath_pkeys[i] == key) { + atomic_t *pkrefs = &dd->ipath_pkeyrefs[i]; + + if (atomic_inc_return(pkrefs) > 1) { + pd->port_pkeys[pidx] = key; + ipath_cdbg(VERBOSE, "p%u set key %x " + "matches #%d, count now %d\n", + pd->port_port, key, i, + atomic_read(pkrefs)); + ret = 0; + goto bail; + } else { + /* + * lost race, decrement count, catch below + */ + atomic_dec(pkrefs); + ipath_cdbg(VERBOSE, "Lost race, count was " + "0, after dec, it's %d\n", + atomic_read(pkrefs)); + any++; + } + } + if ((dd->ipath_pkeys[i] & 0x7FFF) == lkey) { + /* + * It makes no sense to have both the limited and + * full membership PKEY set at the same time since + * the unlimited one will disable the limited one. + */ + ret = -EEXIST; + goto bail; + } + } + if (!any) { + ipath_dbg("port %u, all pkeys already in use, " + "can't set %x\n", pd->port_port, key); + ret = -EBUSY; + goto bail; + } + for (any = i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) { + if (!dd->ipath_pkeys[i] && + atomic_inc_return(&dd->ipath_pkeyrefs[i]) == 1) { + u64 pkey; + + /* for ipathstats, etc. */ + ipath_stats.sps_pkeys[i] = lkey; + pd->port_pkeys[pidx] = dd->ipath_pkeys[i] = key; + pkey = + (u64) dd->ipath_pkeys[0] | + ((u64) dd->ipath_pkeys[1] << 16) | + ((u64) dd->ipath_pkeys[2] << 32) | + ((u64) dd->ipath_pkeys[3] << 48); + ipath_cdbg(PROC, "p%u set key %x in #%d, " + "portidx %d, new pkey reg %llx\n", + pd->port_port, key, i, pidx, + (unsigned long long) pkey); + ipath_write_kreg( + dd, dd->ipath_kregs->kr_partitionkey, pkey); + + ret = 0; + goto bail; + } + } + ipath_dbg("port %u, all pkeys already in use 2nd pass, " + "can't set %x\n", pd->port_port, key); + ret = -EBUSY; + +bail: + return ret; +} + +/** + * ipath_manage_rcvq - manage a port's receive queue + * @pd: the port + * @subport: the subport + * @start_stop: action to carry out + * + * start_stop == 0 disables receive on the port, for use in queue + * overflow conditions. start_stop==1 re-enables, to be used to + * re-init the software copy of the head register + */ +static int ipath_manage_rcvq(struct ipath_portdata *pd, unsigned subport, + int start_stop) +{ + struct ipath_devdata *dd = pd->port_dd; + + ipath_cdbg(PROC, "%sabling rcv for unit %u port %u:%u\n", + start_stop ? "en" : "dis", dd->ipath_unit, + pd->port_port, subport); + if (subport) + goto bail; + /* atomically clear receive enable port. */ + if (start_stop) { + /* + * On enable, force in-memory copy of the tail register to + * 0, so that protocol code doesn't have to worry about + * whether or not the chip has yet updated the in-memory + * copy or not on return from the system call. The chip + * always resets it's tail register back to 0 on a + * transition from disabled to enabled. This could cause a + * problem if software was broken, and did the enable w/o + * the disable, but eventually the in-memory copy will be + * updated and correct itself, even in the face of software + * bugs. + */ + if (pd->port_rcvhdrtail_kvaddr) + ipath_clear_rcvhdrtail(pd); + set_bit(dd->ipath_r_portenable_shift + pd->port_port, + &dd->ipath_rcvctrl); + } else + clear_bit(dd->ipath_r_portenable_shift + pd->port_port, + &dd->ipath_rcvctrl); + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, + dd->ipath_rcvctrl); + /* now be sure chip saw it before we return */ + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + if (start_stop) { + /* + * And try to be sure that tail reg update has happened too. + * This should in theory interlock with the RXE changes to + * the tail register. Don't assign it to the tail register + * in memory copy, since we could overwrite an update by the + * chip if we did. + */ + ipath_read_ureg32(dd, ur_rcvhdrtail, pd->port_port); + } + /* always; new head should be equal to new tail; see above */ +bail: + return 0; +} + +static void ipath_clean_part_key(struct ipath_portdata *pd, + struct ipath_devdata *dd) +{ + int i, j, pchanged = 0; + u64 oldpkey; + + /* for debugging only */ + oldpkey = (u64) dd->ipath_pkeys[0] | + ((u64) dd->ipath_pkeys[1] << 16) | + ((u64) dd->ipath_pkeys[2] << 32) | + ((u64) dd->ipath_pkeys[3] << 48); + + for (i = 0; i < ARRAY_SIZE(pd->port_pkeys); i++) { + if (!pd->port_pkeys[i]) + continue; + ipath_cdbg(VERBOSE, "look for key[%d] %hx in pkeys\n", i, + pd->port_pkeys[i]); + for (j = 0; j < ARRAY_SIZE(dd->ipath_pkeys); j++) { + /* check for match independent of the global bit */ + if ((dd->ipath_pkeys[j] & 0x7fff) != + (pd->port_pkeys[i] & 0x7fff)) + continue; + if (atomic_dec_and_test(&dd->ipath_pkeyrefs[j])) { + ipath_cdbg(VERBOSE, "p%u clear key " + "%x matches #%d\n", + pd->port_port, + pd->port_pkeys[i], j); + ipath_stats.sps_pkeys[j] = + dd->ipath_pkeys[j] = 0; + pchanged++; + } + else ipath_cdbg( + VERBOSE, "p%u key %x matches #%d, " + "but ref still %d\n", pd->port_port, + pd->port_pkeys[i], j, + atomic_read(&dd->ipath_pkeyrefs[j])); + break; + } + pd->port_pkeys[i] = 0; + } + if (pchanged) { + u64 pkey = (u64) dd->ipath_pkeys[0] | + ((u64) dd->ipath_pkeys[1] << 16) | + ((u64) dd->ipath_pkeys[2] << 32) | + ((u64) dd->ipath_pkeys[3] << 48); + ipath_cdbg(VERBOSE, "p%u old pkey reg %llx, " + "new pkey reg %llx\n", pd->port_port, + (unsigned long long) oldpkey, + (unsigned long long) pkey); + ipath_write_kreg(dd, dd->ipath_kregs->kr_partitionkey, + pkey); + } +} + +/* + * Initialize the port data with the receive buffer sizes + * so this can be done while the master port is locked. + * Otherwise, there is a race with a slave opening the port + * and seeing these fields uninitialized. + */ +static void init_user_egr_sizes(struct ipath_portdata *pd) +{ + struct ipath_devdata *dd = pd->port_dd; + unsigned egrperchunk, egrcnt, size; + + /* + * to avoid wasting a lot of memory, we allocate 32KB chunks of + * physically contiguous memory, advance through it until used up + * and then allocate more. Of course, we need memory to store those + * extra pointers, now. Started out with 256KB, but under heavy + * memory pressure (creating large files and then copying them over + * NFS while doing lots of MPI jobs), we hit some allocation + * failures, even though we can sleep... (2.6.10) Still get + * failures at 64K. 32K is the lowest we can go without wasting + * additional memory. + */ + size = 0x8000; + egrperchunk = size / dd->ipath_rcvegrbufsize; + egrcnt = dd->ipath_rcvegrcnt; + pd->port_rcvegrbuf_chunks = (egrcnt + egrperchunk - 1) / egrperchunk; + pd->port_rcvegrbufs_perchunk = egrperchunk; + pd->port_rcvegrbuf_size = size; +} + +/** + * ipath_create_user_egr - allocate eager TID buffers + * @pd: the port to allocate TID buffers for + * + * This routine is now quite different for user and kernel, because + * the kernel uses skb's, for the accelerated network performance + * This is the user port version + * + * Allocate the eager TID buffers and program them into infinipath + * They are no longer completely contiguous, we do multiple allocation + * calls. + */ +static int ipath_create_user_egr(struct ipath_portdata *pd) +{ + struct ipath_devdata *dd = pd->port_dd; + unsigned e, egrcnt, egrperchunk, chunk, egrsize, egroff; + size_t size; + int ret; + gfp_t gfp_flags; + + /* + * GFP_USER, but without GFP_FS, so buffer cache can be + * coalesced (we hope); otherwise, even at order 4, + * heavy filesystem activity makes these fail, and we can + * use compound pages. + */ + gfp_flags = __GFP_WAIT | __GFP_IO | __GFP_COMP; + + egrcnt = dd->ipath_rcvegrcnt; + /* TID number offset for this port */ + egroff = (pd->port_port - 1) * egrcnt + dd->ipath_p0_rcvegrcnt; + egrsize = dd->ipath_rcvegrbufsize; + ipath_cdbg(VERBOSE, "Allocating %d egr buffers, at egrtid " + "offset %x, egrsize %u\n", egrcnt, egroff, egrsize); + + chunk = pd->port_rcvegrbuf_chunks; + egrperchunk = pd->port_rcvegrbufs_perchunk; + size = pd->port_rcvegrbuf_size; + pd->port_rcvegrbuf = kmalloc(chunk * sizeof(pd->port_rcvegrbuf[0]), + GFP_KERNEL); + if (!pd->port_rcvegrbuf) { + ret = -ENOMEM; + goto bail; + } + pd->port_rcvegrbuf_phys = + kmalloc(chunk * sizeof(pd->port_rcvegrbuf_phys[0]), + GFP_KERNEL); + if (!pd->port_rcvegrbuf_phys) { + ret = -ENOMEM; + goto bail_rcvegrbuf; + } + for (e = 0; e < pd->port_rcvegrbuf_chunks; e++) { + + pd->port_rcvegrbuf[e] = dma_alloc_coherent( + &dd->pcidev->dev, size, &pd->port_rcvegrbuf_phys[e], + gfp_flags); + + if (!pd->port_rcvegrbuf[e]) { + ret = -ENOMEM; + goto bail_rcvegrbuf_phys; + } + } + + pd->port_rcvegr_phys = pd->port_rcvegrbuf_phys[0]; + + for (e = chunk = 0; chunk < pd->port_rcvegrbuf_chunks; chunk++) { + dma_addr_t pa = pd->port_rcvegrbuf_phys[chunk]; + unsigned i; + + for (i = 0; e < egrcnt && i < egrperchunk; e++, i++) { + dd->ipath_f_put_tid(dd, e + egroff + + (u64 __iomem *) + ((char __iomem *) + dd->ipath_kregbase + + dd->ipath_rcvegrbase), + RCVHQ_RCV_TYPE_EAGER, pa); + pa += egrsize; + } + cond_resched(); /* don't hog the cpu */ + } + + ret = 0; + goto bail; + +bail_rcvegrbuf_phys: + for (e = 0; e < pd->port_rcvegrbuf_chunks && + pd->port_rcvegrbuf[e]; e++) { + dma_free_coherent(&dd->pcidev->dev, size, + pd->port_rcvegrbuf[e], + pd->port_rcvegrbuf_phys[e]); + + } + kfree(pd->port_rcvegrbuf_phys); + pd->port_rcvegrbuf_phys = NULL; +bail_rcvegrbuf: + kfree(pd->port_rcvegrbuf); + pd->port_rcvegrbuf = NULL; +bail: + return ret; +} + + +/* common code for the mappings on dma_alloc_coherent mem */ +static int ipath_mmap_mem(struct vm_area_struct *vma, + struct ipath_portdata *pd, unsigned len, int write_ok, + void *kvaddr, char *what) +{ + struct ipath_devdata *dd = pd->port_dd; + unsigned long pfn; + int ret; + + if ((vma->vm_end - vma->vm_start) > len) { + dev_info(&dd->pcidev->dev, + "FAIL on %s: len %lx > %x\n", what, + vma->vm_end - vma->vm_start, len); + ret = -EFAULT; + goto bail; + } + + if (!write_ok) { + if (vma->vm_flags & VM_WRITE) { + dev_info(&dd->pcidev->dev, + "%s must be mapped readonly\n", what); + ret = -EPERM; + goto bail; + } + + /* don't allow them to later change with mprotect */ + vma->vm_flags &= ~VM_MAYWRITE; + } + + pfn = virt_to_phys(kvaddr) >> PAGE_SHIFT; + ret = remap_pfn_range(vma, vma->vm_start, pfn, + len, vma->vm_page_prot); + if (ret) + dev_info(&dd->pcidev->dev, "%s port%u mmap of %lx, %x " + "bytes r%c failed: %d\n", what, pd->port_port, + pfn, len, write_ok?'w':'o', ret); + else + ipath_cdbg(VERBOSE, "%s port%u mmaped %lx, %x bytes " + "r%c\n", what, pd->port_port, pfn, len, + write_ok?'w':'o'); +bail: + return ret; +} + +static int mmap_ureg(struct vm_area_struct *vma, struct ipath_devdata *dd, + u64 ureg) +{ + unsigned long phys; + int ret; + + /* + * This is real hardware, so use io_remap. This is the mechanism + * for the user process to update the head registers for their port + * in the chip. + */ + if ((vma->vm_end - vma->vm_start) > PAGE_SIZE) { + dev_info(&dd->pcidev->dev, "FAIL mmap userreg: reqlen " + "%lx > PAGE\n", vma->vm_end - vma->vm_start); + ret = -EFAULT; + } else { + phys = dd->ipath_physaddr + ureg; + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + + vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND; + ret = io_remap_pfn_range(vma, vma->vm_start, + phys >> PAGE_SHIFT, + vma->vm_end - vma->vm_start, + vma->vm_page_prot); + } + return ret; +} + +static int mmap_piobufs(struct vm_area_struct *vma, + struct ipath_devdata *dd, + struct ipath_portdata *pd, + unsigned piobufs, unsigned piocnt) +{ + unsigned long phys; + int ret; + + /* + * When we map the PIO buffers in the chip, we want to map them as + * writeonly, no read possible. This prevents access to previous + * process data, and catches users who might try to read the i/o + * space due to a bug. + */ + if ((vma->vm_end - vma->vm_start) > (piocnt * dd->ipath_palign)) { + dev_info(&dd->pcidev->dev, "FAIL mmap piobufs: " + "reqlen %lx > PAGE\n", + vma->vm_end - vma->vm_start); + ret = -EINVAL; + goto bail; + } + + phys = dd->ipath_physaddr + piobufs; + +#if defined(__powerpc__) + /* There isn't a generic way to specify writethrough mappings */ + pgprot_val(vma->vm_page_prot) |= _PAGE_NO_CACHE; + pgprot_val(vma->vm_page_prot) |= _PAGE_WRITETHRU; + pgprot_val(vma->vm_page_prot) &= ~_PAGE_GUARDED; +#endif + + /* + * don't allow them to later change to readable with mprotect (for when + * not initially mapped readable, as is normally the case) + */ + vma->vm_flags &= ~VM_MAYREAD; + vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND; + + ret = io_remap_pfn_range(vma, vma->vm_start, phys >> PAGE_SHIFT, + vma->vm_end - vma->vm_start, + vma->vm_page_prot); +bail: + return ret; +} + +static int mmap_rcvegrbufs(struct vm_area_struct *vma, + struct ipath_portdata *pd) +{ + struct ipath_devdata *dd = pd->port_dd; + unsigned long start, size; + size_t total_size, i; + unsigned long pfn; + int ret; + + size = pd->port_rcvegrbuf_size; + total_size = pd->port_rcvegrbuf_chunks * size; + if ((vma->vm_end - vma->vm_start) > total_size) { + dev_info(&dd->pcidev->dev, "FAIL on egr bufs: " + "reqlen %lx > actual %lx\n", + vma->vm_end - vma->vm_start, + (unsigned long) total_size); + ret = -EINVAL; + goto bail; + } + + if (vma->vm_flags & VM_WRITE) { + dev_info(&dd->pcidev->dev, "Can't map eager buffers as " + "writable (flags=%lx)\n", vma->vm_flags); + ret = -EPERM; + goto bail; + } + /* don't allow them to later change to writeable with mprotect */ + vma->vm_flags &= ~VM_MAYWRITE; + + start = vma->vm_start; + + for (i = 0; i < pd->port_rcvegrbuf_chunks; i++, start += size) { + pfn = virt_to_phys(pd->port_rcvegrbuf[i]) >> PAGE_SHIFT; + ret = remap_pfn_range(vma, start, pfn, size, + vma->vm_page_prot); + if (ret < 0) + goto bail; + } + ret = 0; + +bail: + return ret; +} + +/* + * ipath_file_vma_fault - handle a VMA page fault. + */ +static int ipath_file_vma_fault(struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + struct page *page; + + page = vmalloc_to_page((void *)(vmf->pgoff << PAGE_SHIFT)); + if (!page) + return VM_FAULT_SIGBUS; + get_page(page); + vmf->page = page; + + return 0; +} + +static const struct vm_operations_struct ipath_file_vm_ops = { + .fault = ipath_file_vma_fault, +}; + +static int mmap_kvaddr(struct vm_area_struct *vma, u64 pgaddr, + struct ipath_portdata *pd, unsigned subport) +{ + unsigned long len; + struct ipath_devdata *dd; + void *addr; + size_t size; + int ret = 0; + + /* If the port is not shared, all addresses should be physical */ + if (!pd->port_subport_cnt) + goto bail; + + dd = pd->port_dd; + size = pd->port_rcvegrbuf_chunks * pd->port_rcvegrbuf_size; + + /* + * Each process has all the subport uregbase, rcvhdrq, and + * rcvegrbufs mmapped - as an array for all the processes, + * and also separately for this process. + */ + if (pgaddr == cvt_kvaddr(pd->subport_uregbase)) { + addr = pd->subport_uregbase; + size = PAGE_SIZE * pd->port_subport_cnt; + } else if (pgaddr == cvt_kvaddr(pd->subport_rcvhdr_base)) { + addr = pd->subport_rcvhdr_base; + size = pd->port_rcvhdrq_size * pd->port_subport_cnt; + } else if (pgaddr == cvt_kvaddr(pd->subport_rcvegrbuf)) { + addr = pd->subport_rcvegrbuf; + size *= pd->port_subport_cnt; + } else if (pgaddr == cvt_kvaddr(pd->subport_uregbase + + PAGE_SIZE * subport)) { + addr = pd->subport_uregbase + PAGE_SIZE * subport; + size = PAGE_SIZE; + } else if (pgaddr == cvt_kvaddr(pd->subport_rcvhdr_base + + pd->port_rcvhdrq_size * subport)) { + addr = pd->subport_rcvhdr_base + + pd->port_rcvhdrq_size * subport; + size = pd->port_rcvhdrq_size; + } else if (pgaddr == cvt_kvaddr(pd->subport_rcvegrbuf + + size * subport)) { + addr = pd->subport_rcvegrbuf + size * subport; + /* rcvegrbufs are read-only on the slave */ + if (vma->vm_flags & VM_WRITE) { + dev_info(&dd->pcidev->dev, + "Can't map eager buffers as " + "writable (flags=%lx)\n", vma->vm_flags); + ret = -EPERM; + goto bail; + } + /* + * Don't allow permission to later change to writeable + * with mprotect. + */ + vma->vm_flags &= ~VM_MAYWRITE; + } else { + goto bail; + } + len = vma->vm_end - vma->vm_start; + if (len > size) { + ipath_cdbg(MM, "FAIL: reqlen %lx > %zx\n", len, size); + ret = -EINVAL; + goto bail; + } + + vma->vm_pgoff = (unsigned long) addr >> PAGE_SHIFT; + vma->vm_ops = &ipath_file_vm_ops; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + ret = 1; + +bail: + return ret; +} + +/** + * ipath_mmap - mmap various structures into user space + * @fp: the file pointer + * @vma: the VM area + * + * We use this to have a shared buffer between the kernel and the user code + * for the rcvhdr queue, egr buffers, and the per-port user regs and pio + * buffers in the chip. We have the open and close entries so we can bump + * the ref count and keep the driver from being unloaded while still mapped. + */ +static int ipath_mmap(struct file *fp, struct vm_area_struct *vma) +{ + struct ipath_portdata *pd; + struct ipath_devdata *dd; + u64 pgaddr, ureg; + unsigned piobufs, piocnt; + int ret; + + pd = port_fp(fp); + if (!pd) { + ret = -EINVAL; + goto bail; + } + dd = pd->port_dd; + + /* + * This is the ipath_do_user_init() code, mapping the shared buffers + * into the user process. The address referred to by vm_pgoff is the + * file offset passed via mmap(). For shared ports, this is the + * kernel vmalloc() address of the pages to share with the master. + * For non-shared or master ports, this is a physical address. + * We only do one mmap for each space mapped. + */ + pgaddr = vma->vm_pgoff << PAGE_SHIFT; + + /* + * Check for 0 in case one of the allocations failed, but user + * called mmap anyway. + */ + if (!pgaddr) { + ret = -EINVAL; + goto bail; + } + + ipath_cdbg(MM, "pgaddr %llx vm_start=%lx len %lx port %u:%u:%u\n", + (unsigned long long) pgaddr, vma->vm_start, + vma->vm_end - vma->vm_start, dd->ipath_unit, + pd->port_port, subport_fp(fp)); + + /* + * Physical addresses must fit in 40 bits for our hardware. + * Check for kernel virtual addresses first, anything else must + * match a HW or memory address. + */ + ret = mmap_kvaddr(vma, pgaddr, pd, subport_fp(fp)); + if (ret) { + if (ret > 0) + ret = 0; + goto bail; + } + + ureg = dd->ipath_uregbase + dd->ipath_ureg_align * pd->port_port; + if (!pd->port_subport_cnt) { + /* port is not shared */ + piocnt = pd->port_piocnt; + piobufs = pd->port_piobufs; + } else if (!subport_fp(fp)) { + /* caller is the master */ + piocnt = (pd->port_piocnt / pd->port_subport_cnt) + + (pd->port_piocnt % pd->port_subport_cnt); + piobufs = pd->port_piobufs + + dd->ipath_palign * (pd->port_piocnt - piocnt); + } else { + unsigned slave = subport_fp(fp) - 1; + + /* caller is a slave */ + piocnt = pd->port_piocnt / pd->port_subport_cnt; + piobufs = pd->port_piobufs + dd->ipath_palign * piocnt * slave; + } + + if (pgaddr == ureg) + ret = mmap_ureg(vma, dd, ureg); + else if (pgaddr == piobufs) + ret = mmap_piobufs(vma, dd, pd, piobufs, piocnt); + else if (pgaddr == dd->ipath_pioavailregs_phys) + /* in-memory copy of pioavail registers */ + ret = ipath_mmap_mem(vma, pd, PAGE_SIZE, 0, + (void *) dd->ipath_pioavailregs_dma, + "pioavail registers"); + else if (pgaddr == pd->port_rcvegr_phys) + ret = mmap_rcvegrbufs(vma, pd); + else if (pgaddr == (u64) pd->port_rcvhdrq_phys) + /* + * The rcvhdrq itself; readonly except on HT (so have + * to allow writable mapping), multiple pages, contiguous + * from an i/o perspective. + */ + ret = ipath_mmap_mem(vma, pd, pd->port_rcvhdrq_size, 1, + pd->port_rcvhdrq, + "rcvhdrq"); + else if (pgaddr == (u64) pd->port_rcvhdrqtailaddr_phys) + /* in-memory copy of rcvhdrq tail register */ + ret = ipath_mmap_mem(vma, pd, PAGE_SIZE, 0, + pd->port_rcvhdrtail_kvaddr, + "rcvhdrq tail"); + else + ret = -EINVAL; + + vma->vm_private_data = NULL; + + if (ret < 0) + dev_info(&dd->pcidev->dev, + "Failure %d on off %llx len %lx\n", + -ret, (unsigned long long)pgaddr, + vma->vm_end - vma->vm_start); +bail: + return ret; +} + +static unsigned ipath_poll_hdrqfull(struct ipath_portdata *pd) +{ + unsigned pollflag = 0; + + if ((pd->poll_type & IPATH_POLL_TYPE_OVERFLOW) && + pd->port_hdrqfull != pd->port_hdrqfull_poll) { + pollflag |= POLLIN | POLLRDNORM; + pd->port_hdrqfull_poll = pd->port_hdrqfull; + } + + return pollflag; +} + +static unsigned int ipath_poll_urgent(struct ipath_portdata *pd, + struct file *fp, + struct poll_table_struct *pt) +{ + unsigned pollflag = 0; + struct ipath_devdata *dd; + + dd = pd->port_dd; + + /* variable access in ipath_poll_hdrqfull() needs this */ + rmb(); + pollflag = ipath_poll_hdrqfull(pd); + + if (pd->port_urgent != pd->port_urgent_poll) { + pollflag |= POLLIN | POLLRDNORM; + pd->port_urgent_poll = pd->port_urgent; + } + + if (!pollflag) { + /* this saves a spin_lock/unlock in interrupt handler... */ + set_bit(IPATH_PORT_WAITING_URG, &pd->port_flag); + /* flush waiting flag so don't miss an event... */ + wmb(); + poll_wait(fp, &pd->port_wait, pt); + } + + return pollflag; +} + +static unsigned int ipath_poll_next(struct ipath_portdata *pd, + struct file *fp, + struct poll_table_struct *pt) +{ + u32 head; + u32 tail; + unsigned pollflag = 0; + struct ipath_devdata *dd; + + dd = pd->port_dd; + + /* variable access in ipath_poll_hdrqfull() needs this */ + rmb(); + pollflag = ipath_poll_hdrqfull(pd); + + head = ipath_read_ureg32(dd, ur_rcvhdrhead, pd->port_port); + if (pd->port_rcvhdrtail_kvaddr) + tail = ipath_get_rcvhdrtail(pd); + else + tail = ipath_read_ureg32(dd, ur_rcvhdrtail, pd->port_port); + + if (head != tail) + pollflag |= POLLIN | POLLRDNORM; + else { + /* this saves a spin_lock/unlock in interrupt handler */ + set_bit(IPATH_PORT_WAITING_RCV, &pd->port_flag); + /* flush waiting flag so we don't miss an event */ + wmb(); + + set_bit(pd->port_port + dd->ipath_r_intravail_shift, + &dd->ipath_rcvctrl); + + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, + dd->ipath_rcvctrl); + + if (dd->ipath_rhdrhead_intr_off) /* arm rcv interrupt */ + ipath_write_ureg(dd, ur_rcvhdrhead, + dd->ipath_rhdrhead_intr_off | head, + pd->port_port); + + poll_wait(fp, &pd->port_wait, pt); + } + + return pollflag; +} + +static unsigned int ipath_poll(struct file *fp, + struct poll_table_struct *pt) +{ + struct ipath_portdata *pd; + unsigned pollflag; + + pd = port_fp(fp); + if (!pd) + pollflag = 0; + else if (pd->poll_type & IPATH_POLL_TYPE_URGENT) + pollflag = ipath_poll_urgent(pd, fp, pt); + else + pollflag = ipath_poll_next(pd, fp, pt); + + return pollflag; +} + +static int ipath_supports_subports(int user_swmajor, int user_swminor) +{ + /* no subport implementation prior to software version 1.3 */ + return (user_swmajor > 1) || (user_swminor >= 3); +} + +static int ipath_compatible_subports(int user_swmajor, int user_swminor) +{ + /* this code is written long-hand for clarity */ + if (IPATH_USER_SWMAJOR != user_swmajor) { + /* no promise of compatibility if major mismatch */ + return 0; + } + if (IPATH_USER_SWMAJOR == 1) { + switch (IPATH_USER_SWMINOR) { + case 0: + case 1: + case 2: + /* no subport implementation so cannot be compatible */ + return 0; + case 3: + /* 3 is only compatible with itself */ + return user_swminor == 3; + default: + /* >= 4 are compatible (or are expected to be) */ + return user_swminor >= 4; + } + } + /* make no promises yet for future major versions */ + return 0; +} + +static int init_subports(struct ipath_devdata *dd, + struct ipath_portdata *pd, + const struct ipath_user_info *uinfo) +{ + int ret = 0; + unsigned num_subports; + size_t size; + + /* + * If the user is requesting zero subports, + * skip the subport allocation. + */ + if (uinfo->spu_subport_cnt <= 0) + goto bail; + + /* Self-consistency check for ipath_compatible_subports() */ + if (ipath_supports_subports(IPATH_USER_SWMAJOR, IPATH_USER_SWMINOR) && + !ipath_compatible_subports(IPATH_USER_SWMAJOR, + IPATH_USER_SWMINOR)) { + dev_info(&dd->pcidev->dev, + "Inconsistent ipath_compatible_subports()\n"); + goto bail; + } + + /* Check for subport compatibility */ + if (!ipath_compatible_subports(uinfo->spu_userversion >> 16, + uinfo->spu_userversion & 0xffff)) { + dev_info(&dd->pcidev->dev, + "Mismatched user version (%d.%d) and driver " + "version (%d.%d) while port sharing. Ensure " + "that driver and library are from the same " + "release.\n", + (int) (uinfo->spu_userversion >> 16), + (int) (uinfo->spu_userversion & 0xffff), + IPATH_USER_SWMAJOR, + IPATH_USER_SWMINOR); + goto bail; + } + if (uinfo->spu_subport_cnt > INFINIPATH_MAX_SUBPORT) { + ret = -EINVAL; + goto bail; + } + + num_subports = uinfo->spu_subport_cnt; + pd->subport_uregbase = vzalloc(PAGE_SIZE * num_subports); + if (!pd->subport_uregbase) { + ret = -ENOMEM; + goto bail; + } + /* Note: pd->port_rcvhdrq_size isn't initialized yet. */ + size = ALIGN(dd->ipath_rcvhdrcnt * dd->ipath_rcvhdrentsize * + sizeof(u32), PAGE_SIZE) * num_subports; + pd->subport_rcvhdr_base = vzalloc(size); + if (!pd->subport_rcvhdr_base) { + ret = -ENOMEM; + goto bail_ureg; + } + + pd->subport_rcvegrbuf = vzalloc(pd->port_rcvegrbuf_chunks * + pd->port_rcvegrbuf_size * + num_subports); + if (!pd->subport_rcvegrbuf) { + ret = -ENOMEM; + goto bail_rhdr; + } + + pd->port_subport_cnt = uinfo->spu_subport_cnt; + pd->port_subport_id = uinfo->spu_subport_id; + pd->active_slaves = 1; + set_bit(IPATH_PORT_MASTER_UNINIT, &pd->port_flag); + goto bail; + +bail_rhdr: + vfree(pd->subport_rcvhdr_base); +bail_ureg: + vfree(pd->subport_uregbase); + pd->subport_uregbase = NULL; +bail: + return ret; +} + +static int try_alloc_port(struct ipath_devdata *dd, int port, + struct file *fp, + const struct ipath_user_info *uinfo) +{ + struct ipath_portdata *pd; + int ret; + + if (!(pd = dd->ipath_pd[port])) { + void *ptmp; + + pd = kzalloc(sizeof(struct ipath_portdata), GFP_KERNEL); + + /* + * Allocate memory for use in ipath_tid_update() just once + * at open, not per call. Reduces cost of expected send + * setup. + */ + ptmp = kmalloc(dd->ipath_rcvtidcnt * sizeof(u16) + + dd->ipath_rcvtidcnt * sizeof(struct page **), + GFP_KERNEL); + if (!pd || !ptmp) { + ipath_dev_err(dd, "Unable to allocate portdata " + "memory, failing open\n"); + ret = -ENOMEM; + kfree(pd); + kfree(ptmp); + goto bail; + } + dd->ipath_pd[port] = pd; + dd->ipath_pd[port]->port_port = port; + dd->ipath_pd[port]->port_dd = dd; + dd->ipath_pd[port]->port_tid_pg_list = ptmp; + init_waitqueue_head(&dd->ipath_pd[port]->port_wait); + } + if (!pd->port_cnt) { + pd->userversion = uinfo->spu_userversion; + init_user_egr_sizes(pd); + if ((ret = init_subports(dd, pd, uinfo)) != 0) + goto bail; + ipath_cdbg(PROC, "%s[%u] opened unit:port %u:%u\n", + current->comm, current->pid, dd->ipath_unit, + port); + pd->port_cnt = 1; + port_fp(fp) = pd; + pd->port_pid = get_pid(task_pid(current)); + strlcpy(pd->port_comm, current->comm, sizeof(pd->port_comm)); + ipath_stats.sps_ports++; + ret = 0; + } else + ret = -EBUSY; + +bail: + return ret; +} + +static inline int usable(struct ipath_devdata *dd) +{ + return dd && + (dd->ipath_flags & IPATH_PRESENT) && + dd->ipath_kregbase && + dd->ipath_lid && + !(dd->ipath_flags & (IPATH_LINKDOWN | IPATH_DISABLED + | IPATH_LINKUNK)); +} + +static int find_free_port(int unit, struct file *fp, + const struct ipath_user_info *uinfo) +{ + struct ipath_devdata *dd = ipath_lookup(unit); + int ret, i; + + if (!dd) { + ret = -ENODEV; + goto bail; + } + + if (!usable(dd)) { + ret = -ENETDOWN; + goto bail; + } + + for (i = 1; i < dd->ipath_cfgports; i++) { + ret = try_alloc_port(dd, i, fp, uinfo); + if (ret != -EBUSY) + goto bail; + } + ret = -EBUSY; + +bail: + return ret; +} + +static int find_best_unit(struct file *fp, + const struct ipath_user_info *uinfo) +{ + int ret = 0, i, prefunit = -1, devmax; + int maxofallports, npresent, nup; + int ndev; + + devmax = ipath_count_units(&npresent, &nup, &maxofallports); + + /* + * This code is present to allow a knowledgeable person to + * specify the layout of processes to processors before opening + * this driver, and then we'll assign the process to the "closest" + * InfiniPath chip to that processor (we assume reasonable connectivity, + * for now). This code assumes that if affinity has been set + * before this point, that at most one cpu is set; for now this + * is reasonable. I check for both cpumask_empty() and cpumask_full(), + * in case some kernel variant sets none of the bits when no + * affinity is set. 2.6.11 and 12 kernels have all present + * cpus set. Some day we'll have to fix it up further to handle + * a cpu subset. This algorithm fails for two HT chips connected + * in tunnel fashion. Eventually this needs real topology + * information. There may be some issues with dual core numbering + * as well. This needs more work prior to release. + */ + if (!cpumask_empty(tsk_cpus_allowed(current)) && + !cpumask_full(tsk_cpus_allowed(current))) { + int ncpus = num_online_cpus(), curcpu = -1, nset = 0; + get_online_cpus(); + for_each_online_cpu(i) + if (cpumask_test_cpu(i, tsk_cpus_allowed(current))) { + ipath_cdbg(PROC, "%s[%u] affinity set for " + "cpu %d/%d\n", current->comm, + current->pid, i, ncpus); + curcpu = i; + nset++; + } + put_online_cpus(); + if (curcpu != -1 && nset != ncpus) { + if (npresent) { + prefunit = curcpu / (ncpus / npresent); + ipath_cdbg(PROC,"%s[%u] %d chips, %d cpus, " + "%d cpus/chip, select unit %d\n", + current->comm, current->pid, + npresent, ncpus, ncpus / npresent, + prefunit); + } + } + } + + /* + * user ports start at 1, kernel port is 0 + * For now, we do round-robin access across all chips + */ + + if (prefunit != -1) + devmax = prefunit + 1; +recheck: + for (i = 1; i < maxofallports; i++) { + for (ndev = prefunit != -1 ? prefunit : 0; ndev < devmax; + ndev++) { + struct ipath_devdata *dd = ipath_lookup(ndev); + + if (!usable(dd)) + continue; /* can't use this unit */ + if (i >= dd->ipath_cfgports) + /* + * Maxed out on users of this unit. Try + * next. + */ + continue; + ret = try_alloc_port(dd, i, fp, uinfo); + if (!ret) + goto done; + } + } + + if (npresent) { + if (nup == 0) { + ret = -ENETDOWN; + ipath_dbg("No ports available (none initialized " + "and ready)\n"); + } else { + if (prefunit > 0) { + /* if started above 0, retry from 0 */ + ipath_cdbg(PROC, + "%s[%u] no ports on prefunit " + "%d, clear and re-check\n", + current->comm, current->pid, + prefunit); + devmax = ipath_count_units(NULL, NULL, + NULL); + prefunit = -1; + goto recheck; + } + ret = -EBUSY; + ipath_dbg("No ports available\n"); + } + } else { + ret = -ENXIO; + ipath_dbg("No boards found\n"); + } + +done: + return ret; +} + +static int find_shared_port(struct file *fp, + const struct ipath_user_info *uinfo) +{ + int devmax, ndev, i; + int ret = 0; + + devmax = ipath_count_units(NULL, NULL, NULL); + + for (ndev = 0; ndev < devmax; ndev++) { + struct ipath_devdata *dd = ipath_lookup(ndev); + + if (!usable(dd)) + continue; + for (i = 1; i < dd->ipath_cfgports; i++) { + struct ipath_portdata *pd = dd->ipath_pd[i]; + + /* Skip ports which are not yet open */ + if (!pd || !pd->port_cnt) + continue; + /* Skip port if it doesn't match the requested one */ + if (pd->port_subport_id != uinfo->spu_subport_id) + continue; + /* Verify the sharing process matches the master */ + if (pd->port_subport_cnt != uinfo->spu_subport_cnt || + pd->userversion != uinfo->spu_userversion || + pd->port_cnt >= pd->port_subport_cnt) { + ret = -EINVAL; + goto done; + } + port_fp(fp) = pd; + subport_fp(fp) = pd->port_cnt++; + pd->port_subpid[subport_fp(fp)] = + get_pid(task_pid(current)); + tidcursor_fp(fp) = 0; + pd->active_slaves |= 1 << subport_fp(fp); + ipath_cdbg(PROC, + "%s[%u] %u sharing %s[%u] unit:port %u:%u\n", + current->comm, current->pid, + subport_fp(fp), + pd->port_comm, pid_nr(pd->port_pid), + dd->ipath_unit, pd->port_port); + ret = 1; + goto done; + } + } + +done: + return ret; +} + +static int ipath_open(struct inode *in, struct file *fp) +{ + /* The real work is performed later in ipath_assign_port() */ + fp->private_data = kzalloc(sizeof(struct ipath_filedata), GFP_KERNEL); + return fp->private_data ? 0 : -ENOMEM; +} + +/* Get port early, so can set affinity prior to memory allocation */ +static int ipath_assign_port(struct file *fp, + const struct ipath_user_info *uinfo) +{ + int ret; + int i_minor; + unsigned swmajor, swminor; + + /* Check to be sure we haven't already initialized this file */ + if (port_fp(fp)) { + ret = -EINVAL; + goto done; + } + + /* for now, if major version is different, bail */ + swmajor = uinfo->spu_userversion >> 16; + if (swmajor != IPATH_USER_SWMAJOR) { + ipath_dbg("User major version %d not same as driver " + "major %d\n", uinfo->spu_userversion >> 16, + IPATH_USER_SWMAJOR); + ret = -ENODEV; + goto done; + } + + swminor = uinfo->spu_userversion & 0xffff; + if (swminor != IPATH_USER_SWMINOR) + ipath_dbg("User minor version %d not same as driver " + "minor %d\n", swminor, IPATH_USER_SWMINOR); + + mutex_lock(&ipath_mutex); + + if (ipath_compatible_subports(swmajor, swminor) && + uinfo->spu_subport_cnt && + (ret = find_shared_port(fp, uinfo))) { + if (ret > 0) + ret = 0; + goto done_chk_sdma; + } + + i_minor = iminor(file_inode(fp)) - IPATH_USER_MINOR_BASE; + ipath_cdbg(VERBOSE, "open on dev %lx (minor %d)\n", + (long)file_inode(fp)->i_rdev, i_minor); + + if (i_minor) + ret = find_free_port(i_minor - 1, fp, uinfo); + else + ret = find_best_unit(fp, uinfo); + +done_chk_sdma: + if (!ret) { + struct ipath_filedata *fd = fp->private_data; + const struct ipath_portdata *pd = fd->pd; + const struct ipath_devdata *dd = pd->port_dd; + + fd->pq = ipath_user_sdma_queue_create(&dd->pcidev->dev, + dd->ipath_unit, + pd->port_port, + fd->subport); + + if (!fd->pq) + ret = -ENOMEM; + } + + mutex_unlock(&ipath_mutex); + +done: + return ret; +} + + +static int ipath_do_user_init(struct file *fp, + const struct ipath_user_info *uinfo) +{ + int ret; + struct ipath_portdata *pd = port_fp(fp); + struct ipath_devdata *dd; + u32 head32; + + /* Subports don't need to initialize anything since master did it. */ + if (subport_fp(fp)) { + ret = wait_event_interruptible(pd->port_wait, + !test_bit(IPATH_PORT_MASTER_UNINIT, &pd->port_flag)); + goto done; + } + + dd = pd->port_dd; + + if (uinfo->spu_rcvhdrsize) { + ret = ipath_setrcvhdrsize(dd, uinfo->spu_rcvhdrsize); + if (ret) + goto done; + } + + /* for now we do nothing with rcvhdrcnt: uinfo->spu_rcvhdrcnt */ + + /* some ports may get extra buffers, calculate that here */ + if (pd->port_port <= dd->ipath_ports_extrabuf) + pd->port_piocnt = dd->ipath_pbufsport + 1; + else + pd->port_piocnt = dd->ipath_pbufsport; + + /* for right now, kernel piobufs are at end, so port 1 is at 0 */ + if (pd->port_port <= dd->ipath_ports_extrabuf) + pd->port_pio_base = (dd->ipath_pbufsport + 1) + * (pd->port_port - 1); + else + pd->port_pio_base = dd->ipath_ports_extrabuf + + dd->ipath_pbufsport * (pd->port_port - 1); + pd->port_piobufs = dd->ipath_piobufbase + + pd->port_pio_base * dd->ipath_palign; + ipath_cdbg(VERBOSE, "piobuf base for port %u is 0x%x, piocnt %u," + " first pio %u\n", pd->port_port, pd->port_piobufs, + pd->port_piocnt, pd->port_pio_base); + ipath_chg_pioavailkernel(dd, pd->port_pio_base, pd->port_piocnt, 0); + + /* + * Now allocate the rcvhdr Q and eager TIDs; skip the TID + * array for time being. If pd->port_port > chip-supported, + * we need to do extra stuff here to handle by handling overflow + * through port 0, someday + */ + ret = ipath_create_rcvhdrq(dd, pd); + if (!ret) + ret = ipath_create_user_egr(pd); + if (ret) + goto done; + + /* + * set the eager head register for this port to the current values + * of the tail pointers, since we don't know if they were + * updated on last use of the port. + */ + head32 = ipath_read_ureg32(dd, ur_rcvegrindextail, pd->port_port); + ipath_write_ureg(dd, ur_rcvegrindexhead, head32, pd->port_port); + pd->port_lastrcvhdrqtail = -1; + ipath_cdbg(VERBOSE, "Wrote port%d egrhead %x from tail regs\n", + pd->port_port, head32); + pd->port_tidcursor = 0; /* start at beginning after open */ + + /* initialize poll variables... */ + pd->port_urgent = 0; + pd->port_urgent_poll = 0; + pd->port_hdrqfull_poll = pd->port_hdrqfull; + + /* + * Now enable the port for receive. + * For chips that are set to DMA the tail register to memory + * when they change (and when the update bit transitions from + * 0 to 1. So for those chips, we turn it off and then back on. + * This will (very briefly) affect any other open ports, but the + * duration is very short, and therefore isn't an issue. We + * explicitly set the in-memory tail copy to 0 beforehand, so we + * don't have to wait to be sure the DMA update has happened + * (chip resets head/tail to 0 on transition to enable). + */ + set_bit(dd->ipath_r_portenable_shift + pd->port_port, + &dd->ipath_rcvctrl); + if (!(dd->ipath_flags & IPATH_NODMA_RTAIL)) { + if (pd->port_rcvhdrtail_kvaddr) + ipath_clear_rcvhdrtail(pd); + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, + dd->ipath_rcvctrl & + ~(1ULL << dd->ipath_r_tailupd_shift)); + } + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, + dd->ipath_rcvctrl); + /* Notify any waiting slaves */ + if (pd->port_subport_cnt) { + clear_bit(IPATH_PORT_MASTER_UNINIT, &pd->port_flag); + wake_up(&pd->port_wait); + } +done: + return ret; +} + +/** + * unlock_exptid - unlock any expected TID entries port still had in use + * @pd: port + * + * We don't actually update the chip here, because we do a bulk update + * below, using ipath_f_clear_tids. + */ +static void unlock_expected_tids(struct ipath_portdata *pd) +{ + struct ipath_devdata *dd = pd->port_dd; + int port_tidbase = pd->port_port * dd->ipath_rcvtidcnt; + int i, cnt = 0, maxtid = port_tidbase + dd->ipath_rcvtidcnt; + + ipath_cdbg(VERBOSE, "Port %u unlocking any locked expTID pages\n", + pd->port_port); + for (i = port_tidbase; i < maxtid; i++) { + struct page *ps = dd->ipath_pageshadow[i]; + + if (!ps) + continue; + + dd->ipath_pageshadow[i] = NULL; + pci_unmap_page(dd->pcidev, dd->ipath_physshadow[i], + PAGE_SIZE, PCI_DMA_FROMDEVICE); + ipath_release_user_pages_on_close(&ps, 1); + cnt++; + ipath_stats.sps_pageunlocks++; + } + if (cnt) + ipath_cdbg(VERBOSE, "Port %u locked %u expTID entries\n", + pd->port_port, cnt); + + if (ipath_stats.sps_pagelocks || ipath_stats.sps_pageunlocks) + ipath_cdbg(VERBOSE, "%llu pages locked, %llu unlocked\n", + (unsigned long long) ipath_stats.sps_pagelocks, + (unsigned long long) + ipath_stats.sps_pageunlocks); +} + +static int ipath_close(struct inode *in, struct file *fp) +{ + int ret = 0; + struct ipath_filedata *fd; + struct ipath_portdata *pd; + struct ipath_devdata *dd; + unsigned long flags; + unsigned port; + struct pid *pid; + + ipath_cdbg(VERBOSE, "close on dev %lx, private data %p\n", + (long)in->i_rdev, fp->private_data); + + mutex_lock(&ipath_mutex); + + fd = fp->private_data; + fp->private_data = NULL; + pd = fd->pd; + if (!pd) { + mutex_unlock(&ipath_mutex); + goto bail; + } + + dd = pd->port_dd; + + /* drain user sdma queue */ + ipath_user_sdma_queue_drain(dd, fd->pq); + ipath_user_sdma_queue_destroy(fd->pq); + + if (--pd->port_cnt) { + /* + * XXX If the master closes the port before the slave(s), + * revoke the mmap for the eager receive queue so + * the slave(s) don't wait for receive data forever. + */ + pd->active_slaves &= ~(1 << fd->subport); + put_pid(pd->port_subpid[fd->subport]); + pd->port_subpid[fd->subport] = NULL; + mutex_unlock(&ipath_mutex); + goto bail; + } + /* early; no interrupt users after this */ + spin_lock_irqsave(&dd->ipath_uctxt_lock, flags); + port = pd->port_port; + dd->ipath_pd[port] = NULL; + pid = pd->port_pid; + pd->port_pid = NULL; + spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags); + + if (pd->port_rcvwait_to || pd->port_piowait_to + || pd->port_rcvnowait || pd->port_pionowait) { + ipath_cdbg(VERBOSE, "port%u, %u rcv, %u pio wait timeo; " + "%u rcv %u, pio already\n", + pd->port_port, pd->port_rcvwait_to, + pd->port_piowait_to, pd->port_rcvnowait, + pd->port_pionowait); + pd->port_rcvwait_to = pd->port_piowait_to = + pd->port_rcvnowait = pd->port_pionowait = 0; + } + if (pd->port_flag) { + ipath_cdbg(PROC, "port %u port_flag set: 0x%lx\n", + pd->port_port, pd->port_flag); + pd->port_flag = 0; + } + + if (dd->ipath_kregbase) { + /* atomically clear receive enable port and intr avail. */ + clear_bit(dd->ipath_r_portenable_shift + port, + &dd->ipath_rcvctrl); + clear_bit(pd->port_port + dd->ipath_r_intravail_shift, + &dd->ipath_rcvctrl); + ipath_write_kreg( dd, dd->ipath_kregs->kr_rcvctrl, + dd->ipath_rcvctrl); + /* and read back from chip to be sure that nothing + * else is in flight when we do the rest */ + (void)ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + + /* clean up the pkeys for this port user */ + ipath_clean_part_key(pd, dd); + /* + * be paranoid, and never write 0's to these, just use an + * unused part of the port 0 tail page. Of course, + * rcvhdraddr points to a large chunk of memory, so this + * could still trash things, but at least it won't trash + * page 0, and by disabling the port, it should stop "soon", + * even if a packet or two is in already in flight after we + * disabled the port. + */ + ipath_write_kreg_port(dd, + dd->ipath_kregs->kr_rcvhdrtailaddr, port, + dd->ipath_dummy_hdrq_phys); + ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdraddr, + pd->port_port, dd->ipath_dummy_hdrq_phys); + + ipath_disarm_piobufs(dd, pd->port_pio_base, pd->port_piocnt); + ipath_chg_pioavailkernel(dd, pd->port_pio_base, + pd->port_piocnt, 1); + + dd->ipath_f_clear_tids(dd, pd->port_port); + + if (dd->ipath_pageshadow) + unlock_expected_tids(pd); + ipath_stats.sps_ports--; + ipath_cdbg(PROC, "%s[%u] closed port %u:%u\n", + pd->port_comm, pid_nr(pid), + dd->ipath_unit, port); + } + + put_pid(pid); + mutex_unlock(&ipath_mutex); + ipath_free_pddata(dd, pd); /* after releasing the mutex */ + +bail: + kfree(fd); + return ret; +} + +static int ipath_port_info(struct ipath_portdata *pd, u16 subport, + struct ipath_port_info __user *uinfo) +{ + struct ipath_port_info info; + int nup; + int ret; + size_t sz; + + (void) ipath_count_units(NULL, &nup, NULL); + info.num_active = nup; + info.unit = pd->port_dd->ipath_unit; + info.port = pd->port_port; + info.subport = subport; + /* Don't return new fields if old library opened the port. */ + if (ipath_supports_subports(pd->userversion >> 16, + pd->userversion & 0xffff)) { + /* Number of user ports available for this device. */ + info.num_ports = pd->port_dd->ipath_cfgports - 1; + info.num_subports = pd->port_subport_cnt; + sz = sizeof(info); + } else + sz = sizeof(info) - 2 * sizeof(u16); + + if (copy_to_user(uinfo, &info, sz)) { + ret = -EFAULT; + goto bail; + } + ret = 0; + +bail: + return ret; +} + +static int ipath_get_slave_info(struct ipath_portdata *pd, + void __user *slave_mask_addr) +{ + int ret = 0; + + if (copy_to_user(slave_mask_addr, &pd->active_slaves, sizeof(u32))) + ret = -EFAULT; + return ret; +} + +static int ipath_sdma_get_inflight(struct ipath_user_sdma_queue *pq, + u32 __user *inflightp) +{ + const u32 val = ipath_user_sdma_inflight_counter(pq); + + if (put_user(val, inflightp)) + return -EFAULT; + + return 0; +} + +static int ipath_sdma_get_complete(struct ipath_devdata *dd, + struct ipath_user_sdma_queue *pq, + u32 __user *completep) +{ + u32 val; + int err; + + err = ipath_user_sdma_make_progress(dd, pq); + if (err < 0) + return err; + + val = ipath_user_sdma_complete_counter(pq); + if (put_user(val, completep)) + return -EFAULT; + + return 0; +} + +static ssize_t ipath_write(struct file *fp, const char __user *data, + size_t count, loff_t *off) +{ + const struct ipath_cmd __user *ucmd; + struct ipath_portdata *pd; + const void __user *src; + size_t consumed, copy; + struct ipath_cmd cmd; + ssize_t ret = 0; + void *dest; + + if (count < sizeof(cmd.type)) { + ret = -EINVAL; + goto bail; + } + + ucmd = (const struct ipath_cmd __user *) data; + + if (copy_from_user(&cmd.type, &ucmd->type, sizeof(cmd.type))) { + ret = -EFAULT; + goto bail; + } + + consumed = sizeof(cmd.type); + + switch (cmd.type) { + case IPATH_CMD_ASSIGN_PORT: + case __IPATH_CMD_USER_INIT: + case IPATH_CMD_USER_INIT: + copy = sizeof(cmd.cmd.user_info); + dest = &cmd.cmd.user_info; + src = &ucmd->cmd.user_info; + break; + case IPATH_CMD_RECV_CTRL: + copy = sizeof(cmd.cmd.recv_ctrl); + dest = &cmd.cmd.recv_ctrl; + src = &ucmd->cmd.recv_ctrl; + break; + case IPATH_CMD_PORT_INFO: + copy = sizeof(cmd.cmd.port_info); + dest = &cmd.cmd.port_info; + src = &ucmd->cmd.port_info; + break; + case IPATH_CMD_TID_UPDATE: + case IPATH_CMD_TID_FREE: + copy = sizeof(cmd.cmd.tid_info); + dest = &cmd.cmd.tid_info; + src = &ucmd->cmd.tid_info; + break; + case IPATH_CMD_SET_PART_KEY: + copy = sizeof(cmd.cmd.part_key); + dest = &cmd.cmd.part_key; + src = &ucmd->cmd.part_key; + break; + case __IPATH_CMD_SLAVE_INFO: + copy = sizeof(cmd.cmd.slave_mask_addr); + dest = &cmd.cmd.slave_mask_addr; + src = &ucmd->cmd.slave_mask_addr; + break; + case IPATH_CMD_PIOAVAILUPD: // force an update of PIOAvail reg + copy = 0; + src = NULL; + dest = NULL; + break; + case IPATH_CMD_POLL_TYPE: + copy = sizeof(cmd.cmd.poll_type); + dest = &cmd.cmd.poll_type; + src = &ucmd->cmd.poll_type; + break; + case IPATH_CMD_ARMLAUNCH_CTRL: + copy = sizeof(cmd.cmd.armlaunch_ctrl); + dest = &cmd.cmd.armlaunch_ctrl; + src = &ucmd->cmd.armlaunch_ctrl; + break; + case IPATH_CMD_SDMA_INFLIGHT: + copy = sizeof(cmd.cmd.sdma_inflight); + dest = &cmd.cmd.sdma_inflight; + src = &ucmd->cmd.sdma_inflight; + break; + case IPATH_CMD_SDMA_COMPLETE: + copy = sizeof(cmd.cmd.sdma_complete); + dest = &cmd.cmd.sdma_complete; + src = &ucmd->cmd.sdma_complete; + break; + default: + ret = -EINVAL; + goto bail; + } + + if (copy) { + if ((count - consumed) < copy) { + ret = -EINVAL; + goto bail; + } + + if (copy_from_user(dest, src, copy)) { + ret = -EFAULT; + goto bail; + } + + consumed += copy; + } + + pd = port_fp(fp); + if (!pd && cmd.type != __IPATH_CMD_USER_INIT && + cmd.type != IPATH_CMD_ASSIGN_PORT) { + ret = -EINVAL; + goto bail; + } + + switch (cmd.type) { + case IPATH_CMD_ASSIGN_PORT: + ret = ipath_assign_port(fp, &cmd.cmd.user_info); + if (ret) + goto bail; + break; + case __IPATH_CMD_USER_INIT: + /* backwards compatibility, get port first */ + ret = ipath_assign_port(fp, &cmd.cmd.user_info); + if (ret) + goto bail; + /* and fall through to current version. */ + case IPATH_CMD_USER_INIT: + ret = ipath_do_user_init(fp, &cmd.cmd.user_info); + if (ret) + goto bail; + ret = ipath_get_base_info( + fp, (void __user *) (unsigned long) + cmd.cmd.user_info.spu_base_info, + cmd.cmd.user_info.spu_base_info_size); + break; + case IPATH_CMD_RECV_CTRL: + ret = ipath_manage_rcvq(pd, subport_fp(fp), cmd.cmd.recv_ctrl); + break; + case IPATH_CMD_PORT_INFO: + ret = ipath_port_info(pd, subport_fp(fp), + (struct ipath_port_info __user *) + (unsigned long) cmd.cmd.port_info); + break; + case IPATH_CMD_TID_UPDATE: + ret = ipath_tid_update(pd, fp, &cmd.cmd.tid_info); + break; + case IPATH_CMD_TID_FREE: + ret = ipath_tid_free(pd, subport_fp(fp), &cmd.cmd.tid_info); + break; + case IPATH_CMD_SET_PART_KEY: + ret = ipath_set_part_key(pd, cmd.cmd.part_key); + break; + case __IPATH_CMD_SLAVE_INFO: + ret = ipath_get_slave_info(pd, + (void __user *) (unsigned long) + cmd.cmd.slave_mask_addr); + break; + case IPATH_CMD_PIOAVAILUPD: + ipath_force_pio_avail_update(pd->port_dd); + break; + case IPATH_CMD_POLL_TYPE: + pd->poll_type = cmd.cmd.poll_type; + break; + case IPATH_CMD_ARMLAUNCH_CTRL: + if (cmd.cmd.armlaunch_ctrl) + ipath_enable_armlaunch(pd->port_dd); + else + ipath_disable_armlaunch(pd->port_dd); + break; + case IPATH_CMD_SDMA_INFLIGHT: + ret = ipath_sdma_get_inflight(user_sdma_queue_fp(fp), + (u32 __user *) (unsigned long) + cmd.cmd.sdma_inflight); + break; + case IPATH_CMD_SDMA_COMPLETE: + ret = ipath_sdma_get_complete(pd->port_dd, + user_sdma_queue_fp(fp), + (u32 __user *) (unsigned long) + cmd.cmd.sdma_complete); + break; + } + + if (ret >= 0) + ret = consumed; + +bail: + return ret; +} + +static ssize_t ipath_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *filp = iocb->ki_filp; + struct ipath_filedata *fp = filp->private_data; + struct ipath_portdata *pd = port_fp(filp); + struct ipath_user_sdma_queue *pq = fp->pq; + + if (!iter_is_iovec(from) || !from->nr_segs) + return -EINVAL; + + return ipath_user_sdma_writev(pd->port_dd, pq, from->iov, from->nr_segs); +} + +static struct class *ipath_class; + +static int init_cdev(int minor, char *name, const struct file_operations *fops, + struct cdev **cdevp, struct device **devp) +{ + const dev_t dev = MKDEV(IPATH_MAJOR, minor); + struct cdev *cdev = NULL; + struct device *device = NULL; + int ret; + + cdev = cdev_alloc(); + if (!cdev) { + printk(KERN_ERR IPATH_DRV_NAME + ": Could not allocate cdev for minor %d, %s\n", + minor, name); + ret = -ENOMEM; + goto done; + } + + cdev->owner = THIS_MODULE; + cdev->ops = fops; + kobject_set_name(&cdev->kobj, name); + + ret = cdev_add(cdev, dev, 1); + if (ret < 0) { + printk(KERN_ERR IPATH_DRV_NAME + ": Could not add cdev for minor %d, %s (err %d)\n", + minor, name, -ret); + goto err_cdev; + } + + device = device_create(ipath_class, NULL, dev, NULL, name); + + if (IS_ERR(device)) { + ret = PTR_ERR(device); + printk(KERN_ERR IPATH_DRV_NAME ": Could not create " + "device for minor %d, %s (err %d)\n", + minor, name, -ret); + goto err_cdev; + } + + goto done; + +err_cdev: + cdev_del(cdev); + cdev = NULL; + +done: + if (ret >= 0) { + *cdevp = cdev; + *devp = device; + } else { + *cdevp = NULL; + *devp = NULL; + } + + return ret; +} + +int ipath_cdev_init(int minor, char *name, const struct file_operations *fops, + struct cdev **cdevp, struct device **devp) +{ + return init_cdev(minor, name, fops, cdevp, devp); +} + +static void cleanup_cdev(struct cdev **cdevp, + struct device **devp) +{ + struct device *dev = *devp; + + if (dev) { + device_unregister(dev); + *devp = NULL; + } + + if (*cdevp) { + cdev_del(*cdevp); + *cdevp = NULL; + } +} + +void ipath_cdev_cleanup(struct cdev **cdevp, + struct device **devp) +{ + cleanup_cdev(cdevp, devp); +} + +static struct cdev *wildcard_cdev; +static struct device *wildcard_dev; + +static const dev_t dev = MKDEV(IPATH_MAJOR, 0); + +static int user_init(void) +{ + int ret; + + ret = register_chrdev_region(dev, IPATH_NMINORS, IPATH_DRV_NAME); + if (ret < 0) { + printk(KERN_ERR IPATH_DRV_NAME ": Could not register " + "chrdev region (err %d)\n", -ret); + goto done; + } + + ipath_class = class_create(THIS_MODULE, IPATH_DRV_NAME); + + if (IS_ERR(ipath_class)) { + ret = PTR_ERR(ipath_class); + printk(KERN_ERR IPATH_DRV_NAME ": Could not create " + "device class (err %d)\n", -ret); + goto bail; + } + + goto done; +bail: + unregister_chrdev_region(dev, IPATH_NMINORS); +done: + return ret; +} + +static void user_cleanup(void) +{ + if (ipath_class) { + class_destroy(ipath_class); + ipath_class = NULL; + } + + unregister_chrdev_region(dev, IPATH_NMINORS); +} + +static atomic_t user_count = ATOMIC_INIT(0); +static atomic_t user_setup = ATOMIC_INIT(0); + +int ipath_user_add(struct ipath_devdata *dd) +{ + char name[10]; + int ret; + + if (atomic_inc_return(&user_count) == 1) { + ret = user_init(); + if (ret < 0) { + ipath_dev_err(dd, "Unable to set up user support: " + "error %d\n", -ret); + goto bail; + } + ret = init_cdev(0, "ipath", &ipath_file_ops, &wildcard_cdev, + &wildcard_dev); + if (ret < 0) { + ipath_dev_err(dd, "Could not create wildcard " + "minor: error %d\n", -ret); + goto bail_user; + } + + atomic_set(&user_setup, 1); + } + + snprintf(name, sizeof(name), "ipath%d", dd->ipath_unit); + + ret = init_cdev(dd->ipath_unit + 1, name, &ipath_file_ops, + &dd->user_cdev, &dd->user_dev); + if (ret < 0) + ipath_dev_err(dd, "Could not create user minor %d, %s\n", + dd->ipath_unit + 1, name); + + goto bail; + +bail_user: + user_cleanup(); +bail: + return ret; +} + +void ipath_user_remove(struct ipath_devdata *dd) +{ + cleanup_cdev(&dd->user_cdev, &dd->user_dev); + + if (atomic_dec_return(&user_count) == 0) { + if (atomic_read(&user_setup) == 0) + goto bail; + + cleanup_cdev(&wildcard_cdev, &wildcard_dev); + user_cleanup(); + + atomic_set(&user_setup, 0); + } +bail: + return; +} diff --git a/kernel/drivers/infiniband/hw/ipath/ipath_fs.c b/kernel/drivers/infiniband/hw/ipath/ipath_fs.c new file mode 100644 index 000000000..1ca8e32a9 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ipath/ipath_fs.c @@ -0,0 +1,422 @@ +/* + * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. + * Copyright (c) 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "ipath_kernel.h" + +#define IPATHFS_MAGIC 0x726a77 + +static struct super_block *ipath_super; + +static int ipathfs_mknod(struct inode *dir, struct dentry *dentry, + umode_t mode, const struct file_operations *fops, + void *data) +{ + int error; + struct inode *inode = new_inode(dir->i_sb); + + if (!inode) { + error = -EPERM; + goto bail; + } + + inode->i_ino = get_next_ino(); + inode->i_mode = mode; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_private = data; + if (S_ISDIR(mode)) { + inode->i_op = &simple_dir_inode_operations; + inc_nlink(inode); + inc_nlink(dir); + } + + inode->i_fop = fops; + + d_instantiate(dentry, inode); + error = 0; + +bail: + return error; +} + +static int create_file(const char *name, umode_t mode, + struct dentry *parent, struct dentry **dentry, + const struct file_operations *fops, void *data) +{ + int error; + + mutex_lock(&d_inode(parent)->i_mutex); + *dentry = lookup_one_len(name, parent, strlen(name)); + if (!IS_ERR(*dentry)) + error = ipathfs_mknod(d_inode(parent), *dentry, + mode, fops, data); + else + error = PTR_ERR(*dentry); + mutex_unlock(&d_inode(parent)->i_mutex); + + return error; +} + +static ssize_t atomic_stats_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + return simple_read_from_buffer(buf, count, ppos, &ipath_stats, + sizeof ipath_stats); +} + +static const struct file_operations atomic_stats_ops = { + .read = atomic_stats_read, + .llseek = default_llseek, +}; + +static ssize_t atomic_counters_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct infinipath_counters counters; + struct ipath_devdata *dd; + + dd = file_inode(file)->i_private; + dd->ipath_f_read_counters(dd, &counters); + + return simple_read_from_buffer(buf, count, ppos, &counters, + sizeof counters); +} + +static const struct file_operations atomic_counters_ops = { + .read = atomic_counters_read, + .llseek = default_llseek, +}; + +static ssize_t flash_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct ipath_devdata *dd; + ssize_t ret; + loff_t pos; + char *tmp; + + pos = *ppos; + + if ( pos < 0) { + ret = -EINVAL; + goto bail; + } + + if (pos >= sizeof(struct ipath_flash)) { + ret = 0; + goto bail; + } + + if (count > sizeof(struct ipath_flash) - pos) + count = sizeof(struct ipath_flash) - pos; + + tmp = kmalloc(count, GFP_KERNEL); + if (!tmp) { + ret = -ENOMEM; + goto bail; + } + + dd = file_inode(file)->i_private; + if (ipath_eeprom_read(dd, pos, tmp, count)) { + ipath_dev_err(dd, "failed to read from flash\n"); + ret = -ENXIO; + goto bail_tmp; + } + + if (copy_to_user(buf, tmp, count)) { + ret = -EFAULT; + goto bail_tmp; + } + + *ppos = pos + count; + ret = count; + +bail_tmp: + kfree(tmp); + +bail: + return ret; +} + +static ssize_t flash_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct ipath_devdata *dd; + ssize_t ret; + loff_t pos; + char *tmp; + + pos = *ppos; + + if (pos != 0) { + ret = -EINVAL; + goto bail; + } + + if (count != sizeof(struct ipath_flash)) { + ret = -EINVAL; + goto bail; + } + + tmp = kmalloc(count, GFP_KERNEL); + if (!tmp) { + ret = -ENOMEM; + goto bail; + } + + if (copy_from_user(tmp, buf, count)) { + ret = -EFAULT; + goto bail_tmp; + } + + dd = file_inode(file)->i_private; + if (ipath_eeprom_write(dd, pos, tmp, count)) { + ret = -ENXIO; + ipath_dev_err(dd, "failed to write to flash\n"); + goto bail_tmp; + } + + *ppos = pos + count; + ret = count; + +bail_tmp: + kfree(tmp); + +bail: + return ret; +} + +static const struct file_operations flash_ops = { + .read = flash_read, + .write = flash_write, + .llseek = default_llseek, +}; + +static int create_device_files(struct super_block *sb, + struct ipath_devdata *dd) +{ + struct dentry *dir, *tmp; + char unit[10]; + int ret; + + snprintf(unit, sizeof unit, "%02d", dd->ipath_unit); + ret = create_file(unit, S_IFDIR|S_IRUGO|S_IXUGO, sb->s_root, &dir, + &simple_dir_operations, dd); + if (ret) { + printk(KERN_ERR "create_file(%s) failed: %d\n", unit, ret); + goto bail; + } + + ret = create_file("atomic_counters", S_IFREG|S_IRUGO, dir, &tmp, + &atomic_counters_ops, dd); + if (ret) { + printk(KERN_ERR "create_file(%s/atomic_counters) " + "failed: %d\n", unit, ret); + goto bail; + } + + ret = create_file("flash", S_IFREG|S_IWUSR|S_IRUGO, dir, &tmp, + &flash_ops, dd); + if (ret) { + printk(KERN_ERR "create_file(%s/flash) " + "failed: %d\n", unit, ret); + goto bail; + } + +bail: + return ret; +} + +static int remove_file(struct dentry *parent, char *name) +{ + struct dentry *tmp; + int ret; + + tmp = lookup_one_len(name, parent, strlen(name)); + + if (IS_ERR(tmp)) { + ret = PTR_ERR(tmp); + goto bail; + } + + spin_lock(&tmp->d_lock); + if (!d_unhashed(tmp) && d_really_is_positive(tmp)) { + dget_dlock(tmp); + __d_drop(tmp); + spin_unlock(&tmp->d_lock); + simple_unlink(d_inode(parent), tmp); + } else + spin_unlock(&tmp->d_lock); + + ret = 0; +bail: + /* + * We don't expect clients to care about the return value, but + * it's there if they need it. + */ + return ret; +} + +static int remove_device_files(struct super_block *sb, + struct ipath_devdata *dd) +{ + struct dentry *dir, *root; + char unit[10]; + int ret; + + root = dget(sb->s_root); + mutex_lock(&d_inode(root)->i_mutex); + snprintf(unit, sizeof unit, "%02d", dd->ipath_unit); + dir = lookup_one_len(unit, root, strlen(unit)); + + if (IS_ERR(dir)) { + ret = PTR_ERR(dir); + printk(KERN_ERR "Lookup of %s failed\n", unit); + goto bail; + } + + remove_file(dir, "flash"); + remove_file(dir, "atomic_counters"); + d_delete(dir); + ret = simple_rmdir(d_inode(root), dir); + +bail: + mutex_unlock(&d_inode(root)->i_mutex); + dput(root); + return ret; +} + +static int ipathfs_fill_super(struct super_block *sb, void *data, + int silent) +{ + struct ipath_devdata *dd, *tmp; + unsigned long flags; + int ret; + + static struct tree_descr files[] = { + [2] = {"atomic_stats", &atomic_stats_ops, S_IRUGO}, + {""}, + }; + + ret = simple_fill_super(sb, IPATHFS_MAGIC, files); + if (ret) { + printk(KERN_ERR "simple_fill_super failed: %d\n", ret); + goto bail; + } + + spin_lock_irqsave(&ipath_devs_lock, flags); + + list_for_each_entry_safe(dd, tmp, &ipath_dev_list, ipath_list) { + spin_unlock_irqrestore(&ipath_devs_lock, flags); + ret = create_device_files(sb, dd); + if (ret) + goto bail; + spin_lock_irqsave(&ipath_devs_lock, flags); + } + + spin_unlock_irqrestore(&ipath_devs_lock, flags); + +bail: + return ret; +} + +static struct dentry *ipathfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + struct dentry *ret; + ret = mount_single(fs_type, flags, data, ipathfs_fill_super); + if (!IS_ERR(ret)) + ipath_super = ret->d_sb; + return ret; +} + +static void ipathfs_kill_super(struct super_block *s) +{ + kill_litter_super(s); + ipath_super = NULL; +} + +int ipathfs_add_device(struct ipath_devdata *dd) +{ + int ret; + + if (ipath_super == NULL) { + ret = 0; + goto bail; + } + + ret = create_device_files(ipath_super, dd); + +bail: + return ret; +} + +int ipathfs_remove_device(struct ipath_devdata *dd) +{ + int ret; + + if (ipath_super == NULL) { + ret = 0; + goto bail; + } + + ret = remove_device_files(ipath_super, dd); + +bail: + return ret; +} + +static struct file_system_type ipathfs_fs_type = { + .owner = THIS_MODULE, + .name = "ipathfs", + .mount = ipathfs_mount, + .kill_sb = ipathfs_kill_super, +}; +MODULE_ALIAS_FS("ipathfs"); + +int __init ipath_init_ipathfs(void) +{ + return register_filesystem(&ipathfs_fs_type); +} + +void __exit ipath_exit_ipathfs(void) +{ + unregister_filesystem(&ipathfs_fs_type); +} diff --git a/kernel/drivers/infiniband/hw/ipath/ipath_iba6110.c b/kernel/drivers/infiniband/hw/ipath/ipath_iba6110.c new file mode 100644 index 000000000..7cc305488 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ipath/ipath_iba6110.c @@ -0,0 +1,1940 @@ +/* + * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * This file contains all of the code that is specific to the InfiniPath + * HT chip. + */ + +#include +#include +#include +#include +#include + +#include "ipath_kernel.h" +#include "ipath_registers.h" + +static void ipath_setup_ht_setextled(struct ipath_devdata *, u64, u64); + + +/* + * This lists the InfiniPath registers, in the actual chip layout. + * This structure should never be directly accessed. + * + * The names are in InterCap form because they're taken straight from + * the chip specification. Since they're only used in this file, they + * don't pollute the rest of the source. +*/ + +struct _infinipath_do_not_use_kernel_regs { + unsigned long long Revision; + unsigned long long Control; + unsigned long long PageAlign; + unsigned long long PortCnt; + unsigned long long DebugPortSelect; + unsigned long long DebugPort; + unsigned long long SendRegBase; + unsigned long long UserRegBase; + unsigned long long CounterRegBase; + unsigned long long Scratch; + unsigned long long ReservedMisc1; + unsigned long long InterruptConfig; + unsigned long long IntBlocked; + unsigned long long IntMask; + unsigned long long IntStatus; + unsigned long long IntClear; + unsigned long long ErrorMask; + unsigned long long ErrorStatus; + unsigned long long ErrorClear; + unsigned long long HwErrMask; + unsigned long long HwErrStatus; + unsigned long long HwErrClear; + unsigned long long HwDiagCtrl; + unsigned long long MDIO; + unsigned long long IBCStatus; + unsigned long long IBCCtrl; + unsigned long long ExtStatus; + unsigned long long ExtCtrl; + unsigned long long GPIOOut; + unsigned long long GPIOMask; + unsigned long long GPIOStatus; + unsigned long long GPIOClear; + unsigned long long RcvCtrl; + unsigned long long RcvBTHQP; + unsigned long long RcvHdrSize; + unsigned long long RcvHdrCnt; + unsigned long long RcvHdrEntSize; + unsigned long long RcvTIDBase; + unsigned long long RcvTIDCnt; + unsigned long long RcvEgrBase; + unsigned long long RcvEgrCnt; + unsigned long long RcvBufBase; + unsigned long long RcvBufSize; + unsigned long long RxIntMemBase; + unsigned long long RxIntMemSize; + unsigned long long RcvPartitionKey; + unsigned long long ReservedRcv[10]; + unsigned long long SendCtrl; + unsigned long long SendPIOBufBase; + unsigned long long SendPIOSize; + unsigned long long SendPIOBufCnt; + unsigned long long SendPIOAvailAddr; + unsigned long long TxIntMemBase; + unsigned long long TxIntMemSize; + unsigned long long ReservedSend[9]; + unsigned long long SendBufferError; + unsigned long long SendBufferErrorCONT1; + unsigned long long SendBufferErrorCONT2; + unsigned long long SendBufferErrorCONT3; + unsigned long long ReservedSBE[4]; + unsigned long long RcvHdrAddr0; + unsigned long long RcvHdrAddr1; + unsigned long long RcvHdrAddr2; + unsigned long long RcvHdrAddr3; + unsigned long long RcvHdrAddr4; + unsigned long long RcvHdrAddr5; + unsigned long long RcvHdrAddr6; + unsigned long long RcvHdrAddr7; + unsigned long long RcvHdrAddr8; + unsigned long long ReservedRHA[7]; + unsigned long long RcvHdrTailAddr0; + unsigned long long RcvHdrTailAddr1; + unsigned long long RcvHdrTailAddr2; + unsigned long long RcvHdrTailAddr3; + unsigned long long RcvHdrTailAddr4; + unsigned long long RcvHdrTailAddr5; + unsigned long long RcvHdrTailAddr6; + unsigned long long RcvHdrTailAddr7; + unsigned long long RcvHdrTailAddr8; + unsigned long long ReservedRHTA[7]; + unsigned long long Sync; /* Software only */ + unsigned long long Dump; /* Software only */ + unsigned long long SimVer; /* Software only */ + unsigned long long ReservedSW[5]; + unsigned long long SerdesConfig0; + unsigned long long SerdesConfig1; + unsigned long long SerdesStatus; + unsigned long long XGXSConfig; + unsigned long long ReservedSW2[4]; +}; + +struct _infinipath_do_not_use_counters { + __u64 LBIntCnt; + __u64 LBFlowStallCnt; + __u64 Reserved1; + __u64 TxUnsupVLErrCnt; + __u64 TxDataPktCnt; + __u64 TxFlowPktCnt; + __u64 TxDwordCnt; + __u64 TxLenErrCnt; + __u64 TxMaxMinLenErrCnt; + __u64 TxUnderrunCnt; + __u64 TxFlowStallCnt; + __u64 TxDroppedPktCnt; + __u64 RxDroppedPktCnt; + __u64 RxDataPktCnt; + __u64 RxFlowPktCnt; + __u64 RxDwordCnt; + __u64 RxLenErrCnt; + __u64 RxMaxMinLenErrCnt; + __u64 RxICRCErrCnt; + __u64 RxVCRCErrCnt; + __u64 RxFlowCtrlErrCnt; + __u64 RxBadFormatCnt; + __u64 RxLinkProblemCnt; + __u64 RxEBPCnt; + __u64 RxLPCRCErrCnt; + __u64 RxBufOvflCnt; + __u64 RxTIDFullErrCnt; + __u64 RxTIDValidErrCnt; + __u64 RxPKeyMismatchCnt; + __u64 RxP0HdrEgrOvflCnt; + __u64 RxP1HdrEgrOvflCnt; + __u64 RxP2HdrEgrOvflCnt; + __u64 RxP3HdrEgrOvflCnt; + __u64 RxP4HdrEgrOvflCnt; + __u64 RxP5HdrEgrOvflCnt; + __u64 RxP6HdrEgrOvflCnt; + __u64 RxP7HdrEgrOvflCnt; + __u64 RxP8HdrEgrOvflCnt; + __u64 Reserved6; + __u64 Reserved7; + __u64 IBStatusChangeCnt; + __u64 IBLinkErrRecoveryCnt; + __u64 IBLinkDownedCnt; + __u64 IBSymbolErrCnt; +}; + +#define IPATH_KREG_OFFSET(field) (offsetof( \ + struct _infinipath_do_not_use_kernel_regs, field) / sizeof(u64)) +#define IPATH_CREG_OFFSET(field) (offsetof( \ + struct _infinipath_do_not_use_counters, field) / sizeof(u64)) + +static const struct ipath_kregs ipath_ht_kregs = { + .kr_control = IPATH_KREG_OFFSET(Control), + .kr_counterregbase = IPATH_KREG_OFFSET(CounterRegBase), + .kr_debugport = IPATH_KREG_OFFSET(DebugPort), + .kr_debugportselect = IPATH_KREG_OFFSET(DebugPortSelect), + .kr_errorclear = IPATH_KREG_OFFSET(ErrorClear), + .kr_errormask = IPATH_KREG_OFFSET(ErrorMask), + .kr_errorstatus = IPATH_KREG_OFFSET(ErrorStatus), + .kr_extctrl = IPATH_KREG_OFFSET(ExtCtrl), + .kr_extstatus = IPATH_KREG_OFFSET(ExtStatus), + .kr_gpio_clear = IPATH_KREG_OFFSET(GPIOClear), + .kr_gpio_mask = IPATH_KREG_OFFSET(GPIOMask), + .kr_gpio_out = IPATH_KREG_OFFSET(GPIOOut), + .kr_gpio_status = IPATH_KREG_OFFSET(GPIOStatus), + .kr_hwdiagctrl = IPATH_KREG_OFFSET(HwDiagCtrl), + .kr_hwerrclear = IPATH_KREG_OFFSET(HwErrClear), + .kr_hwerrmask = IPATH_KREG_OFFSET(HwErrMask), + .kr_hwerrstatus = IPATH_KREG_OFFSET(HwErrStatus), + .kr_ibcctrl = IPATH_KREG_OFFSET(IBCCtrl), + .kr_ibcstatus = IPATH_KREG_OFFSET(IBCStatus), + .kr_intblocked = IPATH_KREG_OFFSET(IntBlocked), + .kr_intclear = IPATH_KREG_OFFSET(IntClear), + .kr_interruptconfig = IPATH_KREG_OFFSET(InterruptConfig), + .kr_intmask = IPATH_KREG_OFFSET(IntMask), + .kr_intstatus = IPATH_KREG_OFFSET(IntStatus), + .kr_mdio = IPATH_KREG_OFFSET(MDIO), + .kr_pagealign = IPATH_KREG_OFFSET(PageAlign), + .kr_partitionkey = IPATH_KREG_OFFSET(RcvPartitionKey), + .kr_portcnt = IPATH_KREG_OFFSET(PortCnt), + .kr_rcvbthqp = IPATH_KREG_OFFSET(RcvBTHQP), + .kr_rcvbufbase = IPATH_KREG_OFFSET(RcvBufBase), + .kr_rcvbufsize = IPATH_KREG_OFFSET(RcvBufSize), + .kr_rcvctrl = IPATH_KREG_OFFSET(RcvCtrl), + .kr_rcvegrbase = IPATH_KREG_OFFSET(RcvEgrBase), + .kr_rcvegrcnt = IPATH_KREG_OFFSET(RcvEgrCnt), + .kr_rcvhdrcnt = IPATH_KREG_OFFSET(RcvHdrCnt), + .kr_rcvhdrentsize = IPATH_KREG_OFFSET(RcvHdrEntSize), + .kr_rcvhdrsize = IPATH_KREG_OFFSET(RcvHdrSize), + .kr_rcvintmembase = IPATH_KREG_OFFSET(RxIntMemBase), + .kr_rcvintmemsize = IPATH_KREG_OFFSET(RxIntMemSize), + .kr_rcvtidbase = IPATH_KREG_OFFSET(RcvTIDBase), + .kr_rcvtidcnt = IPATH_KREG_OFFSET(RcvTIDCnt), + .kr_revision = IPATH_KREG_OFFSET(Revision), + .kr_scratch = IPATH_KREG_OFFSET(Scratch), + .kr_sendbuffererror = IPATH_KREG_OFFSET(SendBufferError), + .kr_sendctrl = IPATH_KREG_OFFSET(SendCtrl), + .kr_sendpioavailaddr = IPATH_KREG_OFFSET(SendPIOAvailAddr), + .kr_sendpiobufbase = IPATH_KREG_OFFSET(SendPIOBufBase), + .kr_sendpiobufcnt = IPATH_KREG_OFFSET(SendPIOBufCnt), + .kr_sendpiosize = IPATH_KREG_OFFSET(SendPIOSize), + .kr_sendregbase = IPATH_KREG_OFFSET(SendRegBase), + .kr_txintmembase = IPATH_KREG_OFFSET(TxIntMemBase), + .kr_txintmemsize = IPATH_KREG_OFFSET(TxIntMemSize), + .kr_userregbase = IPATH_KREG_OFFSET(UserRegBase), + .kr_serdesconfig0 = IPATH_KREG_OFFSET(SerdesConfig0), + .kr_serdesconfig1 = IPATH_KREG_OFFSET(SerdesConfig1), + .kr_serdesstatus = IPATH_KREG_OFFSET(SerdesStatus), + .kr_xgxsconfig = IPATH_KREG_OFFSET(XGXSConfig), + /* + * These should not be used directly via ipath_write_kreg64(), + * use them with ipath_write_kreg64_port(), + */ + .kr_rcvhdraddr = IPATH_KREG_OFFSET(RcvHdrAddr0), + .kr_rcvhdrtailaddr = IPATH_KREG_OFFSET(RcvHdrTailAddr0) +}; + +static const struct ipath_cregs ipath_ht_cregs = { + .cr_badformatcnt = IPATH_CREG_OFFSET(RxBadFormatCnt), + .cr_erricrccnt = IPATH_CREG_OFFSET(RxICRCErrCnt), + .cr_errlinkcnt = IPATH_CREG_OFFSET(RxLinkProblemCnt), + .cr_errlpcrccnt = IPATH_CREG_OFFSET(RxLPCRCErrCnt), + .cr_errpkey = IPATH_CREG_OFFSET(RxPKeyMismatchCnt), + .cr_errrcvflowctrlcnt = IPATH_CREG_OFFSET(RxFlowCtrlErrCnt), + .cr_err_rlencnt = IPATH_CREG_OFFSET(RxLenErrCnt), + .cr_errslencnt = IPATH_CREG_OFFSET(TxLenErrCnt), + .cr_errtidfull = IPATH_CREG_OFFSET(RxTIDFullErrCnt), + .cr_errtidvalid = IPATH_CREG_OFFSET(RxTIDValidErrCnt), + .cr_errvcrccnt = IPATH_CREG_OFFSET(RxVCRCErrCnt), + .cr_ibstatuschange = IPATH_CREG_OFFSET(IBStatusChangeCnt), + /* calc from Reg_CounterRegBase + offset */ + .cr_intcnt = IPATH_CREG_OFFSET(LBIntCnt), + .cr_invalidrlencnt = IPATH_CREG_OFFSET(RxMaxMinLenErrCnt), + .cr_invalidslencnt = IPATH_CREG_OFFSET(TxMaxMinLenErrCnt), + .cr_lbflowstallcnt = IPATH_CREG_OFFSET(LBFlowStallCnt), + .cr_pktrcvcnt = IPATH_CREG_OFFSET(RxDataPktCnt), + .cr_pktrcvflowctrlcnt = IPATH_CREG_OFFSET(RxFlowPktCnt), + .cr_pktsendcnt = IPATH_CREG_OFFSET(TxDataPktCnt), + .cr_pktsendflowcnt = IPATH_CREG_OFFSET(TxFlowPktCnt), + .cr_portovflcnt = IPATH_CREG_OFFSET(RxP0HdrEgrOvflCnt), + .cr_rcvebpcnt = IPATH_CREG_OFFSET(RxEBPCnt), + .cr_rcvovflcnt = IPATH_CREG_OFFSET(RxBufOvflCnt), + .cr_senddropped = IPATH_CREG_OFFSET(TxDroppedPktCnt), + .cr_sendstallcnt = IPATH_CREG_OFFSET(TxFlowStallCnt), + .cr_sendunderruncnt = IPATH_CREG_OFFSET(TxUnderrunCnt), + .cr_wordrcvcnt = IPATH_CREG_OFFSET(RxDwordCnt), + .cr_wordsendcnt = IPATH_CREG_OFFSET(TxDwordCnt), + .cr_unsupvlcnt = IPATH_CREG_OFFSET(TxUnsupVLErrCnt), + .cr_rxdroppktcnt = IPATH_CREG_OFFSET(RxDroppedPktCnt), + .cr_iblinkerrrecovcnt = IPATH_CREG_OFFSET(IBLinkErrRecoveryCnt), + .cr_iblinkdowncnt = IPATH_CREG_OFFSET(IBLinkDownedCnt), + .cr_ibsymbolerrcnt = IPATH_CREG_OFFSET(IBSymbolErrCnt) +}; + +/* kr_intstatus, kr_intclear, kr_intmask bits */ +#define INFINIPATH_I_RCVURG_MASK ((1U<<9)-1) +#define INFINIPATH_I_RCVURG_SHIFT 0 +#define INFINIPATH_I_RCVAVAIL_MASK ((1U<<9)-1) +#define INFINIPATH_I_RCVAVAIL_SHIFT 12 + +/* kr_hwerrclear, kr_hwerrmask, kr_hwerrstatus, bits */ +#define INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT 0 +#define INFINIPATH_HWE_HTCMEMPARITYERR_MASK 0x3FFFFFULL +#define INFINIPATH_HWE_HTCLNKABYTE0CRCERR 0x0000000000800000ULL +#define INFINIPATH_HWE_HTCLNKABYTE1CRCERR 0x0000000001000000ULL +#define INFINIPATH_HWE_HTCLNKBBYTE0CRCERR 0x0000000002000000ULL +#define INFINIPATH_HWE_HTCLNKBBYTE1CRCERR 0x0000000004000000ULL +#define INFINIPATH_HWE_HTCMISCERR4 0x0000000008000000ULL +#define INFINIPATH_HWE_HTCMISCERR5 0x0000000010000000ULL +#define INFINIPATH_HWE_HTCMISCERR6 0x0000000020000000ULL +#define INFINIPATH_HWE_HTCMISCERR7 0x0000000040000000ULL +#define INFINIPATH_HWE_HTCBUSTREQPARITYERR 0x0000000080000000ULL +#define INFINIPATH_HWE_HTCBUSTRESPPARITYERR 0x0000000100000000ULL +#define INFINIPATH_HWE_HTCBUSIREQPARITYERR 0x0000000200000000ULL +#define INFINIPATH_HWE_COREPLL_FBSLIP 0x0080000000000000ULL +#define INFINIPATH_HWE_COREPLL_RFSLIP 0x0100000000000000ULL +#define INFINIPATH_HWE_HTBPLL_FBSLIP 0x0200000000000000ULL +#define INFINIPATH_HWE_HTBPLL_RFSLIP 0x0400000000000000ULL +#define INFINIPATH_HWE_HTAPLL_FBSLIP 0x0800000000000000ULL +#define INFINIPATH_HWE_HTAPLL_RFSLIP 0x1000000000000000ULL +#define INFINIPATH_HWE_SERDESPLLFAILED 0x2000000000000000ULL + +#define IBA6110_IBCS_LINKTRAININGSTATE_MASK 0xf +#define IBA6110_IBCS_LINKSTATE_SHIFT 4 + +/* kr_extstatus bits */ +#define INFINIPATH_EXTS_FREQSEL 0x2 +#define INFINIPATH_EXTS_SERDESSEL 0x4 +#define INFINIPATH_EXTS_MEMBIST_ENDTEST 0x0000000000004000 +#define INFINIPATH_EXTS_MEMBIST_CORRECT 0x0000000000008000 + + +/* TID entries (memory), HT-only */ +#define INFINIPATH_RT_ADDR_MASK 0xFFFFFFFFFFULL /* 40 bits valid */ +#define INFINIPATH_RT_VALID 0x8000000000000000ULL +#define INFINIPATH_RT_ADDR_SHIFT 0 +#define INFINIPATH_RT_BUFSIZE_MASK 0x3FFFULL +#define INFINIPATH_RT_BUFSIZE_SHIFT 48 + +#define INFINIPATH_R_INTRAVAIL_SHIFT 16 +#define INFINIPATH_R_TAILUPD_SHIFT 31 + +/* kr_xgxsconfig bits */ +#define INFINIPATH_XGXS_RESET 0x7ULL + +/* + * masks and bits that are different in different chips, or present only + * in one + */ +static const ipath_err_t infinipath_hwe_htcmemparityerr_mask = + INFINIPATH_HWE_HTCMEMPARITYERR_MASK; +static const ipath_err_t infinipath_hwe_htcmemparityerr_shift = + INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT; + +static const ipath_err_t infinipath_hwe_htclnkabyte0crcerr = + INFINIPATH_HWE_HTCLNKABYTE0CRCERR; +static const ipath_err_t infinipath_hwe_htclnkabyte1crcerr = + INFINIPATH_HWE_HTCLNKABYTE1CRCERR; +static const ipath_err_t infinipath_hwe_htclnkbbyte0crcerr = + INFINIPATH_HWE_HTCLNKBBYTE0CRCERR; +static const ipath_err_t infinipath_hwe_htclnkbbyte1crcerr = + INFINIPATH_HWE_HTCLNKBBYTE1CRCERR; + +#define _IPATH_GPIO_SDA_NUM 1 +#define _IPATH_GPIO_SCL_NUM 0 + +#define IPATH_GPIO_SDA \ + (1ULL << (_IPATH_GPIO_SDA_NUM+INFINIPATH_EXTC_GPIOOE_SHIFT)) +#define IPATH_GPIO_SCL \ + (1ULL << (_IPATH_GPIO_SCL_NUM+INFINIPATH_EXTC_GPIOOE_SHIFT)) + +/* keep the code below somewhat more readable; not used elsewhere */ +#define _IPATH_HTLINK0_CRCBITS (infinipath_hwe_htclnkabyte0crcerr | \ + infinipath_hwe_htclnkabyte1crcerr) +#define _IPATH_HTLINK1_CRCBITS (infinipath_hwe_htclnkbbyte0crcerr | \ + infinipath_hwe_htclnkbbyte1crcerr) +#define _IPATH_HTLANE0_CRCBITS (infinipath_hwe_htclnkabyte0crcerr | \ + infinipath_hwe_htclnkbbyte0crcerr) +#define _IPATH_HTLANE1_CRCBITS (infinipath_hwe_htclnkabyte1crcerr | \ + infinipath_hwe_htclnkbbyte1crcerr) + +static void hwerr_crcbits(struct ipath_devdata *dd, ipath_err_t hwerrs, + char *msg, size_t msgl) +{ + char bitsmsg[64]; + ipath_err_t crcbits = hwerrs & + (_IPATH_HTLINK0_CRCBITS | _IPATH_HTLINK1_CRCBITS); + /* don't check if 8bit HT */ + if (dd->ipath_flags & IPATH_8BIT_IN_HT0) + crcbits &= ~infinipath_hwe_htclnkabyte1crcerr; + /* don't check if 8bit HT */ + if (dd->ipath_flags & IPATH_8BIT_IN_HT1) + crcbits &= ~infinipath_hwe_htclnkbbyte1crcerr; + /* + * we'll want to ignore link errors on link that is + * not in use, if any. For now, complain about both + */ + if (crcbits) { + u16 ctrl0, ctrl1; + snprintf(bitsmsg, sizeof bitsmsg, + "[HT%s lane %s CRC (%llx); powercycle to completely clear]", + !(crcbits & _IPATH_HTLINK1_CRCBITS) ? + "0 (A)" : (!(crcbits & _IPATH_HTLINK0_CRCBITS) + ? "1 (B)" : "0+1 (A+B)"), + !(crcbits & _IPATH_HTLANE1_CRCBITS) ? "0" + : (!(crcbits & _IPATH_HTLANE0_CRCBITS) ? "1" : + "0+1"), (unsigned long long) crcbits); + strlcat(msg, bitsmsg, msgl); + + /* + * print extra info for debugging. slave/primary + * config word 4, 8 (link control 0, 1) + */ + + if (pci_read_config_word(dd->pcidev, + dd->ipath_ht_slave_off + 0x4, + &ctrl0)) + dev_info(&dd->pcidev->dev, "Couldn't read " + "linkctrl0 of slave/primary " + "config block\n"); + else if (!(ctrl0 & 1 << 6)) + /* not if EOC bit set */ + ipath_dbg("HT linkctrl0 0x%x%s%s\n", ctrl0, + ((ctrl0 >> 8) & 7) ? " CRC" : "", + ((ctrl0 >> 4) & 1) ? "linkfail" : + ""); + if (pci_read_config_word(dd->pcidev, + dd->ipath_ht_slave_off + 0x8, + &ctrl1)) + dev_info(&dd->pcidev->dev, "Couldn't read " + "linkctrl1 of slave/primary " + "config block\n"); + else if (!(ctrl1 & 1 << 6)) + /* not if EOC bit set */ + ipath_dbg("HT linkctrl1 0x%x%s%s\n", ctrl1, + ((ctrl1 >> 8) & 7) ? " CRC" : "", + ((ctrl1 >> 4) & 1) ? "linkfail" : + ""); + + /* disable until driver reloaded */ + dd->ipath_hwerrmask &= ~crcbits; + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask, + dd->ipath_hwerrmask); + ipath_dbg("HT crc errs: %s\n", msg); + } else + ipath_dbg("ignoring HT crc errors 0x%llx, " + "not in use\n", (unsigned long long) + (hwerrs & (_IPATH_HTLINK0_CRCBITS | + _IPATH_HTLINK1_CRCBITS))); +} + +/* 6110 specific hardware errors... */ +static const struct ipath_hwerror_msgs ipath_6110_hwerror_msgs[] = { + INFINIPATH_HWE_MSG(HTCBUSIREQPARITYERR, "HTC Ireq Parity"), + INFINIPATH_HWE_MSG(HTCBUSTREQPARITYERR, "HTC Treq Parity"), + INFINIPATH_HWE_MSG(HTCBUSTRESPPARITYERR, "HTC Tresp Parity"), + INFINIPATH_HWE_MSG(HTCMISCERR5, "HT core Misc5"), + INFINIPATH_HWE_MSG(HTCMISCERR6, "HT core Misc6"), + INFINIPATH_HWE_MSG(HTCMISCERR7, "HT core Misc7"), + INFINIPATH_HWE_MSG(RXDSYNCMEMPARITYERR, "Rx Dsync"), + INFINIPATH_HWE_MSG(SERDESPLLFAILED, "SerDes PLL"), +}; + +#define TXE_PIO_PARITY ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF | \ + INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC) \ + << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT) +#define RXE_EAGER_PARITY (INFINIPATH_HWE_RXEMEMPARITYERR_EAGERTID \ + << INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) + +static void ipath_ht_txe_recover(struct ipath_devdata *dd) +{ + ++ipath_stats.sps_txeparity; + dev_info(&dd->pcidev->dev, + "Recovering from TXE PIO parity error\n"); +} + + +/** + * ipath_ht_handle_hwerrors - display hardware errors. + * @dd: the infinipath device + * @msg: the output buffer + * @msgl: the size of the output buffer + * + * Use same msg buffer as regular errors to avoid excessive stack + * use. Most hardware errors are catastrophic, but for right now, + * we'll print them and continue. We reuse the same message buffer as + * ipath_handle_errors() to avoid excessive stack usage. + */ +static void ipath_ht_handle_hwerrors(struct ipath_devdata *dd, char *msg, + size_t msgl) +{ + ipath_err_t hwerrs; + u32 bits, ctrl; + int isfatal = 0; + char bitsmsg[64]; + int log_idx; + + hwerrs = ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwerrstatus); + + if (!hwerrs) { + ipath_cdbg(VERBOSE, "Called but no hardware errors set\n"); + /* + * better than printing cofusing messages + * This seems to be related to clearing the crc error, or + * the pll error during init. + */ + goto bail; + } else if (hwerrs == -1LL) { + ipath_dev_err(dd, "Read of hardware error status failed " + "(all bits set); ignoring\n"); + goto bail; + } + ipath_stats.sps_hwerrs++; + + /* Always clear the error status register, except MEMBISTFAIL, + * regardless of whether we continue or stop using the chip. + * We want that set so we know it failed, even across driver reload. + * We'll still ignore it in the hwerrmask. We do this partly for + * diagnostics, but also for support */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear, + hwerrs&~INFINIPATH_HWE_MEMBISTFAILED); + + hwerrs &= dd->ipath_hwerrmask; + + /* We log some errors to EEPROM, check if we have any of those. */ + for (log_idx = 0; log_idx < IPATH_EEP_LOG_CNT; ++log_idx) + if (hwerrs & dd->ipath_eep_st_masks[log_idx].hwerrs_to_log) + ipath_inc_eeprom_err(dd, log_idx, 1); + + /* + * make sure we get this much out, unless told to be quiet, + * it's a parity error we may recover from, + * or it's occurred within the last 5 seconds + */ + if ((hwerrs & ~(dd->ipath_lasthwerror | TXE_PIO_PARITY | + RXE_EAGER_PARITY)) || + (ipath_debug & __IPATH_VERBDBG)) + dev_info(&dd->pcidev->dev, "Hardware error: hwerr=0x%llx " + "(cleared)\n", (unsigned long long) hwerrs); + dd->ipath_lasthwerror |= hwerrs; + + if (hwerrs & ~dd->ipath_hwe_bitsextant) + ipath_dev_err(dd, "hwerror interrupt with unknown errors " + "%llx set\n", (unsigned long long) + (hwerrs & ~dd->ipath_hwe_bitsextant)); + + ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control); + if ((ctrl & INFINIPATH_C_FREEZEMODE) && !ipath_diag_inuse) { + /* + * parity errors in send memory are recoverable, + * just cancel the send (if indicated in * sendbuffererror), + * count the occurrence, unfreeze (if no other handled + * hardware error bits are set), and continue. They can + * occur if a processor speculative read is done to the PIO + * buffer while we are sending a packet, for example. + */ + if (hwerrs & TXE_PIO_PARITY) { + ipath_ht_txe_recover(dd); + hwerrs &= ~TXE_PIO_PARITY; + } + + if (!hwerrs) { + ipath_dbg("Clearing freezemode on ignored or " + "recovered hardware error\n"); + ipath_clear_freeze(dd); + } + } + + *msg = '\0'; + + /* + * may someday want to decode into which bits are which + * functional area for parity errors, etc. + */ + if (hwerrs & (infinipath_hwe_htcmemparityerr_mask + << INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT)) { + bits = (u32) ((hwerrs >> + INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT) & + INFINIPATH_HWE_HTCMEMPARITYERR_MASK); + snprintf(bitsmsg, sizeof bitsmsg, "[HTC Parity Errs %x] ", + bits); + strlcat(msg, bitsmsg, msgl); + } + + ipath_format_hwerrors(hwerrs, + ipath_6110_hwerror_msgs, + ARRAY_SIZE(ipath_6110_hwerror_msgs), + msg, msgl); + + if (hwerrs & (_IPATH_HTLINK0_CRCBITS | _IPATH_HTLINK1_CRCBITS)) + hwerr_crcbits(dd, hwerrs, msg, msgl); + + if (hwerrs & INFINIPATH_HWE_MEMBISTFAILED) { + strlcat(msg, "[Memory BIST test failed, InfiniPath hardware unusable]", + msgl); + /* ignore from now on, so disable until driver reloaded */ + dd->ipath_hwerrmask &= ~INFINIPATH_HWE_MEMBISTFAILED; + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask, + dd->ipath_hwerrmask); + } +#define _IPATH_PLL_FAIL (INFINIPATH_HWE_COREPLL_FBSLIP | \ + INFINIPATH_HWE_COREPLL_RFSLIP | \ + INFINIPATH_HWE_HTBPLL_FBSLIP | \ + INFINIPATH_HWE_HTBPLL_RFSLIP | \ + INFINIPATH_HWE_HTAPLL_FBSLIP | \ + INFINIPATH_HWE_HTAPLL_RFSLIP) + + if (hwerrs & _IPATH_PLL_FAIL) { + snprintf(bitsmsg, sizeof bitsmsg, + "[PLL failed (%llx), InfiniPath hardware unusable]", + (unsigned long long) (hwerrs & _IPATH_PLL_FAIL)); + strlcat(msg, bitsmsg, msgl); + /* ignore from now on, so disable until driver reloaded */ + dd->ipath_hwerrmask &= ~(hwerrs & _IPATH_PLL_FAIL); + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask, + dd->ipath_hwerrmask); + } + + if (hwerrs & INFINIPATH_HWE_SERDESPLLFAILED) { + /* + * If it occurs, it is left masked since the eternal + * interface is unused + */ + dd->ipath_hwerrmask &= ~INFINIPATH_HWE_SERDESPLLFAILED; + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask, + dd->ipath_hwerrmask); + } + + if (hwerrs) { + /* + * if any set that we aren't ignoring; only + * make the complaint once, in case it's stuck + * or recurring, and we get here multiple + * times. + * force link down, so switch knows, and + * LEDs are turned off + */ + if (dd->ipath_flags & IPATH_INITTED) { + ipath_set_linkstate(dd, IPATH_IB_LINKDOWN); + ipath_setup_ht_setextled(dd, + INFINIPATH_IBCS_L_STATE_DOWN, + INFINIPATH_IBCS_LT_STATE_DISABLED); + ipath_dev_err(dd, "Fatal Hardware Error (freeze " + "mode), no longer usable, SN %.16s\n", + dd->ipath_serial); + isfatal = 1; + } + *dd->ipath_statusp &= ~IPATH_STATUS_IB_READY; + /* mark as having had error */ + *dd->ipath_statusp |= IPATH_STATUS_HWERROR; + /* + * mark as not usable, at a minimum until driver + * is reloaded, probably until reboot, since no + * other reset is possible. + */ + dd->ipath_flags &= ~IPATH_INITTED; + } + else + *msg = 0; /* recovered from all of them */ + if (*msg) + ipath_dev_err(dd, "%s hardware error\n", msg); + if (isfatal && !ipath_diag_inuse && dd->ipath_freezemsg) + /* + * for status file; if no trailing brace is copied, + * we'll know it was truncated. + */ + snprintf(dd->ipath_freezemsg, + dd->ipath_freezelen, "{%s}", msg); + +bail:; +} + +/** + * ipath_ht_boardname - fill in the board name + * @dd: the infinipath device + * @name: the output buffer + * @namelen: the size of the output buffer + * + * fill in the board name, based on the board revision register + */ +static int ipath_ht_boardname(struct ipath_devdata *dd, char *name, + size_t namelen) +{ + char *n = NULL; + u8 boardrev = dd->ipath_boardrev; + int ret = 0; + + switch (boardrev) { + case 5: + /* + * original production board; two production levels, with + * different serial number ranges. See ipath_ht_early_init() for + * case where we enable IPATH_GPIO_INTR for later serial # range. + * Original 112* serial number is no longer supported. + */ + n = "InfiniPath_QHT7040"; + break; + case 7: + /* small form factor production board */ + n = "InfiniPath_QHT7140"; + break; + default: /* don't know, just print the number */ + ipath_dev_err(dd, "Don't yet know about board " + "with ID %u\n", boardrev); + snprintf(name, namelen, "Unknown_InfiniPath_QHT7xxx_%u", + boardrev); + break; + } + if (n) + snprintf(name, namelen, "%s", n); + + if (ret) { + ipath_dev_err(dd, "Unsupported InfiniPath board %s!\n", name); + goto bail; + } + if (dd->ipath_majrev != 3 || (dd->ipath_minrev < 2 || + dd->ipath_minrev > 4)) { + /* + * This version of the driver only supports Rev 3.2 - 3.4 + */ + ipath_dev_err(dd, + "Unsupported InfiniPath hardware revision %u.%u!\n", + dd->ipath_majrev, dd->ipath_minrev); + ret = 1; + goto bail; + } + /* + * pkt/word counters are 32 bit, and therefore wrap fast enough + * that we snapshot them from a timer, and maintain 64 bit shadow + * copies + */ + dd->ipath_flags |= IPATH_32BITCOUNTERS; + dd->ipath_flags |= IPATH_GPIO_INTR; + if (dd->ipath_lbus_speed != 800) + ipath_dev_err(dd, + "Incorrectly configured for HT @ %uMHz\n", + dd->ipath_lbus_speed); + + /* + * set here, not in ipath_init_*_funcs because we have to do + * it after we can read chip registers. + */ + dd->ipath_ureg_align = + ipath_read_kreg32(dd, dd->ipath_kregs->kr_pagealign); + +bail: + return ret; +} + +static void ipath_check_htlink(struct ipath_devdata *dd) +{ + u8 linkerr, link_off, i; + + for (i = 0; i < 2; i++) { + link_off = dd->ipath_ht_slave_off + i * 4 + 0xd; + if (pci_read_config_byte(dd->pcidev, link_off, &linkerr)) + dev_info(&dd->pcidev->dev, "Couldn't read " + "linkerror%d of HT slave/primary block\n", + i); + else if (linkerr & 0xf0) { + ipath_cdbg(VERBOSE, "HT linkerr%d bits 0x%x set, " + "clearing\n", linkerr >> 4, i); + /* + * writing the linkerr bits that are set should + * clear them + */ + if (pci_write_config_byte(dd->pcidev, link_off, + linkerr)) + ipath_dbg("Failed write to clear HT " + "linkerror%d\n", i); + if (pci_read_config_byte(dd->pcidev, link_off, + &linkerr)) + dev_info(&dd->pcidev->dev, + "Couldn't reread linkerror%d of " + "HT slave/primary block\n", i); + else if (linkerr & 0xf0) + dev_info(&dd->pcidev->dev, + "HT linkerror%d bits 0x%x " + "couldn't be cleared\n", + i, linkerr >> 4); + } + } +} + +static int ipath_setup_ht_reset(struct ipath_devdata *dd) +{ + ipath_dbg("No reset possible for this InfiniPath hardware\n"); + return 0; +} + +#define HT_INTR_DISC_CONFIG 0x80 /* HT interrupt and discovery cap */ +#define HT_INTR_REG_INDEX 2 /* intconfig requires indirect accesses */ + +/* + * Bits 13-15 of command==0 is slave/primary block. Clear any HT CRC + * errors. We only bother to do this at load time, because it's OK if + * it happened before we were loaded (first time after boot/reset), + * but any time after that, it's fatal anyway. Also need to not check + * for upper byte errors if we are in 8 bit mode, so figure out + * our width. For now, at least, also complain if it's 8 bit. + */ +static void slave_or_pri_blk(struct ipath_devdata *dd, struct pci_dev *pdev, + int pos, u8 cap_type) +{ + u8 linkwidth = 0, linkerr, link_a_b_off, link_off; + u16 linkctrl = 0; + int i; + + dd->ipath_ht_slave_off = pos; + /* command word, master_host bit */ + /* master host || slave */ + if ((cap_type >> 2) & 1) + link_a_b_off = 4; + else + link_a_b_off = 0; + ipath_cdbg(VERBOSE, "HT%u (Link %c) connected to processor\n", + link_a_b_off ? 1 : 0, + link_a_b_off ? 'B' : 'A'); + + link_a_b_off += pos; + + /* + * check both link control registers; clear both HT CRC sets if + * necessary. + */ + for (i = 0; i < 2; i++) { + link_off = pos + i * 4 + 0x4; + if (pci_read_config_word(pdev, link_off, &linkctrl)) + ipath_dev_err(dd, "Couldn't read HT link control%d " + "register\n", i); + else if (linkctrl & (0xf << 8)) { + ipath_cdbg(VERBOSE, "Clear linkctrl%d CRC Error " + "bits %x\n", i, linkctrl & (0xf << 8)); + /* + * now write them back to clear the error. + */ + pci_write_config_word(pdev, link_off, + linkctrl & (0xf << 8)); + } + } + + /* + * As with HT CRC bits, same for protocol errors that might occur + * during boot. + */ + for (i = 0; i < 2; i++) { + link_off = pos + i * 4 + 0xd; + if (pci_read_config_byte(pdev, link_off, &linkerr)) + dev_info(&pdev->dev, "Couldn't read linkerror%d " + "of HT slave/primary block\n", i); + else if (linkerr & 0xf0) { + ipath_cdbg(VERBOSE, "HT linkerr%d bits 0x%x set, " + "clearing\n", linkerr >> 4, i); + /* + * writing the linkerr bits that are set will clear + * them + */ + if (pci_write_config_byte + (pdev, link_off, linkerr)) + ipath_dbg("Failed write to clear HT " + "linkerror%d\n", i); + if (pci_read_config_byte(pdev, link_off, &linkerr)) + dev_info(&pdev->dev, "Couldn't reread " + "linkerror%d of HT slave/primary " + "block\n", i); + else if (linkerr & 0xf0) + dev_info(&pdev->dev, "HT linkerror%d bits " + "0x%x couldn't be cleared\n", + i, linkerr >> 4); + } + } + + /* + * this is just for our link to the host, not devices connected + * through tunnel. + */ + + if (pci_read_config_byte(pdev, link_a_b_off + 7, &linkwidth)) + ipath_dev_err(dd, "Couldn't read HT link width " + "config register\n"); + else { + u32 width; + switch (linkwidth & 7) { + case 5: + width = 4; + break; + case 4: + width = 2; + break; + case 3: + width = 32; + break; + case 1: + width = 16; + break; + case 0: + default: /* if wrong, assume 8 bit */ + width = 8; + break; + } + + dd->ipath_lbus_width = width; + + if (linkwidth != 0x11) { + ipath_dev_err(dd, "Not configured for 16 bit HT " + "(%x)\n", linkwidth); + if (!(linkwidth & 0xf)) { + ipath_dbg("Will ignore HT lane1 errors\n"); + dd->ipath_flags |= IPATH_8BIT_IN_HT0; + } + } + } + + /* + * this is just for our link to the host, not devices connected + * through tunnel. + */ + if (pci_read_config_byte(pdev, link_a_b_off + 0xd, &linkwidth)) + ipath_dev_err(dd, "Couldn't read HT link frequency " + "config register\n"); + else { + u32 speed; + switch (linkwidth & 0xf) { + case 6: + speed = 1000; + break; + case 5: + speed = 800; + break; + case 4: + speed = 600; + break; + case 3: + speed = 500; + break; + case 2: + speed = 400; + break; + case 1: + speed = 300; + break; + default: + /* + * assume reserved and vendor-specific are 200... + */ + case 0: + speed = 200; + break; + } + dd->ipath_lbus_speed = speed; + } + + snprintf(dd->ipath_lbus_info, sizeof(dd->ipath_lbus_info), + "HyperTransport,%uMHz,x%u\n", + dd->ipath_lbus_speed, + dd->ipath_lbus_width); +} + +static int ipath_ht_intconfig(struct ipath_devdata *dd) +{ + int ret; + + if (dd->ipath_intconfig) { + ipath_write_kreg(dd, dd->ipath_kregs->kr_interruptconfig, + dd->ipath_intconfig); /* interrupt address */ + ret = 0; + } else { + ipath_dev_err(dd, "No interrupts enabled, couldn't setup " + "interrupt address\n"); + ret = -EINVAL; + } + + return ret; +} + +static void ipath_ht_irq_update(struct pci_dev *dev, int irq, + struct ht_irq_msg *msg) +{ + struct ipath_devdata *dd = pci_get_drvdata(dev); + u64 prev_intconfig = dd->ipath_intconfig; + + dd->ipath_intconfig = msg->address_lo; + dd->ipath_intconfig |= ((u64) msg->address_hi) << 32; + + /* + * If the previous value of dd->ipath_intconfig is zero, we're + * getting configured for the first time, and must not program the + * intconfig register here (it will be programmed later, when the + * hardware is ready). Otherwise, we should. + */ + if (prev_intconfig) + ipath_ht_intconfig(dd); +} + +/** + * ipath_setup_ht_config - setup the interruptconfig register + * @dd: the infinipath device + * @pdev: the PCI device + * + * setup the interruptconfig register from the HT config info. + * Also clear CRC errors in HT linkcontrol, if necessary. + * This is done only for the real hardware. It is done before + * chip address space is initted, so can't touch infinipath registers + */ +static int ipath_setup_ht_config(struct ipath_devdata *dd, + struct pci_dev *pdev) +{ + int pos, ret; + + ret = __ht_create_irq(pdev, 0, ipath_ht_irq_update); + if (ret < 0) { + ipath_dev_err(dd, "Couldn't create interrupt handler: " + "err %d\n", ret); + goto bail; + } + dd->ipath_irq = ret; + ret = 0; + + /* + * Handle clearing CRC errors in linkctrl register if necessary. We + * do this early, before we ever enable errors or hardware errors, + * mostly to avoid causing the chip to enter freeze mode. + */ + pos = pci_find_capability(pdev, PCI_CAP_ID_HT); + if (!pos) { + ipath_dev_err(dd, "Couldn't find HyperTransport " + "capability; no interrupts\n"); + ret = -ENODEV; + goto bail; + } + do { + u8 cap_type; + + /* + * The HT capability type byte is 3 bytes after the + * capability byte. + */ + if (pci_read_config_byte(pdev, pos + 3, &cap_type)) { + dev_info(&pdev->dev, "Couldn't read config " + "command @ %d\n", pos); + continue; + } + if (!(cap_type & 0xE0)) + slave_or_pri_blk(dd, pdev, pos, cap_type); + } while ((pos = pci_find_next_capability(pdev, pos, + PCI_CAP_ID_HT))); + + dd->ipath_flags |= IPATH_SWAP_PIOBUFS; + +bail: + return ret; +} + +/** + * ipath_setup_ht_cleanup - clean up any per-chip chip-specific stuff + * @dd: the infinipath device + * + * Called during driver unload. + * This is currently a nop for the HT chip, not for all chips + */ +static void ipath_setup_ht_cleanup(struct ipath_devdata *dd) +{ +} + +/** + * ipath_setup_ht_setextled - set the state of the two external LEDs + * @dd: the infinipath device + * @lst: the L state + * @ltst: the LT state + * + * Set the state of the two external LEDs, to indicate physical and + * logical state of IB link. For this chip (at least with recommended + * board pinouts), LED1 is Green (physical state), and LED2 is Yellow + * (logical state) + * + * Note: We try to match the Mellanox HCA LED behavior as best + * we can. Green indicates physical link state is OK (something is + * plugged in, and we can train). + * Amber indicates the link is logically up (ACTIVE). + * Mellanox further blinks the amber LED to indicate data packet + * activity, but we have no hardware support for that, so it would + * require waking up every 10-20 msecs and checking the counters + * on the chip, and then turning the LED off if appropriate. That's + * visible overhead, so not something we will do. + * + */ +static void ipath_setup_ht_setextled(struct ipath_devdata *dd, + u64 lst, u64 ltst) +{ + u64 extctl; + unsigned long flags = 0; + + /* the diags use the LED to indicate diag info, so we leave + * the external LED alone when the diags are running */ + if (ipath_diag_inuse) + return; + + /* Allow override of LED display for, e.g. Locating system in rack */ + if (dd->ipath_led_override) { + ltst = (dd->ipath_led_override & IPATH_LED_PHYS) + ? INFINIPATH_IBCS_LT_STATE_LINKUP + : INFINIPATH_IBCS_LT_STATE_DISABLED; + lst = (dd->ipath_led_override & IPATH_LED_LOG) + ? INFINIPATH_IBCS_L_STATE_ACTIVE + : INFINIPATH_IBCS_L_STATE_DOWN; + } + + spin_lock_irqsave(&dd->ipath_gpio_lock, flags); + /* + * start by setting both LED control bits to off, then turn + * on the appropriate bit(s). + */ + if (dd->ipath_boardrev == 8) { /* LS/X-1 uses different pins */ + /* + * major difference is that INFINIPATH_EXTC_LEDGBLERR_OFF + * is inverted, because it is normally used to indicate + * a hardware fault at reset, if there were errors + */ + extctl = (dd->ipath_extctrl & ~INFINIPATH_EXTC_LEDGBLOK_ON) + | INFINIPATH_EXTC_LEDGBLERR_OFF; + if (ltst == INFINIPATH_IBCS_LT_STATE_LINKUP) + extctl &= ~INFINIPATH_EXTC_LEDGBLERR_OFF; + if (lst == INFINIPATH_IBCS_L_STATE_ACTIVE) + extctl |= INFINIPATH_EXTC_LEDGBLOK_ON; + } + else { + extctl = dd->ipath_extctrl & + ~(INFINIPATH_EXTC_LED1PRIPORT_ON | + INFINIPATH_EXTC_LED2PRIPORT_ON); + if (ltst == INFINIPATH_IBCS_LT_STATE_LINKUP) + extctl |= INFINIPATH_EXTC_LED1PRIPORT_ON; + if (lst == INFINIPATH_IBCS_L_STATE_ACTIVE) + extctl |= INFINIPATH_EXTC_LED2PRIPORT_ON; + } + dd->ipath_extctrl = extctl; + ipath_write_kreg(dd, dd->ipath_kregs->kr_extctrl, extctl); + spin_unlock_irqrestore(&dd->ipath_gpio_lock, flags); +} + +static void ipath_init_ht_variables(struct ipath_devdata *dd) +{ + /* + * setup the register offsets, since they are different for each + * chip + */ + dd->ipath_kregs = &ipath_ht_kregs; + dd->ipath_cregs = &ipath_ht_cregs; + + dd->ipath_gpio_sda_num = _IPATH_GPIO_SDA_NUM; + dd->ipath_gpio_scl_num = _IPATH_GPIO_SCL_NUM; + dd->ipath_gpio_sda = IPATH_GPIO_SDA; + dd->ipath_gpio_scl = IPATH_GPIO_SCL; + + /* + * Fill in data for field-values that change in newer chips. + * We dynamically specify only the mask for LINKTRAININGSTATE + * and only the shift for LINKSTATE, as they are the only ones + * that change. Also precalculate the 3 link states of interest + * and the combined mask. + */ + dd->ibcs_ls_shift = IBA6110_IBCS_LINKSTATE_SHIFT; + dd->ibcs_lts_mask = IBA6110_IBCS_LINKTRAININGSTATE_MASK; + dd->ibcs_mask = (INFINIPATH_IBCS_LINKSTATE_MASK << + dd->ibcs_ls_shift) | dd->ibcs_lts_mask; + dd->ib_init = (INFINIPATH_IBCS_LT_STATE_LINKUP << + INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) | + (INFINIPATH_IBCS_L_STATE_INIT << dd->ibcs_ls_shift); + dd->ib_arm = (INFINIPATH_IBCS_LT_STATE_LINKUP << + INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) | + (INFINIPATH_IBCS_L_STATE_ARM << dd->ibcs_ls_shift); + dd->ib_active = (INFINIPATH_IBCS_LT_STATE_LINKUP << + INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) | + (INFINIPATH_IBCS_L_STATE_ACTIVE << dd->ibcs_ls_shift); + + /* + * Fill in data for ibcc field-values that change in newer chips. + * We dynamically specify only the mask for LINKINITCMD + * and only the shift for LINKCMD and MAXPKTLEN, as they are + * the only ones that change. + */ + dd->ibcc_lic_mask = INFINIPATH_IBCC_LINKINITCMD_MASK; + dd->ibcc_lc_shift = INFINIPATH_IBCC_LINKCMD_SHIFT; + dd->ibcc_mpl_shift = INFINIPATH_IBCC_MAXPKTLEN_SHIFT; + + /* Fill in shifts for RcvCtrl. */ + dd->ipath_r_portenable_shift = INFINIPATH_R_PORTENABLE_SHIFT; + dd->ipath_r_intravail_shift = INFINIPATH_R_INTRAVAIL_SHIFT; + dd->ipath_r_tailupd_shift = INFINIPATH_R_TAILUPD_SHIFT; + dd->ipath_r_portcfg_shift = 0; /* Not on IBA6110 */ + + dd->ipath_i_bitsextant = + (INFINIPATH_I_RCVURG_MASK << INFINIPATH_I_RCVURG_SHIFT) | + (INFINIPATH_I_RCVAVAIL_MASK << + INFINIPATH_I_RCVAVAIL_SHIFT) | + INFINIPATH_I_ERROR | INFINIPATH_I_SPIOSENT | + INFINIPATH_I_SPIOBUFAVAIL | INFINIPATH_I_GPIO; + + dd->ipath_e_bitsextant = + INFINIPATH_E_RFORMATERR | INFINIPATH_E_RVCRC | + INFINIPATH_E_RICRC | INFINIPATH_E_RMINPKTLEN | + INFINIPATH_E_RMAXPKTLEN | INFINIPATH_E_RLONGPKTLEN | + INFINIPATH_E_RSHORTPKTLEN | INFINIPATH_E_RUNEXPCHAR | + INFINIPATH_E_RUNSUPVL | INFINIPATH_E_REBP | + INFINIPATH_E_RIBFLOW | INFINIPATH_E_RBADVERSION | + INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL | + INFINIPATH_E_RBADTID | INFINIPATH_E_RHDRLEN | + INFINIPATH_E_RHDR | INFINIPATH_E_RIBLOSTLINK | + INFINIPATH_E_SMINPKTLEN | INFINIPATH_E_SMAXPKTLEN | + INFINIPATH_E_SUNDERRUN | INFINIPATH_E_SPKTLEN | + INFINIPATH_E_SDROPPEDSMPPKT | INFINIPATH_E_SDROPPEDDATAPKT | + INFINIPATH_E_SPIOARMLAUNCH | INFINIPATH_E_SUNEXPERRPKTNUM | + INFINIPATH_E_SUNSUPVL | INFINIPATH_E_IBSTATUSCHANGED | + INFINIPATH_E_INVALIDADDR | INFINIPATH_E_RESET | + INFINIPATH_E_HARDWARE; + + dd->ipath_hwe_bitsextant = + (INFINIPATH_HWE_HTCMEMPARITYERR_MASK << + INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT) | + (INFINIPATH_HWE_TXEMEMPARITYERR_MASK << + INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT) | + (INFINIPATH_HWE_RXEMEMPARITYERR_MASK << + INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) | + INFINIPATH_HWE_HTCLNKABYTE0CRCERR | + INFINIPATH_HWE_HTCLNKABYTE1CRCERR | + INFINIPATH_HWE_HTCLNKBBYTE0CRCERR | + INFINIPATH_HWE_HTCLNKBBYTE1CRCERR | + INFINIPATH_HWE_HTCMISCERR4 | + INFINIPATH_HWE_HTCMISCERR5 | INFINIPATH_HWE_HTCMISCERR6 | + INFINIPATH_HWE_HTCMISCERR7 | + INFINIPATH_HWE_HTCBUSTREQPARITYERR | + INFINIPATH_HWE_HTCBUSTRESPPARITYERR | + INFINIPATH_HWE_HTCBUSIREQPARITYERR | + INFINIPATH_HWE_RXDSYNCMEMPARITYERR | + INFINIPATH_HWE_MEMBISTFAILED | + INFINIPATH_HWE_COREPLL_FBSLIP | + INFINIPATH_HWE_COREPLL_RFSLIP | + INFINIPATH_HWE_HTBPLL_FBSLIP | + INFINIPATH_HWE_HTBPLL_RFSLIP | + INFINIPATH_HWE_HTAPLL_FBSLIP | + INFINIPATH_HWE_HTAPLL_RFSLIP | + INFINIPATH_HWE_SERDESPLLFAILED | + INFINIPATH_HWE_IBCBUSTOSPCPARITYERR | + INFINIPATH_HWE_IBCBUSFRSPCPARITYERR; + + dd->ipath_i_rcvavail_mask = INFINIPATH_I_RCVAVAIL_MASK; + dd->ipath_i_rcvurg_mask = INFINIPATH_I_RCVURG_MASK; + dd->ipath_i_rcvavail_shift = INFINIPATH_I_RCVAVAIL_SHIFT; + dd->ipath_i_rcvurg_shift = INFINIPATH_I_RCVURG_SHIFT; + + /* + * EEPROM error log 0 is TXE Parity errors. 1 is RXE Parity. + * 2 is Some Misc, 3 is reserved for future. + */ + dd->ipath_eep_st_masks[0].hwerrs_to_log = + INFINIPATH_HWE_TXEMEMPARITYERR_MASK << + INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT; + + dd->ipath_eep_st_masks[1].hwerrs_to_log = + INFINIPATH_HWE_RXEMEMPARITYERR_MASK << + INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT; + + dd->ipath_eep_st_masks[2].errs_to_log = INFINIPATH_E_RESET; + + dd->delay_mult = 2; /* SDR, 4X, can't change */ + + dd->ipath_link_width_supported = IB_WIDTH_1X | IB_WIDTH_4X; + dd->ipath_link_speed_supported = IPATH_IB_SDR; + dd->ipath_link_width_enabled = IB_WIDTH_4X; + dd->ipath_link_speed_enabled = dd->ipath_link_speed_supported; + /* these can't change for this chip, so set once */ + dd->ipath_link_width_active = dd->ipath_link_width_enabled; + dd->ipath_link_speed_active = dd->ipath_link_speed_enabled; +} + +/** + * ipath_ht_init_hwerrors - enable hardware errors + * @dd: the infinipath device + * + * now that we have finished initializing everything that might reasonably + * cause a hardware error, and cleared those errors bits as they occur, + * we can enable hardware errors in the mask (potentially enabling + * freeze mode), and enable hardware errors as errors (along with + * everything else) in errormask + */ +static void ipath_ht_init_hwerrors(struct ipath_devdata *dd) +{ + ipath_err_t val; + u64 extsval; + + extsval = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extstatus); + + if (!(extsval & INFINIPATH_EXTS_MEMBIST_ENDTEST)) + ipath_dev_err(dd, "MemBIST did not complete!\n"); + if (extsval & INFINIPATH_EXTS_MEMBIST_CORRECT) + ipath_dbg("MemBIST corrected\n"); + + ipath_check_htlink(dd); + + /* barring bugs, all hwerrors become interrupts, which can */ + val = -1LL; + /* don't look at crc lane1 if 8 bit */ + if (dd->ipath_flags & IPATH_8BIT_IN_HT0) + val &= ~infinipath_hwe_htclnkabyte1crcerr; + /* don't look at crc lane1 if 8 bit */ + if (dd->ipath_flags & IPATH_8BIT_IN_HT1) + val &= ~infinipath_hwe_htclnkbbyte1crcerr; + + /* + * disable RXDSYNCMEMPARITY because external serdes is unused, + * and therefore the logic will never be used or initialized, + * and uninitialized state will normally result in this error + * being asserted. Similarly for the external serdess pll + * lock signal. + */ + val &= ~(INFINIPATH_HWE_SERDESPLLFAILED | + INFINIPATH_HWE_RXDSYNCMEMPARITYERR); + + /* + * Disable MISCERR4 because of an inversion in the HT core + * logic checking for errors that cause this bit to be set. + * The errata can also cause the protocol error bit to be set + * in the HT config space linkerror register(s). + */ + val &= ~INFINIPATH_HWE_HTCMISCERR4; + + /* + * PLL ignored because unused MDIO interface has a logic problem + */ + if (dd->ipath_boardrev == 4 || dd->ipath_boardrev == 9) + val &= ~INFINIPATH_HWE_SERDESPLLFAILED; + dd->ipath_hwerrmask = val; +} + + + + +/** + * ipath_ht_bringup_serdes - bring up the serdes + * @dd: the infinipath device + */ +static int ipath_ht_bringup_serdes(struct ipath_devdata *dd) +{ + u64 val, config1; + int ret = 0, change = 0; + + ipath_dbg("Trying to bringup serdes\n"); + + if (ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwerrstatus) & + INFINIPATH_HWE_SERDESPLLFAILED) + { + ipath_dbg("At start, serdes PLL failed bit set in " + "hwerrstatus, clearing and continuing\n"); + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear, + INFINIPATH_HWE_SERDESPLLFAILED); + } + + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0); + config1 = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig1); + + ipath_cdbg(VERBOSE, "Initial serdes status is config0=%llx " + "config1=%llx, sstatus=%llx xgxs %llx\n", + (unsigned long long) val, (unsigned long long) config1, + (unsigned long long) + ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesstatus), + (unsigned long long) + ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig)); + + /* force reset on */ + val |= INFINIPATH_SERDC0_RESET_PLL + /* | INFINIPATH_SERDC0_RESET_MASK */ + ; + ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0, val); + udelay(15); /* need pll reset set at least for a bit */ + + if (val & INFINIPATH_SERDC0_RESET_PLL) { + u64 val2 = val &= ~INFINIPATH_SERDC0_RESET_PLL; + /* set lane resets, and tx idle, during pll reset */ + val2 |= INFINIPATH_SERDC0_RESET_MASK | + INFINIPATH_SERDC0_TXIDLE; + ipath_cdbg(VERBOSE, "Clearing serdes PLL reset (writing " + "%llx)\n", (unsigned long long) val2); + ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0, + val2); + /* + * be sure chip saw it + */ + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + /* + * need pll reset clear at least 11 usec before lane + * resets cleared; give it a few more + */ + udelay(15); + val = val2; /* for check below */ + } + + if (val & (INFINIPATH_SERDC0_RESET_PLL | + INFINIPATH_SERDC0_RESET_MASK | + INFINIPATH_SERDC0_TXIDLE)) { + val &= ~(INFINIPATH_SERDC0_RESET_PLL | + INFINIPATH_SERDC0_RESET_MASK | + INFINIPATH_SERDC0_TXIDLE); + /* clear them */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0, + val); + } + + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig); + if (val & INFINIPATH_XGXS_RESET) { + /* normally true after boot */ + val &= ~INFINIPATH_XGXS_RESET; + change = 1; + } + if (((val >> INFINIPATH_XGXS_RX_POL_SHIFT) & + INFINIPATH_XGXS_RX_POL_MASK) != dd->ipath_rx_pol_inv ) { + /* need to compensate for Tx inversion in partner */ + val &= ~(INFINIPATH_XGXS_RX_POL_MASK << + INFINIPATH_XGXS_RX_POL_SHIFT); + val |= dd->ipath_rx_pol_inv << + INFINIPATH_XGXS_RX_POL_SHIFT; + change = 1; + } + if (change) + ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, val); + + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0); + + /* clear current and de-emphasis bits */ + config1 &= ~0x0ffffffff00ULL; + /* set current to 20ma */ + config1 |= 0x00000000000ULL; + /* set de-emphasis to -5.68dB */ + config1 |= 0x0cccc000000ULL; + ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig1, config1); + + ipath_cdbg(VERBOSE, "After setup: serdes status is config0=%llx " + "config1=%llx, sstatus=%llx xgxs %llx\n", + (unsigned long long) val, (unsigned long long) config1, + (unsigned long long) + ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesstatus), + (unsigned long long) + ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig)); + + return ret; /* for now, say we always succeeded */ +} + +/** + * ipath_ht_quiet_serdes - set serdes to txidle + * @dd: the infinipath device + * driver is being unloaded + */ +static void ipath_ht_quiet_serdes(struct ipath_devdata *dd) +{ + u64 val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0); + + val |= INFINIPATH_SERDC0_TXIDLE; + ipath_dbg("Setting TxIdleEn on serdes (config0 = %llx)\n", + (unsigned long long) val); + ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0, val); +} + +/** + * ipath_pe_put_tid - write a TID in chip + * @dd: the infinipath device + * @tidptr: pointer to the expected TID (in chip) to update + * @tidtype: RCVHQ_RCV_TYPE_EAGER (1) for eager, RCVHQ_RCV_TYPE_EXPECTED (0) for expected + * @pa: physical address of in memory buffer; ipath_tidinvalid if freeing + * + * This exists as a separate routine to allow for special locking etc. + * It's used for both the full cleanup on exit, as well as the normal + * setup and teardown. + */ +static void ipath_ht_put_tid(struct ipath_devdata *dd, + u64 __iomem *tidptr, u32 type, + unsigned long pa) +{ + if (!dd->ipath_kregbase) + return; + + if (pa != dd->ipath_tidinvalid) { + if (unlikely((pa & ~INFINIPATH_RT_ADDR_MASK))) { + dev_info(&dd->pcidev->dev, + "physaddr %lx has more than " + "40 bits, using only 40!!!\n", pa); + pa &= INFINIPATH_RT_ADDR_MASK; + } + if (type == RCVHQ_RCV_TYPE_EAGER) + pa |= dd->ipath_tidtemplate; + else { + /* in words (fixed, full page). */ + u64 lenvalid = PAGE_SIZE >> 2; + lenvalid <<= INFINIPATH_RT_BUFSIZE_SHIFT; + pa |= lenvalid | INFINIPATH_RT_VALID; + } + } + + writeq(pa, tidptr); +} + + +/** + * ipath_ht_clear_tid - clear all TID entries for a port, expected and eager + * @dd: the infinipath device + * @port: the port + * + * Used from ipath_close(), and at chip initialization. + */ +static void ipath_ht_clear_tids(struct ipath_devdata *dd, unsigned port) +{ + u64 __iomem *tidbase; + int i; + + if (!dd->ipath_kregbase) + return; + + ipath_cdbg(VERBOSE, "Invalidate TIDs for port %u\n", port); + + /* + * need to invalidate all of the expected TID entries for this + * port, so we don't have valid entries that might somehow get + * used (early in next use of this port, or through some bug) + */ + tidbase = (u64 __iomem *) ((char __iomem *)(dd->ipath_kregbase) + + dd->ipath_rcvtidbase + + port * dd->ipath_rcvtidcnt * + sizeof(*tidbase)); + for (i = 0; i < dd->ipath_rcvtidcnt; i++) + ipath_ht_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EXPECTED, + dd->ipath_tidinvalid); + + tidbase = (u64 __iomem *) ((char __iomem *)(dd->ipath_kregbase) + + dd->ipath_rcvegrbase + + port * dd->ipath_rcvegrcnt * + sizeof(*tidbase)); + + for (i = 0; i < dd->ipath_rcvegrcnt; i++) + ipath_ht_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EAGER, + dd->ipath_tidinvalid); +} + +/** + * ipath_ht_tidtemplate - setup constants for TID updates + * @dd: the infinipath device + * + * We setup stuff that we use a lot, to avoid calculating each time + */ +static void ipath_ht_tidtemplate(struct ipath_devdata *dd) +{ + dd->ipath_tidtemplate = dd->ipath_ibmaxlen >> 2; + dd->ipath_tidtemplate <<= INFINIPATH_RT_BUFSIZE_SHIFT; + dd->ipath_tidtemplate |= INFINIPATH_RT_VALID; + + /* + * work around chip errata bug 7358, by marking invalid tids + * as having max length + */ + dd->ipath_tidinvalid = (-1LL & INFINIPATH_RT_BUFSIZE_MASK) << + INFINIPATH_RT_BUFSIZE_SHIFT; +} + +static int ipath_ht_early_init(struct ipath_devdata *dd) +{ + u32 __iomem *piobuf; + u32 pioincr, val32; + int i; + + /* + * one cache line; long IB headers will spill over into received + * buffer + */ + dd->ipath_rcvhdrentsize = 16; + dd->ipath_rcvhdrsize = IPATH_DFLT_RCVHDRSIZE; + + /* + * For HT, we allocate a somewhat overly large eager buffer, + * such that we can guarantee that we can receive the largest + * packet that we can send out. To truly support a 4KB MTU, + * we need to bump this to a large value. To date, other than + * testing, we have never encountered an HCA that can really + * send 4KB MTU packets, so we do not handle that (we'll get + * errors interrupts if we ever see one). + */ + dd->ipath_rcvegrbufsize = dd->ipath_piosize2k; + + /* + * the min() check here is currently a nop, but it may not + * always be, depending on just how we do ipath_rcvegrbufsize + */ + dd->ipath_ibmaxlen = min(dd->ipath_piosize2k, + dd->ipath_rcvegrbufsize); + dd->ipath_init_ibmaxlen = dd->ipath_ibmaxlen; + ipath_ht_tidtemplate(dd); + + /* + * zero all the TID entries at startup. We do this for sanity, + * in case of a previous driver crash of some kind, and also + * because the chip powers up with these memories in an unknown + * state. Use portcnt, not cfgports, since this is for the + * full chip, not for current (possibly different) configuration + * value. + * Chip Errata bug 6447 + */ + for (val32 = 0; val32 < dd->ipath_portcnt; val32++) + ipath_ht_clear_tids(dd, val32); + + /* + * write the pbc of each buffer, to be sure it's initialized, then + * cancel all the buffers, and also abort any packets that might + * have been in flight for some reason (the latter is for driver + * unload/reload, but isn't a bad idea at first init). PIO send + * isn't enabled at this point, so there is no danger of sending + * these out on the wire. + * Chip Errata bug 6610 + */ + piobuf = (u32 __iomem *) (((char __iomem *)(dd->ipath_kregbase)) + + dd->ipath_piobufbase); + pioincr = dd->ipath_palign / sizeof(*piobuf); + for (i = 0; i < dd->ipath_piobcnt2k; i++) { + /* + * reasonable word count, just to init pbc + */ + writel(16, piobuf); + piobuf += pioincr; + } + + ipath_get_eeprom_info(dd); + if (dd->ipath_boardrev == 5) { + /* + * Later production QHT7040 has same changes as QHT7140, so + * can use GPIO interrupts. They have serial #'s starting + * with 128, rather than 112. + */ + if (dd->ipath_serial[0] == '1' && + dd->ipath_serial[1] == '2' && + dd->ipath_serial[2] == '8') + dd->ipath_flags |= IPATH_GPIO_INTR; + else { + ipath_dev_err(dd, "Unsupported InfiniPath board " + "(serial number %.16s)!\n", + dd->ipath_serial); + return 1; + } + } + + if (dd->ipath_minrev >= 4) { + /* Rev4+ reports extra errors via internal GPIO pins */ + dd->ipath_flags |= IPATH_GPIO_ERRINTRS; + dd->ipath_gpio_mask |= IPATH_GPIO_ERRINTR_MASK; + ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask, + dd->ipath_gpio_mask); + } + + return 0; +} + + +/** + * ipath_init_ht_get_base_info - set chip-specific flags for user code + * @dd: the infinipath device + * @kbase: ipath_base_info pointer + * + * We set the PCIE flag because the lower bandwidth on PCIe vs + * HyperTransport can affect some user packet algorithms. + */ +static int ipath_ht_get_base_info(struct ipath_portdata *pd, void *kbase) +{ + struct ipath_base_info *kinfo = kbase; + + kinfo->spi_runtime_flags |= IPATH_RUNTIME_HT | + IPATH_RUNTIME_PIO_REGSWAPPED; + + if (pd->port_dd->ipath_minrev < 4) + kinfo->spi_runtime_flags |= IPATH_RUNTIME_RCVHDR_COPY; + + return 0; +} + +static void ipath_ht_free_irq(struct ipath_devdata *dd) +{ + free_irq(dd->ipath_irq, dd); + ht_destroy_irq(dd->ipath_irq); + dd->ipath_irq = 0; + dd->ipath_intconfig = 0; +} + +static struct ipath_message_header * +ipath_ht_get_msgheader(struct ipath_devdata *dd, __le32 *rhf_addr) +{ + return (struct ipath_message_header *) + &rhf_addr[sizeof(u64) / sizeof(u32)]; +} + +static void ipath_ht_config_ports(struct ipath_devdata *dd, ushort cfgports) +{ + dd->ipath_portcnt = + ipath_read_kreg32(dd, dd->ipath_kregs->kr_portcnt); + dd->ipath_p0_rcvegrcnt = + ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrcnt); +} + +static void ipath_ht_read_counters(struct ipath_devdata *dd, + struct infinipath_counters *cntrs) +{ + cntrs->LBIntCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(LBIntCnt)); + cntrs->LBFlowStallCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(LBFlowStallCnt)); + cntrs->TxSDmaDescCnt = 0; + cntrs->TxUnsupVLErrCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxUnsupVLErrCnt)); + cntrs->TxDataPktCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxDataPktCnt)); + cntrs->TxFlowPktCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxFlowPktCnt)); + cntrs->TxDwordCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxDwordCnt)); + cntrs->TxLenErrCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxLenErrCnt)); + cntrs->TxMaxMinLenErrCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxMaxMinLenErrCnt)); + cntrs->TxUnderrunCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxUnderrunCnt)); + cntrs->TxFlowStallCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxFlowStallCnt)); + cntrs->TxDroppedPktCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxDroppedPktCnt)); + cntrs->RxDroppedPktCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxDroppedPktCnt)); + cntrs->RxDataPktCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxDataPktCnt)); + cntrs->RxFlowPktCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxFlowPktCnt)); + cntrs->RxDwordCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxDwordCnt)); + cntrs->RxLenErrCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxLenErrCnt)); + cntrs->RxMaxMinLenErrCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxMaxMinLenErrCnt)); + cntrs->RxICRCErrCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxICRCErrCnt)); + cntrs->RxVCRCErrCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxVCRCErrCnt)); + cntrs->RxFlowCtrlErrCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxFlowCtrlErrCnt)); + cntrs->RxBadFormatCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxBadFormatCnt)); + cntrs->RxLinkProblemCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxLinkProblemCnt)); + cntrs->RxEBPCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxEBPCnt)); + cntrs->RxLPCRCErrCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxLPCRCErrCnt)); + cntrs->RxBufOvflCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxBufOvflCnt)); + cntrs->RxTIDFullErrCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxTIDFullErrCnt)); + cntrs->RxTIDValidErrCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxTIDValidErrCnt)); + cntrs->RxPKeyMismatchCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxPKeyMismatchCnt)); + cntrs->RxP0HdrEgrOvflCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP0HdrEgrOvflCnt)); + cntrs->RxP1HdrEgrOvflCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP1HdrEgrOvflCnt)); + cntrs->RxP2HdrEgrOvflCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP2HdrEgrOvflCnt)); + cntrs->RxP3HdrEgrOvflCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP3HdrEgrOvflCnt)); + cntrs->RxP4HdrEgrOvflCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP4HdrEgrOvflCnt)); + cntrs->RxP5HdrEgrOvflCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP5HdrEgrOvflCnt)); + cntrs->RxP6HdrEgrOvflCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP6HdrEgrOvflCnt)); + cntrs->RxP7HdrEgrOvflCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP7HdrEgrOvflCnt)); + cntrs->RxP8HdrEgrOvflCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP8HdrEgrOvflCnt)); + cntrs->RxP9HdrEgrOvflCnt = 0; + cntrs->RxP10HdrEgrOvflCnt = 0; + cntrs->RxP11HdrEgrOvflCnt = 0; + cntrs->RxP12HdrEgrOvflCnt = 0; + cntrs->RxP13HdrEgrOvflCnt = 0; + cntrs->RxP14HdrEgrOvflCnt = 0; + cntrs->RxP15HdrEgrOvflCnt = 0; + cntrs->RxP16HdrEgrOvflCnt = 0; + cntrs->IBStatusChangeCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBStatusChangeCnt)); + cntrs->IBLinkErrRecoveryCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBLinkErrRecoveryCnt)); + cntrs->IBLinkDownedCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBLinkDownedCnt)); + cntrs->IBSymbolErrCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBSymbolErrCnt)); + cntrs->RxVL15DroppedPktCnt = 0; + cntrs->RxOtherLocalPhyErrCnt = 0; + cntrs->PcieRetryBufDiagQwordCnt = 0; + cntrs->ExcessBufferOvflCnt = dd->ipath_overrun_thresh_errs; + cntrs->LocalLinkIntegrityErrCnt = + (dd->ipath_flags & IPATH_GPIO_ERRINTRS) ? + dd->ipath_lli_errs : dd->ipath_lli_errors; + cntrs->RxVlErrCnt = 0; + cntrs->RxDlidFltrCnt = 0; +} + + +/* no interrupt fallback for these chips */ +static int ipath_ht_nointr_fallback(struct ipath_devdata *dd) +{ + return 0; +} + + +/* + * reset the XGXS (between serdes and IBC). Slightly less intrusive + * than resetting the IBC or external link state, and useful in some + * cases to cause some retraining. To do this right, we reset IBC + * as well. + */ +static void ipath_ht_xgxs_reset(struct ipath_devdata *dd) +{ + u64 val, prev_val; + + prev_val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig); + val = prev_val | INFINIPATH_XGXS_RESET; + prev_val &= ~INFINIPATH_XGXS_RESET; /* be sure */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_control, + dd->ipath_control & ~INFINIPATH_C_LINKENABLE); + ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, val); + ipath_read_kreg32(dd, dd->ipath_kregs->kr_scratch); + ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, prev_val); + ipath_write_kreg(dd, dd->ipath_kregs->kr_control, + dd->ipath_control); +} + + +static int ipath_ht_get_ib_cfg(struct ipath_devdata *dd, int which) +{ + int ret; + + switch (which) { + case IPATH_IB_CFG_LWID: + ret = dd->ipath_link_width_active; + break; + case IPATH_IB_CFG_SPD: + ret = dd->ipath_link_speed_active; + break; + case IPATH_IB_CFG_LWID_ENB: + ret = dd->ipath_link_width_enabled; + break; + case IPATH_IB_CFG_SPD_ENB: + ret = dd->ipath_link_speed_enabled; + break; + default: + ret = -ENOTSUPP; + break; + } + return ret; +} + + +/* we assume range checking is already done, if needed */ +static int ipath_ht_set_ib_cfg(struct ipath_devdata *dd, int which, u32 val) +{ + int ret = 0; + + if (which == IPATH_IB_CFG_LWID_ENB) + dd->ipath_link_width_enabled = val; + else if (which == IPATH_IB_CFG_SPD_ENB) + dd->ipath_link_speed_enabled = val; + else + ret = -ENOTSUPP; + return ret; +} + + +static void ipath_ht_config_jint(struct ipath_devdata *dd, u16 a, u16 b) +{ +} + + +static int ipath_ht_ib_updown(struct ipath_devdata *dd, int ibup, u64 ibcs) +{ + ipath_setup_ht_setextled(dd, ipath_ib_linkstate(dd, ibcs), + ipath_ib_linktrstate(dd, ibcs)); + return 0; +} + + +/** + * ipath_init_iba6110_funcs - set up the chip-specific function pointers + * @dd: the infinipath device + * + * This is global, and is called directly at init to set up the + * chip-specific function pointers for later use. + */ +void ipath_init_iba6110_funcs(struct ipath_devdata *dd) +{ + dd->ipath_f_intrsetup = ipath_ht_intconfig; + dd->ipath_f_bus = ipath_setup_ht_config; + dd->ipath_f_reset = ipath_setup_ht_reset; + dd->ipath_f_get_boardname = ipath_ht_boardname; + dd->ipath_f_init_hwerrors = ipath_ht_init_hwerrors; + dd->ipath_f_early_init = ipath_ht_early_init; + dd->ipath_f_handle_hwerrors = ipath_ht_handle_hwerrors; + dd->ipath_f_quiet_serdes = ipath_ht_quiet_serdes; + dd->ipath_f_bringup_serdes = ipath_ht_bringup_serdes; + dd->ipath_f_clear_tids = ipath_ht_clear_tids; + dd->ipath_f_put_tid = ipath_ht_put_tid; + dd->ipath_f_cleanup = ipath_setup_ht_cleanup; + dd->ipath_f_setextled = ipath_setup_ht_setextled; + dd->ipath_f_get_base_info = ipath_ht_get_base_info; + dd->ipath_f_free_irq = ipath_ht_free_irq; + dd->ipath_f_tidtemplate = ipath_ht_tidtemplate; + dd->ipath_f_intr_fallback = ipath_ht_nointr_fallback; + dd->ipath_f_get_msgheader = ipath_ht_get_msgheader; + dd->ipath_f_config_ports = ipath_ht_config_ports; + dd->ipath_f_read_counters = ipath_ht_read_counters; + dd->ipath_f_xgxs_reset = ipath_ht_xgxs_reset; + dd->ipath_f_get_ib_cfg = ipath_ht_get_ib_cfg; + dd->ipath_f_set_ib_cfg = ipath_ht_set_ib_cfg; + dd->ipath_f_config_jint = ipath_ht_config_jint; + dd->ipath_f_ib_updown = ipath_ht_ib_updown; + + /* + * initialize chip-specific variables + */ + ipath_init_ht_variables(dd); +} diff --git a/kernel/drivers/infiniband/hw/ipath/ipath_init_chip.c b/kernel/drivers/infiniband/hw/ipath/ipath_init_chip.c new file mode 100644 index 000000000..be2a60e14 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ipath/ipath_init_chip.c @@ -0,0 +1,1066 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include + +#include "ipath_kernel.h" +#include "ipath_common.h" + +/* + * min buffers we want to have per port, after driver + */ +#define IPATH_MIN_USER_PORT_BUFCNT 7 + +/* + * Number of ports we are configured to use (to allow for more pio + * buffers per port, etc.) Zero means use chip value. + */ +static ushort ipath_cfgports; + +module_param_named(cfgports, ipath_cfgports, ushort, S_IRUGO); +MODULE_PARM_DESC(cfgports, "Set max number of ports to use"); + +/* + * Number of buffers reserved for driver (verbs and layered drivers.) + * Initialized based on number of PIO buffers if not set via module interface. + * The problem with this is that it's global, but we'll use different + * numbers for different chip types. + */ +static ushort ipath_kpiobufs; + +static int ipath_set_kpiobufs(const char *val, struct kernel_param *kp); + +module_param_call(kpiobufs, ipath_set_kpiobufs, param_get_ushort, + &ipath_kpiobufs, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(kpiobufs, "Set number of PIO buffers for driver"); + +/** + * create_port0_egr - allocate the eager TID buffers + * @dd: the infinipath device + * + * This code is now quite different for user and kernel, because + * the kernel uses skb's, for the accelerated network performance. + * This is the kernel (port0) version. + * + * Allocate the eager TID buffers and program them into infinipath. + * We use the network layer alloc_skb() allocator to allocate the + * memory, and either use the buffers as is for things like verbs + * packets, or pass the buffers up to the ipath layered driver and + * thence the network layer, replacing them as we do so (see + * ipath_rcv_layer()). + */ +static int create_port0_egr(struct ipath_devdata *dd) +{ + unsigned e, egrcnt; + struct ipath_skbinfo *skbinfo; + int ret; + + egrcnt = dd->ipath_p0_rcvegrcnt; + + skbinfo = vmalloc(sizeof(*dd->ipath_port0_skbinfo) * egrcnt); + if (skbinfo == NULL) { + ipath_dev_err(dd, "allocation error for eager TID " + "skb array\n"); + ret = -ENOMEM; + goto bail; + } + for (e = 0; e < egrcnt; e++) { + /* + * This is a bit tricky in that we allocate extra + * space for 2 bytes of the 14 byte ethernet header. + * These two bytes are passed in the ipath header so + * the rest of the data is word aligned. We allocate + * 4 bytes so that the data buffer stays word aligned. + * See ipath_kreceive() for more details. + */ + skbinfo[e].skb = ipath_alloc_skb(dd, GFP_KERNEL); + if (!skbinfo[e].skb) { + ipath_dev_err(dd, "SKB allocation error for " + "eager TID %u\n", e); + while (e != 0) + dev_kfree_skb(skbinfo[--e].skb); + vfree(skbinfo); + ret = -ENOMEM; + goto bail; + } + } + /* + * After loop above, so we can test non-NULL to see if ready + * to use at receive, etc. + */ + dd->ipath_port0_skbinfo = skbinfo; + + for (e = 0; e < egrcnt; e++) { + dd->ipath_port0_skbinfo[e].phys = + ipath_map_single(dd->pcidev, + dd->ipath_port0_skbinfo[e].skb->data, + dd->ipath_ibmaxlen, PCI_DMA_FROMDEVICE); + dd->ipath_f_put_tid(dd, e + (u64 __iomem *) + ((char __iomem *) dd->ipath_kregbase + + dd->ipath_rcvegrbase), + RCVHQ_RCV_TYPE_EAGER, + dd->ipath_port0_skbinfo[e].phys); + } + + ret = 0; + +bail: + return ret; +} + +static int bringup_link(struct ipath_devdata *dd) +{ + u64 val, ibc; + int ret = 0; + + /* hold IBC in reset */ + dd->ipath_control &= ~INFINIPATH_C_LINKENABLE; + ipath_write_kreg(dd, dd->ipath_kregs->kr_control, + dd->ipath_control); + + /* + * set initial max size pkt IBC will send, including ICRC; it's the + * PIO buffer size in dwords, less 1; also see ipath_set_mtu() + */ + val = (dd->ipath_ibmaxlen >> 2) + 1; + ibc = val << dd->ibcc_mpl_shift; + + /* flowcontrolwatermark is in units of KBytes */ + ibc |= 0x5ULL << INFINIPATH_IBCC_FLOWCTRLWATERMARK_SHIFT; + /* + * How often flowctrl sent. More or less in usecs; balance against + * watermark value, so that in theory senders always get a flow + * control update in time to not let the IB link go idle. + */ + ibc |= 0x3ULL << INFINIPATH_IBCC_FLOWCTRLPERIOD_SHIFT; + /* max error tolerance */ + ibc |= 0xfULL << INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT; + /* use "real" buffer space for */ + ibc |= 4ULL << INFINIPATH_IBCC_CREDITSCALE_SHIFT; + /* IB credit flow control. */ + ibc |= 0xfULL << INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT; + /* initially come up waiting for TS1, without sending anything. */ + dd->ipath_ibcctrl = ibc; + /* + * Want to start out with both LINKCMD and LINKINITCMD in NOP + * (0 and 0). Don't put linkinitcmd in ipath_ibcctrl, want that + * to stay a NOP. Flag that we are disabled, for the (unlikely) + * case that some recovery path is trying to bring the link up + * before we are ready. + */ + ibc |= INFINIPATH_IBCC_LINKINITCMD_DISABLE << + INFINIPATH_IBCC_LINKINITCMD_SHIFT; + dd->ipath_flags |= IPATH_IB_LINK_DISABLED; + ipath_cdbg(VERBOSE, "Writing 0x%llx to ibcctrl\n", + (unsigned long long) ibc); + ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl, ibc); + + // be sure chip saw it + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + + ret = dd->ipath_f_bringup_serdes(dd); + + if (ret) + dev_info(&dd->pcidev->dev, "Could not initialize SerDes, " + "not usable\n"); + else { + /* enable IBC */ + dd->ipath_control |= INFINIPATH_C_LINKENABLE; + ipath_write_kreg(dd, dd->ipath_kregs->kr_control, + dd->ipath_control); + } + + return ret; +} + +static struct ipath_portdata *create_portdata0(struct ipath_devdata *dd) +{ + struct ipath_portdata *pd = NULL; + + pd = kzalloc(sizeof(*pd), GFP_KERNEL); + if (pd) { + pd->port_dd = dd; + pd->port_cnt = 1; + /* The port 0 pkey table is used by the layer interface. */ + pd->port_pkeys[0] = IPATH_DEFAULT_P_KEY; + pd->port_seq_cnt = 1; + } + return pd; +} + +static int init_chip_first(struct ipath_devdata *dd) +{ + struct ipath_portdata *pd; + int ret = 0; + u64 val; + + spin_lock_init(&dd->ipath_kernel_tid_lock); + spin_lock_init(&dd->ipath_user_tid_lock); + spin_lock_init(&dd->ipath_sendctrl_lock); + spin_lock_init(&dd->ipath_uctxt_lock); + spin_lock_init(&dd->ipath_sdma_lock); + spin_lock_init(&dd->ipath_gpio_lock); + spin_lock_init(&dd->ipath_eep_st_lock); + spin_lock_init(&dd->ipath_sdepb_lock); + mutex_init(&dd->ipath_eep_lock); + + /* + * skip cfgports stuff because we are not allocating memory, + * and we don't want problems if the portcnt changed due to + * cfgports. We do still check and report a difference, if + * not same (should be impossible). + */ + dd->ipath_f_config_ports(dd, ipath_cfgports); + if (!ipath_cfgports) + dd->ipath_cfgports = dd->ipath_portcnt; + else if (ipath_cfgports <= dd->ipath_portcnt) { + dd->ipath_cfgports = ipath_cfgports; + ipath_dbg("Configured to use %u ports out of %u in chip\n", + dd->ipath_cfgports, ipath_read_kreg32(dd, + dd->ipath_kregs->kr_portcnt)); + } else { + dd->ipath_cfgports = dd->ipath_portcnt; + ipath_dbg("Tried to configured to use %u ports; chip " + "only supports %u\n", ipath_cfgports, + ipath_read_kreg32(dd, + dd->ipath_kregs->kr_portcnt)); + } + /* + * Allocate full portcnt array, rather than just cfgports, because + * cleanup iterates across all possible ports. + */ + dd->ipath_pd = kzalloc(sizeof(*dd->ipath_pd) * dd->ipath_portcnt, + GFP_KERNEL); + + if (!dd->ipath_pd) { + ipath_dev_err(dd, "Unable to allocate portdata array, " + "failing\n"); + ret = -ENOMEM; + goto done; + } + + pd = create_portdata0(dd); + if (!pd) { + ipath_dev_err(dd, "Unable to allocate portdata for port " + "0, failing\n"); + ret = -ENOMEM; + goto done; + } + dd->ipath_pd[0] = pd; + + dd->ipath_rcvtidcnt = + ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidcnt); + dd->ipath_rcvtidbase = + ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidbase); + dd->ipath_rcvegrcnt = + ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrcnt); + dd->ipath_rcvegrbase = + ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrbase); + dd->ipath_palign = + ipath_read_kreg32(dd, dd->ipath_kregs->kr_pagealign); + dd->ipath_piobufbase = + ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpiobufbase); + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpiosize); + dd->ipath_piosize2k = val & ~0U; + dd->ipath_piosize4k = val >> 32; + if (dd->ipath_piosize4k == 0 && ipath_mtu4096) + ipath_mtu4096 = 0; /* 4KB not supported by this chip */ + dd->ipath_ibmtu = ipath_mtu4096 ? 4096 : 2048; + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpiobufcnt); + dd->ipath_piobcnt2k = val & ~0U; + dd->ipath_piobcnt4k = val >> 32; + dd->ipath_pio2kbase = + (u32 __iomem *) (((char __iomem *) dd->ipath_kregbase) + + (dd->ipath_piobufbase & 0xffffffff)); + if (dd->ipath_piobcnt4k) { + dd->ipath_pio4kbase = (u32 __iomem *) + (((char __iomem *) dd->ipath_kregbase) + + (dd->ipath_piobufbase >> 32)); + /* + * 4K buffers take 2 pages; we use roundup just to be + * paranoid; we calculate it once here, rather than on + * ever buf allocate + */ + dd->ipath_4kalign = ALIGN(dd->ipath_piosize4k, + dd->ipath_palign); + ipath_dbg("%u 2k(%x) piobufs @ %p, %u 4k(%x) @ %p " + "(%x aligned)\n", + dd->ipath_piobcnt2k, dd->ipath_piosize2k, + dd->ipath_pio2kbase, dd->ipath_piobcnt4k, + dd->ipath_piosize4k, dd->ipath_pio4kbase, + dd->ipath_4kalign); + } + else ipath_dbg("%u 2k piobufs @ %p\n", + dd->ipath_piobcnt2k, dd->ipath_pio2kbase); + +done: + return ret; +} + +/** + * init_chip_reset - re-initialize after a reset, or enable + * @dd: the infinipath device + * + * sanity check at least some of the values after reset, and + * ensure no receive or transmit (explicitly, in case reset + * failed + */ +static int init_chip_reset(struct ipath_devdata *dd) +{ + u32 rtmp; + int i; + unsigned long flags; + + /* + * ensure chip does no sends or receives, tail updates, or + * pioavail updates while we re-initialize + */ + dd->ipath_rcvctrl &= ~(1ULL << dd->ipath_r_tailupd_shift); + for (i = 0; i < dd->ipath_portcnt; i++) { + clear_bit(dd->ipath_r_portenable_shift + i, + &dd->ipath_rcvctrl); + clear_bit(dd->ipath_r_intravail_shift + i, + &dd->ipath_rcvctrl); + } + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, + dd->ipath_rcvctrl); + + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + dd->ipath_sendctrl = 0U; /* no sdma, etc */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + + ipath_write_kreg(dd, dd->ipath_kregs->kr_control, 0ULL); + + rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidcnt); + if (rtmp != dd->ipath_rcvtidcnt) + dev_info(&dd->pcidev->dev, "tidcnt was %u before " + "reset, now %u, using original\n", + dd->ipath_rcvtidcnt, rtmp); + rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidbase); + if (rtmp != dd->ipath_rcvtidbase) + dev_info(&dd->pcidev->dev, "tidbase was %u before " + "reset, now %u, using original\n", + dd->ipath_rcvtidbase, rtmp); + rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrcnt); + if (rtmp != dd->ipath_rcvegrcnt) + dev_info(&dd->pcidev->dev, "egrcnt was %u before " + "reset, now %u, using original\n", + dd->ipath_rcvegrcnt, rtmp); + rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrbase); + if (rtmp != dd->ipath_rcvegrbase) + dev_info(&dd->pcidev->dev, "egrbase was %u before " + "reset, now %u, using original\n", + dd->ipath_rcvegrbase, rtmp); + + return 0; +} + +static int init_pioavailregs(struct ipath_devdata *dd) +{ + int ret; + + dd->ipath_pioavailregs_dma = dma_alloc_coherent( + &dd->pcidev->dev, PAGE_SIZE, &dd->ipath_pioavailregs_phys, + GFP_KERNEL); + if (!dd->ipath_pioavailregs_dma) { + ipath_dev_err(dd, "failed to allocate PIOavail reg area " + "in memory\n"); + ret = -ENOMEM; + goto done; + } + + /* + * we really want L2 cache aligned, but for current CPUs of + * interest, they are the same. + */ + dd->ipath_statusp = (u64 *) + ((char *)dd->ipath_pioavailregs_dma + + ((2 * L1_CACHE_BYTES + + dd->ipath_pioavregs * sizeof(u64)) & ~L1_CACHE_BYTES)); + /* copy the current value now that it's really allocated */ + *dd->ipath_statusp = dd->_ipath_status; + /* + * setup buffer to hold freeze msg, accessible to apps, + * following statusp + */ + dd->ipath_freezemsg = (char *)&dd->ipath_statusp[1]; + /* and its length */ + dd->ipath_freezelen = L1_CACHE_BYTES - sizeof(dd->ipath_statusp[0]); + + ret = 0; + +done: + return ret; +} + +/** + * init_shadow_tids - allocate the shadow TID array + * @dd: the infinipath device + * + * allocate the shadow TID array, so we can ipath_munlock previous + * entries. It may make more sense to move the pageshadow to the + * port data structure, so we only allocate memory for ports actually + * in use, since we at 8k per port, now. + */ +static void init_shadow_tids(struct ipath_devdata *dd) +{ + struct page **pages; + dma_addr_t *addrs; + + pages = vzalloc(dd->ipath_cfgports * dd->ipath_rcvtidcnt * + sizeof(struct page *)); + if (!pages) { + ipath_dev_err(dd, "failed to allocate shadow page * " + "array, no expected sends!\n"); + dd->ipath_pageshadow = NULL; + return; + } + + addrs = vmalloc(dd->ipath_cfgports * dd->ipath_rcvtidcnt * + sizeof(dma_addr_t)); + if (!addrs) { + ipath_dev_err(dd, "failed to allocate shadow dma handle " + "array, no expected sends!\n"); + vfree(pages); + dd->ipath_pageshadow = NULL; + return; + } + + dd->ipath_pageshadow = pages; + dd->ipath_physshadow = addrs; +} + +static void enable_chip(struct ipath_devdata *dd, int reinit) +{ + u32 val; + u64 rcvmask; + unsigned long flags; + int i; + + if (!reinit) + init_waitqueue_head(&ipath_state_wait); + + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, + dd->ipath_rcvctrl); + + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + /* Enable PIO send, and update of PIOavail regs to memory. */ + dd->ipath_sendctrl = INFINIPATH_S_PIOENABLE | + INFINIPATH_S_PIOBUFAVAILUPD; + + /* + * Set the PIO avail update threshold to host memory + * on chips that support it. + */ + if (dd->ipath_pioupd_thresh) + dd->ipath_sendctrl |= dd->ipath_pioupd_thresh + << INFINIPATH_S_UPDTHRESH_SHIFT; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + + /* + * Enable kernel ports' receive and receive interrupt. + * Other ports done as user opens and inits them. + */ + rcvmask = 1ULL; + dd->ipath_rcvctrl |= (rcvmask << dd->ipath_r_portenable_shift) | + (rcvmask << dd->ipath_r_intravail_shift); + if (!(dd->ipath_flags & IPATH_NODMA_RTAIL)) + dd->ipath_rcvctrl |= (1ULL << dd->ipath_r_tailupd_shift); + + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, + dd->ipath_rcvctrl); + + /* + * now ready for use. this should be cleared whenever we + * detect a reset, or initiate one. + */ + dd->ipath_flags |= IPATH_INITTED; + + /* + * Init our shadow copies of head from tail values, + * and write head values to match. + */ + val = ipath_read_ureg32(dd, ur_rcvegrindextail, 0); + ipath_write_ureg(dd, ur_rcvegrindexhead, val, 0); + + /* Initialize so we interrupt on next packet received */ + ipath_write_ureg(dd, ur_rcvhdrhead, + dd->ipath_rhdrhead_intr_off | + dd->ipath_pd[0]->port_head, 0); + + /* + * by now pioavail updates to memory should have occurred, so + * copy them into our working/shadow registers; this is in + * case something went wrong with abort, but mostly to get the + * initial values of the generation bit correct. + */ + for (i = 0; i < dd->ipath_pioavregs; i++) { + __le64 pioavail; + + /* + * Chip Errata bug 6641; even and odd qwords>3 are swapped. + */ + if (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS)) + pioavail = dd->ipath_pioavailregs_dma[i ^ 1]; + else + pioavail = dd->ipath_pioavailregs_dma[i]; + /* + * don't need to worry about ipath_pioavailkernel here + * because we will call ipath_chg_pioavailkernel() later + * in initialization, to busy out buffers as needed + */ + dd->ipath_pioavailshadow[i] = le64_to_cpu(pioavail); + } + /* can get counters, stats, etc. */ + dd->ipath_flags |= IPATH_PRESENT; +} + +static int init_housekeeping(struct ipath_devdata *dd, int reinit) +{ + char boardn[40]; + int ret = 0; + + /* + * have to clear shadow copies of registers at init that are + * not otherwise set here, or all kinds of bizarre things + * happen with driver on chip reset + */ + dd->ipath_rcvhdrsize = 0; + + /* + * Don't clear ipath_flags as 8bit mode was set before + * entering this func. However, we do set the linkstate to + * unknown, so we can watch for a transition. + * PRESENT is set because we want register reads to work, + * and the kernel infrastructure saw it in config space; + * We clear it if we have failures. + */ + dd->ipath_flags |= IPATH_LINKUNK | IPATH_PRESENT; + dd->ipath_flags &= ~(IPATH_LINKACTIVE | IPATH_LINKARMED | + IPATH_LINKDOWN | IPATH_LINKINIT); + + ipath_cdbg(VERBOSE, "Try to read spc chip revision\n"); + dd->ipath_revision = + ipath_read_kreg64(dd, dd->ipath_kregs->kr_revision); + + /* + * set up fundamental info we need to use the chip; we assume + * if the revision reg and these regs are OK, we don't need to + * special case the rest + */ + dd->ipath_sregbase = + ipath_read_kreg32(dd, dd->ipath_kregs->kr_sendregbase); + dd->ipath_cregbase = + ipath_read_kreg32(dd, dd->ipath_kregs->kr_counterregbase); + dd->ipath_uregbase = + ipath_read_kreg32(dd, dd->ipath_kregs->kr_userregbase); + ipath_cdbg(VERBOSE, "ipath_kregbase %p, sendbase %x usrbase %x, " + "cntrbase %x\n", dd->ipath_kregbase, dd->ipath_sregbase, + dd->ipath_uregbase, dd->ipath_cregbase); + if ((dd->ipath_revision & 0xffffffff) == 0xffffffff + || (dd->ipath_sregbase & 0xffffffff) == 0xffffffff + || (dd->ipath_cregbase & 0xffffffff) == 0xffffffff + || (dd->ipath_uregbase & 0xffffffff) == 0xffffffff) { + ipath_dev_err(dd, "Register read failures from chip, " + "giving up initialization\n"); + dd->ipath_flags &= ~IPATH_PRESENT; + ret = -ENODEV; + goto done; + } + + + /* clear diagctrl register, in case diags were running and crashed */ + ipath_write_kreg (dd, dd->ipath_kregs->kr_hwdiagctrl, 0); + + /* clear the initial reset flag, in case first driver load */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, + INFINIPATH_E_RESET); + + ipath_cdbg(VERBOSE, "Revision %llx (PCI %x)\n", + (unsigned long long) dd->ipath_revision, + dd->ipath_pcirev); + + if (((dd->ipath_revision >> INFINIPATH_R_SOFTWARE_SHIFT) & + INFINIPATH_R_SOFTWARE_MASK) != IPATH_CHIP_SWVERSION) { + ipath_dev_err(dd, "Driver only handles version %d, " + "chip swversion is %d (%llx), failng\n", + IPATH_CHIP_SWVERSION, + (int)(dd->ipath_revision >> + INFINIPATH_R_SOFTWARE_SHIFT) & + INFINIPATH_R_SOFTWARE_MASK, + (unsigned long long) dd->ipath_revision); + ret = -ENOSYS; + goto done; + } + dd->ipath_majrev = (u8) ((dd->ipath_revision >> + INFINIPATH_R_CHIPREVMAJOR_SHIFT) & + INFINIPATH_R_CHIPREVMAJOR_MASK); + dd->ipath_minrev = (u8) ((dd->ipath_revision >> + INFINIPATH_R_CHIPREVMINOR_SHIFT) & + INFINIPATH_R_CHIPREVMINOR_MASK); + dd->ipath_boardrev = (u8) ((dd->ipath_revision >> + INFINIPATH_R_BOARDID_SHIFT) & + INFINIPATH_R_BOARDID_MASK); + + ret = dd->ipath_f_get_boardname(dd, boardn, sizeof boardn); + + snprintf(dd->ipath_boardversion, sizeof(dd->ipath_boardversion), + "ChipABI %u.%u, %s, InfiniPath%u %u.%u, PCI %u, " + "SW Compat %u\n", + IPATH_CHIP_VERS_MAJ, IPATH_CHIP_VERS_MIN, boardn, + (unsigned)(dd->ipath_revision >> INFINIPATH_R_ARCH_SHIFT) & + INFINIPATH_R_ARCH_MASK, + dd->ipath_majrev, dd->ipath_minrev, dd->ipath_pcirev, + (unsigned)(dd->ipath_revision >> + INFINIPATH_R_SOFTWARE_SHIFT) & + INFINIPATH_R_SOFTWARE_MASK); + + ipath_dbg("%s", dd->ipath_boardversion); + + if (ret) + goto done; + + if (reinit) + ret = init_chip_reset(dd); + else + ret = init_chip_first(dd); + +done: + return ret; +} + +static void verify_interrupt(unsigned long opaque) +{ + struct ipath_devdata *dd = (struct ipath_devdata *) opaque; + + if (!dd) + return; /* being torn down */ + + /* + * If we don't have any interrupts, let the user know and + * don't bother checking again. + */ + if (dd->ipath_int_counter == 0) { + if (!dd->ipath_f_intr_fallback(dd)) + dev_err(&dd->pcidev->dev, "No interrupts detected, " + "not usable.\n"); + else /* re-arm the timer to see if fallback works */ + mod_timer(&dd->ipath_intrchk_timer, jiffies + HZ/2); + } else + ipath_cdbg(VERBOSE, "%u interrupts at timer check\n", + dd->ipath_int_counter); +} + +/** + * ipath_init_chip - do the actual initialization sequence on the chip + * @dd: the infinipath device + * @reinit: reinitializing, so don't allocate new memory + * + * Do the actual initialization sequence on the chip. This is done + * both from the init routine called from the PCI infrastructure, and + * when we reset the chip, or detect that it was reset internally, + * or it's administratively re-enabled. + * + * Memory allocation here and in called routines is only done in + * the first case (reinit == 0). We have to be careful, because even + * without memory allocation, we need to re-write all the chip registers + * TIDs, etc. after the reset or enable has completed. + */ +int ipath_init_chip(struct ipath_devdata *dd, int reinit) +{ + int ret = 0; + u32 kpiobufs, defkbufs; + u32 piobufs, uports; + u64 val; + struct ipath_portdata *pd; + gfp_t gfp_flags = GFP_USER | __GFP_COMP; + + ret = init_housekeeping(dd, reinit); + if (ret) + goto done; + + /* + * We could bump this to allow for full rcvegrcnt + rcvtidcnt, + * but then it no longer nicely fits power of two, and since + * we now use routines that backend onto __get_free_pages, the + * rest would be wasted. + */ + dd->ipath_rcvhdrcnt = max(dd->ipath_p0_rcvegrcnt, dd->ipath_rcvegrcnt); + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrcnt, + dd->ipath_rcvhdrcnt); + + /* + * Set up the shadow copies of the piobufavail registers, + * which we compare against the chip registers for now, and + * the in memory DMA'ed copies of the registers. This has to + * be done early, before we calculate lastport, etc. + */ + piobufs = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k; + /* + * calc number of pioavail registers, and save it; we have 2 + * bits per buffer. + */ + dd->ipath_pioavregs = ALIGN(piobufs, sizeof(u64) * BITS_PER_BYTE / 2) + / (sizeof(u64) * BITS_PER_BYTE / 2); + uports = dd->ipath_cfgports ? dd->ipath_cfgports - 1 : 0; + if (piobufs > 144) + defkbufs = 32 + dd->ipath_pioreserved; + else + defkbufs = 16 + dd->ipath_pioreserved; + + if (ipath_kpiobufs && (ipath_kpiobufs + + (uports * IPATH_MIN_USER_PORT_BUFCNT)) > piobufs) { + int i = (int) piobufs - + (int) (uports * IPATH_MIN_USER_PORT_BUFCNT); + if (i < 1) + i = 1; + dev_info(&dd->pcidev->dev, "Allocating %d PIO bufs of " + "%d for kernel leaves too few for %d user ports " + "(%d each); using %u\n", ipath_kpiobufs, + piobufs, uports, IPATH_MIN_USER_PORT_BUFCNT, i); + /* + * shouldn't change ipath_kpiobufs, because could be + * different for different devices... + */ + kpiobufs = i; + } else if (ipath_kpiobufs) + kpiobufs = ipath_kpiobufs; + else + kpiobufs = defkbufs; + dd->ipath_lastport_piobuf = piobufs - kpiobufs; + dd->ipath_pbufsport = + uports ? dd->ipath_lastport_piobuf / uports : 0; + /* if not an even divisor, some user ports get extra buffers */ + dd->ipath_ports_extrabuf = dd->ipath_lastport_piobuf - + (dd->ipath_pbufsport * uports); + if (dd->ipath_ports_extrabuf) + ipath_dbg("%u pbufs/port leaves some unused, add 1 buffer to " + "ports <= %u\n", dd->ipath_pbufsport, + dd->ipath_ports_extrabuf); + dd->ipath_lastpioindex = 0; + dd->ipath_lastpioindexl = dd->ipath_piobcnt2k; + /* ipath_pioavailshadow initialized earlier */ + ipath_cdbg(VERBOSE, "%d PIO bufs for kernel out of %d total %u " + "each for %u user ports\n", kpiobufs, + piobufs, dd->ipath_pbufsport, uports); + ret = dd->ipath_f_early_init(dd); + if (ret) { + ipath_dev_err(dd, "Early initialization failure\n"); + goto done; + } + + /* + * Early_init sets rcvhdrentsize and rcvhdrsize, so this must be + * done after early_init. + */ + dd->ipath_hdrqlast = + dd->ipath_rcvhdrentsize * (dd->ipath_rcvhdrcnt - 1); + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrentsize, + dd->ipath_rcvhdrentsize); + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrsize, + dd->ipath_rcvhdrsize); + + if (!reinit) { + ret = init_pioavailregs(dd); + init_shadow_tids(dd); + if (ret) + goto done; + } + + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendpioavailaddr, + dd->ipath_pioavailregs_phys); + + /* + * this is to detect s/w errors, which the h/w works around by + * ignoring the low 6 bits of address, if it wasn't aligned. + */ + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpioavailaddr); + if (val != dd->ipath_pioavailregs_phys) { + ipath_dev_err(dd, "Catastrophic software error, " + "SendPIOAvailAddr written as %lx, " + "read back as %llx\n", + (unsigned long) dd->ipath_pioavailregs_phys, + (unsigned long long) val); + ret = -EINVAL; + goto done; + } + + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvbthqp, IPATH_KD_QP); + + /* + * make sure we are not in freeze, and PIO send enabled, so + * writes to pbc happen + */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask, 0ULL); + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear, + ~0ULL&~INFINIPATH_HWE_MEMBISTFAILED); + ipath_write_kreg(dd, dd->ipath_kregs->kr_control, 0ULL); + + /* + * before error clears, since we expect serdes pll errors during + * this, the first time after reset + */ + if (bringup_link(dd)) { + dev_info(&dd->pcidev->dev, "Failed to bringup IB link\n"); + ret = -ENETDOWN; + goto done; + } + + /* + * clear any "expected" hwerrs from reset and/or initialization + * clear any that aren't enabled (at least this once), and then + * set the enable mask + */ + dd->ipath_f_init_hwerrors(dd); + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear, + ~0ULL&~INFINIPATH_HWE_MEMBISTFAILED); + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask, + dd->ipath_hwerrmask); + + /* clear all */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, -1LL); + /* enable errors that are masked, at least this first time. */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, + ~dd->ipath_maskederrs); + dd->ipath_maskederrs = 0; /* don't re-enable ignored in timer */ + dd->ipath_errormask = + ipath_read_kreg64(dd, dd->ipath_kregs->kr_errormask); + /* clear any interrupts up to this point (ints still not enabled) */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, -1LL); + + dd->ipath_f_tidtemplate(dd); + + /* + * Set up the port 0 (kernel) rcvhdr q and egr TIDs. If doing + * re-init, the simplest way to handle this is to free + * existing, and re-allocate. + * Need to re-create rest of port 0 portdata as well. + */ + pd = dd->ipath_pd[0]; + if (reinit) { + struct ipath_portdata *npd; + + /* + * Alloc and init new ipath_portdata for port0, + * Then free old pd. Could lead to fragmentation, but also + * makes later support for hot-swap easier. + */ + npd = create_portdata0(dd); + if (npd) { + ipath_free_pddata(dd, pd); + dd->ipath_pd[0] = npd; + pd = npd; + } else { + ipath_dev_err(dd, "Unable to allocate portdata" + " for port 0, failing\n"); + ret = -ENOMEM; + goto done; + } + } + ret = ipath_create_rcvhdrq(dd, pd); + if (!ret) + ret = create_port0_egr(dd); + if (ret) { + ipath_dev_err(dd, "failed to allocate kernel port's " + "rcvhdrq and/or egr bufs\n"); + goto done; + } + else + enable_chip(dd, reinit); + + /* after enable_chip, so pioavailshadow setup */ + ipath_chg_pioavailkernel(dd, 0, piobufs, 1); + + /* + * Cancel any possible active sends from early driver load. + * Follows early_init because some chips have to initialize + * PIO buffers in early_init to avoid false parity errors. + * After enable and ipath_chg_pioavailkernel so we can safely + * enable pioavail updates and PIOENABLE; packets are now + * ready to go out. + */ + ipath_cancel_sends(dd, 1); + + if (!reinit) { + /* + * Used when we close a port, for DMA already in flight + * at close. + */ + dd->ipath_dummy_hdrq = dma_alloc_coherent( + &dd->pcidev->dev, dd->ipath_pd[0]->port_rcvhdrq_size, + &dd->ipath_dummy_hdrq_phys, + gfp_flags); + if (!dd->ipath_dummy_hdrq) { + dev_info(&dd->pcidev->dev, + "Couldn't allocate 0x%lx bytes for dummy hdrq\n", + dd->ipath_pd[0]->port_rcvhdrq_size); + /* fallback to just 0'ing */ + dd->ipath_dummy_hdrq_phys = 0UL; + } + } + + /* + * cause retrigger of pending interrupts ignored during init, + * even if we had errors + */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, 0ULL); + + if (!dd->ipath_stats_timer_active) { + /* + * first init, or after an admin disable/enable + * set up stats retrieval timer, even if we had errors + * in last portion of setup + */ + init_timer(&dd->ipath_stats_timer); + dd->ipath_stats_timer.function = ipath_get_faststats; + dd->ipath_stats_timer.data = (unsigned long) dd; + /* every 5 seconds; */ + dd->ipath_stats_timer.expires = jiffies + 5 * HZ; + /* takes ~16 seconds to overflow at full IB 4x bandwdith */ + add_timer(&dd->ipath_stats_timer); + dd->ipath_stats_timer_active = 1; + } + + /* Set up SendDMA if chip supports it */ + if (dd->ipath_flags & IPATH_HAS_SEND_DMA) + ret = setup_sdma(dd); + + /* Set up HoL state */ + init_timer(&dd->ipath_hol_timer); + dd->ipath_hol_timer.function = ipath_hol_event; + dd->ipath_hol_timer.data = (unsigned long)dd; + dd->ipath_hol_state = IPATH_HOL_UP; + +done: + if (!ret) { + *dd->ipath_statusp |= IPATH_STATUS_CHIP_PRESENT; + if (!dd->ipath_f_intrsetup(dd)) { + /* now we can enable all interrupts from the chip */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, + -1LL); + /* force re-interrupt of any pending interrupts. */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, + 0ULL); + /* chip is usable; mark it as initialized */ + *dd->ipath_statusp |= IPATH_STATUS_INITTED; + + /* + * setup to verify we get an interrupt, and fallback + * to an alternate if necessary and possible + */ + if (!reinit) { + init_timer(&dd->ipath_intrchk_timer); + dd->ipath_intrchk_timer.function = + verify_interrupt; + dd->ipath_intrchk_timer.data = + (unsigned long) dd; + } + dd->ipath_intrchk_timer.expires = jiffies + HZ/2; + add_timer(&dd->ipath_intrchk_timer); + } else + ipath_dev_err(dd, "No interrupts enabled, couldn't " + "setup interrupt address\n"); + + if (dd->ipath_cfgports > ipath_stats.sps_nports) + /* + * sps_nports is a global, so, we set it to + * the highest number of ports of any of the + * chips we find; we never decrement it, at + * least for now. Since this might have changed + * over disable/enable or prior to reset, always + * do the check and potentially adjust. + */ + ipath_stats.sps_nports = dd->ipath_cfgports; + } else + ipath_dbg("Failed (%d) to initialize chip\n", ret); + + /* if ret is non-zero, we probably should do some cleanup + here... */ + return ret; +} + +static int ipath_set_kpiobufs(const char *str, struct kernel_param *kp) +{ + struct ipath_devdata *dd; + unsigned long flags; + unsigned short val; + int ret; + + ret = ipath_parse_ushort(str, &val); + + spin_lock_irqsave(&ipath_devs_lock, flags); + + if (ret < 0) + goto bail; + + if (val == 0) { + ret = -EINVAL; + goto bail; + } + + list_for_each_entry(dd, &ipath_dev_list, ipath_list) { + if (dd->ipath_kregbase) + continue; + if (val > (dd->ipath_piobcnt2k + dd->ipath_piobcnt4k - + (dd->ipath_cfgports * + IPATH_MIN_USER_PORT_BUFCNT))) + { + ipath_dev_err( + dd, + "Allocating %d PIO bufs for kernel leaves " + "too few for %d user ports (%d each)\n", + val, dd->ipath_cfgports - 1, + IPATH_MIN_USER_PORT_BUFCNT); + ret = -EINVAL; + goto bail; + } + dd->ipath_lastport_piobuf = + dd->ipath_piobcnt2k + dd->ipath_piobcnt4k - val; + } + + ipath_kpiobufs = val; + ret = 0; +bail: + spin_unlock_irqrestore(&ipath_devs_lock, flags); + + return ret; +} diff --git a/kernel/drivers/infiniband/hw/ipath/ipath_intr.c b/kernel/drivers/infiniband/hw/ipath/ipath_intr.c new file mode 100644 index 000000000..01ba79279 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ipath/ipath_intr.c @@ -0,0 +1,1273 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include "ipath_kernel.h" +#include "ipath_verbs.h" +#include "ipath_common.h" + + +/* + * Called when we might have an error that is specific to a particular + * PIO buffer, and may need to cancel that buffer, so it can be re-used. + */ +void ipath_disarm_senderrbufs(struct ipath_devdata *dd) +{ + u32 piobcnt; + unsigned long sbuf[4]; + /* + * it's possible that sendbuffererror could have bits set; might + * have already done this as a result of hardware error handling + */ + piobcnt = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k; + /* read these before writing errorclear */ + sbuf[0] = ipath_read_kreg64( + dd, dd->ipath_kregs->kr_sendbuffererror); + sbuf[1] = ipath_read_kreg64( + dd, dd->ipath_kregs->kr_sendbuffererror + 1); + if (piobcnt > 128) + sbuf[2] = ipath_read_kreg64( + dd, dd->ipath_kregs->kr_sendbuffererror + 2); + if (piobcnt > 192) + sbuf[3] = ipath_read_kreg64( + dd, dd->ipath_kregs->kr_sendbuffererror + 3); + else + sbuf[3] = 0; + + if (sbuf[0] || sbuf[1] || (piobcnt > 128 && (sbuf[2] || sbuf[3]))) { + int i; + if (ipath_debug & (__IPATH_PKTDBG|__IPATH_DBG) && + time_after(dd->ipath_lastcancel, jiffies)) { + __IPATH_DBG_WHICH(__IPATH_PKTDBG|__IPATH_DBG, + "SendbufErrs %lx %lx", sbuf[0], + sbuf[1]); + if (ipath_debug & __IPATH_PKTDBG && piobcnt > 128) + printk(" %lx %lx ", sbuf[2], sbuf[3]); + printk("\n"); + } + + for (i = 0; i < piobcnt; i++) + if (test_bit(i, sbuf)) + ipath_disarm_piobufs(dd, i, 1); + /* ignore armlaunch errs for a bit */ + dd->ipath_lastcancel = jiffies+3; + } +} + + +/* These are all rcv-related errors which we want to count for stats */ +#define E_SUM_PKTERRS \ + (INFINIPATH_E_RHDRLEN | INFINIPATH_E_RBADTID | \ + INFINIPATH_E_RBADVERSION | INFINIPATH_E_RHDR | \ + INFINIPATH_E_RLONGPKTLEN | INFINIPATH_E_RSHORTPKTLEN | \ + INFINIPATH_E_RMAXPKTLEN | INFINIPATH_E_RMINPKTLEN | \ + INFINIPATH_E_RFORMATERR | INFINIPATH_E_RUNSUPVL | \ + INFINIPATH_E_RUNEXPCHAR | INFINIPATH_E_REBP) + +/* These are all send-related errors which we want to count for stats */ +#define E_SUM_ERRS \ + (INFINIPATH_E_SPIOARMLAUNCH | INFINIPATH_E_SUNEXPERRPKTNUM | \ + INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_SDROPPEDSMPPKT | \ + INFINIPATH_E_SMAXPKTLEN | INFINIPATH_E_SUNSUPVL | \ + INFINIPATH_E_SMINPKTLEN | INFINIPATH_E_SPKTLEN | \ + INFINIPATH_E_INVALIDADDR) + +/* + * this is similar to E_SUM_ERRS, but can't ignore armlaunch, don't ignore + * errors not related to freeze and cancelling buffers. Can't ignore + * armlaunch because could get more while still cleaning up, and need + * to cancel those as they happen. + */ +#define E_SPKT_ERRS_IGNORE \ + (INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_SDROPPEDSMPPKT | \ + INFINIPATH_E_SMAXPKTLEN | INFINIPATH_E_SMINPKTLEN | \ + INFINIPATH_E_SPKTLEN) + +/* + * these are errors that can occur when the link changes state while + * a packet is being sent or received. This doesn't cover things + * like EBP or VCRC that can be the result of a sending having the + * link change state, so we receive a "known bad" packet. + */ +#define E_SUM_LINK_PKTERRS \ + (INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_SDROPPEDSMPPKT | \ + INFINIPATH_E_SMINPKTLEN | INFINIPATH_E_SPKTLEN | \ + INFINIPATH_E_RSHORTPKTLEN | INFINIPATH_E_RMINPKTLEN | \ + INFINIPATH_E_RUNEXPCHAR) + +static u64 handle_e_sum_errs(struct ipath_devdata *dd, ipath_err_t errs) +{ + u64 ignore_this_time = 0; + + ipath_disarm_senderrbufs(dd); + if ((errs & E_SUM_LINK_PKTERRS) && + !(dd->ipath_flags & IPATH_LINKACTIVE)) { + /* + * This can happen when SMA is trying to bring the link + * up, but the IB link changes state at the "wrong" time. + * The IB logic then complains that the packet isn't + * valid. We don't want to confuse people, so we just + * don't print them, except at debug + */ + ipath_dbg("Ignoring packet errors %llx, because link not " + "ACTIVE\n", (unsigned long long) errs); + ignore_this_time = errs & E_SUM_LINK_PKTERRS; + } + + return ignore_this_time; +} + +/* generic hw error messages... */ +#define INFINIPATH_HWE_TXEMEMPARITYERR_MSG(a) \ + { \ + .mask = ( INFINIPATH_HWE_TXEMEMPARITYERR_##a << \ + INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT ), \ + .msg = "TXE " #a " Memory Parity" \ + } +#define INFINIPATH_HWE_RXEMEMPARITYERR_MSG(a) \ + { \ + .mask = ( INFINIPATH_HWE_RXEMEMPARITYERR_##a << \ + INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT ), \ + .msg = "RXE " #a " Memory Parity" \ + } + +static const struct ipath_hwerror_msgs ipath_generic_hwerror_msgs[] = { + INFINIPATH_HWE_MSG(IBCBUSFRSPCPARITYERR, "IPATH2IB Parity"), + INFINIPATH_HWE_MSG(IBCBUSTOSPCPARITYERR, "IB2IPATH Parity"), + + INFINIPATH_HWE_TXEMEMPARITYERR_MSG(PIOBUF), + INFINIPATH_HWE_TXEMEMPARITYERR_MSG(PIOPBC), + INFINIPATH_HWE_TXEMEMPARITYERR_MSG(PIOLAUNCHFIFO), + + INFINIPATH_HWE_RXEMEMPARITYERR_MSG(RCVBUF), + INFINIPATH_HWE_RXEMEMPARITYERR_MSG(LOOKUPQ), + INFINIPATH_HWE_RXEMEMPARITYERR_MSG(EAGERTID), + INFINIPATH_HWE_RXEMEMPARITYERR_MSG(EXPTID), + INFINIPATH_HWE_RXEMEMPARITYERR_MSG(FLAGBUF), + INFINIPATH_HWE_RXEMEMPARITYERR_MSG(DATAINFO), + INFINIPATH_HWE_RXEMEMPARITYERR_MSG(HDRINFO), +}; + +/** + * ipath_format_hwmsg - format a single hwerror message + * @msg message buffer + * @msgl length of message buffer + * @hwmsg message to add to message buffer + */ +static void ipath_format_hwmsg(char *msg, size_t msgl, const char *hwmsg) +{ + strlcat(msg, "[", msgl); + strlcat(msg, hwmsg, msgl); + strlcat(msg, "]", msgl); +} + +/** + * ipath_format_hwerrors - format hardware error messages for display + * @hwerrs hardware errors bit vector + * @hwerrmsgs hardware error descriptions + * @nhwerrmsgs number of hwerrmsgs + * @msg message buffer + * @msgl message buffer length + */ +void ipath_format_hwerrors(u64 hwerrs, + const struct ipath_hwerror_msgs *hwerrmsgs, + size_t nhwerrmsgs, + char *msg, size_t msgl) +{ + int i; + const int glen = + ARRAY_SIZE(ipath_generic_hwerror_msgs); + + for (i=0; iib_init) + ret = "Init"; + else if (state == dd->ib_arm) + ret = "Arm"; + else if (state == dd->ib_active) + ret = "Active"; + else + ret = "Down"; + return ret; +} + +void signal_ib_event(struct ipath_devdata *dd, enum ib_event_type ev) +{ + struct ib_event event; + + event.device = &dd->verbs_dev->ibdev; + event.element.port_num = 1; + event.event = ev; + ib_dispatch_event(&event); +} + +static void handle_e_ibstatuschanged(struct ipath_devdata *dd, + ipath_err_t errs) +{ + u32 ltstate, lstate, ibstate, lastlstate; + u32 init = dd->ib_init; + u32 arm = dd->ib_arm; + u32 active = dd->ib_active; + const u64 ibcs = ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus); + + lstate = ipath_ib_linkstate(dd, ibcs); /* linkstate */ + ibstate = ipath_ib_state(dd, ibcs); + /* linkstate at last interrupt */ + lastlstate = ipath_ib_linkstate(dd, dd->ipath_lastibcstat); + ltstate = ipath_ib_linktrstate(dd, ibcs); /* linktrainingtate */ + + /* + * Since going into a recovery state causes the link state to go + * down and since recovery is transitory, it is better if we "miss" + * ever seeing the link training state go into recovery (i.e., + * ignore this transition for link state special handling purposes) + * without even updating ipath_lastibcstat. + */ + if ((ltstate == INFINIPATH_IBCS_LT_STATE_RECOVERRETRAIN) || + (ltstate == INFINIPATH_IBCS_LT_STATE_RECOVERWAITRMT) || + (ltstate == INFINIPATH_IBCS_LT_STATE_RECOVERIDLE)) + goto done; + + /* + * if linkstate transitions into INIT from any of the various down + * states, or if it transitions from any of the up (INIT or better) + * states into any of the down states (except link recovery), then + * call the chip-specific code to take appropriate actions. + */ + if (lstate >= INFINIPATH_IBCS_L_STATE_INIT && + lastlstate == INFINIPATH_IBCS_L_STATE_DOWN) { + /* transitioned to UP */ + if (dd->ipath_f_ib_updown(dd, 1, ibcs)) { + /* link came up, so we must no longer be disabled */ + dd->ipath_flags &= ~IPATH_IB_LINK_DISABLED; + ipath_cdbg(LINKVERB, "LinkUp handled, skipped\n"); + goto skip_ibchange; /* chip-code handled */ + } + } else if ((lastlstate >= INFINIPATH_IBCS_L_STATE_INIT || + (dd->ipath_flags & IPATH_IB_FORCE_NOTIFY)) && + ltstate <= INFINIPATH_IBCS_LT_STATE_CFGWAITRMT && + ltstate != INFINIPATH_IBCS_LT_STATE_LINKUP) { + int handled; + handled = dd->ipath_f_ib_updown(dd, 0, ibcs); + dd->ipath_flags &= ~IPATH_IB_FORCE_NOTIFY; + if (handled) { + ipath_cdbg(LINKVERB, "LinkDown handled, skipped\n"); + goto skip_ibchange; /* chip-code handled */ + } + } + + /* + * Significant enough to always print and get into logs, if it was + * unexpected. If it was a requested state change, we'll have + * already cleared the flags, so we won't print this warning + */ + if ((ibstate != arm && ibstate != active) && + (dd->ipath_flags & (IPATH_LINKARMED | IPATH_LINKACTIVE))) { + dev_info(&dd->pcidev->dev, "Link state changed from %s " + "to %s\n", (dd->ipath_flags & IPATH_LINKARMED) ? + "ARM" : "ACTIVE", ib_linkstate(dd, ibcs)); + } + + if (ltstate == INFINIPATH_IBCS_LT_STATE_POLLACTIVE || + ltstate == INFINIPATH_IBCS_LT_STATE_POLLQUIET) { + u32 lastlts; + lastlts = ipath_ib_linktrstate(dd, dd->ipath_lastibcstat); + /* + * Ignore cycling back and forth from Polling.Active to + * Polling.Quiet while waiting for the other end of the link + * to come up, except to try and decide if we are connected + * to a live IB device or not. We will cycle back and + * forth between them if no cable is plugged in, the other + * device is powered off or disabled, etc. + */ + if (lastlts == INFINIPATH_IBCS_LT_STATE_POLLACTIVE || + lastlts == INFINIPATH_IBCS_LT_STATE_POLLQUIET) { + if (!(dd->ipath_flags & IPATH_IB_AUTONEG_INPROG) && + (++dd->ipath_ibpollcnt == 40)) { + dd->ipath_flags |= IPATH_NOCABLE; + *dd->ipath_statusp |= + IPATH_STATUS_IB_NOCABLE; + ipath_cdbg(LINKVERB, "Set NOCABLE\n"); + } + ipath_cdbg(LINKVERB, "POLL change to %s (%x)\n", + ipath_ibcstatus_str[ltstate], ibstate); + goto skip_ibchange; + } + } + + dd->ipath_ibpollcnt = 0; /* not poll*, now */ + ipath_stats.sps_iblink++; + + if (ibstate != init && dd->ipath_lastlinkrecov && ipath_linkrecovery) { + u64 linkrecov; + linkrecov = ipath_snap_cntr(dd, + dd->ipath_cregs->cr_iblinkerrrecovcnt); + if (linkrecov != dd->ipath_lastlinkrecov) { + ipath_dbg("IB linkrecov up %Lx (%s %s) recov %Lu\n", + (unsigned long long) ibcs, + ib_linkstate(dd, ibcs), + ipath_ibcstatus_str[ltstate], + (unsigned long long) linkrecov); + /* and no more until active again */ + dd->ipath_lastlinkrecov = 0; + ipath_set_linkstate(dd, IPATH_IB_LINKDOWN); + goto skip_ibchange; + } + } + + if (ibstate == init || ibstate == arm || ibstate == active) { + *dd->ipath_statusp &= ~IPATH_STATUS_IB_NOCABLE; + if (ibstate == init || ibstate == arm) { + *dd->ipath_statusp &= ~IPATH_STATUS_IB_READY; + if (dd->ipath_flags & IPATH_LINKACTIVE) + signal_ib_event(dd, IB_EVENT_PORT_ERR); + } + if (ibstate == arm) { + dd->ipath_flags |= IPATH_LINKARMED; + dd->ipath_flags &= ~(IPATH_LINKUNK | + IPATH_LINKINIT | IPATH_LINKDOWN | + IPATH_LINKACTIVE | IPATH_NOCABLE); + ipath_hol_down(dd); + } else if (ibstate == init) { + /* + * set INIT and DOWN. Down is checked by + * most of the other code, but INIT is + * useful to know in a few places. + */ + dd->ipath_flags |= IPATH_LINKINIT | + IPATH_LINKDOWN; + dd->ipath_flags &= ~(IPATH_LINKUNK | + IPATH_LINKARMED | IPATH_LINKACTIVE | + IPATH_NOCABLE); + ipath_hol_down(dd); + } else { /* active */ + dd->ipath_lastlinkrecov = ipath_snap_cntr(dd, + dd->ipath_cregs->cr_iblinkerrrecovcnt); + *dd->ipath_statusp |= + IPATH_STATUS_IB_READY | IPATH_STATUS_IB_CONF; + dd->ipath_flags |= IPATH_LINKACTIVE; + dd->ipath_flags &= ~(IPATH_LINKUNK | IPATH_LINKINIT + | IPATH_LINKDOWN | IPATH_LINKARMED | + IPATH_NOCABLE); + if (dd->ipath_flags & IPATH_HAS_SEND_DMA) + ipath_restart_sdma(dd); + signal_ib_event(dd, IB_EVENT_PORT_ACTIVE); + /* LED active not handled in chip _f_updown */ + dd->ipath_f_setextled(dd, lstate, ltstate); + ipath_hol_up(dd); + } + + /* + * print after we've already done the work, so as not to + * delay the state changes and notifications, for debugging + */ + if (lstate == lastlstate) + ipath_cdbg(LINKVERB, "Unchanged from last: %s " + "(%x)\n", ib_linkstate(dd, ibcs), ibstate); + else + ipath_cdbg(VERBOSE, "Unit %u: link up to %s %s (%x)\n", + dd->ipath_unit, ib_linkstate(dd, ibcs), + ipath_ibcstatus_str[ltstate], ibstate); + } else { /* down */ + if (dd->ipath_flags & IPATH_LINKACTIVE) + signal_ib_event(dd, IB_EVENT_PORT_ERR); + dd->ipath_flags |= IPATH_LINKDOWN; + dd->ipath_flags &= ~(IPATH_LINKUNK | IPATH_LINKINIT + | IPATH_LINKACTIVE | + IPATH_LINKARMED); + *dd->ipath_statusp &= ~IPATH_STATUS_IB_READY; + dd->ipath_lli_counter = 0; + + if (lastlstate != INFINIPATH_IBCS_L_STATE_DOWN) + ipath_cdbg(VERBOSE, "Unit %u link state down " + "(state 0x%x), from %s\n", + dd->ipath_unit, lstate, + ib_linkstate(dd, dd->ipath_lastibcstat)); + else + ipath_cdbg(LINKVERB, "Unit %u link state changed " + "to %s (0x%x) from down (%x)\n", + dd->ipath_unit, + ipath_ibcstatus_str[ltstate], + ibstate, lastlstate); + } + +skip_ibchange: + dd->ipath_lastibcstat = ibcs; +done: + return; +} + +static void handle_supp_msgs(struct ipath_devdata *dd, + unsigned supp_msgs, char *msg, u32 msgsz) +{ + /* + * Print the message unless it's ibc status change only, which + * happens so often we never want to count it. + */ + if (dd->ipath_lasterror & ~INFINIPATH_E_IBSTATUSCHANGED) { + int iserr; + ipath_err_t mask; + iserr = ipath_decode_err(dd, msg, msgsz, + dd->ipath_lasterror & + ~INFINIPATH_E_IBSTATUSCHANGED); + + mask = INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL | + INFINIPATH_E_PKTERRS | INFINIPATH_E_SDMADISABLED; + + /* if we're in debug, then don't mask SDMADISABLED msgs */ + if (ipath_debug & __IPATH_DBG) + mask &= ~INFINIPATH_E_SDMADISABLED; + + if (dd->ipath_lasterror & ~mask) + ipath_dev_err(dd, "Suppressed %u messages for " + "fast-repeating errors (%s) (%llx)\n", + supp_msgs, msg, + (unsigned long long) + dd->ipath_lasterror); + else { + /* + * rcvegrfull and rcvhdrqfull are "normal", for some + * types of processes (mostly benchmarks) that send + * huge numbers of messages, while not processing + * them. So only complain about these at debug + * level. + */ + if (iserr) + ipath_dbg("Suppressed %u messages for %s\n", + supp_msgs, msg); + else + ipath_cdbg(ERRPKT, + "Suppressed %u messages for %s\n", + supp_msgs, msg); + } + } +} + +static unsigned handle_frequent_errors(struct ipath_devdata *dd, + ipath_err_t errs, char *msg, + u32 msgsz, int *noprint) +{ + unsigned long nc; + static unsigned long nextmsg_time; + static unsigned nmsgs, supp_msgs; + + /* + * Throttle back "fast" messages to no more than 10 per 5 seconds. + * This isn't perfect, but it's a reasonable heuristic. If we get + * more than 10, give a 6x longer delay. + */ + nc = jiffies; + if (nmsgs > 10) { + if (time_before(nc, nextmsg_time)) { + *noprint = 1; + if (!supp_msgs++) + nextmsg_time = nc + HZ * 3; + } + else if (supp_msgs) { + handle_supp_msgs(dd, supp_msgs, msg, msgsz); + supp_msgs = 0; + nmsgs = 0; + } + } + else if (!nmsgs++ || time_after(nc, nextmsg_time)) + nextmsg_time = nc + HZ / 2; + + return supp_msgs; +} + +static void handle_sdma_errors(struct ipath_devdata *dd, ipath_err_t errs) +{ + unsigned long flags; + int expected; + + if (ipath_debug & __IPATH_DBG) { + char msg[128]; + ipath_decode_err(dd, msg, sizeof msg, errs & + INFINIPATH_E_SDMAERRS); + ipath_dbg("errors %lx (%s)\n", (unsigned long)errs, msg); + } + if (ipath_debug & __IPATH_VERBDBG) { + unsigned long tl, hd, status, lengen; + tl = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmatail); + hd = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmahead); + status = ipath_read_kreg64(dd + , dd->ipath_kregs->kr_senddmastatus); + lengen = ipath_read_kreg64(dd, + dd->ipath_kregs->kr_senddmalengen); + ipath_cdbg(VERBOSE, "sdma tl 0x%lx hd 0x%lx status 0x%lx " + "lengen 0x%lx\n", tl, hd, status, lengen); + } + + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + __set_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status); + expected = test_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status); + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + if (!expected) + ipath_cancel_sends(dd, 1); +} + +static void handle_sdma_intr(struct ipath_devdata *dd, u64 istat) +{ + unsigned long flags; + int expected; + + if ((istat & INFINIPATH_I_SDMAINT) && + !test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status)) + ipath_sdma_intr(dd); + + if (istat & INFINIPATH_I_SDMADISABLED) { + expected = test_bit(IPATH_SDMA_ABORTING, + &dd->ipath_sdma_status); + ipath_dbg("%s SDmaDisabled intr\n", + expected ? "expected" : "unexpected"); + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + __set_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status); + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + if (!expected) + ipath_cancel_sends(dd, 1); + if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status)) + tasklet_hi_schedule(&dd->ipath_sdma_abort_task); + } +} + +static int handle_hdrq_full(struct ipath_devdata *dd) +{ + int chkerrpkts = 0; + u32 hd, tl; + u32 i; + + ipath_stats.sps_hdrqfull++; + for (i = 0; i < dd->ipath_cfgports; i++) { + struct ipath_portdata *pd = dd->ipath_pd[i]; + + if (i == 0) { + /* + * For kernel receive queues, we just want to know + * if there are packets in the queue that we can + * process. + */ + if (pd->port_head != ipath_get_hdrqtail(pd)) + chkerrpkts |= 1 << i; + continue; + } + + /* Skip if user context is not open */ + if (!pd || !pd->port_cnt) + continue; + + /* Don't report the same point multiple times. */ + if (dd->ipath_flags & IPATH_NODMA_RTAIL) + tl = ipath_read_ureg32(dd, ur_rcvhdrtail, i); + else + tl = ipath_get_rcvhdrtail(pd); + if (tl == pd->port_lastrcvhdrqtail) + continue; + + hd = ipath_read_ureg32(dd, ur_rcvhdrhead, i); + if (hd == (tl + 1) || (!hd && tl == dd->ipath_hdrqlast)) { + pd->port_lastrcvhdrqtail = tl; + pd->port_hdrqfull++; + /* flush hdrqfull so that poll() sees it */ + wmb(); + wake_up_interruptible(&pd->port_wait); + } + } + + return chkerrpkts; +} + +static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs) +{ + char msg[128]; + u64 ignore_this_time = 0; + u64 iserr = 0; + int chkerrpkts = 0, noprint = 0; + unsigned supp_msgs; + int log_idx; + + /* + * don't report errors that are masked, either at init + * (not set in ipath_errormask), or temporarily (set in + * ipath_maskederrs) + */ + errs &= dd->ipath_errormask & ~dd->ipath_maskederrs; + + supp_msgs = handle_frequent_errors(dd, errs, msg, (u32)sizeof msg, + &noprint); + + /* do these first, they are most important */ + if (errs & INFINIPATH_E_HARDWARE) { + /* reuse same msg buf */ + dd->ipath_f_handle_hwerrors(dd, msg, sizeof msg); + } else { + u64 mask; + for (log_idx = 0; log_idx < IPATH_EEP_LOG_CNT; ++log_idx) { + mask = dd->ipath_eep_st_masks[log_idx].errs_to_log; + if (errs & mask) + ipath_inc_eeprom_err(dd, log_idx, 1); + } + } + + if (errs & INFINIPATH_E_SDMAERRS) + handle_sdma_errors(dd, errs); + + if (!noprint && (errs & ~dd->ipath_e_bitsextant)) + ipath_dev_err(dd, "error interrupt with unknown errors " + "%llx set\n", (unsigned long long) + (errs & ~dd->ipath_e_bitsextant)); + + if (errs & E_SUM_ERRS) + ignore_this_time = handle_e_sum_errs(dd, errs); + else if ((errs & E_SUM_LINK_PKTERRS) && + !(dd->ipath_flags & IPATH_LINKACTIVE)) { + /* + * This can happen when SMA is trying to bring the link + * up, but the IB link changes state at the "wrong" time. + * The IB logic then complains that the packet isn't + * valid. We don't want to confuse people, so we just + * don't print them, except at debug + */ + ipath_dbg("Ignoring packet errors %llx, because link not " + "ACTIVE\n", (unsigned long long) errs); + ignore_this_time = errs & E_SUM_LINK_PKTERRS; + } + + if (supp_msgs == 250000) { + int s_iserr; + /* + * It's not entirely reasonable assuming that the errors set + * in the last clear period are all responsible for the + * problem, but the alternative is to assume it's the only + * ones on this particular interrupt, which also isn't great + */ + dd->ipath_maskederrs |= dd->ipath_lasterror | errs; + + dd->ipath_errormask &= ~dd->ipath_maskederrs; + ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, + dd->ipath_errormask); + s_iserr = ipath_decode_err(dd, msg, sizeof msg, + dd->ipath_maskederrs); + + if (dd->ipath_maskederrs & + ~(INFINIPATH_E_RRCVEGRFULL | + INFINIPATH_E_RRCVHDRFULL | INFINIPATH_E_PKTERRS)) + ipath_dev_err(dd, "Temporarily disabling " + "error(s) %llx reporting; too frequent (%s)\n", + (unsigned long long) dd->ipath_maskederrs, + msg); + else { + /* + * rcvegrfull and rcvhdrqfull are "normal", + * for some types of processes (mostly benchmarks) + * that send huge numbers of messages, while not + * processing them. So only complain about + * these at debug level. + */ + if (s_iserr) + ipath_dbg("Temporarily disabling reporting " + "too frequent queue full errors (%s)\n", + msg); + else + ipath_cdbg(ERRPKT, + "Temporarily disabling reporting too" + " frequent packet errors (%s)\n", + msg); + } + + /* + * Re-enable the masked errors after around 3 minutes. in + * ipath_get_faststats(). If we have a series of fast + * repeating but different errors, the interval will keep + * stretching out, but that's OK, as that's pretty + * catastrophic. + */ + dd->ipath_unmasktime = jiffies + HZ * 180; + } + + ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, errs); + if (ignore_this_time) + errs &= ~ignore_this_time; + if (errs & ~dd->ipath_lasterror) { + errs &= ~dd->ipath_lasterror; + /* never suppress duplicate hwerrors or ibstatuschange */ + dd->ipath_lasterror |= errs & + ~(INFINIPATH_E_HARDWARE | + INFINIPATH_E_IBSTATUSCHANGED); + } + + if (errs & INFINIPATH_E_SENDSPECIALTRIGGER) { + dd->ipath_spectriggerhit++; + ipath_dbg("%lu special trigger hits\n", + dd->ipath_spectriggerhit); + } + + /* likely due to cancel; so suppress message unless verbose */ + if ((errs & (INFINIPATH_E_SPKTLEN | INFINIPATH_E_SPIOARMLAUNCH)) && + time_after(dd->ipath_lastcancel, jiffies)) { + /* armlaunch takes precedence; it often causes both. */ + ipath_cdbg(VERBOSE, + "Suppressed %s error (%llx) after sendbuf cancel\n", + (errs & INFINIPATH_E_SPIOARMLAUNCH) ? + "armlaunch" : "sendpktlen", (unsigned long long)errs); + errs &= ~(INFINIPATH_E_SPIOARMLAUNCH | INFINIPATH_E_SPKTLEN); + } + + if (!errs) + return 0; + + if (!noprint) { + ipath_err_t mask; + /* + * The ones we mask off are handled specially below + * or above. Also mask SDMADISABLED by default as it + * is too chatty. + */ + mask = INFINIPATH_E_IBSTATUSCHANGED | + INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL | + INFINIPATH_E_HARDWARE | INFINIPATH_E_SDMADISABLED; + + /* if we're in debug, then don't mask SDMADISABLED msgs */ + if (ipath_debug & __IPATH_DBG) + mask &= ~INFINIPATH_E_SDMADISABLED; + + ipath_decode_err(dd, msg, sizeof msg, errs & ~mask); + } else + /* so we don't need if (!noprint) at strlcat's below */ + *msg = 0; + + if (errs & E_SUM_PKTERRS) { + ipath_stats.sps_pkterrs++; + chkerrpkts = 1; + } + if (errs & E_SUM_ERRS) + ipath_stats.sps_errs++; + + if (errs & (INFINIPATH_E_RICRC | INFINIPATH_E_RVCRC)) { + ipath_stats.sps_crcerrs++; + chkerrpkts = 1; + } + iserr = errs & ~(E_SUM_PKTERRS | INFINIPATH_E_PKTERRS); + + + /* + * We don't want to print these two as they happen, or we can make + * the situation even worse, because it takes so long to print + * messages to serial consoles. Kernel ports get printed from + * fast_stats, no more than every 5 seconds, user ports get printed + * on close + */ + if (errs & INFINIPATH_E_RRCVHDRFULL) + chkerrpkts |= handle_hdrq_full(dd); + if (errs & INFINIPATH_E_RRCVEGRFULL) { + struct ipath_portdata *pd = dd->ipath_pd[0]; + + /* + * since this is of less importance and not likely to + * happen without also getting hdrfull, only count + * occurrences; don't check each port (or even the kernel + * vs user) + */ + ipath_stats.sps_etidfull++; + if (pd->port_head != ipath_get_hdrqtail(pd)) + chkerrpkts |= 1; + } + + /* + * do this before IBSTATUSCHANGED, in case both bits set in a single + * interrupt; we want the STATUSCHANGE to "win", so we do our + * internal copy of state machine correctly + */ + if (errs & INFINIPATH_E_RIBLOSTLINK) { + /* + * force through block below + */ + errs |= INFINIPATH_E_IBSTATUSCHANGED; + ipath_stats.sps_iblink++; + dd->ipath_flags |= IPATH_LINKDOWN; + dd->ipath_flags &= ~(IPATH_LINKUNK | IPATH_LINKINIT + | IPATH_LINKARMED | IPATH_LINKACTIVE); + *dd->ipath_statusp &= ~IPATH_STATUS_IB_READY; + + ipath_dbg("Lost link, link now down (%s)\n", + ipath_ibcstatus_str[ipath_read_kreg64(dd, + dd->ipath_kregs->kr_ibcstatus) & 0xf]); + } + if (errs & INFINIPATH_E_IBSTATUSCHANGED) + handle_e_ibstatuschanged(dd, errs); + + if (errs & INFINIPATH_E_RESET) { + if (!noprint) + ipath_dev_err(dd, "Got reset, requires re-init " + "(unload and reload driver)\n"); + dd->ipath_flags &= ~IPATH_INITTED; /* needs re-init */ + /* mark as having had error */ + *dd->ipath_statusp |= IPATH_STATUS_HWERROR; + *dd->ipath_statusp &= ~IPATH_STATUS_IB_CONF; + } + + if (!noprint && *msg) { + if (iserr) + ipath_dev_err(dd, "%s error\n", msg); + } + if (dd->ipath_state_wanted & dd->ipath_flags) { + ipath_cdbg(VERBOSE, "driver wanted state %x, iflags now %x, " + "waking\n", dd->ipath_state_wanted, + dd->ipath_flags); + wake_up_interruptible(&ipath_state_wait); + } + + return chkerrpkts; +} + +/* + * try to cleanup as much as possible for anything that might have gone + * wrong while in freeze mode, such as pio buffers being written by user + * processes (causing armlaunch), send errors due to going into freeze mode, + * etc., and try to avoid causing extra interrupts while doing so. + * Forcibly update the in-memory pioavail register copies after cleanup + * because the chip won't do it while in freeze mode (the register values + * themselves are kept correct). + * Make sure that we don't lose any important interrupts by using the chip + * feature that says that writing 0 to a bit in *clear that is set in + * *status will cause an interrupt to be generated again (if allowed by + * the *mask value). + */ +void ipath_clear_freeze(struct ipath_devdata *dd) +{ + /* disable error interrupts, to avoid confusion */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, 0ULL); + + /* also disable interrupts; errormask is sometimes overwriten */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL); + + ipath_cancel_sends(dd, 1); + + /* clear the freeze, and be sure chip saw it */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_control, + dd->ipath_control); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + + /* force in-memory update now we are out of freeze */ + ipath_force_pio_avail_update(dd); + + /* + * force new interrupt if any hwerr, error or interrupt bits are + * still set, and clear "safe" send packet errors related to freeze + * and cancelling sends. Re-enable error interrupts before possible + * force of re-interrupt on pending interrupts. + */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear, 0ULL); + ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, + E_SPKT_ERRS_IGNORE); + ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, + dd->ipath_errormask); + ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, -1LL); + ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, 0ULL); +} + + +/* this is separate to allow for better optimization of ipath_intr() */ + +static noinline void ipath_bad_intr(struct ipath_devdata *dd, u32 *unexpectp) +{ + /* + * sometimes happen during driver init and unload, don't want + * to process any interrupts at that point + */ + + /* this is just a bandaid, not a fix, if something goes badly + * wrong */ + if (++*unexpectp > 100) { + if (++*unexpectp > 105) { + /* + * ok, we must be taking somebody else's interrupts, + * due to a messed up mptable and/or PIRQ table, so + * unregister the interrupt. We've seen this during + * linuxbios development work, and it may happen in + * the future again. + */ + if (dd->pcidev && dd->ipath_irq) { + ipath_dev_err(dd, "Now %u unexpected " + "interrupts, unregistering " + "interrupt handler\n", + *unexpectp); + ipath_dbg("free_irq of irq %d\n", + dd->ipath_irq); + dd->ipath_f_free_irq(dd); + } + } + if (ipath_read_ireg(dd, dd->ipath_kregs->kr_intmask)) { + ipath_dev_err(dd, "%u unexpected interrupts, " + "disabling interrupts completely\n", + *unexpectp); + /* + * disable all interrupts, something is very wrong + */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, + 0ULL); + } + } else if (*unexpectp > 1) + ipath_dbg("Interrupt when not ready, should not happen, " + "ignoring\n"); +} + +static noinline void ipath_bad_regread(struct ipath_devdata *dd) +{ + static int allbits; + + /* separate routine, for better optimization of ipath_intr() */ + + /* + * We print the message and disable interrupts, in hope of + * having a better chance of debugging the problem. + */ + ipath_dev_err(dd, + "Read of interrupt status failed (all bits set)\n"); + if (allbits++) { + /* disable all interrupts, something is very wrong */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL); + if (allbits == 2) { + ipath_dev_err(dd, "Still bad interrupt status, " + "unregistering interrupt\n"); + dd->ipath_f_free_irq(dd); + } else if (allbits > 2) { + if ((allbits % 10000) == 0) + printk("."); + } else + ipath_dev_err(dd, "Disabling interrupts, " + "multiple errors\n"); + } +} + +static void handle_layer_pioavail(struct ipath_devdata *dd) +{ + unsigned long flags; + int ret; + + ret = ipath_ib_piobufavail(dd->verbs_dev); + if (ret > 0) + goto set; + + return; +set: + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + dd->ipath_sendctrl |= INFINIPATH_S_PIOINTBUFAVAIL; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); +} + +/* + * Handle receive interrupts for user ports; this means a user + * process was waiting for a packet to arrive, and didn't want + * to poll + */ +static void handle_urcv(struct ipath_devdata *dd, u64 istat) +{ + u64 portr; + int i; + int rcvdint = 0; + + /* + * test_and_clear_bit(IPATH_PORT_WAITING_RCV) and + * test_and_clear_bit(IPATH_PORT_WAITING_URG) below + * would both like timely updates of the bits so that + * we don't pass them by unnecessarily. the rmb() + * here ensures that we see them promptly -- the + * corresponding wmb()'s are in ipath_poll_urgent() + * and ipath_poll_next()... + */ + rmb(); + portr = ((istat >> dd->ipath_i_rcvavail_shift) & + dd->ipath_i_rcvavail_mask) | + ((istat >> dd->ipath_i_rcvurg_shift) & + dd->ipath_i_rcvurg_mask); + for (i = 1; i < dd->ipath_cfgports; i++) { + struct ipath_portdata *pd = dd->ipath_pd[i]; + + if (portr & (1 << i) && pd && pd->port_cnt) { + if (test_and_clear_bit(IPATH_PORT_WAITING_RCV, + &pd->port_flag)) { + clear_bit(i + dd->ipath_r_intravail_shift, + &dd->ipath_rcvctrl); + wake_up_interruptible(&pd->port_wait); + rcvdint = 1; + } else if (test_and_clear_bit(IPATH_PORT_WAITING_URG, + &pd->port_flag)) { + pd->port_urgent++; + wake_up_interruptible(&pd->port_wait); + } + } + } + if (rcvdint) { + /* only want to take one interrupt, so turn off the rcv + * interrupt for all the ports that we set the rcv_waiting + * (but never for kernel port) + */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, + dd->ipath_rcvctrl); + } +} + +irqreturn_t ipath_intr(int irq, void *data) +{ + struct ipath_devdata *dd = data; + u64 istat, chk0rcv = 0; + ipath_err_t estat = 0; + irqreturn_t ret; + static unsigned unexpected = 0; + u64 kportrbits; + + ipath_stats.sps_ints++; + + if (dd->ipath_int_counter != (u32) -1) + dd->ipath_int_counter++; + + if (!(dd->ipath_flags & IPATH_PRESENT)) { + /* + * This return value is not great, but we do not want the + * interrupt core code to remove our interrupt handler + * because we don't appear to be handling an interrupt + * during a chip reset. + */ + return IRQ_HANDLED; + } + + /* + * this needs to be flags&initted, not statusp, so we keep + * taking interrupts even after link goes down, etc. + * Also, we *must* clear the interrupt at some point, or we won't + * take it again, which can be real bad for errors, etc... + */ + + if (!(dd->ipath_flags & IPATH_INITTED)) { + ipath_bad_intr(dd, &unexpected); + ret = IRQ_NONE; + goto bail; + } + + istat = ipath_read_ireg(dd, dd->ipath_kregs->kr_intstatus); + + if (unlikely(!istat)) { + ipath_stats.sps_nullintr++; + ret = IRQ_NONE; /* not our interrupt, or already handled */ + goto bail; + } + if (unlikely(istat == -1)) { + ipath_bad_regread(dd); + /* don't know if it was our interrupt or not */ + ret = IRQ_NONE; + goto bail; + } + + if (unexpected) + unexpected = 0; + + if (unlikely(istat & ~dd->ipath_i_bitsextant)) + ipath_dev_err(dd, + "interrupt with unknown interrupts %Lx set\n", + (unsigned long long) + istat & ~dd->ipath_i_bitsextant); + else if (istat & ~INFINIPATH_I_ERROR) /* errors do own printing */ + ipath_cdbg(VERBOSE, "intr stat=0x%Lx\n", + (unsigned long long) istat); + + if (istat & INFINIPATH_I_ERROR) { + ipath_stats.sps_errints++; + estat = ipath_read_kreg64(dd, + dd->ipath_kregs->kr_errorstatus); + if (!estat) + dev_info(&dd->pcidev->dev, "error interrupt (%Lx), " + "but no error bits set!\n", + (unsigned long long) istat); + else if (estat == -1LL) + /* + * should we try clearing all, or hope next read + * works? + */ + ipath_dev_err(dd, "Read of error status failed " + "(all bits set); ignoring\n"); + else + chk0rcv |= handle_errors(dd, estat); + } + + if (istat & INFINIPATH_I_GPIO) { + /* + * GPIO interrupts fall in two broad classes: + * GPIO_2 indicates (on some HT4xx boards) that a packet + * has arrived for Port 0. Checking for this + * is controlled by flag IPATH_GPIO_INTR. + * GPIO_3..5 on IBA6120 Rev2 and IBA6110 Rev4 chips indicate + * errors that we need to count. Checking for this + * is controlled by flag IPATH_GPIO_ERRINTRS. + */ + u32 gpiostatus; + u32 to_clear = 0; + + gpiostatus = ipath_read_kreg32( + dd, dd->ipath_kregs->kr_gpio_status); + /* First the error-counter case. */ + if ((gpiostatus & IPATH_GPIO_ERRINTR_MASK) && + (dd->ipath_flags & IPATH_GPIO_ERRINTRS)) { + /* want to clear the bits we see asserted. */ + to_clear |= (gpiostatus & IPATH_GPIO_ERRINTR_MASK); + + /* + * Count appropriately, clear bits out of our copy, + * as they have been "handled". + */ + if (gpiostatus & (1 << IPATH_GPIO_RXUVL_BIT)) { + ipath_dbg("FlowCtl on UnsupVL\n"); + dd->ipath_rxfc_unsupvl_errs++; + } + if (gpiostatus & (1 << IPATH_GPIO_OVRUN_BIT)) { + ipath_dbg("Overrun Threshold exceeded\n"); + dd->ipath_overrun_thresh_errs++; + } + if (gpiostatus & (1 << IPATH_GPIO_LLI_BIT)) { + ipath_dbg("Local Link Integrity error\n"); + dd->ipath_lli_errs++; + } + gpiostatus &= ~IPATH_GPIO_ERRINTR_MASK; + } + /* Now the Port0 Receive case */ + if ((gpiostatus & (1 << IPATH_GPIO_PORT0_BIT)) && + (dd->ipath_flags & IPATH_GPIO_INTR)) { + /* + * GPIO status bit 2 is set, and we expected it. + * clear it and indicate in p0bits. + * This probably only happens if a Port0 pkt + * arrives at _just_ the wrong time, and we + * handle that by seting chk0rcv; + */ + to_clear |= (1 << IPATH_GPIO_PORT0_BIT); + gpiostatus &= ~(1 << IPATH_GPIO_PORT0_BIT); + chk0rcv = 1; + } + if (gpiostatus) { + /* + * Some unexpected bits remain. If they could have + * caused the interrupt, complain and clear. + * To avoid repetition of this condition, also clear + * the mask. It is almost certainly due to error. + */ + const u32 mask = (u32) dd->ipath_gpio_mask; + + if (mask & gpiostatus) { + ipath_dbg("Unexpected GPIO IRQ bits %x\n", + gpiostatus & mask); + to_clear |= (gpiostatus & mask); + dd->ipath_gpio_mask &= ~(gpiostatus & mask); + ipath_write_kreg(dd, + dd->ipath_kregs->kr_gpio_mask, + dd->ipath_gpio_mask); + } + } + if (to_clear) { + ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_clear, + (u64) to_clear); + } + } + + /* + * Clear the interrupt bits we found set, unless they are receive + * related, in which case we already cleared them above, and don't + * want to clear them again, because we might lose an interrupt. + * Clear it early, so we "know" know the chip will have seen this by + * the time we process the queue, and will re-interrupt if necessary. + * The processor itself won't take the interrupt again until we return. + */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, istat); + + /* + * Handle kernel receive queues before checking for pio buffers + * available since receives can overflow; piobuf waiters can afford + * a few extra cycles, since they were waiting anyway, and user's + * waiting for receive are at the bottom. + */ + kportrbits = (1ULL << dd->ipath_i_rcvavail_shift) | + (1ULL << dd->ipath_i_rcvurg_shift); + if (chk0rcv || (istat & kportrbits)) { + istat &= ~kportrbits; + ipath_kreceive(dd->ipath_pd[0]); + } + + if (istat & ((dd->ipath_i_rcvavail_mask << dd->ipath_i_rcvavail_shift) | + (dd->ipath_i_rcvurg_mask << dd->ipath_i_rcvurg_shift))) + handle_urcv(dd, istat); + + if (istat & (INFINIPATH_I_SDMAINT | INFINIPATH_I_SDMADISABLED)) + handle_sdma_intr(dd, istat); + + if (istat & INFINIPATH_I_SPIOBUFAVAIL) { + unsigned long flags; + + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + dd->ipath_sendctrl &= ~INFINIPATH_S_PIOINTBUFAVAIL; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + + /* always process; sdma verbs uses PIO for acks and VL15 */ + handle_layer_pioavail(dd); + } + + ret = IRQ_HANDLED; + +bail: + return ret; +} diff --git a/kernel/drivers/infiniband/hw/ipath/ipath_kernel.h b/kernel/drivers/infiniband/hw/ipath/ipath_kernel.h new file mode 100644 index 000000000..e08db7020 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ipath/ipath_kernel.h @@ -0,0 +1,1375 @@ +#ifndef _IPATH_KERNEL_H +#define _IPATH_KERNEL_H +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * This header file is the base header file for infinipath kernel code + * ipath_user.h serves a similar purpose for user code. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ipath_common.h" +#include "ipath_debug.h" +#include "ipath_registers.h" + +/* only s/w major version of InfiniPath we can handle */ +#define IPATH_CHIP_VERS_MAJ 2U + +/* don't care about this except printing */ +#define IPATH_CHIP_VERS_MIN 0U + +/* temporary, maybe always */ +extern struct infinipath_stats ipath_stats; + +#define IPATH_CHIP_SWVERSION IPATH_CHIP_VERS_MAJ +/* + * First-cut critierion for "device is active" is + * two thousand dwords combined Tx, Rx traffic per + * 5-second interval. SMA packets are 64 dwords, + * and occur "a few per second", presumably each way. + */ +#define IPATH_TRAFFIC_ACTIVE_THRESHOLD (2000) +/* + * Struct used to indicate which errors are logged in each of the + * error-counters that are logged to EEPROM. A counter is incremented + * _once_ (saturating at 255) for each event with any bits set in + * the error or hwerror register masks below. + */ +#define IPATH_EEP_LOG_CNT (4) +struct ipath_eep_log_mask { + u64 errs_to_log; + u64 hwerrs_to_log; +}; + +struct ipath_portdata { + void **port_rcvegrbuf; + dma_addr_t *port_rcvegrbuf_phys; + /* rcvhdrq base, needs mmap before useful */ + void *port_rcvhdrq; + /* kernel virtual address where hdrqtail is updated */ + void *port_rcvhdrtail_kvaddr; + /* + * temp buffer for expected send setup, allocated at open, instead + * of each setup call + */ + void *port_tid_pg_list; + /* when waiting for rcv or pioavail */ + wait_queue_head_t port_wait; + /* + * rcvegr bufs base, physical, must fit + * in 44 bits so 32 bit programs mmap64 44 bit works) + */ + dma_addr_t port_rcvegr_phys; + /* mmap of hdrq, must fit in 44 bits */ + dma_addr_t port_rcvhdrq_phys; + dma_addr_t port_rcvhdrqtailaddr_phys; + /* + * number of opens (including slave subports) on this instance + * (ignoring forks, dup, etc. for now) + */ + int port_cnt; + /* + * how much space to leave at start of eager TID entries for + * protocol use, on each TID + */ + /* instead of calculating it */ + unsigned port_port; + /* non-zero if port is being shared. */ + u16 port_subport_cnt; + /* non-zero if port is being shared. */ + u16 port_subport_id; + /* number of pio bufs for this port (all procs, if shared) */ + u32 port_piocnt; + /* first pio buffer for this port */ + u32 port_pio_base; + /* chip offset of PIO buffers for this port */ + u32 port_piobufs; + /* how many alloc_pages() chunks in port_rcvegrbuf_pages */ + u32 port_rcvegrbuf_chunks; + /* how many egrbufs per chunk */ + u32 port_rcvegrbufs_perchunk; + /* order for port_rcvegrbuf_pages */ + size_t port_rcvegrbuf_size; + /* rcvhdrq size (for freeing) */ + size_t port_rcvhdrq_size; + /* next expected TID to check when looking for free */ + u32 port_tidcursor; + /* next expected TID to check */ + unsigned long port_flag; + /* what happened */ + unsigned long int_flag; + /* WAIT_RCV that timed out, no interrupt */ + u32 port_rcvwait_to; + /* WAIT_PIO that timed out, no interrupt */ + u32 port_piowait_to; + /* WAIT_RCV already happened, no wait */ + u32 port_rcvnowait; + /* WAIT_PIO already happened, no wait */ + u32 port_pionowait; + /* total number of rcvhdrqfull errors */ + u32 port_hdrqfull; + /* + * Used to suppress multiple instances of same + * port staying stuck at same point. + */ + u32 port_lastrcvhdrqtail; + /* saved total number of rcvhdrqfull errors for poll edge trigger */ + u32 port_hdrqfull_poll; + /* total number of polled urgent packets */ + u32 port_urgent; + /* saved total number of polled urgent packets for poll edge trigger */ + u32 port_urgent_poll; + /* pid of process using this port */ + struct pid *port_pid; + struct pid *port_subpid[INFINIPATH_MAX_SUBPORT]; + /* same size as task_struct .comm[] */ + char port_comm[16]; + /* pkeys set by this use of this port */ + u16 port_pkeys[4]; + /* so file ops can get at unit */ + struct ipath_devdata *port_dd; + /* A page of memory for rcvhdrhead, rcvegrhead, rcvegrtail * N */ + void *subport_uregbase; + /* An array of pages for the eager receive buffers * N */ + void *subport_rcvegrbuf; + /* An array of pages for the eager header queue entries * N */ + void *subport_rcvhdr_base; + /* The version of the library which opened this port */ + u32 userversion; + /* Bitmask of active slaves */ + u32 active_slaves; + /* Type of packets or conditions we want to poll for */ + u16 poll_type; + /* port rcvhdrq head offset */ + u32 port_head; + /* receive packet sequence counter */ + u32 port_seq_cnt; +}; + +struct sk_buff; +struct ipath_sge_state; +struct ipath_verbs_txreq; + +/* + * control information for layered drivers + */ +struct _ipath_layer { + void *l_arg; +}; + +struct ipath_skbinfo { + struct sk_buff *skb; + dma_addr_t phys; +}; + +struct ipath_sdma_txreq { + int flags; + int sg_count; + union { + struct scatterlist *sg; + void *map_addr; + }; + void (*callback)(void *, int); + void *callback_cookie; + int callback_status; + u16 start_idx; /* sdma private */ + u16 next_descq_idx; /* sdma private */ + struct list_head list; /* sdma private */ +}; + +struct ipath_sdma_desc { + __le64 qw[2]; +}; + +#define IPATH_SDMA_TXREQ_F_USELARGEBUF 0x1 +#define IPATH_SDMA_TXREQ_F_HEADTOHOST 0x2 +#define IPATH_SDMA_TXREQ_F_INTREQ 0x4 +#define IPATH_SDMA_TXREQ_F_FREEBUF 0x8 +#define IPATH_SDMA_TXREQ_F_FREEDESC 0x10 +#define IPATH_SDMA_TXREQ_F_VL15 0x20 + +#define IPATH_SDMA_TXREQ_S_OK 0 +#define IPATH_SDMA_TXREQ_S_SENDERROR 1 +#define IPATH_SDMA_TXREQ_S_ABORTED 2 +#define IPATH_SDMA_TXREQ_S_SHUTDOWN 3 + +#define IPATH_SDMA_STATUS_SCORE_BOARD_DRAIN_IN_PROG (1ull << 63) +#define IPATH_SDMA_STATUS_ABORT_IN_PROG (1ull << 62) +#define IPATH_SDMA_STATUS_INTERNAL_SDMA_ENABLE (1ull << 61) +#define IPATH_SDMA_STATUS_SCB_EMPTY (1ull << 30) + +/* max dwords in small buffer packet */ +#define IPATH_SMALLBUF_DWORDS (dd->ipath_piosize2k >> 2) + +/* + * Possible IB config parameters for ipath_f_get/set_ib_cfg() + */ +#define IPATH_IB_CFG_LIDLMC 0 /* Get/set LID (LS16b) and Mask (MS16b) */ +#define IPATH_IB_CFG_HRTBT 1 /* Get/set Heartbeat off/enable/auto */ +#define IPATH_IB_HRTBT_ON 3 /* Heartbeat enabled, sent every 100msec */ +#define IPATH_IB_HRTBT_OFF 0 /* Heartbeat off */ +#define IPATH_IB_CFG_LWID_ENB 2 /* Get/set allowed Link-width */ +#define IPATH_IB_CFG_LWID 3 /* Get currently active Link-width */ +#define IPATH_IB_CFG_SPD_ENB 4 /* Get/set allowed Link speeds */ +#define IPATH_IB_CFG_SPD 5 /* Get current Link spd */ +#define IPATH_IB_CFG_RXPOL_ENB 6 /* Get/set Auto-RX-polarity enable */ +#define IPATH_IB_CFG_LREV_ENB 7 /* Get/set Auto-Lane-reversal enable */ +#define IPATH_IB_CFG_LINKLATENCY 8 /* Get Auto-Lane-reversal enable */ + + +struct ipath_devdata { + struct list_head ipath_list; + + struct ipath_kregs const *ipath_kregs; + struct ipath_cregs const *ipath_cregs; + + /* mem-mapped pointer to base of chip regs */ + u64 __iomem *ipath_kregbase; + /* end of mem-mapped chip space; range checking */ + u64 __iomem *ipath_kregend; + /* physical address of chip for io_remap, etc. */ + unsigned long ipath_physaddr; + /* base of memory alloced for ipath_kregbase, for free */ + u64 *ipath_kregalloc; + /* ipath_cfgports pointers */ + struct ipath_portdata **ipath_pd; + /* sk_buffs used by port 0 eager receive queue */ + struct ipath_skbinfo *ipath_port0_skbinfo; + /* kvirt address of 1st 2k pio buffer */ + void __iomem *ipath_pio2kbase; + /* kvirt address of 1st 4k pio buffer */ + void __iomem *ipath_pio4kbase; + /* + * points to area where PIOavail registers will be DMA'ed. + * Has to be on a page of it's own, because the page will be + * mapped into user program space. This copy is *ONLY* ever + * written by DMA, not by the driver! Need a copy per device + * when we get to multiple devices + */ + volatile __le64 *ipath_pioavailregs_dma; + /* physical address where updates occur */ + dma_addr_t ipath_pioavailregs_phys; + struct _ipath_layer ipath_layer; + /* setup intr */ + int (*ipath_f_intrsetup)(struct ipath_devdata *); + /* fallback to alternate interrupt type if possible */ + int (*ipath_f_intr_fallback)(struct ipath_devdata *); + /* setup on-chip bus config */ + int (*ipath_f_bus)(struct ipath_devdata *, struct pci_dev *); + /* hard reset chip */ + int (*ipath_f_reset)(struct ipath_devdata *); + int (*ipath_f_get_boardname)(struct ipath_devdata *, char *, + size_t); + void (*ipath_f_init_hwerrors)(struct ipath_devdata *); + void (*ipath_f_handle_hwerrors)(struct ipath_devdata *, char *, + size_t); + void (*ipath_f_quiet_serdes)(struct ipath_devdata *); + int (*ipath_f_bringup_serdes)(struct ipath_devdata *); + int (*ipath_f_early_init)(struct ipath_devdata *); + void (*ipath_f_clear_tids)(struct ipath_devdata *, unsigned); + void (*ipath_f_put_tid)(struct ipath_devdata *, u64 __iomem*, + u32, unsigned long); + void (*ipath_f_tidtemplate)(struct ipath_devdata *); + void (*ipath_f_cleanup)(struct ipath_devdata *); + void (*ipath_f_setextled)(struct ipath_devdata *, u64, u64); + /* fill out chip-specific fields */ + int (*ipath_f_get_base_info)(struct ipath_portdata *, void *); + /* free irq */ + void (*ipath_f_free_irq)(struct ipath_devdata *); + struct ipath_message_header *(*ipath_f_get_msgheader) + (struct ipath_devdata *, __le32 *); + void (*ipath_f_config_ports)(struct ipath_devdata *, ushort); + int (*ipath_f_get_ib_cfg)(struct ipath_devdata *, int); + int (*ipath_f_set_ib_cfg)(struct ipath_devdata *, int, u32); + void (*ipath_f_config_jint)(struct ipath_devdata *, u16 , u16); + void (*ipath_f_read_counters)(struct ipath_devdata *, + struct infinipath_counters *); + void (*ipath_f_xgxs_reset)(struct ipath_devdata *); + /* per chip actions needed for IB Link up/down changes */ + int (*ipath_f_ib_updown)(struct ipath_devdata *, int, u64); + + unsigned ipath_lastegr_idx; + struct ipath_ibdev *verbs_dev; + struct timer_list verbs_timer; + /* total dwords sent (summed from counter) */ + u64 ipath_sword; + /* total dwords rcvd (summed from counter) */ + u64 ipath_rword; + /* total packets sent (summed from counter) */ + u64 ipath_spkts; + /* total packets rcvd (summed from counter) */ + u64 ipath_rpkts; + /* ipath_statusp initially points to this. */ + u64 _ipath_status; + /* GUID for this interface, in network order */ + __be64 ipath_guid; + /* + * aggregrate of error bits reported since last cleared, for + * limiting of error reporting + */ + ipath_err_t ipath_lasterror; + /* + * aggregrate of error bits reported since last cleared, for + * limiting of hwerror reporting + */ + ipath_err_t ipath_lasthwerror; + /* errors masked because they occur too fast */ + ipath_err_t ipath_maskederrs; + u64 ipath_lastlinkrecov; /* link recoveries at last ACTIVE */ + /* these 5 fields are used to establish deltas for IB Symbol + * errors and linkrecovery errors. They can be reported on + * some chips during link negotiation prior to INIT, and with + * DDR when faking DDR negotiations with non-IBTA switches. + * The chip counters are adjusted at driver unload if there is + * a non-zero delta. + */ + u64 ibdeltainprog; + u64 ibsymdelta; + u64 ibsymsnap; + u64 iblnkerrdelta; + u64 iblnkerrsnap; + + /* time in jiffies at which to re-enable maskederrs */ + unsigned long ipath_unmasktime; + /* count of egrfull errors, combined for all ports */ + u64 ipath_last_tidfull; + /* for ipath_qcheck() */ + u64 ipath_lastport0rcv_cnt; + /* template for writing TIDs */ + u64 ipath_tidtemplate; + /* value to write to free TIDs */ + u64 ipath_tidinvalid; + /* IBA6120 rcv interrupt setup */ + u64 ipath_rhdrhead_intr_off; + + /* size of memory at ipath_kregbase */ + u32 ipath_kregsize; + /* number of registers used for pioavail */ + u32 ipath_pioavregs; + /* IPATH_POLL, etc. */ + u32 ipath_flags; + /* ipath_flags driver is waiting for */ + u32 ipath_state_wanted; + /* last buffer for user use, first buf for kernel use is this + * index. */ + u32 ipath_lastport_piobuf; + /* is a stats timer active */ + u32 ipath_stats_timer_active; + /* number of interrupts for this device -- saturates... */ + u32 ipath_int_counter; + /* dwords sent read from counter */ + u32 ipath_lastsword; + /* dwords received read from counter */ + u32 ipath_lastrword; + /* sent packets read from counter */ + u32 ipath_lastspkts; + /* received packets read from counter */ + u32 ipath_lastrpkts; + /* pio bufs allocated per port */ + u32 ipath_pbufsport; + /* if remainder on bufs/port, ports < extrabuf get 1 extra */ + u32 ipath_ports_extrabuf; + u32 ipath_pioupd_thresh; /* update threshold, some chips */ + /* + * number of ports configured as max; zero is set to number chip + * supports, less gives more pio bufs/port, etc. + */ + u32 ipath_cfgports; + /* count of port 0 hdrqfull errors */ + u32 ipath_p0_hdrqfull; + /* port 0 number of receive eager buffers */ + u32 ipath_p0_rcvegrcnt; + + /* + * index of last piobuffer we used. Speeds up searching, by + * starting at this point. Doesn't matter if multiple cpu's use and + * update, last updater is only write that matters. Whenever it + * wraps, we update shadow copies. Need a copy per device when we + * get to multiple devices + */ + u32 ipath_lastpioindex; + u32 ipath_lastpioindexl; + /* max length of freezemsg */ + u32 ipath_freezelen; + /* + * consecutive times we wanted a PIO buffer but were unable to + * get one + */ + u32 ipath_consec_nopiobuf; + /* + * hint that we should update ipath_pioavailshadow before + * looking for a PIO buffer + */ + u32 ipath_upd_pio_shadow; + /* so we can rewrite it after a chip reset */ + u32 ipath_pcibar0; + /* so we can rewrite it after a chip reset */ + u32 ipath_pcibar1; + u32 ipath_x1_fix_tries; + u32 ipath_autoneg_tries; + u32 serdes_first_init_done; + + struct ipath_relock { + atomic_t ipath_relock_timer_active; + struct timer_list ipath_relock_timer; + unsigned int ipath_relock_interval; /* in jiffies */ + } ipath_relock_singleton; + + /* interrupt number */ + int ipath_irq; + /* HT/PCI Vendor ID (here for NodeInfo) */ + u16 ipath_vendorid; + /* HT/PCI Device ID (here for NodeInfo) */ + u16 ipath_deviceid; + /* offset in HT config space of slave/primary interface block */ + u8 ipath_ht_slave_off; + /* for write combining settings */ + unsigned long ipath_wc_cookie; + unsigned long ipath_wc_base; + unsigned long ipath_wc_len; + /* ref count for each pkey */ + atomic_t ipath_pkeyrefs[4]; + /* shadow copy of struct page *'s for exp tid pages */ + struct page **ipath_pageshadow; + /* shadow copy of dma handles for exp tid pages */ + dma_addr_t *ipath_physshadow; + u64 __iomem *ipath_egrtidbase; + /* lock to workaround chip bug 9437 and others */ + spinlock_t ipath_kernel_tid_lock; + spinlock_t ipath_user_tid_lock; + spinlock_t ipath_sendctrl_lock; + /* around ipath_pd and (user ports) port_cnt use (intr vs free) */ + spinlock_t ipath_uctxt_lock; + + /* + * IPATH_STATUS_*, + * this address is mapped readonly into user processes so they can + * get status cheaply, whenever they want. + */ + u64 *ipath_statusp; + /* freeze msg if hw error put chip in freeze */ + char *ipath_freezemsg; + /* pci access data structure */ + struct pci_dev *pcidev; + struct cdev *user_cdev; + struct cdev *diag_cdev; + struct device *user_dev; + struct device *diag_dev; + /* timer used to prevent stats overflow, error throttling, etc. */ + struct timer_list ipath_stats_timer; + /* timer to verify interrupts work, and fallback if possible */ + struct timer_list ipath_intrchk_timer; + void *ipath_dummy_hdrq; /* used after port close */ + dma_addr_t ipath_dummy_hdrq_phys; + + /* SendDMA related entries */ + spinlock_t ipath_sdma_lock; + unsigned long ipath_sdma_status; + unsigned long ipath_sdma_abort_jiffies; + unsigned long ipath_sdma_abort_intr_timeout; + unsigned long ipath_sdma_buf_jiffies; + struct ipath_sdma_desc *ipath_sdma_descq; + u64 ipath_sdma_descq_added; + u64 ipath_sdma_descq_removed; + int ipath_sdma_desc_nreserved; + u16 ipath_sdma_descq_cnt; + u16 ipath_sdma_descq_tail; + u16 ipath_sdma_descq_head; + u16 ipath_sdma_next_intr; + u16 ipath_sdma_reset_wait; + u8 ipath_sdma_generation; + struct tasklet_struct ipath_sdma_abort_task; + struct tasklet_struct ipath_sdma_notify_task; + struct list_head ipath_sdma_activelist; + struct list_head ipath_sdma_notifylist; + atomic_t ipath_sdma_vl15_count; + struct timer_list ipath_sdma_vl15_timer; + + dma_addr_t ipath_sdma_descq_phys; + volatile __le64 *ipath_sdma_head_dma; + dma_addr_t ipath_sdma_head_phys; + + unsigned long ipath_ureg_align; /* user register alignment */ + + struct delayed_work ipath_autoneg_work; + wait_queue_head_t ipath_autoneg_wait; + + /* HoL blocking / user app forward-progress state */ + unsigned ipath_hol_state; + unsigned ipath_hol_next; + struct timer_list ipath_hol_timer; + + /* + * Shadow copies of registers; size indicates read access size. + * Most of them are readonly, but some are write-only register, + * where we manipulate the bits in the shadow copy, and then write + * the shadow copy to infinipath. + * + * We deliberately make most of these 32 bits, since they have + * restricted range. For any that we read, we won't to generate 32 + * bit accesses, since Opteron will generate 2 separate 32 bit HT + * transactions for a 64 bit read, and we want to avoid unnecessary + * HT transactions. + */ + + /* This is the 64 bit group */ + + /* + * shadow of pioavail, check to be sure it's large enough at + * init time. + */ + unsigned long ipath_pioavailshadow[8]; + /* bitmap of send buffers available for the kernel to use with PIO. */ + unsigned long ipath_pioavailkernel[8]; + /* shadow of kr_gpio_out, for rmw ops */ + u64 ipath_gpio_out; + /* shadow the gpio mask register */ + u64 ipath_gpio_mask; + /* shadow the gpio output enable, etc... */ + u64 ipath_extctrl; + /* kr_revision shadow */ + u64 ipath_revision; + /* + * shadow of ibcctrl, for interrupt handling of link changes, + * etc. + */ + u64 ipath_ibcctrl; + /* + * last ibcstatus, to suppress "duplicate" status change messages, + * mostly from 2 to 3 + */ + u64 ipath_lastibcstat; + /* hwerrmask shadow */ + ipath_err_t ipath_hwerrmask; + ipath_err_t ipath_errormask; /* errormask shadow */ + /* interrupt config reg shadow */ + u64 ipath_intconfig; + /* kr_sendpiobufbase value */ + u64 ipath_piobufbase; + /* kr_ibcddrctrl shadow */ + u64 ipath_ibcddrctrl; + + /* these are the "32 bit" regs */ + + /* + * number of GUIDs in the flash for this interface; may need some + * rethinking for setting on other ifaces + */ + u32 ipath_nguid; + /* + * the following two are 32-bit bitmasks, but {test,clear,set}_bit + * all expect bit fields to be "unsigned long" + */ + /* shadow kr_rcvctrl */ + unsigned long ipath_rcvctrl; + /* shadow kr_sendctrl */ + unsigned long ipath_sendctrl; + /* to not count armlaunch after cancel */ + unsigned long ipath_lastcancel; + /* count cases where special trigger was needed (double write) */ + unsigned long ipath_spectriggerhit; + + /* value we put in kr_rcvhdrcnt */ + u32 ipath_rcvhdrcnt; + /* value we put in kr_rcvhdrsize */ + u32 ipath_rcvhdrsize; + /* value we put in kr_rcvhdrentsize */ + u32 ipath_rcvhdrentsize; + /* offset of last entry in rcvhdrq */ + u32 ipath_hdrqlast; + /* kr_portcnt value */ + u32 ipath_portcnt; + /* kr_pagealign value */ + u32 ipath_palign; + /* number of "2KB" PIO buffers */ + u32 ipath_piobcnt2k; + /* size in bytes of "2KB" PIO buffers */ + u32 ipath_piosize2k; + /* number of "4KB" PIO buffers */ + u32 ipath_piobcnt4k; + /* size in bytes of "4KB" PIO buffers */ + u32 ipath_piosize4k; + u32 ipath_pioreserved; /* reserved special-inkernel; */ + /* kr_rcvegrbase value */ + u32 ipath_rcvegrbase; + /* kr_rcvegrcnt value */ + u32 ipath_rcvegrcnt; + /* kr_rcvtidbase value */ + u32 ipath_rcvtidbase; + /* kr_rcvtidcnt value */ + u32 ipath_rcvtidcnt; + /* kr_sendregbase */ + u32 ipath_sregbase; + /* kr_userregbase */ + u32 ipath_uregbase; + /* kr_counterregbase */ + u32 ipath_cregbase; + /* shadow the control register contents */ + u32 ipath_control; + /* PCI revision register (HTC rev on FPGA) */ + u32 ipath_pcirev; + + /* chip address space used by 4k pio buffers */ + u32 ipath_4kalign; + /* The MTU programmed for this unit */ + u32 ipath_ibmtu; + /* + * The max size IB packet, included IB headers that we can send. + * Starts same as ipath_piosize, but is affected when ibmtu is + * changed, or by size of eager buffers + */ + u32 ipath_ibmaxlen; + /* + * ibmaxlen at init time, limited by chip and by receive buffer + * size. Not changed after init. + */ + u32 ipath_init_ibmaxlen; + /* size of each rcvegrbuffer */ + u32 ipath_rcvegrbufsize; + /* localbus width (1, 2,4,8,16,32) from config space */ + u32 ipath_lbus_width; + /* localbus speed (HT: 200,400,800,1000; PCIe 2500) */ + u32 ipath_lbus_speed; + /* + * number of sequential ibcstatus change for polling active/quiet + * (i.e., link not coming up). + */ + u32 ipath_ibpollcnt; + /* low and high portions of MSI capability/vector */ + u32 ipath_msi_lo; + /* saved after PCIe init for restore after reset */ + u32 ipath_msi_hi; + /* MSI data (vector) saved for restore */ + u16 ipath_msi_data; + /* MLID programmed for this instance */ + u16 ipath_mlid; + /* LID programmed for this instance */ + u16 ipath_lid; + /* list of pkeys programmed; 0 if not set */ + u16 ipath_pkeys[4]; + /* + * ASCII serial number, from flash, large enough for original + * all digit strings, and longer QLogic serial number format + */ + u8 ipath_serial[16]; + /* human readable board version */ + u8 ipath_boardversion[96]; + u8 ipath_lbus_info[32]; /* human readable localbus info */ + /* chip major rev, from ipath_revision */ + u8 ipath_majrev; + /* chip minor rev, from ipath_revision */ + u8 ipath_minrev; + /* board rev, from ipath_revision */ + u8 ipath_boardrev; + /* saved for restore after reset */ + u8 ipath_pci_cacheline; + /* LID mask control */ + u8 ipath_lmc; + /* link width supported */ + u8 ipath_link_width_supported; + /* link speed supported */ + u8 ipath_link_speed_supported; + u8 ipath_link_width_enabled; + u8 ipath_link_speed_enabled; + u8 ipath_link_width_active; + u8 ipath_link_speed_active; + /* Rx Polarity inversion (compensate for ~tx on partner) */ + u8 ipath_rx_pol_inv; + + u8 ipath_r_portenable_shift; + u8 ipath_r_intravail_shift; + u8 ipath_r_tailupd_shift; + u8 ipath_r_portcfg_shift; + + /* unit # of this chip, if present */ + int ipath_unit; + + /* local link integrity counter */ + u32 ipath_lli_counter; + /* local link integrity errors */ + u32 ipath_lli_errors; + /* + * Above counts only cases where _successive_ LocalLinkIntegrity + * errors were seen in the receive headers of kern-packets. + * Below are the three (monotonically increasing) counters + * maintained via GPIO interrupts on iba6120-rev2. + */ + u32 ipath_rxfc_unsupvl_errs; + u32 ipath_overrun_thresh_errs; + u32 ipath_lli_errs; + + /* + * Not all devices managed by a driver instance are the same + * type, so these fields must be per-device. + */ + u64 ipath_i_bitsextant; + ipath_err_t ipath_e_bitsextant; + ipath_err_t ipath_hwe_bitsextant; + + /* + * Below should be computable from number of ports, + * since they are never modified. + */ + u64 ipath_i_rcvavail_mask; + u64 ipath_i_rcvurg_mask; + u16 ipath_i_rcvurg_shift; + u16 ipath_i_rcvavail_shift; + + /* + * Register bits for selecting i2c direction and values, used for + * I2C serial flash. + */ + u8 ipath_gpio_sda_num; + u8 ipath_gpio_scl_num; + u8 ipath_i2c_chain_type; + u64 ipath_gpio_sda; + u64 ipath_gpio_scl; + + /* lock for doing RMW of shadows/regs for ExtCtrl and GPIO */ + spinlock_t ipath_gpio_lock; + + /* + * IB link and linktraining states and masks that vary per chip in + * some way. Set at init, to avoid each IB status change interrupt + */ + u8 ibcs_ls_shift; + u8 ibcs_lts_mask; + u32 ibcs_mask; + u32 ib_init; + u32 ib_arm; + u32 ib_active; + + u16 ipath_rhf_offset; /* offset of RHF within receive header entry */ + + /* + * shift/mask for linkcmd, linkinitcmd, maxpktlen in ibccontol + * reg. Changes for IBA7220 + */ + u8 ibcc_lic_mask; /* LinkInitCmd */ + u8 ibcc_lc_shift; /* LinkCmd */ + u8 ibcc_mpl_shift; /* Maxpktlen */ + + u8 delay_mult; + + /* used to override LED behavior */ + u8 ipath_led_override; /* Substituted for normal value, if non-zero */ + u16 ipath_led_override_timeoff; /* delta to next timer event */ + u8 ipath_led_override_vals[2]; /* Alternates per blink-frame */ + u8 ipath_led_override_phase; /* Just counts, LSB picks from vals[] */ + atomic_t ipath_led_override_timer_active; + /* Used to flash LEDs in override mode */ + struct timer_list ipath_led_override_timer; + + /* Support (including locks) for EEPROM logging of errors and time */ + /* control access to actual counters, timer */ + spinlock_t ipath_eep_st_lock; + /* control high-level access to EEPROM */ + struct mutex ipath_eep_lock; + /* Below inc'd by ipath_snap_cntrs(), locked by ipath_eep_st_lock */ + uint64_t ipath_traffic_wds; + /* active time is kept in seconds, but logged in hours */ + atomic_t ipath_active_time; + /* Below are nominal shadow of EEPROM, new since last EEPROM update */ + uint8_t ipath_eep_st_errs[IPATH_EEP_LOG_CNT]; + uint8_t ipath_eep_st_new_errs[IPATH_EEP_LOG_CNT]; + uint16_t ipath_eep_hrs; + /* + * masks for which bits of errs, hwerrs that cause + * each of the counters to increment. + */ + struct ipath_eep_log_mask ipath_eep_st_masks[IPATH_EEP_LOG_CNT]; + + /* interrupt mitigation reload register info */ + u16 ipath_jint_idle_ticks; /* idle clock ticks */ + u16 ipath_jint_max_packets; /* max packets across all ports */ + + /* + * lock for access to SerDes, and flags to sequence preset + * versus steady-state. 7220-only at the moment. + */ + spinlock_t ipath_sdepb_lock; + u8 ipath_presets_needed; /* Set if presets to be restored next DOWN */ +}; + +/* ipath_hol_state values (stopping/starting user proc, send flushing) */ +#define IPATH_HOL_UP 0 +#define IPATH_HOL_DOWN 1 +/* ipath_hol_next toggle values, used when hol_state IPATH_HOL_DOWN */ +#define IPATH_HOL_DOWNSTOP 0 +#define IPATH_HOL_DOWNCONT 1 + +/* bit positions for sdma_status */ +#define IPATH_SDMA_ABORTING 0 +#define IPATH_SDMA_DISARMED 1 +#define IPATH_SDMA_DISABLED 2 +#define IPATH_SDMA_LAYERBUF 3 +#define IPATH_SDMA_RUNNING 30 +#define IPATH_SDMA_SHUTDOWN 31 + +/* bit combinations that correspond to abort states */ +#define IPATH_SDMA_ABORT_NONE 0 +#define IPATH_SDMA_ABORT_ABORTING (1UL << IPATH_SDMA_ABORTING) +#define IPATH_SDMA_ABORT_DISARMED ((1UL << IPATH_SDMA_ABORTING) | \ + (1UL << IPATH_SDMA_DISARMED)) +#define IPATH_SDMA_ABORT_DISABLED ((1UL << IPATH_SDMA_ABORTING) | \ + (1UL << IPATH_SDMA_DISABLED)) +#define IPATH_SDMA_ABORT_ABORTED ((1UL << IPATH_SDMA_ABORTING) | \ + (1UL << IPATH_SDMA_DISARMED) | (1UL << IPATH_SDMA_DISABLED)) +#define IPATH_SDMA_ABORT_MASK ((1UL<private_data)->pd +#define subport_fp(fp) \ + ((struct ipath_filedata *)(fp)->private_data)->subport +#define tidcursor_fp(fp) \ + ((struct ipath_filedata *)(fp)->private_data)->tidcursor +#define user_sdma_queue_fp(fp) \ + ((struct ipath_filedata *)(fp)->private_data)->pq + +/* + * values for ipath_flags + */ + /* chip can report link latency (IB 1.2) */ +#define IPATH_HAS_LINK_LATENCY 0x1 + /* The chip is up and initted */ +#define IPATH_INITTED 0x2 + /* set if any user code has set kr_rcvhdrsize */ +#define IPATH_RCVHDRSZ_SET 0x4 + /* The chip is present and valid for accesses */ +#define IPATH_PRESENT 0x8 + /* HT link0 is only 8 bits wide, ignore upper byte crc + * errors, etc. */ +#define IPATH_8BIT_IN_HT0 0x10 + /* HT link1 is only 8 bits wide, ignore upper byte crc + * errors, etc. */ +#define IPATH_8BIT_IN_HT1 0x20 + /* The link is down */ +#define IPATH_LINKDOWN 0x40 + /* The link level is up (0x11) */ +#define IPATH_LINKINIT 0x80 + /* The link is in the armed (0x21) state */ +#define IPATH_LINKARMED 0x100 + /* The link is in the active (0x31) state */ +#define IPATH_LINKACTIVE 0x200 + /* link current state is unknown */ +#define IPATH_LINKUNK 0x400 + /* Write combining flush needed for PIO */ +#define IPATH_PIO_FLUSH_WC 0x1000 + /* DMA Receive tail pointer */ +#define IPATH_NODMA_RTAIL 0x2000 + /* no IB cable, or no device on IB cable */ +#define IPATH_NOCABLE 0x4000 + /* Supports port zero per packet receive interrupts via + * GPIO */ +#define IPATH_GPIO_INTR 0x8000 + /* uses the coded 4byte TID, not 8 byte */ +#define IPATH_4BYTE_TID 0x10000 + /* packet/word counters are 32 bit, else those 4 counters + * are 64bit */ +#define IPATH_32BITCOUNTERS 0x20000 + /* Interrupt register is 64 bits */ +#define IPATH_INTREG_64 0x40000 + /* can miss port0 rx interrupts */ +#define IPATH_DISABLED 0x80000 /* administratively disabled */ + /* Use GPIO interrupts for new counters */ +#define IPATH_GPIO_ERRINTRS 0x100000 +#define IPATH_SWAP_PIOBUFS 0x200000 + /* Supports Send DMA */ +#define IPATH_HAS_SEND_DMA 0x400000 + /* Supports Send Count (not just word count) in PBC */ +#define IPATH_HAS_PBC_CNT 0x800000 + /* Suppress heartbeat, even if turning off loopback */ +#define IPATH_NO_HRTBT 0x1000000 +#define IPATH_HAS_THRESH_UPDATE 0x4000000 +#define IPATH_HAS_MULT_IB_SPEED 0x8000000 +#define IPATH_IB_AUTONEG_INPROG 0x10000000 +#define IPATH_IB_AUTONEG_FAILED 0x20000000 + /* Linkdown-disable intentionally, Do not attempt to bring up */ +#define IPATH_IB_LINK_DISABLED 0x40000000 +#define IPATH_IB_FORCE_NOTIFY 0x80000000 /* force notify on next ib change */ + +/* Bits in GPIO for the added interrupts */ +#define IPATH_GPIO_PORT0_BIT 2 +#define IPATH_GPIO_RXUVL_BIT 3 +#define IPATH_GPIO_OVRUN_BIT 4 +#define IPATH_GPIO_LLI_BIT 5 +#define IPATH_GPIO_ERRINTR_MASK 0x38 + +/* portdata flag bit offsets */ + /* waiting for a packet to arrive */ +#define IPATH_PORT_WAITING_RCV 2 + /* master has not finished initializing */ +#define IPATH_PORT_MASTER_UNINIT 4 + /* waiting for an urgent packet to arrive */ +#define IPATH_PORT_WAITING_URG 5 + +/* free up any allocated data at closes */ +void ipath_free_data(struct ipath_portdata *dd); +u32 __iomem *ipath_getpiobuf(struct ipath_devdata *, u32, u32 *); +void ipath_chg_pioavailkernel(struct ipath_devdata *dd, unsigned start, + unsigned len, int avail); +void ipath_init_iba6110_funcs(struct ipath_devdata *); +void ipath_get_eeprom_info(struct ipath_devdata *); +int ipath_update_eeprom_log(struct ipath_devdata *dd); +void ipath_inc_eeprom_err(struct ipath_devdata *dd, u32 eidx, u32 incr); +u64 ipath_snap_cntr(struct ipath_devdata *, ipath_creg); +void ipath_disarm_senderrbufs(struct ipath_devdata *); +void ipath_force_pio_avail_update(struct ipath_devdata *); +void signal_ib_event(struct ipath_devdata *dd, enum ib_event_type ev); + +/* + * Set LED override, only the two LSBs have "public" meaning, but + * any non-zero value substitutes them for the Link and LinkTrain + * LED states. + */ +#define IPATH_LED_PHYS 1 /* Physical (linktraining) GREEN LED */ +#define IPATH_LED_LOG 2 /* Logical (link) YELLOW LED */ +void ipath_set_led_override(struct ipath_devdata *dd, unsigned int val); + +/* send dma routines */ +int setup_sdma(struct ipath_devdata *); +void teardown_sdma(struct ipath_devdata *); +void ipath_restart_sdma(struct ipath_devdata *); +void ipath_sdma_intr(struct ipath_devdata *); +int ipath_sdma_verbs_send(struct ipath_devdata *, struct ipath_sge_state *, + u32, struct ipath_verbs_txreq *); +/* ipath_sdma_lock should be locked before calling this. */ +int ipath_sdma_make_progress(struct ipath_devdata *dd); + +/* must be called under ipath_sdma_lock */ +static inline u16 ipath_sdma_descq_freecnt(const struct ipath_devdata *dd) +{ + return dd->ipath_sdma_descq_cnt - + (dd->ipath_sdma_descq_added - dd->ipath_sdma_descq_removed) - + 1 - dd->ipath_sdma_desc_nreserved; +} + +static inline void ipath_sdma_desc_reserve(struct ipath_devdata *dd, u16 cnt) +{ + dd->ipath_sdma_desc_nreserved += cnt; +} + +static inline void ipath_sdma_desc_unreserve(struct ipath_devdata *dd, u16 cnt) +{ + dd->ipath_sdma_desc_nreserved -= cnt; +} + +/* + * number of words used for protocol header if not set by ipath_userinit(); + */ +#define IPATH_DFLT_RCVHDRSIZE 9 + +int ipath_get_user_pages(unsigned long, size_t, struct page **); +void ipath_release_user_pages(struct page **, size_t); +void ipath_release_user_pages_on_close(struct page **, size_t); +int ipath_eeprom_read(struct ipath_devdata *, u8, void *, int); +int ipath_eeprom_write(struct ipath_devdata *, u8, const void *, int); +int ipath_tempsense_read(struct ipath_devdata *, u8 regnum); +int ipath_tempsense_write(struct ipath_devdata *, u8 regnum, u8 data); + +/* these are used for the registers that vary with port */ +void ipath_write_kreg_port(const struct ipath_devdata *, ipath_kreg, + unsigned, u64); + +/* + * We could have a single register get/put routine, that takes a group type, + * but this is somewhat clearer and cleaner. It also gives us some error + * checking. 64 bit register reads should always work, but are inefficient + * on opteron (the northbridge always generates 2 separate HT 32 bit reads), + * so we use kreg32 wherever possible. User register and counter register + * reads are always 32 bit reads, so only one form of those routines. + */ + +/* + * At the moment, none of the s-registers are writable, so no + * ipath_write_sreg(). + */ + +/** + * ipath_read_ureg32 - read 32-bit virtualized per-port register + * @dd: device + * @regno: register number + * @port: port number + * + * Return the contents of a register that is virtualized to be per port. + * Returns -1 on errors (not distinguishable from valid contents at + * runtime; we may add a separate error variable at some point). + */ +static inline u32 ipath_read_ureg32(const struct ipath_devdata *dd, + ipath_ureg regno, int port) +{ + if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT)) + return 0; + + return readl(regno + (u64 __iomem *) + (dd->ipath_uregbase + + (char __iomem *)dd->ipath_kregbase + + dd->ipath_ureg_align * port)); +} + +/** + * ipath_write_ureg - write 32-bit virtualized per-port register + * @dd: device + * @regno: register number + * @value: value + * @port: port + * + * Write the contents of a register that is virtualized to be per port. + */ +static inline void ipath_write_ureg(const struct ipath_devdata *dd, + ipath_ureg regno, u64 value, int port) +{ + u64 __iomem *ubase = (u64 __iomem *) + (dd->ipath_uregbase + (char __iomem *) dd->ipath_kregbase + + dd->ipath_ureg_align * port); + if (dd->ipath_kregbase) + writeq(value, &ubase[regno]); +} + +static inline u32 ipath_read_kreg32(const struct ipath_devdata *dd, + ipath_kreg regno) +{ + if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT)) + return -1; + return readl((u32 __iomem *) & dd->ipath_kregbase[regno]); +} + +static inline u64 ipath_read_kreg64(const struct ipath_devdata *dd, + ipath_kreg regno) +{ + if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT)) + return -1; + + return readq(&dd->ipath_kregbase[regno]); +} + +static inline void ipath_write_kreg(const struct ipath_devdata *dd, + ipath_kreg regno, u64 value) +{ + if (dd->ipath_kregbase) + writeq(value, &dd->ipath_kregbase[regno]); +} + +static inline u64 ipath_read_creg(const struct ipath_devdata *dd, + ipath_sreg regno) +{ + if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT)) + return 0; + + return readq(regno + (u64 __iomem *) + (dd->ipath_cregbase + + (char __iomem *)dd->ipath_kregbase)); +} + +static inline u32 ipath_read_creg32(const struct ipath_devdata *dd, + ipath_sreg regno) +{ + if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT)) + return 0; + return readl(regno + (u64 __iomem *) + (dd->ipath_cregbase + + (char __iomem *)dd->ipath_kregbase)); +} + +static inline void ipath_write_creg(const struct ipath_devdata *dd, + ipath_creg regno, u64 value) +{ + if (dd->ipath_kregbase) + writeq(value, regno + (u64 __iomem *) + (dd->ipath_cregbase + + (char __iomem *)dd->ipath_kregbase)); +} + +static inline void ipath_clear_rcvhdrtail(const struct ipath_portdata *pd) +{ + *((u64 *) pd->port_rcvhdrtail_kvaddr) = 0ULL; +} + +static inline u32 ipath_get_rcvhdrtail(const struct ipath_portdata *pd) +{ + return (u32) le64_to_cpu(*((volatile __le64 *) + pd->port_rcvhdrtail_kvaddr)); +} + +static inline u32 ipath_get_hdrqtail(const struct ipath_portdata *pd) +{ + const struct ipath_devdata *dd = pd->port_dd; + u32 hdrqtail; + + if (dd->ipath_flags & IPATH_NODMA_RTAIL) { + __le32 *rhf_addr; + u32 seq; + + rhf_addr = (__le32 *) pd->port_rcvhdrq + + pd->port_head + dd->ipath_rhf_offset; + seq = ipath_hdrget_seq(rhf_addr); + hdrqtail = pd->port_head; + if (seq == pd->port_seq_cnt) + hdrqtail++; + } else + hdrqtail = ipath_get_rcvhdrtail(pd); + + return hdrqtail; +} + +static inline u64 ipath_read_ireg(const struct ipath_devdata *dd, ipath_kreg r) +{ + return (dd->ipath_flags & IPATH_INTREG_64) ? + ipath_read_kreg64(dd, r) : ipath_read_kreg32(dd, r); +} + +/* + * from contents of IBCStatus (or a saved copy), return linkstate + * Report ACTIVE_DEFER as ACTIVE, because we treat them the same + * everywhere, anyway (and should be, for almost all purposes). + */ +static inline u32 ipath_ib_linkstate(struct ipath_devdata *dd, u64 ibcs) +{ + u32 state = (u32)(ibcs >> dd->ibcs_ls_shift) & + INFINIPATH_IBCS_LINKSTATE_MASK; + if (state == INFINIPATH_IBCS_L_STATE_ACT_DEFER) + state = INFINIPATH_IBCS_L_STATE_ACTIVE; + return state; +} + +/* from contents of IBCStatus (or a saved copy), return linktrainingstate */ +static inline u32 ipath_ib_linktrstate(struct ipath_devdata *dd, u64 ibcs) +{ + return (u32)(ibcs >> INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) & + dd->ibcs_lts_mask; +} + +/* + * from contents of IBCStatus (or a saved copy), return logical link state + * combination of link state and linktraining state (down, active, init, + * arm, etc. + */ +static inline u32 ipath_ib_state(struct ipath_devdata *dd, u64 ibcs) +{ + u32 ibs; + ibs = (u32)(ibcs >> INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) & + dd->ibcs_lts_mask; + ibs |= (u32)(ibcs & + (INFINIPATH_IBCS_LINKSTATE_MASK << dd->ibcs_ls_shift)); + return ibs; +} + +/* + * sysfs interface. + */ + +struct device_driver; + +extern const char ib_ipath_version[]; + +extern const struct attribute_group *ipath_driver_attr_groups[]; + +int ipath_device_create_group(struct device *, struct ipath_devdata *); +void ipath_device_remove_group(struct device *, struct ipath_devdata *); +int ipath_expose_reset(struct device *); + +int ipath_init_ipathfs(void); +void ipath_exit_ipathfs(void); +int ipathfs_add_device(struct ipath_devdata *); +int ipathfs_remove_device(struct ipath_devdata *); + +/* + * dma_addr wrappers - all 0's invalid for hw + */ +dma_addr_t ipath_map_page(struct pci_dev *, struct page *, unsigned long, + size_t, int); +dma_addr_t ipath_map_single(struct pci_dev *, void *, size_t, int); +const char *ipath_get_unit_name(int unit); + +/* + * Flush write combining store buffers (if present) and perform a write + * barrier. + */ +#if defined(CONFIG_X86_64) +#define ipath_flush_wc() asm volatile("sfence" ::: "memory") +#else +#define ipath_flush_wc() wmb() +#endif + +extern unsigned ipath_debug; /* debugging bit mask */ +extern unsigned ipath_linkrecovery; +extern unsigned ipath_mtu4096; +extern struct mutex ipath_mutex; + +#define IPATH_DRV_NAME "ib_ipath" +#define IPATH_MAJOR 233 +#define IPATH_USER_MINOR_BASE 0 +#define IPATH_DIAGPKT_MINOR 127 +#define IPATH_DIAG_MINOR_BASE 129 +#define IPATH_NMINORS 255 + +#define ipath_dev_err(dd,fmt,...) \ + do { \ + const struct ipath_devdata *__dd = (dd); \ + if (__dd->pcidev) \ + dev_err(&__dd->pcidev->dev, "%s: " fmt, \ + ipath_get_unit_name(__dd->ipath_unit), \ + ##__VA_ARGS__); \ + else \ + printk(KERN_ERR IPATH_DRV_NAME ": %s: " fmt, \ + ipath_get_unit_name(__dd->ipath_unit), \ + ##__VA_ARGS__); \ + } while (0) + +#if _IPATH_DEBUGGING + +# define __IPATH_DBG_WHICH(which,fmt,...) \ + do { \ + if (unlikely(ipath_debug & (which))) \ + printk(KERN_DEBUG IPATH_DRV_NAME ": %s: " fmt, \ + __func__,##__VA_ARGS__); \ + } while(0) + +# define ipath_dbg(fmt,...) \ + __IPATH_DBG_WHICH(__IPATH_DBG,fmt,##__VA_ARGS__) +# define ipath_cdbg(which,fmt,...) \ + __IPATH_DBG_WHICH(__IPATH_##which##DBG,fmt,##__VA_ARGS__) + +#else /* ! _IPATH_DEBUGGING */ + +# define ipath_dbg(fmt,...) +# define ipath_cdbg(which,fmt,...) + +#endif /* _IPATH_DEBUGGING */ + +/* + * this is used for formatting hw error messages... + */ +struct ipath_hwerror_msgs { + u64 mask; + const char *msg; +}; + +#define INFINIPATH_HWE_MSG(a, b) { .mask = INFINIPATH_HWE_##a, .msg = b } + +/* in ipath_intr.c... */ +void ipath_format_hwerrors(u64 hwerrs, + const struct ipath_hwerror_msgs *hwerrmsgs, + size_t nhwerrmsgs, + char *msg, size_t lmsg); + +#endif /* _IPATH_KERNEL_H */ diff --git a/kernel/drivers/infiniband/hw/ipath/ipath_keys.c b/kernel/drivers/infiniband/hw/ipath/ipath_keys.c new file mode 100644 index 000000000..c0e933fec --- /dev/null +++ b/kernel/drivers/infiniband/hw/ipath/ipath_keys.c @@ -0,0 +1,270 @@ +/* + * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include "ipath_verbs.h" +#include "ipath_kernel.h" + +/** + * ipath_alloc_lkey - allocate an lkey + * @rkt: lkey table in which to allocate the lkey + * @mr: memory region that this lkey protects + * + * Returns 1 if successful, otherwise returns 0. + */ + +int ipath_alloc_lkey(struct ipath_lkey_table *rkt, struct ipath_mregion *mr) +{ + unsigned long flags; + u32 r; + u32 n; + int ret; + + spin_lock_irqsave(&rkt->lock, flags); + + /* Find the next available LKEY */ + r = n = rkt->next; + for (;;) { + if (rkt->table[r] == NULL) + break; + r = (r + 1) & (rkt->max - 1); + if (r == n) { + spin_unlock_irqrestore(&rkt->lock, flags); + ipath_dbg("LKEY table full\n"); + ret = 0; + goto bail; + } + } + rkt->next = (r + 1) & (rkt->max - 1); + /* + * Make sure lkey is never zero which is reserved to indicate an + * unrestricted LKEY. + */ + rkt->gen++; + mr->lkey = (r << (32 - ib_ipath_lkey_table_size)) | + ((((1 << (24 - ib_ipath_lkey_table_size)) - 1) & rkt->gen) + << 8); + if (mr->lkey == 0) { + mr->lkey |= 1 << 8; + rkt->gen++; + } + rkt->table[r] = mr; + spin_unlock_irqrestore(&rkt->lock, flags); + + ret = 1; + +bail: + return ret; +} + +/** + * ipath_free_lkey - free an lkey + * @rkt: table from which to free the lkey + * @lkey: lkey id to free + */ +void ipath_free_lkey(struct ipath_lkey_table *rkt, u32 lkey) +{ + unsigned long flags; + u32 r; + + if (lkey == 0) + return; + r = lkey >> (32 - ib_ipath_lkey_table_size); + spin_lock_irqsave(&rkt->lock, flags); + rkt->table[r] = NULL; + spin_unlock_irqrestore(&rkt->lock, flags); +} + +/** + * ipath_lkey_ok - check IB SGE for validity and initialize + * @rkt: table containing lkey to check SGE against + * @isge: outgoing internal SGE + * @sge: SGE to check + * @acc: access flags + * + * Return 1 if valid and successful, otherwise returns 0. + * + * Check the IB SGE for validity and initialize our internal version + * of it. + */ +int ipath_lkey_ok(struct ipath_qp *qp, struct ipath_sge *isge, + struct ib_sge *sge, int acc) +{ + struct ipath_lkey_table *rkt = &to_idev(qp->ibqp.device)->lk_table; + struct ipath_mregion *mr; + unsigned n, m; + size_t off; + int ret; + + /* + * We use LKEY == zero for kernel virtual addresses + * (see ipath_get_dma_mr and ipath_dma.c). + */ + if (sge->lkey == 0) { + /* always a kernel port, no locking needed */ + struct ipath_pd *pd = to_ipd(qp->ibqp.pd); + + if (pd->user) { + ret = 0; + goto bail; + } + isge->mr = NULL; + isge->vaddr = (void *) sge->addr; + isge->length = sge->length; + isge->sge_length = sge->length; + ret = 1; + goto bail; + } + mr = rkt->table[(sge->lkey >> (32 - ib_ipath_lkey_table_size))]; + if (unlikely(mr == NULL || mr->lkey != sge->lkey || + qp->ibqp.pd != mr->pd)) { + ret = 0; + goto bail; + } + + off = sge->addr - mr->user_base; + if (unlikely(sge->addr < mr->user_base || + off + sge->length > mr->length || + (mr->access_flags & acc) != acc)) { + ret = 0; + goto bail; + } + + off += mr->offset; + m = 0; + n = 0; + while (off >= mr->map[m]->segs[n].length) { + off -= mr->map[m]->segs[n].length; + n++; + if (n >= IPATH_SEGSZ) { + m++; + n = 0; + } + } + isge->mr = mr; + isge->vaddr = mr->map[m]->segs[n].vaddr + off; + isge->length = mr->map[m]->segs[n].length - off; + isge->sge_length = sge->length; + isge->m = m; + isge->n = n; + + ret = 1; + +bail: + return ret; +} + +/** + * ipath_rkey_ok - check the IB virtual address, length, and RKEY + * @dev: infiniband device + * @ss: SGE state + * @len: length of data + * @vaddr: virtual address to place data + * @rkey: rkey to check + * @acc: access flags + * + * Return 1 if successful, otherwise 0. + */ +int ipath_rkey_ok(struct ipath_qp *qp, struct ipath_sge_state *ss, + u32 len, u64 vaddr, u32 rkey, int acc) +{ + struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + struct ipath_lkey_table *rkt = &dev->lk_table; + struct ipath_sge *sge = &ss->sge; + struct ipath_mregion *mr; + unsigned n, m; + size_t off; + int ret; + + /* + * We use RKEY == zero for kernel virtual addresses + * (see ipath_get_dma_mr and ipath_dma.c). + */ + if (rkey == 0) { + /* always a kernel port, no locking needed */ + struct ipath_pd *pd = to_ipd(qp->ibqp.pd); + + if (pd->user) { + ret = 0; + goto bail; + } + sge->mr = NULL; + sge->vaddr = (void *) vaddr; + sge->length = len; + sge->sge_length = len; + ss->sg_list = NULL; + ss->num_sge = 1; + ret = 1; + goto bail; + } + + mr = rkt->table[(rkey >> (32 - ib_ipath_lkey_table_size))]; + if (unlikely(mr == NULL || mr->lkey != rkey || + qp->ibqp.pd != mr->pd)) { + ret = 0; + goto bail; + } + + off = vaddr - mr->iova; + if (unlikely(vaddr < mr->iova || off + len > mr->length || + (mr->access_flags & acc) == 0)) { + ret = 0; + goto bail; + } + + off += mr->offset; + m = 0; + n = 0; + while (off >= mr->map[m]->segs[n].length) { + off -= mr->map[m]->segs[n].length; + n++; + if (n >= IPATH_SEGSZ) { + m++; + n = 0; + } + } + sge->mr = mr; + sge->vaddr = mr->map[m]->segs[n].vaddr + off; + sge->length = mr->map[m]->segs[n].length - off; + sge->sge_length = len; + sge->m = m; + sge->n = n; + ss->sg_list = NULL; + ss->num_sge = 1; + + ret = 1; + +bail: + return ret; +} diff --git a/kernel/drivers/infiniband/hw/ipath/ipath_mad.c b/kernel/drivers/infiniband/hw/ipath/ipath_mad.c new file mode 100644 index 000000000..e890e5ba0 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ipath/ipath_mad.c @@ -0,0 +1,1513 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include "ipath_kernel.h" +#include "ipath_verbs.h" +#include "ipath_common.h" + +#define IB_SMP_UNSUP_VERSION cpu_to_be16(0x0004) +#define IB_SMP_UNSUP_METHOD cpu_to_be16(0x0008) +#define IB_SMP_UNSUP_METH_ATTR cpu_to_be16(0x000C) +#define IB_SMP_INVALID_FIELD cpu_to_be16(0x001C) + +static int reply(struct ib_smp *smp) +{ + /* + * The verbs framework will handle the directed/LID route + * packet changes. + */ + smp->method = IB_MGMT_METHOD_GET_RESP; + if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + smp->status |= IB_SMP_DIRECTION; + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; +} + +static int recv_subn_get_nodedescription(struct ib_smp *smp, + struct ib_device *ibdev) +{ + if (smp->attr_mod) + smp->status |= IB_SMP_INVALID_FIELD; + + memcpy(smp->data, ibdev->node_desc, sizeof(smp->data)); + + return reply(smp); +} + +struct nodeinfo { + u8 base_version; + u8 class_version; + u8 node_type; + u8 num_ports; + __be64 sys_guid; + __be64 node_guid; + __be64 port_guid; + __be16 partition_cap; + __be16 device_id; + __be32 revision; + u8 local_port_num; + u8 vendor_id[3]; +} __attribute__ ((packed)); + +static int recv_subn_get_nodeinfo(struct ib_smp *smp, + struct ib_device *ibdev, u8 port) +{ + struct nodeinfo *nip = (struct nodeinfo *)&smp->data; + struct ipath_devdata *dd = to_idev(ibdev)->dd; + u32 vendor, majrev, minrev; + + /* GUID 0 is illegal */ + if (smp->attr_mod || (dd->ipath_guid == 0)) + smp->status |= IB_SMP_INVALID_FIELD; + + nip->base_version = 1; + nip->class_version = 1; + nip->node_type = 1; /* channel adapter */ + /* + * XXX The num_ports value will need a layer function to get + * the value if we ever have more than one IB port on a chip. + * We will also need to get the GUID for the port. + */ + nip->num_ports = ibdev->phys_port_cnt; + /* This is already in network order */ + nip->sys_guid = to_idev(ibdev)->sys_image_guid; + nip->node_guid = dd->ipath_guid; + nip->port_guid = dd->ipath_guid; + nip->partition_cap = cpu_to_be16(ipath_get_npkeys(dd)); + nip->device_id = cpu_to_be16(dd->ipath_deviceid); + majrev = dd->ipath_majrev; + minrev = dd->ipath_minrev; + nip->revision = cpu_to_be32((majrev << 16) | minrev); + nip->local_port_num = port; + vendor = dd->ipath_vendorid; + nip->vendor_id[0] = IPATH_SRC_OUI_1; + nip->vendor_id[1] = IPATH_SRC_OUI_2; + nip->vendor_id[2] = IPATH_SRC_OUI_3; + + return reply(smp); +} + +static int recv_subn_get_guidinfo(struct ib_smp *smp, + struct ib_device *ibdev) +{ + u32 startgx = 8 * be32_to_cpu(smp->attr_mod); + __be64 *p = (__be64 *) smp->data; + + /* 32 blocks of 8 64-bit GUIDs per block */ + + memset(smp->data, 0, sizeof(smp->data)); + + /* + * We only support one GUID for now. If this changes, the + * portinfo.guid_cap field needs to be updated too. + */ + if (startgx == 0) { + __be64 g = to_idev(ibdev)->dd->ipath_guid; + if (g == 0) + /* GUID 0 is illegal */ + smp->status |= IB_SMP_INVALID_FIELD; + else + /* The first is a copy of the read-only HW GUID. */ + *p = g; + } else + smp->status |= IB_SMP_INVALID_FIELD; + + return reply(smp); +} + +static void set_link_width_enabled(struct ipath_devdata *dd, u32 w) +{ + (void) dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LWID_ENB, w); +} + +static void set_link_speed_enabled(struct ipath_devdata *dd, u32 s) +{ + (void) dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_SPD_ENB, s); +} + +static int get_overrunthreshold(struct ipath_devdata *dd) +{ + return (dd->ipath_ibcctrl >> + INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT) & + INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK; +} + +/** + * set_overrunthreshold - set the overrun threshold + * @dd: the infinipath device + * @n: the new threshold + * + * Note that this will only take effect when the link state changes. + */ +static int set_overrunthreshold(struct ipath_devdata *dd, unsigned n) +{ + unsigned v; + + v = (dd->ipath_ibcctrl >> INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT) & + INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK; + if (v != n) { + dd->ipath_ibcctrl &= + ~(INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK << + INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT); + dd->ipath_ibcctrl |= + (u64) n << INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT; + ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl, + dd->ipath_ibcctrl); + } + return 0; +} + +static int get_phyerrthreshold(struct ipath_devdata *dd) +{ + return (dd->ipath_ibcctrl >> + INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT) & + INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK; +} + +/** + * set_phyerrthreshold - set the physical error threshold + * @dd: the infinipath device + * @n: the new threshold + * + * Note that this will only take effect when the link state changes. + */ +static int set_phyerrthreshold(struct ipath_devdata *dd, unsigned n) +{ + unsigned v; + + v = (dd->ipath_ibcctrl >> INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT) & + INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK; + if (v != n) { + dd->ipath_ibcctrl &= + ~(INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK << + INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT); + dd->ipath_ibcctrl |= + (u64) n << INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT; + ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl, + dd->ipath_ibcctrl); + } + return 0; +} + +/** + * get_linkdowndefaultstate - get the default linkdown state + * @dd: the infinipath device + * + * Returns zero if the default is POLL, 1 if the default is SLEEP. + */ +static int get_linkdowndefaultstate(struct ipath_devdata *dd) +{ + return !!(dd->ipath_ibcctrl & INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE); +} + +static int recv_subn_get_portinfo(struct ib_smp *smp, + struct ib_device *ibdev, u8 port) +{ + struct ipath_ibdev *dev; + struct ipath_devdata *dd; + struct ib_port_info *pip = (struct ib_port_info *)smp->data; + u16 lid; + u8 ibcstat; + u8 mtu; + int ret; + + if (be32_to_cpu(smp->attr_mod) > ibdev->phys_port_cnt) { + smp->status |= IB_SMP_INVALID_FIELD; + ret = reply(smp); + goto bail; + } + + dev = to_idev(ibdev); + dd = dev->dd; + + /* Clear all fields. Only set the non-zero fields. */ + memset(smp->data, 0, sizeof(smp->data)); + + /* Only return the mkey if the protection field allows it. */ + if (smp->method == IB_MGMT_METHOD_SET || dev->mkey == smp->mkey || + dev->mkeyprot == 0) + pip->mkey = dev->mkey; + pip->gid_prefix = dev->gid_prefix; + lid = dd->ipath_lid; + pip->lid = lid ? cpu_to_be16(lid) : IB_LID_PERMISSIVE; + pip->sm_lid = cpu_to_be16(dev->sm_lid); + pip->cap_mask = cpu_to_be32(dev->port_cap_flags); + /* pip->diag_code; */ + pip->mkey_lease_period = cpu_to_be16(dev->mkey_lease_period); + pip->local_port_num = port; + pip->link_width_enabled = dd->ipath_link_width_enabled; + pip->link_width_supported = dd->ipath_link_width_supported; + pip->link_width_active = dd->ipath_link_width_active; + pip->linkspeed_portstate = dd->ipath_link_speed_supported << 4; + ibcstat = dd->ipath_lastibcstat; + /* map LinkState to IB portinfo values. */ + pip->linkspeed_portstate |= ipath_ib_linkstate(dd, ibcstat) + 1; + + pip->portphysstate_linkdown = + (ipath_cvt_physportstate[ibcstat & dd->ibcs_lts_mask] << 4) | + (get_linkdowndefaultstate(dd) ? 1 : 2); + pip->mkeyprot_resv_lmc = (dev->mkeyprot << 6) | dd->ipath_lmc; + pip->linkspeedactive_enabled = (dd->ipath_link_speed_active << 4) | + dd->ipath_link_speed_enabled; + switch (dd->ipath_ibmtu) { + case 4096: + mtu = IB_MTU_4096; + break; + case 2048: + mtu = IB_MTU_2048; + break; + case 1024: + mtu = IB_MTU_1024; + break; + case 512: + mtu = IB_MTU_512; + break; + case 256: + mtu = IB_MTU_256; + break; + default: /* oops, something is wrong */ + mtu = IB_MTU_2048; + break; + } + pip->neighbormtu_mastersmsl = (mtu << 4) | dev->sm_sl; + pip->vlcap_inittype = 0x10; /* VLCap = VL0, InitType = 0 */ + pip->vl_high_limit = dev->vl_high_limit; + /* pip->vl_arb_high_cap; // only one VL */ + /* pip->vl_arb_low_cap; // only one VL */ + /* InitTypeReply = 0 */ + /* our mtu cap depends on whether 4K MTU enabled or not */ + pip->inittypereply_mtucap = ipath_mtu4096 ? IB_MTU_4096 : IB_MTU_2048; + /* HCAs ignore VLStallCount and HOQLife */ + /* pip->vlstallcnt_hoqlife; */ + pip->operationalvl_pei_peo_fpi_fpo = 0x10; /* OVLs = 1 */ + pip->mkey_violations = cpu_to_be16(dev->mkey_violations); + /* P_KeyViolations are counted by hardware. */ + pip->pkey_violations = + cpu_to_be16((ipath_get_cr_errpkey(dd) - + dev->z_pkey_violations) & 0xFFFF); + pip->qkey_violations = cpu_to_be16(dev->qkey_violations); + /* Only the hardware GUID is supported for now */ + pip->guid_cap = 1; + pip->clientrereg_resv_subnetto = dev->subnet_timeout; + /* 32.768 usec. response time (guessing) */ + pip->resv_resptimevalue = 3; + pip->localphyerrors_overrunerrors = + (get_phyerrthreshold(dd) << 4) | + get_overrunthreshold(dd); + /* pip->max_credit_hint; */ + if (dev->port_cap_flags & IB_PORT_LINK_LATENCY_SUP) { + u32 v; + + v = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_LINKLATENCY); + pip->link_roundtrip_latency[0] = v >> 16; + pip->link_roundtrip_latency[1] = v >> 8; + pip->link_roundtrip_latency[2] = v; + } + + ret = reply(smp); + +bail: + return ret; +} + +/** + * get_pkeys - return the PKEY table for port 0 + * @dd: the infinipath device + * @pkeys: the pkey table is placed here + */ +static int get_pkeys(struct ipath_devdata *dd, u16 * pkeys) +{ + /* always a kernel port, no locking needed */ + struct ipath_portdata *pd = dd->ipath_pd[0]; + + memcpy(pkeys, pd->port_pkeys, sizeof(pd->port_pkeys)); + + return 0; +} + +static int recv_subn_get_pkeytable(struct ib_smp *smp, + struct ib_device *ibdev) +{ + u32 startpx = 32 * (be32_to_cpu(smp->attr_mod) & 0xffff); + u16 *p = (u16 *) smp->data; + __be16 *q = (__be16 *) smp->data; + + /* 64 blocks of 32 16-bit P_Key entries */ + + memset(smp->data, 0, sizeof(smp->data)); + if (startpx == 0) { + struct ipath_ibdev *dev = to_idev(ibdev); + unsigned i, n = ipath_get_npkeys(dev->dd); + + get_pkeys(dev->dd, p); + + for (i = 0; i < n; i++) + q[i] = cpu_to_be16(p[i]); + } else + smp->status |= IB_SMP_INVALID_FIELD; + + return reply(smp); +} + +static int recv_subn_set_guidinfo(struct ib_smp *smp, + struct ib_device *ibdev) +{ + /* The only GUID we support is the first read-only entry. */ + return recv_subn_get_guidinfo(smp, ibdev); +} + +/** + * set_linkdowndefaultstate - set the default linkdown state + * @dd: the infinipath device + * @sleep: the new state + * + * Note that this will only take effect when the link state changes. + */ +static int set_linkdowndefaultstate(struct ipath_devdata *dd, int sleep) +{ + if (sleep) + dd->ipath_ibcctrl |= INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE; + else + dd->ipath_ibcctrl &= ~INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE; + ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl, + dd->ipath_ibcctrl); + return 0; +} + +/** + * recv_subn_set_portinfo - set port information + * @smp: the incoming SM packet + * @ibdev: the infiniband device + * @port: the port on the device + * + * Set Portinfo (see ch. 14.2.5.6). + */ +static int recv_subn_set_portinfo(struct ib_smp *smp, + struct ib_device *ibdev, u8 port) +{ + struct ib_port_info *pip = (struct ib_port_info *)smp->data; + struct ib_event event; + struct ipath_ibdev *dev; + struct ipath_devdata *dd; + char clientrereg = 0; + u16 lid, smlid; + u8 lwe; + u8 lse; + u8 state; + u16 lstate; + u32 mtu; + int ret, ore; + + if (be32_to_cpu(smp->attr_mod) > ibdev->phys_port_cnt) + goto err; + + dev = to_idev(ibdev); + dd = dev->dd; + event.device = ibdev; + event.element.port_num = port; + + dev->mkey = pip->mkey; + dev->gid_prefix = pip->gid_prefix; + dev->mkey_lease_period = be16_to_cpu(pip->mkey_lease_period); + + lid = be16_to_cpu(pip->lid); + if (dd->ipath_lid != lid || + dd->ipath_lmc != (pip->mkeyprot_resv_lmc & 7)) { + /* Must be a valid unicast LID address. */ + if (lid == 0 || lid >= IPATH_MULTICAST_LID_BASE) + goto err; + ipath_set_lid(dd, lid, pip->mkeyprot_resv_lmc & 7); + event.event = IB_EVENT_LID_CHANGE; + ib_dispatch_event(&event); + } + + smlid = be16_to_cpu(pip->sm_lid); + if (smlid != dev->sm_lid) { + /* Must be a valid unicast LID address. */ + if (smlid == 0 || smlid >= IPATH_MULTICAST_LID_BASE) + goto err; + dev->sm_lid = smlid; + event.event = IB_EVENT_SM_CHANGE; + ib_dispatch_event(&event); + } + + /* Allow 1x or 4x to be set (see 14.2.6.6). */ + lwe = pip->link_width_enabled; + if (lwe) { + if (lwe == 0xFF) + lwe = dd->ipath_link_width_supported; + else if (lwe >= 16 || (lwe & ~dd->ipath_link_width_supported)) + goto err; + set_link_width_enabled(dd, lwe); + } + + /* Allow 2.5 or 5.0 Gbs. */ + lse = pip->linkspeedactive_enabled & 0xF; + if (lse) { + if (lse == 15) + lse = dd->ipath_link_speed_supported; + else if (lse >= 8 || (lse & ~dd->ipath_link_speed_supported)) + goto err; + set_link_speed_enabled(dd, lse); + } + + /* Set link down default state. */ + switch (pip->portphysstate_linkdown & 0xF) { + case 0: /* NOP */ + break; + case 1: /* SLEEP */ + if (set_linkdowndefaultstate(dd, 1)) + goto err; + break; + case 2: /* POLL */ + if (set_linkdowndefaultstate(dd, 0)) + goto err; + break; + default: + goto err; + } + + dev->mkeyprot = pip->mkeyprot_resv_lmc >> 6; + dev->vl_high_limit = pip->vl_high_limit; + + switch ((pip->neighbormtu_mastersmsl >> 4) & 0xF) { + case IB_MTU_256: + mtu = 256; + break; + case IB_MTU_512: + mtu = 512; + break; + case IB_MTU_1024: + mtu = 1024; + break; + case IB_MTU_2048: + mtu = 2048; + break; + case IB_MTU_4096: + if (!ipath_mtu4096) + goto err; + mtu = 4096; + break; + default: + /* XXX We have already partially updated our state! */ + goto err; + } + ipath_set_mtu(dd, mtu); + + dev->sm_sl = pip->neighbormtu_mastersmsl & 0xF; + + /* We only support VL0 */ + if (((pip->operationalvl_pei_peo_fpi_fpo >> 4) & 0xF) > 1) + goto err; + + if (pip->mkey_violations == 0) + dev->mkey_violations = 0; + + /* + * Hardware counter can't be reset so snapshot and subtract + * later. + */ + if (pip->pkey_violations == 0) + dev->z_pkey_violations = ipath_get_cr_errpkey(dd); + + if (pip->qkey_violations == 0) + dev->qkey_violations = 0; + + ore = pip->localphyerrors_overrunerrors; + if (set_phyerrthreshold(dd, (ore >> 4) & 0xF)) + goto err; + + if (set_overrunthreshold(dd, (ore & 0xF))) + goto err; + + dev->subnet_timeout = pip->clientrereg_resv_subnetto & 0x1F; + + if (pip->clientrereg_resv_subnetto & 0x80) { + clientrereg = 1; + event.event = IB_EVENT_CLIENT_REREGISTER; + ib_dispatch_event(&event); + } + + /* + * Do the port state change now that the other link parameters + * have been set. + * Changing the port physical state only makes sense if the link + * is down or is being set to down. + */ + state = pip->linkspeed_portstate & 0xF; + lstate = (pip->portphysstate_linkdown >> 4) & 0xF; + if (lstate && !(state == IB_PORT_DOWN || state == IB_PORT_NOP)) + goto err; + + /* + * Only state changes of DOWN, ARM, and ACTIVE are valid + * and must be in the correct state to take effect (see 7.2.6). + */ + switch (state) { + case IB_PORT_NOP: + if (lstate == 0) + break; + /* FALLTHROUGH */ + case IB_PORT_DOWN: + if (lstate == 0) + lstate = IPATH_IB_LINKDOWN_ONLY; + else if (lstate == 1) + lstate = IPATH_IB_LINKDOWN_SLEEP; + else if (lstate == 2) + lstate = IPATH_IB_LINKDOWN; + else if (lstate == 3) + lstate = IPATH_IB_LINKDOWN_DISABLE; + else + goto err; + ipath_set_linkstate(dd, lstate); + if (lstate == IPATH_IB_LINKDOWN_DISABLE) { + ret = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; + goto done; + } + ipath_wait_linkstate(dd, IPATH_LINKINIT | IPATH_LINKARMED | + IPATH_LINKACTIVE, 1000); + break; + case IB_PORT_ARMED: + ipath_set_linkstate(dd, IPATH_IB_LINKARM); + break; + case IB_PORT_ACTIVE: + ipath_set_linkstate(dd, IPATH_IB_LINKACTIVE); + break; + default: + /* XXX We have already partially updated our state! */ + goto err; + } + + ret = recv_subn_get_portinfo(smp, ibdev, port); + + if (clientrereg) + pip->clientrereg_resv_subnetto |= 0x80; + + goto done; + +err: + smp->status |= IB_SMP_INVALID_FIELD; + ret = recv_subn_get_portinfo(smp, ibdev, port); + +done: + return ret; +} + +/** + * rm_pkey - decrecment the reference count for the given PKEY + * @dd: the infinipath device + * @key: the PKEY index + * + * Return true if this was the last reference and the hardware table entry + * needs to be changed. + */ +static int rm_pkey(struct ipath_devdata *dd, u16 key) +{ + int i; + int ret; + + for (i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) { + if (dd->ipath_pkeys[i] != key) + continue; + if (atomic_dec_and_test(&dd->ipath_pkeyrefs[i])) { + dd->ipath_pkeys[i] = 0; + ret = 1; + goto bail; + } + break; + } + + ret = 0; + +bail: + return ret; +} + +/** + * add_pkey - add the given PKEY to the hardware table + * @dd: the infinipath device + * @key: the PKEY + * + * Return an error code if unable to add the entry, zero if no change, + * or 1 if the hardware PKEY register needs to be updated. + */ +static int add_pkey(struct ipath_devdata *dd, u16 key) +{ + int i; + u16 lkey = key & 0x7FFF; + int any = 0; + int ret; + + if (lkey == 0x7FFF) { + ret = 0; + goto bail; + } + + /* Look for an empty slot or a matching PKEY. */ + for (i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) { + if (!dd->ipath_pkeys[i]) { + any++; + continue; + } + /* If it matches exactly, try to increment the ref count */ + if (dd->ipath_pkeys[i] == key) { + if (atomic_inc_return(&dd->ipath_pkeyrefs[i]) > 1) { + ret = 0; + goto bail; + } + /* Lost the race. Look for an empty slot below. */ + atomic_dec(&dd->ipath_pkeyrefs[i]); + any++; + } + /* + * It makes no sense to have both the limited and unlimited + * PKEY set at the same time since the unlimited one will + * disable the limited one. + */ + if ((dd->ipath_pkeys[i] & 0x7FFF) == lkey) { + ret = -EEXIST; + goto bail; + } + } + if (!any) { + ret = -EBUSY; + goto bail; + } + for (i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) { + if (!dd->ipath_pkeys[i] && + atomic_inc_return(&dd->ipath_pkeyrefs[i]) == 1) { + /* for ipathstats, etc. */ + ipath_stats.sps_pkeys[i] = lkey; + dd->ipath_pkeys[i] = key; + ret = 1; + goto bail; + } + } + ret = -EBUSY; + +bail: + return ret; +} + +/** + * set_pkeys - set the PKEY table for port 0 + * @dd: the infinipath device + * @pkeys: the PKEY table + */ +static int set_pkeys(struct ipath_devdata *dd, u16 *pkeys, u8 port) +{ + struct ipath_portdata *pd; + int i; + int changed = 0; + + /* always a kernel port, no locking needed */ + pd = dd->ipath_pd[0]; + + for (i = 0; i < ARRAY_SIZE(pd->port_pkeys); i++) { + u16 key = pkeys[i]; + u16 okey = pd->port_pkeys[i]; + + if (key == okey) + continue; + /* + * The value of this PKEY table entry is changing. + * Remove the old entry in the hardware's array of PKEYs. + */ + if (okey & 0x7FFF) + changed |= rm_pkey(dd, okey); + if (key & 0x7FFF) { + int ret = add_pkey(dd, key); + + if (ret < 0) + key = 0; + else + changed |= ret; + } + pd->port_pkeys[i] = key; + } + if (changed) { + u64 pkey; + struct ib_event event; + + pkey = (u64) dd->ipath_pkeys[0] | + ((u64) dd->ipath_pkeys[1] << 16) | + ((u64) dd->ipath_pkeys[2] << 32) | + ((u64) dd->ipath_pkeys[3] << 48); + ipath_cdbg(VERBOSE, "p0 new pkey reg %llx\n", + (unsigned long long) pkey); + ipath_write_kreg(dd, dd->ipath_kregs->kr_partitionkey, + pkey); + + event.event = IB_EVENT_PKEY_CHANGE; + event.device = &dd->verbs_dev->ibdev; + event.element.port_num = port; + ib_dispatch_event(&event); + } + return 0; +} + +static int recv_subn_set_pkeytable(struct ib_smp *smp, + struct ib_device *ibdev, u8 port) +{ + u32 startpx = 32 * (be32_to_cpu(smp->attr_mod) & 0xffff); + __be16 *p = (__be16 *) smp->data; + u16 *q = (u16 *) smp->data; + struct ipath_ibdev *dev = to_idev(ibdev); + unsigned i, n = ipath_get_npkeys(dev->dd); + + for (i = 0; i < n; i++) + q[i] = be16_to_cpu(p[i]); + + if (startpx != 0 || set_pkeys(dev->dd, q, port) != 0) + smp->status |= IB_SMP_INVALID_FIELD; + + return recv_subn_get_pkeytable(smp, ibdev); +} + +static int recv_pma_get_classportinfo(struct ib_pma_mad *pmp) +{ + struct ib_class_port_info *p = + (struct ib_class_port_info *)pmp->data; + + memset(pmp->data, 0, sizeof(pmp->data)); + + if (pmp->mad_hdr.attr_mod != 0) + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + + /* Indicate AllPortSelect is valid (only one port anyway) */ + p->capability_mask = cpu_to_be16(1 << 8); + p->base_version = 1; + p->class_version = 1; + /* + * Expected response time is 4.096 usec. * 2^18 == 1.073741824 + * sec. + */ + p->resp_time_value = 18; + + return reply((struct ib_smp *) pmp); +} + +/* + * The PortSamplesControl.CounterMasks field is an array of 3 bit fields + * which specify the N'th counter's capabilities. See ch. 16.1.3.2. + * We support 5 counters which only count the mandatory quantities. + */ +#define COUNTER_MASK(q, n) (q << ((9 - n) * 3)) +#define COUNTER_MASK0_9 cpu_to_be32(COUNTER_MASK(1, 0) | \ + COUNTER_MASK(1, 1) | \ + COUNTER_MASK(1, 2) | \ + COUNTER_MASK(1, 3) | \ + COUNTER_MASK(1, 4)) + +static int recv_pma_get_portsamplescontrol(struct ib_pma_mad *pmp, + struct ib_device *ibdev, u8 port) +{ + struct ib_pma_portsamplescontrol *p = + (struct ib_pma_portsamplescontrol *)pmp->data; + struct ipath_ibdev *dev = to_idev(ibdev); + struct ipath_cregs const *crp = dev->dd->ipath_cregs; + unsigned long flags; + u8 port_select = p->port_select; + + memset(pmp->data, 0, sizeof(pmp->data)); + + p->port_select = port_select; + if (pmp->mad_hdr.attr_mod != 0 || + (port_select != port && port_select != 0xFF)) + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + /* + * Ticks are 10x the link transfer period which for 2.5Gbs is 4 + * nsec. 0 == 4 nsec., 1 == 8 nsec., ..., 255 == 1020 nsec. Sample + * intervals are counted in ticks. Since we use Linux timers, that + * count in jiffies, we can't sample for less than 1000 ticks if HZ + * == 1000 (4000 ticks if HZ is 250). link_speed_active returns 2 for + * DDR, 1 for SDR, set the tick to 1 for DDR, 0 for SDR on chips that + * have hardware support for delaying packets. + */ + if (crp->cr_psstat) + p->tick = dev->dd->ipath_link_speed_active - 1; + else + p->tick = 250; /* 1 usec. */ + p->counter_width = 4; /* 32 bit counters */ + p->counter_mask0_9 = COUNTER_MASK0_9; + spin_lock_irqsave(&dev->pending_lock, flags); + if (crp->cr_psstat) + p->sample_status = ipath_read_creg32(dev->dd, crp->cr_psstat); + else + p->sample_status = dev->pma_sample_status; + p->sample_start = cpu_to_be32(dev->pma_sample_start); + p->sample_interval = cpu_to_be32(dev->pma_sample_interval); + p->tag = cpu_to_be16(dev->pma_tag); + p->counter_select[0] = dev->pma_counter_select[0]; + p->counter_select[1] = dev->pma_counter_select[1]; + p->counter_select[2] = dev->pma_counter_select[2]; + p->counter_select[3] = dev->pma_counter_select[3]; + p->counter_select[4] = dev->pma_counter_select[4]; + spin_unlock_irqrestore(&dev->pending_lock, flags); + + return reply((struct ib_smp *) pmp); +} + +static int recv_pma_set_portsamplescontrol(struct ib_pma_mad *pmp, + struct ib_device *ibdev, u8 port) +{ + struct ib_pma_portsamplescontrol *p = + (struct ib_pma_portsamplescontrol *)pmp->data; + struct ipath_ibdev *dev = to_idev(ibdev); + struct ipath_cregs const *crp = dev->dd->ipath_cregs; + unsigned long flags; + u8 status; + int ret; + + if (pmp->mad_hdr.attr_mod != 0 || + (p->port_select != port && p->port_select != 0xFF)) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + ret = reply((struct ib_smp *) pmp); + goto bail; + } + + spin_lock_irqsave(&dev->pending_lock, flags); + if (crp->cr_psstat) + status = ipath_read_creg32(dev->dd, crp->cr_psstat); + else + status = dev->pma_sample_status; + if (status == IB_PMA_SAMPLE_STATUS_DONE) { + dev->pma_sample_start = be32_to_cpu(p->sample_start); + dev->pma_sample_interval = be32_to_cpu(p->sample_interval); + dev->pma_tag = be16_to_cpu(p->tag); + dev->pma_counter_select[0] = p->counter_select[0]; + dev->pma_counter_select[1] = p->counter_select[1]; + dev->pma_counter_select[2] = p->counter_select[2]; + dev->pma_counter_select[3] = p->counter_select[3]; + dev->pma_counter_select[4] = p->counter_select[4]; + if (crp->cr_psstat) { + ipath_write_creg(dev->dd, crp->cr_psinterval, + dev->pma_sample_interval); + ipath_write_creg(dev->dd, crp->cr_psstart, + dev->pma_sample_start); + } else + dev->pma_sample_status = IB_PMA_SAMPLE_STATUS_STARTED; + } + spin_unlock_irqrestore(&dev->pending_lock, flags); + + ret = recv_pma_get_portsamplescontrol(pmp, ibdev, port); + +bail: + return ret; +} + +static u64 get_counter(struct ipath_ibdev *dev, + struct ipath_cregs const *crp, + __be16 sel) +{ + u64 ret; + + switch (sel) { + case IB_PMA_PORT_XMIT_DATA: + ret = (crp->cr_psxmitdatacount) ? + ipath_read_creg32(dev->dd, crp->cr_psxmitdatacount) : + dev->ipath_sword; + break; + case IB_PMA_PORT_RCV_DATA: + ret = (crp->cr_psrcvdatacount) ? + ipath_read_creg32(dev->dd, crp->cr_psrcvdatacount) : + dev->ipath_rword; + break; + case IB_PMA_PORT_XMIT_PKTS: + ret = (crp->cr_psxmitpktscount) ? + ipath_read_creg32(dev->dd, crp->cr_psxmitpktscount) : + dev->ipath_spkts; + break; + case IB_PMA_PORT_RCV_PKTS: + ret = (crp->cr_psrcvpktscount) ? + ipath_read_creg32(dev->dd, crp->cr_psrcvpktscount) : + dev->ipath_rpkts; + break; + case IB_PMA_PORT_XMIT_WAIT: + ret = (crp->cr_psxmitwaitcount) ? + ipath_read_creg32(dev->dd, crp->cr_psxmitwaitcount) : + dev->ipath_xmit_wait; + break; + default: + ret = 0; + } + + return ret; +} + +static int recv_pma_get_portsamplesresult(struct ib_pma_mad *pmp, + struct ib_device *ibdev) +{ + struct ib_pma_portsamplesresult *p = + (struct ib_pma_portsamplesresult *)pmp->data; + struct ipath_ibdev *dev = to_idev(ibdev); + struct ipath_cregs const *crp = dev->dd->ipath_cregs; + u8 status; + int i; + + memset(pmp->data, 0, sizeof(pmp->data)); + p->tag = cpu_to_be16(dev->pma_tag); + if (crp->cr_psstat) + status = ipath_read_creg32(dev->dd, crp->cr_psstat); + else + status = dev->pma_sample_status; + p->sample_status = cpu_to_be16(status); + for (i = 0; i < ARRAY_SIZE(dev->pma_counter_select); i++) + p->counter[i] = (status != IB_PMA_SAMPLE_STATUS_DONE) ? 0 : + cpu_to_be32( + get_counter(dev, crp, dev->pma_counter_select[i])); + + return reply((struct ib_smp *) pmp); +} + +static int recv_pma_get_portsamplesresult_ext(struct ib_pma_mad *pmp, + struct ib_device *ibdev) +{ + struct ib_pma_portsamplesresult_ext *p = + (struct ib_pma_portsamplesresult_ext *)pmp->data; + struct ipath_ibdev *dev = to_idev(ibdev); + struct ipath_cregs const *crp = dev->dd->ipath_cregs; + u8 status; + int i; + + memset(pmp->data, 0, sizeof(pmp->data)); + p->tag = cpu_to_be16(dev->pma_tag); + if (crp->cr_psstat) + status = ipath_read_creg32(dev->dd, crp->cr_psstat); + else + status = dev->pma_sample_status; + p->sample_status = cpu_to_be16(status); + /* 64 bits */ + p->extended_width = cpu_to_be32(0x80000000); + for (i = 0; i < ARRAY_SIZE(dev->pma_counter_select); i++) + p->counter[i] = (status != IB_PMA_SAMPLE_STATUS_DONE) ? 0 : + cpu_to_be64( + get_counter(dev, crp, dev->pma_counter_select[i])); + + return reply((struct ib_smp *) pmp); +} + +static int recv_pma_get_portcounters(struct ib_pma_mad *pmp, + struct ib_device *ibdev, u8 port) +{ + struct ib_pma_portcounters *p = (struct ib_pma_portcounters *) + pmp->data; + struct ipath_ibdev *dev = to_idev(ibdev); + struct ipath_verbs_counters cntrs; + u8 port_select = p->port_select; + + ipath_get_counters(dev->dd, &cntrs); + + /* Adjust counters for any resets done. */ + cntrs.symbol_error_counter -= dev->z_symbol_error_counter; + cntrs.link_error_recovery_counter -= + dev->z_link_error_recovery_counter; + cntrs.link_downed_counter -= dev->z_link_downed_counter; + cntrs.port_rcv_errors += dev->rcv_errors; + cntrs.port_rcv_errors -= dev->z_port_rcv_errors; + cntrs.port_rcv_remphys_errors -= dev->z_port_rcv_remphys_errors; + cntrs.port_xmit_discards -= dev->z_port_xmit_discards; + cntrs.port_xmit_data -= dev->z_port_xmit_data; + cntrs.port_rcv_data -= dev->z_port_rcv_data; + cntrs.port_xmit_packets -= dev->z_port_xmit_packets; + cntrs.port_rcv_packets -= dev->z_port_rcv_packets; + cntrs.local_link_integrity_errors -= + dev->z_local_link_integrity_errors; + cntrs.excessive_buffer_overrun_errors -= + dev->z_excessive_buffer_overrun_errors; + cntrs.vl15_dropped -= dev->z_vl15_dropped; + cntrs.vl15_dropped += dev->n_vl15_dropped; + + memset(pmp->data, 0, sizeof(pmp->data)); + + p->port_select = port_select; + if (pmp->mad_hdr.attr_mod != 0 || + (port_select != port && port_select != 0xFF)) + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + + if (cntrs.symbol_error_counter > 0xFFFFUL) + p->symbol_error_counter = cpu_to_be16(0xFFFF); + else + p->symbol_error_counter = + cpu_to_be16((u16)cntrs.symbol_error_counter); + if (cntrs.link_error_recovery_counter > 0xFFUL) + p->link_error_recovery_counter = 0xFF; + else + p->link_error_recovery_counter = + (u8)cntrs.link_error_recovery_counter; + if (cntrs.link_downed_counter > 0xFFUL) + p->link_downed_counter = 0xFF; + else + p->link_downed_counter = (u8)cntrs.link_downed_counter; + if (cntrs.port_rcv_errors > 0xFFFFUL) + p->port_rcv_errors = cpu_to_be16(0xFFFF); + else + p->port_rcv_errors = + cpu_to_be16((u16) cntrs.port_rcv_errors); + if (cntrs.port_rcv_remphys_errors > 0xFFFFUL) + p->port_rcv_remphys_errors = cpu_to_be16(0xFFFF); + else + p->port_rcv_remphys_errors = + cpu_to_be16((u16)cntrs.port_rcv_remphys_errors); + if (cntrs.port_xmit_discards > 0xFFFFUL) + p->port_xmit_discards = cpu_to_be16(0xFFFF); + else + p->port_xmit_discards = + cpu_to_be16((u16)cntrs.port_xmit_discards); + if (cntrs.local_link_integrity_errors > 0xFUL) + cntrs.local_link_integrity_errors = 0xFUL; + if (cntrs.excessive_buffer_overrun_errors > 0xFUL) + cntrs.excessive_buffer_overrun_errors = 0xFUL; + p->link_overrun_errors = (cntrs.local_link_integrity_errors << 4) | + cntrs.excessive_buffer_overrun_errors; + if (cntrs.vl15_dropped > 0xFFFFUL) + p->vl15_dropped = cpu_to_be16(0xFFFF); + else + p->vl15_dropped = cpu_to_be16((u16)cntrs.vl15_dropped); + if (cntrs.port_xmit_data > 0xFFFFFFFFUL) + p->port_xmit_data = cpu_to_be32(0xFFFFFFFF); + else + p->port_xmit_data = cpu_to_be32((u32)cntrs.port_xmit_data); + if (cntrs.port_rcv_data > 0xFFFFFFFFUL) + p->port_rcv_data = cpu_to_be32(0xFFFFFFFF); + else + p->port_rcv_data = cpu_to_be32((u32)cntrs.port_rcv_data); + if (cntrs.port_xmit_packets > 0xFFFFFFFFUL) + p->port_xmit_packets = cpu_to_be32(0xFFFFFFFF); + else + p->port_xmit_packets = + cpu_to_be32((u32)cntrs.port_xmit_packets); + if (cntrs.port_rcv_packets > 0xFFFFFFFFUL) + p->port_rcv_packets = cpu_to_be32(0xFFFFFFFF); + else + p->port_rcv_packets = + cpu_to_be32((u32) cntrs.port_rcv_packets); + + return reply((struct ib_smp *) pmp); +} + +static int recv_pma_get_portcounters_ext(struct ib_pma_mad *pmp, + struct ib_device *ibdev, u8 port) +{ + struct ib_pma_portcounters_ext *p = + (struct ib_pma_portcounters_ext *)pmp->data; + struct ipath_ibdev *dev = to_idev(ibdev); + u64 swords, rwords, spkts, rpkts, xwait; + u8 port_select = p->port_select; + + ipath_snapshot_counters(dev->dd, &swords, &rwords, &spkts, + &rpkts, &xwait); + + /* Adjust counters for any resets done. */ + swords -= dev->z_port_xmit_data; + rwords -= dev->z_port_rcv_data; + spkts -= dev->z_port_xmit_packets; + rpkts -= dev->z_port_rcv_packets; + + memset(pmp->data, 0, sizeof(pmp->data)); + + p->port_select = port_select; + if (pmp->mad_hdr.attr_mod != 0 || + (port_select != port && port_select != 0xFF)) + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + + p->port_xmit_data = cpu_to_be64(swords); + p->port_rcv_data = cpu_to_be64(rwords); + p->port_xmit_packets = cpu_to_be64(spkts); + p->port_rcv_packets = cpu_to_be64(rpkts); + p->port_unicast_xmit_packets = cpu_to_be64(dev->n_unicast_xmit); + p->port_unicast_rcv_packets = cpu_to_be64(dev->n_unicast_rcv); + p->port_multicast_xmit_packets = cpu_to_be64(dev->n_multicast_xmit); + p->port_multicast_rcv_packets = cpu_to_be64(dev->n_multicast_rcv); + + return reply((struct ib_smp *) pmp); +} + +static int recv_pma_set_portcounters(struct ib_pma_mad *pmp, + struct ib_device *ibdev, u8 port) +{ + struct ib_pma_portcounters *p = (struct ib_pma_portcounters *) + pmp->data; + struct ipath_ibdev *dev = to_idev(ibdev); + struct ipath_verbs_counters cntrs; + + /* + * Since the HW doesn't support clearing counters, we save the + * current count and subtract it from future responses. + */ + ipath_get_counters(dev->dd, &cntrs); + + if (p->counter_select & IB_PMA_SEL_SYMBOL_ERROR) + dev->z_symbol_error_counter = cntrs.symbol_error_counter; + + if (p->counter_select & IB_PMA_SEL_LINK_ERROR_RECOVERY) + dev->z_link_error_recovery_counter = + cntrs.link_error_recovery_counter; + + if (p->counter_select & IB_PMA_SEL_LINK_DOWNED) + dev->z_link_downed_counter = cntrs.link_downed_counter; + + if (p->counter_select & IB_PMA_SEL_PORT_RCV_ERRORS) + dev->z_port_rcv_errors = + cntrs.port_rcv_errors + dev->rcv_errors; + + if (p->counter_select & IB_PMA_SEL_PORT_RCV_REMPHYS_ERRORS) + dev->z_port_rcv_remphys_errors = + cntrs.port_rcv_remphys_errors; + + if (p->counter_select & IB_PMA_SEL_PORT_XMIT_DISCARDS) + dev->z_port_xmit_discards = cntrs.port_xmit_discards; + + if (p->counter_select & IB_PMA_SEL_LOCAL_LINK_INTEGRITY_ERRORS) + dev->z_local_link_integrity_errors = + cntrs.local_link_integrity_errors; + + if (p->counter_select & IB_PMA_SEL_EXCESSIVE_BUFFER_OVERRUNS) + dev->z_excessive_buffer_overrun_errors = + cntrs.excessive_buffer_overrun_errors; + + if (p->counter_select & IB_PMA_SEL_PORT_VL15_DROPPED) { + dev->n_vl15_dropped = 0; + dev->z_vl15_dropped = cntrs.vl15_dropped; + } + + if (p->counter_select & IB_PMA_SEL_PORT_XMIT_DATA) + dev->z_port_xmit_data = cntrs.port_xmit_data; + + if (p->counter_select & IB_PMA_SEL_PORT_RCV_DATA) + dev->z_port_rcv_data = cntrs.port_rcv_data; + + if (p->counter_select & IB_PMA_SEL_PORT_XMIT_PACKETS) + dev->z_port_xmit_packets = cntrs.port_xmit_packets; + + if (p->counter_select & IB_PMA_SEL_PORT_RCV_PACKETS) + dev->z_port_rcv_packets = cntrs.port_rcv_packets; + + return recv_pma_get_portcounters(pmp, ibdev, port); +} + +static int recv_pma_set_portcounters_ext(struct ib_pma_mad *pmp, + struct ib_device *ibdev, u8 port) +{ + struct ib_pma_portcounters *p = (struct ib_pma_portcounters *) + pmp->data; + struct ipath_ibdev *dev = to_idev(ibdev); + u64 swords, rwords, spkts, rpkts, xwait; + + ipath_snapshot_counters(dev->dd, &swords, &rwords, &spkts, + &rpkts, &xwait); + + if (p->counter_select & IB_PMA_SELX_PORT_XMIT_DATA) + dev->z_port_xmit_data = swords; + + if (p->counter_select & IB_PMA_SELX_PORT_RCV_DATA) + dev->z_port_rcv_data = rwords; + + if (p->counter_select & IB_PMA_SELX_PORT_XMIT_PACKETS) + dev->z_port_xmit_packets = spkts; + + if (p->counter_select & IB_PMA_SELX_PORT_RCV_PACKETS) + dev->z_port_rcv_packets = rpkts; + + if (p->counter_select & IB_PMA_SELX_PORT_UNI_XMIT_PACKETS) + dev->n_unicast_xmit = 0; + + if (p->counter_select & IB_PMA_SELX_PORT_UNI_RCV_PACKETS) + dev->n_unicast_rcv = 0; + + if (p->counter_select & IB_PMA_SELX_PORT_MULTI_XMIT_PACKETS) + dev->n_multicast_xmit = 0; + + if (p->counter_select & IB_PMA_SELX_PORT_MULTI_RCV_PACKETS) + dev->n_multicast_rcv = 0; + + return recv_pma_get_portcounters_ext(pmp, ibdev, port); +} + +static int process_subn(struct ib_device *ibdev, int mad_flags, + u8 port_num, struct ib_mad *in_mad, + struct ib_mad *out_mad) +{ + struct ib_smp *smp = (struct ib_smp *)out_mad; + struct ipath_ibdev *dev = to_idev(ibdev); + int ret; + + *out_mad = *in_mad; + if (smp->class_version != 1) { + smp->status |= IB_SMP_UNSUP_VERSION; + ret = reply(smp); + goto bail; + } + + /* Is the mkey in the process of expiring? */ + if (dev->mkey_lease_timeout && + time_after_eq(jiffies, dev->mkey_lease_timeout)) { + /* Clear timeout and mkey protection field. */ + dev->mkey_lease_timeout = 0; + dev->mkeyprot = 0; + } + + /* + * M_Key checking depends on + * Portinfo:M_Key_protect_bits + */ + if ((mad_flags & IB_MAD_IGNORE_MKEY) == 0 && dev->mkey != 0 && + dev->mkey != smp->mkey && + (smp->method == IB_MGMT_METHOD_SET || + (smp->method == IB_MGMT_METHOD_GET && + dev->mkeyprot >= 2))) { + if (dev->mkey_violations != 0xFFFF) + ++dev->mkey_violations; + if (dev->mkey_lease_timeout || + dev->mkey_lease_period == 0) { + ret = IB_MAD_RESULT_SUCCESS | + IB_MAD_RESULT_CONSUMED; + goto bail; + } + dev->mkey_lease_timeout = jiffies + + dev->mkey_lease_period * HZ; + /* Future: Generate a trap notice. */ + ret = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; + goto bail; + } else if (dev->mkey_lease_timeout) + dev->mkey_lease_timeout = 0; + + switch (smp->method) { + case IB_MGMT_METHOD_GET: + switch (smp->attr_id) { + case IB_SMP_ATTR_NODE_DESC: + ret = recv_subn_get_nodedescription(smp, ibdev); + goto bail; + case IB_SMP_ATTR_NODE_INFO: + ret = recv_subn_get_nodeinfo(smp, ibdev, port_num); + goto bail; + case IB_SMP_ATTR_GUID_INFO: + ret = recv_subn_get_guidinfo(smp, ibdev); + goto bail; + case IB_SMP_ATTR_PORT_INFO: + ret = recv_subn_get_portinfo(smp, ibdev, port_num); + goto bail; + case IB_SMP_ATTR_PKEY_TABLE: + ret = recv_subn_get_pkeytable(smp, ibdev); + goto bail; + case IB_SMP_ATTR_SM_INFO: + if (dev->port_cap_flags & IB_PORT_SM_DISABLED) { + ret = IB_MAD_RESULT_SUCCESS | + IB_MAD_RESULT_CONSUMED; + goto bail; + } + if (dev->port_cap_flags & IB_PORT_SM) { + ret = IB_MAD_RESULT_SUCCESS; + goto bail; + } + /* FALLTHROUGH */ + default: + smp->status |= IB_SMP_UNSUP_METH_ATTR; + ret = reply(smp); + goto bail; + } + + case IB_MGMT_METHOD_SET: + switch (smp->attr_id) { + case IB_SMP_ATTR_GUID_INFO: + ret = recv_subn_set_guidinfo(smp, ibdev); + goto bail; + case IB_SMP_ATTR_PORT_INFO: + ret = recv_subn_set_portinfo(smp, ibdev, port_num); + goto bail; + case IB_SMP_ATTR_PKEY_TABLE: + ret = recv_subn_set_pkeytable(smp, ibdev, port_num); + goto bail; + case IB_SMP_ATTR_SM_INFO: + if (dev->port_cap_flags & IB_PORT_SM_DISABLED) { + ret = IB_MAD_RESULT_SUCCESS | + IB_MAD_RESULT_CONSUMED; + goto bail; + } + if (dev->port_cap_flags & IB_PORT_SM) { + ret = IB_MAD_RESULT_SUCCESS; + goto bail; + } + /* FALLTHROUGH */ + default: + smp->status |= IB_SMP_UNSUP_METH_ATTR; + ret = reply(smp); + goto bail; + } + + case IB_MGMT_METHOD_TRAP: + case IB_MGMT_METHOD_REPORT: + case IB_MGMT_METHOD_REPORT_RESP: + case IB_MGMT_METHOD_TRAP_REPRESS: + case IB_MGMT_METHOD_GET_RESP: + /* + * The ib_mad module will call us to process responses + * before checking for other consumers. + * Just tell the caller to process it normally. + */ + ret = IB_MAD_RESULT_SUCCESS; + goto bail; + default: + smp->status |= IB_SMP_UNSUP_METHOD; + ret = reply(smp); + } + +bail: + return ret; +} + +static int process_perf(struct ib_device *ibdev, u8 port_num, + struct ib_mad *in_mad, + struct ib_mad *out_mad) +{ + struct ib_pma_mad *pmp = (struct ib_pma_mad *)out_mad; + int ret; + + *out_mad = *in_mad; + if (pmp->mad_hdr.class_version != 1) { + pmp->mad_hdr.status |= IB_SMP_UNSUP_VERSION; + ret = reply((struct ib_smp *) pmp); + goto bail; + } + + switch (pmp->mad_hdr.method) { + case IB_MGMT_METHOD_GET: + switch (pmp->mad_hdr.attr_id) { + case IB_PMA_CLASS_PORT_INFO: + ret = recv_pma_get_classportinfo(pmp); + goto bail; + case IB_PMA_PORT_SAMPLES_CONTROL: + ret = recv_pma_get_portsamplescontrol(pmp, ibdev, + port_num); + goto bail; + case IB_PMA_PORT_SAMPLES_RESULT: + ret = recv_pma_get_portsamplesresult(pmp, ibdev); + goto bail; + case IB_PMA_PORT_SAMPLES_RESULT_EXT: + ret = recv_pma_get_portsamplesresult_ext(pmp, + ibdev); + goto bail; + case IB_PMA_PORT_COUNTERS: + ret = recv_pma_get_portcounters(pmp, ibdev, + port_num); + goto bail; + case IB_PMA_PORT_COUNTERS_EXT: + ret = recv_pma_get_portcounters_ext(pmp, ibdev, + port_num); + goto bail; + default: + pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR; + ret = reply((struct ib_smp *) pmp); + goto bail; + } + + case IB_MGMT_METHOD_SET: + switch (pmp->mad_hdr.attr_id) { + case IB_PMA_PORT_SAMPLES_CONTROL: + ret = recv_pma_set_portsamplescontrol(pmp, ibdev, + port_num); + goto bail; + case IB_PMA_PORT_COUNTERS: + ret = recv_pma_set_portcounters(pmp, ibdev, + port_num); + goto bail; + case IB_PMA_PORT_COUNTERS_EXT: + ret = recv_pma_set_portcounters_ext(pmp, ibdev, + port_num); + goto bail; + default: + pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR; + ret = reply((struct ib_smp *) pmp); + goto bail; + } + + case IB_MGMT_METHOD_GET_RESP: + /* + * The ib_mad module will call us to process responses + * before checking for other consumers. + * Just tell the caller to process it normally. + */ + ret = IB_MAD_RESULT_SUCCESS; + goto bail; + default: + pmp->mad_hdr.status |= IB_SMP_UNSUP_METHOD; + ret = reply((struct ib_smp *) pmp); + } + +bail: + return ret; +} + +/** + * ipath_process_mad - process an incoming MAD packet + * @ibdev: the infiniband device this packet came in on + * @mad_flags: MAD flags + * @port_num: the port number this packet came in on + * @in_wc: the work completion entry for this packet + * @in_grh: the global route header for this packet + * @in_mad: the incoming MAD + * @out_mad: any outgoing MAD reply + * + * Returns IB_MAD_RESULT_SUCCESS if this is a MAD that we are not + * interested in processing. + * + * Note that the verbs framework has already done the MAD sanity checks, + * and hop count/pointer updating for IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE + * MADs. + * + * This is called by the ib_mad module. + */ +int ipath_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + struct ib_wc *in_wc, struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad) +{ + int ret; + + switch (in_mad->mad_hdr.mgmt_class) { + case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE: + case IB_MGMT_CLASS_SUBN_LID_ROUTED: + ret = process_subn(ibdev, mad_flags, port_num, + in_mad, out_mad); + goto bail; + case IB_MGMT_CLASS_PERF_MGMT: + ret = process_perf(ibdev, port_num, in_mad, out_mad); + goto bail; + default: + ret = IB_MAD_RESULT_SUCCESS; + } + +bail: + return ret; +} diff --git a/kernel/drivers/infiniband/hw/ipath/ipath_mmap.c b/kernel/drivers/infiniband/hw/ipath/ipath_mmap.c new file mode 100644 index 000000000..e73274229 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ipath/ipath_mmap.c @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include + +#include "ipath_verbs.h" + +/** + * ipath_release_mmap_info - free mmap info structure + * @ref: a pointer to the kref within struct ipath_mmap_info + */ +void ipath_release_mmap_info(struct kref *ref) +{ + struct ipath_mmap_info *ip = + container_of(ref, struct ipath_mmap_info, ref); + struct ipath_ibdev *dev = to_idev(ip->context->device); + + spin_lock_irq(&dev->pending_lock); + list_del(&ip->pending_mmaps); + spin_unlock_irq(&dev->pending_lock); + + vfree(ip->obj); + kfree(ip); +} + +/* + * open and close keep track of how many times the CQ is mapped, + * to avoid releasing it. + */ +static void ipath_vma_open(struct vm_area_struct *vma) +{ + struct ipath_mmap_info *ip = vma->vm_private_data; + + kref_get(&ip->ref); +} + +static void ipath_vma_close(struct vm_area_struct *vma) +{ + struct ipath_mmap_info *ip = vma->vm_private_data; + + kref_put(&ip->ref, ipath_release_mmap_info); +} + +static const struct vm_operations_struct ipath_vm_ops = { + .open = ipath_vma_open, + .close = ipath_vma_close, +}; + +/** + * ipath_mmap - create a new mmap region + * @context: the IB user context of the process making the mmap() call + * @vma: the VMA to be initialized + * Return zero if the mmap is OK. Otherwise, return an errno. + */ +int ipath_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) +{ + struct ipath_ibdev *dev = to_idev(context->device); + unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; + unsigned long size = vma->vm_end - vma->vm_start; + struct ipath_mmap_info *ip, *pp; + int ret = -EINVAL; + + /* + * Search the device's list of objects waiting for a mmap call. + * Normally, this list is very short since a call to create a + * CQ, QP, or SRQ is soon followed by a call to mmap(). + */ + spin_lock_irq(&dev->pending_lock); + list_for_each_entry_safe(ip, pp, &dev->pending_mmaps, + pending_mmaps) { + /* Only the creator is allowed to mmap the object */ + if (context != ip->context || (__u64) offset != ip->offset) + continue; + /* Don't allow a mmap larger than the object. */ + if (size > ip->size) + break; + + list_del_init(&ip->pending_mmaps); + spin_unlock_irq(&dev->pending_lock); + + ret = remap_vmalloc_range(vma, ip->obj, 0); + if (ret) + goto done; + vma->vm_ops = &ipath_vm_ops; + vma->vm_private_data = ip; + ipath_vma_open(vma); + goto done; + } + spin_unlock_irq(&dev->pending_lock); +done: + return ret; +} + +/* + * Allocate information for ipath_mmap + */ +struct ipath_mmap_info *ipath_create_mmap_info(struct ipath_ibdev *dev, + u32 size, + struct ib_ucontext *context, + void *obj) { + struct ipath_mmap_info *ip; + + ip = kmalloc(sizeof *ip, GFP_KERNEL); + if (!ip) + goto bail; + + size = PAGE_ALIGN(size); + + spin_lock_irq(&dev->mmap_offset_lock); + if (dev->mmap_offset == 0) + dev->mmap_offset = PAGE_SIZE; + ip->offset = dev->mmap_offset; + dev->mmap_offset += size; + spin_unlock_irq(&dev->mmap_offset_lock); + + INIT_LIST_HEAD(&ip->pending_mmaps); + ip->size = size; + ip->context = context; + ip->obj = obj; + kref_init(&ip->ref); + +bail: + return ip; +} + +void ipath_update_mmap_info(struct ipath_ibdev *dev, + struct ipath_mmap_info *ip, + u32 size, void *obj) { + size = PAGE_ALIGN(size); + + spin_lock_irq(&dev->mmap_offset_lock); + if (dev->mmap_offset == 0) + dev->mmap_offset = PAGE_SIZE; + ip->offset = dev->mmap_offset; + dev->mmap_offset += size; + spin_unlock_irq(&dev->mmap_offset_lock); + + ip->size = size; + ip->obj = obj; +} diff --git a/kernel/drivers/infiniband/hw/ipath/ipath_mr.c b/kernel/drivers/infiniband/hw/ipath/ipath_mr.c new file mode 100644 index 000000000..c7278f6a8 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ipath/ipath_mr.c @@ -0,0 +1,425 @@ +/* + * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include +#include +#include + +#include "ipath_verbs.h" + +/* Fast memory region */ +struct ipath_fmr { + struct ib_fmr ibfmr; + u8 page_shift; + struct ipath_mregion mr; /* must be last */ +}; + +static inline struct ipath_fmr *to_ifmr(struct ib_fmr *ibfmr) +{ + return container_of(ibfmr, struct ipath_fmr, ibfmr); +} + +/** + * ipath_get_dma_mr - get a DMA memory region + * @pd: protection domain for this memory region + * @acc: access flags + * + * Returns the memory region on success, otherwise returns an errno. + * Note that all DMA addresses should be created via the + * struct ib_dma_mapping_ops functions (see ipath_dma.c). + */ +struct ib_mr *ipath_get_dma_mr(struct ib_pd *pd, int acc) +{ + struct ipath_mr *mr; + struct ib_mr *ret; + + mr = kzalloc(sizeof *mr, GFP_KERNEL); + if (!mr) { + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + mr->mr.access_flags = acc; + ret = &mr->ibmr; + +bail: + return ret; +} + +static struct ipath_mr *alloc_mr(int count, + struct ipath_lkey_table *lk_table) +{ + struct ipath_mr *mr; + int m, i = 0; + + /* Allocate struct plus pointers to first level page tables. */ + m = (count + IPATH_SEGSZ - 1) / IPATH_SEGSZ; + mr = kmalloc(sizeof *mr + m * sizeof mr->mr.map[0], GFP_KERNEL); + if (!mr) + goto done; + + /* Allocate first level page tables. */ + for (; i < m; i++) { + mr->mr.map[i] = kmalloc(sizeof *mr->mr.map[0], GFP_KERNEL); + if (!mr->mr.map[i]) + goto bail; + } + mr->mr.mapsz = m; + + /* + * ib_reg_phys_mr() will initialize mr->ibmr except for + * lkey and rkey. + */ + if (!ipath_alloc_lkey(lk_table, &mr->mr)) + goto bail; + mr->ibmr.rkey = mr->ibmr.lkey = mr->mr.lkey; + + goto done; + +bail: + while (i) { + i--; + kfree(mr->mr.map[i]); + } + kfree(mr); + mr = NULL; + +done: + return mr; +} + +/** + * ipath_reg_phys_mr - register a physical memory region + * @pd: protection domain for this memory region + * @buffer_list: pointer to the list of physical buffers to register + * @num_phys_buf: the number of physical buffers to register + * @iova_start: the starting address passed over IB which maps to this MR + * + * Returns the memory region on success, otherwise returns an errno. + */ +struct ib_mr *ipath_reg_phys_mr(struct ib_pd *pd, + struct ib_phys_buf *buffer_list, + int num_phys_buf, int acc, u64 *iova_start) +{ + struct ipath_mr *mr; + int n, m, i; + struct ib_mr *ret; + + mr = alloc_mr(num_phys_buf, &to_idev(pd->device)->lk_table); + if (mr == NULL) { + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + mr->mr.pd = pd; + mr->mr.user_base = *iova_start; + mr->mr.iova = *iova_start; + mr->mr.length = 0; + mr->mr.offset = 0; + mr->mr.access_flags = acc; + mr->mr.max_segs = num_phys_buf; + mr->umem = NULL; + + m = 0; + n = 0; + for (i = 0; i < num_phys_buf; i++) { + mr->mr.map[m]->segs[n].vaddr = (void *) buffer_list[i].addr; + mr->mr.map[m]->segs[n].length = buffer_list[i].size; + mr->mr.length += buffer_list[i].size; + n++; + if (n == IPATH_SEGSZ) { + m++; + n = 0; + } + } + + ret = &mr->ibmr; + +bail: + return ret; +} + +/** + * ipath_reg_user_mr - register a userspace memory region + * @pd: protection domain for this memory region + * @start: starting userspace address + * @length: length of region to register + * @virt_addr: virtual address to use (from HCA's point of view) + * @mr_access_flags: access flags for this memory region + * @udata: unused by the InfiniPath driver + * + * Returns the memory region on success, otherwise returns an errno. + */ +struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int mr_access_flags, + struct ib_udata *udata) +{ + struct ipath_mr *mr; + struct ib_umem *umem; + int n, m, entry; + struct scatterlist *sg; + struct ib_mr *ret; + + if (length == 0) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + + umem = ib_umem_get(pd->uobject->context, start, length, + mr_access_flags, 0); + if (IS_ERR(umem)) + return (void *) umem; + + n = umem->nmap; + mr = alloc_mr(n, &to_idev(pd->device)->lk_table); + if (!mr) { + ret = ERR_PTR(-ENOMEM); + ib_umem_release(umem); + goto bail; + } + + mr->mr.pd = pd; + mr->mr.user_base = start; + mr->mr.iova = virt_addr; + mr->mr.length = length; + mr->mr.offset = ib_umem_offset(umem); + mr->mr.access_flags = mr_access_flags; + mr->mr.max_segs = n; + mr->umem = umem; + + m = 0; + n = 0; + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { + void *vaddr; + + vaddr = page_address(sg_page(sg)); + if (!vaddr) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + mr->mr.map[m]->segs[n].vaddr = vaddr; + mr->mr.map[m]->segs[n].length = umem->page_size; + n++; + if (n == IPATH_SEGSZ) { + m++; + n = 0; + } + } + ret = &mr->ibmr; + +bail: + return ret; +} + +/** + * ipath_dereg_mr - unregister and free a memory region + * @ibmr: the memory region to free + * + * Returns 0 on success. + * + * Note that this is called to free MRs created by ipath_get_dma_mr() + * or ipath_reg_user_mr(). + */ +int ipath_dereg_mr(struct ib_mr *ibmr) +{ + struct ipath_mr *mr = to_imr(ibmr); + int i; + + ipath_free_lkey(&to_idev(ibmr->device)->lk_table, ibmr->lkey); + i = mr->mr.mapsz; + while (i) { + i--; + kfree(mr->mr.map[i]); + } + + if (mr->umem) + ib_umem_release(mr->umem); + + kfree(mr); + return 0; +} + +/** + * ipath_alloc_fmr - allocate a fast memory region + * @pd: the protection domain for this memory region + * @mr_access_flags: access flags for this memory region + * @fmr_attr: fast memory region attributes + * + * Returns the memory region on success, otherwise returns an errno. + */ +struct ib_fmr *ipath_alloc_fmr(struct ib_pd *pd, int mr_access_flags, + struct ib_fmr_attr *fmr_attr) +{ + struct ipath_fmr *fmr; + int m, i = 0; + struct ib_fmr *ret; + + /* Allocate struct plus pointers to first level page tables. */ + m = (fmr_attr->max_pages + IPATH_SEGSZ - 1) / IPATH_SEGSZ; + fmr = kmalloc(sizeof *fmr + m * sizeof fmr->mr.map[0], GFP_KERNEL); + if (!fmr) + goto bail; + + /* Allocate first level page tables. */ + for (; i < m; i++) { + fmr->mr.map[i] = kmalloc(sizeof *fmr->mr.map[0], + GFP_KERNEL); + if (!fmr->mr.map[i]) + goto bail; + } + fmr->mr.mapsz = m; + + /* + * ib_alloc_fmr() will initialize fmr->ibfmr except for lkey & + * rkey. + */ + if (!ipath_alloc_lkey(&to_idev(pd->device)->lk_table, &fmr->mr)) + goto bail; + fmr->ibfmr.rkey = fmr->ibfmr.lkey = fmr->mr.lkey; + /* + * Resources are allocated but no valid mapping (RKEY can't be + * used). + */ + fmr->mr.pd = pd; + fmr->mr.user_base = 0; + fmr->mr.iova = 0; + fmr->mr.length = 0; + fmr->mr.offset = 0; + fmr->mr.access_flags = mr_access_flags; + fmr->mr.max_segs = fmr_attr->max_pages; + fmr->page_shift = fmr_attr->page_shift; + + ret = &fmr->ibfmr; + goto done; + +bail: + while (i) + kfree(fmr->mr.map[--i]); + kfree(fmr); + ret = ERR_PTR(-ENOMEM); + +done: + return ret; +} + +/** + * ipath_map_phys_fmr - set up a fast memory region + * @ibmfr: the fast memory region to set up + * @page_list: the list of pages to associate with the fast memory region + * @list_len: the number of pages to associate with the fast memory region + * @iova: the virtual address of the start of the fast memory region + * + * This may be called from interrupt context. + */ + +int ipath_map_phys_fmr(struct ib_fmr *ibfmr, u64 * page_list, + int list_len, u64 iova) +{ + struct ipath_fmr *fmr = to_ifmr(ibfmr); + struct ipath_lkey_table *rkt; + unsigned long flags; + int m, n, i; + u32 ps; + int ret; + + if (list_len > fmr->mr.max_segs) { + ret = -EINVAL; + goto bail; + } + rkt = &to_idev(ibfmr->device)->lk_table; + spin_lock_irqsave(&rkt->lock, flags); + fmr->mr.user_base = iova; + fmr->mr.iova = iova; + ps = 1 << fmr->page_shift; + fmr->mr.length = list_len * ps; + m = 0; + n = 0; + ps = 1 << fmr->page_shift; + for (i = 0; i < list_len; i++) { + fmr->mr.map[m]->segs[n].vaddr = (void *) page_list[i]; + fmr->mr.map[m]->segs[n].length = ps; + if (++n == IPATH_SEGSZ) { + m++; + n = 0; + } + } + spin_unlock_irqrestore(&rkt->lock, flags); + ret = 0; + +bail: + return ret; +} + +/** + * ipath_unmap_fmr - unmap fast memory regions + * @fmr_list: the list of fast memory regions to unmap + * + * Returns 0 on success. + */ +int ipath_unmap_fmr(struct list_head *fmr_list) +{ + struct ipath_fmr *fmr; + struct ipath_lkey_table *rkt; + unsigned long flags; + + list_for_each_entry(fmr, fmr_list, ibfmr.list) { + rkt = &to_idev(fmr->ibfmr.device)->lk_table; + spin_lock_irqsave(&rkt->lock, flags); + fmr->mr.user_base = 0; + fmr->mr.iova = 0; + fmr->mr.length = 0; + spin_unlock_irqrestore(&rkt->lock, flags); + } + return 0; +} + +/** + * ipath_dealloc_fmr - deallocate a fast memory region + * @ibfmr: the fast memory region to deallocate + * + * Returns 0 on success. + */ +int ipath_dealloc_fmr(struct ib_fmr *ibfmr) +{ + struct ipath_fmr *fmr = to_ifmr(ibfmr); + int i; + + ipath_free_lkey(&to_idev(ibfmr->device)->lk_table, ibfmr->lkey); + i = fmr->mr.mapsz; + while (i) + kfree(fmr->mr.map[--i]); + kfree(fmr); + return 0; +} diff --git a/kernel/drivers/infiniband/hw/ipath/ipath_qp.c b/kernel/drivers/infiniband/hw/ipath/ipath_qp.c new file mode 100644 index 000000000..face87602 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ipath/ipath_qp.c @@ -0,0 +1,1080 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#include "ipath_verbs.h" +#include "ipath_kernel.h" + +#define BITS_PER_PAGE (PAGE_SIZE*BITS_PER_BYTE) +#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) +#define mk_qpn(qpt, map, off) (((map) - (qpt)->map) * BITS_PER_PAGE + \ + (off)) +#define find_next_offset(map, off) find_next_zero_bit((map)->page, \ + BITS_PER_PAGE, off) + +/* + * Convert the AETH credit code into the number of credits. + */ +static u32 credit_table[31] = { + 0, /* 0 */ + 1, /* 1 */ + 2, /* 2 */ + 3, /* 3 */ + 4, /* 4 */ + 6, /* 5 */ + 8, /* 6 */ + 12, /* 7 */ + 16, /* 8 */ + 24, /* 9 */ + 32, /* A */ + 48, /* B */ + 64, /* C */ + 96, /* D */ + 128, /* E */ + 192, /* F */ + 256, /* 10 */ + 384, /* 11 */ + 512, /* 12 */ + 768, /* 13 */ + 1024, /* 14 */ + 1536, /* 15 */ + 2048, /* 16 */ + 3072, /* 17 */ + 4096, /* 18 */ + 6144, /* 19 */ + 8192, /* 1A */ + 12288, /* 1B */ + 16384, /* 1C */ + 24576, /* 1D */ + 32768 /* 1E */ +}; + + +static void get_map_page(struct ipath_qp_table *qpt, struct qpn_map *map) +{ + unsigned long page = get_zeroed_page(GFP_KERNEL); + unsigned long flags; + + /* + * Free the page if someone raced with us installing it. + */ + + spin_lock_irqsave(&qpt->lock, flags); + if (map->page) + free_page(page); + else + map->page = (void *)page; + spin_unlock_irqrestore(&qpt->lock, flags); +} + + +static int alloc_qpn(struct ipath_qp_table *qpt, enum ib_qp_type type) +{ + u32 i, offset, max_scan, qpn; + struct qpn_map *map; + u32 ret = -1; + + if (type == IB_QPT_SMI) + ret = 0; + else if (type == IB_QPT_GSI) + ret = 1; + + if (ret != -1) { + map = &qpt->map[0]; + if (unlikely(!map->page)) { + get_map_page(qpt, map); + if (unlikely(!map->page)) { + ret = -ENOMEM; + goto bail; + } + } + if (!test_and_set_bit(ret, map->page)) + atomic_dec(&map->n_free); + else + ret = -EBUSY; + goto bail; + } + + qpn = qpt->last + 1; + if (qpn >= QPN_MAX) + qpn = 2; + offset = qpn & BITS_PER_PAGE_MASK; + map = &qpt->map[qpn / BITS_PER_PAGE]; + max_scan = qpt->nmaps - !offset; + for (i = 0;;) { + if (unlikely(!map->page)) { + get_map_page(qpt, map); + if (unlikely(!map->page)) + break; + } + if (likely(atomic_read(&map->n_free))) { + do { + if (!test_and_set_bit(offset, map->page)) { + atomic_dec(&map->n_free); + qpt->last = qpn; + ret = qpn; + goto bail; + } + offset = find_next_offset(map, offset); + qpn = mk_qpn(qpt, map, offset); + /* + * This test differs from alloc_pidmap(). + * If find_next_offset() does find a zero + * bit, we don't need to check for QPN + * wrapping around past our starting QPN. + * We just need to be sure we don't loop + * forever. + */ + } while (offset < BITS_PER_PAGE && qpn < QPN_MAX); + } + /* + * In order to keep the number of pages allocated to a + * minimum, we scan the all existing pages before increasing + * the size of the bitmap table. + */ + if (++i > max_scan) { + if (qpt->nmaps == QPNMAP_ENTRIES) + break; + map = &qpt->map[qpt->nmaps++]; + offset = 0; + } else if (map < &qpt->map[qpt->nmaps]) { + ++map; + offset = 0; + } else { + map = &qpt->map[0]; + offset = 2; + } + qpn = mk_qpn(qpt, map, offset); + } + + ret = -ENOMEM; + +bail: + return ret; +} + +static void free_qpn(struct ipath_qp_table *qpt, u32 qpn) +{ + struct qpn_map *map; + + map = qpt->map + qpn / BITS_PER_PAGE; + if (map->page) + clear_bit(qpn & BITS_PER_PAGE_MASK, map->page); + atomic_inc(&map->n_free); +} + +/** + * ipath_alloc_qpn - allocate a QP number + * @qpt: the QP table + * @qp: the QP + * @type: the QP type (IB_QPT_SMI and IB_QPT_GSI are special) + * + * Allocate the next available QPN and put the QP into the hash table. + * The hash table holds a reference to the QP. + */ +static int ipath_alloc_qpn(struct ipath_qp_table *qpt, struct ipath_qp *qp, + enum ib_qp_type type) +{ + unsigned long flags; + int ret; + + ret = alloc_qpn(qpt, type); + if (ret < 0) + goto bail; + qp->ibqp.qp_num = ret; + + /* Add the QP to the hash table. */ + spin_lock_irqsave(&qpt->lock, flags); + + ret %= qpt->max; + qp->next = qpt->table[ret]; + qpt->table[ret] = qp; + atomic_inc(&qp->refcount); + + spin_unlock_irqrestore(&qpt->lock, flags); + ret = 0; + +bail: + return ret; +} + +/** + * ipath_free_qp - remove a QP from the QP table + * @qpt: the QP table + * @qp: the QP to remove + * + * Remove the QP from the table so it can't be found asynchronously by + * the receive interrupt routine. + */ +static void ipath_free_qp(struct ipath_qp_table *qpt, struct ipath_qp *qp) +{ + struct ipath_qp *q, **qpp; + unsigned long flags; + + spin_lock_irqsave(&qpt->lock, flags); + + /* Remove QP from the hash table. */ + qpp = &qpt->table[qp->ibqp.qp_num % qpt->max]; + for (; (q = *qpp) != NULL; qpp = &q->next) { + if (q == qp) { + *qpp = qp->next; + qp->next = NULL; + atomic_dec(&qp->refcount); + break; + } + } + + spin_unlock_irqrestore(&qpt->lock, flags); +} + +/** + * ipath_free_all_qps - check for QPs still in use + * @qpt: the QP table to empty + * + * There should not be any QPs still in use. + * Free memory for table. + */ +unsigned ipath_free_all_qps(struct ipath_qp_table *qpt) +{ + unsigned long flags; + struct ipath_qp *qp; + u32 n, qp_inuse = 0; + + spin_lock_irqsave(&qpt->lock, flags); + for (n = 0; n < qpt->max; n++) { + qp = qpt->table[n]; + qpt->table[n] = NULL; + + for (; qp; qp = qp->next) + qp_inuse++; + } + spin_unlock_irqrestore(&qpt->lock, flags); + + for (n = 0; n < ARRAY_SIZE(qpt->map); n++) + if (qpt->map[n].page) + free_page((unsigned long) qpt->map[n].page); + return qp_inuse; +} + +/** + * ipath_lookup_qpn - return the QP with the given QPN + * @qpt: the QP table + * @qpn: the QP number to look up + * + * The caller is responsible for decrementing the QP reference count + * when done. + */ +struct ipath_qp *ipath_lookup_qpn(struct ipath_qp_table *qpt, u32 qpn) +{ + unsigned long flags; + struct ipath_qp *qp; + + spin_lock_irqsave(&qpt->lock, flags); + + for (qp = qpt->table[qpn % qpt->max]; qp; qp = qp->next) { + if (qp->ibqp.qp_num == qpn) { + atomic_inc(&qp->refcount); + break; + } + } + + spin_unlock_irqrestore(&qpt->lock, flags); + return qp; +} + +/** + * ipath_reset_qp - initialize the QP state to the reset state + * @qp: the QP to reset + * @type: the QP type + */ +static void ipath_reset_qp(struct ipath_qp *qp, enum ib_qp_type type) +{ + qp->remote_qpn = 0; + qp->qkey = 0; + qp->qp_access_flags = 0; + atomic_set(&qp->s_dma_busy, 0); + qp->s_flags &= IPATH_S_SIGNAL_REQ_WR; + qp->s_hdrwords = 0; + qp->s_wqe = NULL; + qp->s_pkt_delay = 0; + qp->s_draining = 0; + qp->s_psn = 0; + qp->r_psn = 0; + qp->r_msn = 0; + if (type == IB_QPT_RC) { + qp->s_state = IB_OPCODE_RC_SEND_LAST; + qp->r_state = IB_OPCODE_RC_SEND_LAST; + } else { + qp->s_state = IB_OPCODE_UC_SEND_LAST; + qp->r_state = IB_OPCODE_UC_SEND_LAST; + } + qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE; + qp->r_nak_state = 0; + qp->r_aflags = 0; + qp->r_flags = 0; + qp->s_rnr_timeout = 0; + qp->s_head = 0; + qp->s_tail = 0; + qp->s_cur = 0; + qp->s_last = 0; + qp->s_ssn = 1; + qp->s_lsn = 0; + memset(qp->s_ack_queue, 0, sizeof(qp->s_ack_queue)); + qp->r_head_ack_queue = 0; + qp->s_tail_ack_queue = 0; + qp->s_num_rd_atomic = 0; + if (qp->r_rq.wq) { + qp->r_rq.wq->head = 0; + qp->r_rq.wq->tail = 0; + } +} + +/** + * ipath_error_qp - put a QP into the error state + * @qp: the QP to put into the error state + * @err: the receive completion error to signal if a RWQE is active + * + * Flushes both send and receive work queues. + * Returns true if last WQE event should be generated. + * The QP s_lock should be held and interrupts disabled. + * If we are already in error state, just return. + */ + +int ipath_error_qp(struct ipath_qp *qp, enum ib_wc_status err) +{ + struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + struct ib_wc wc; + int ret = 0; + + if (qp->state == IB_QPS_ERR) + goto bail; + + qp->state = IB_QPS_ERR; + + spin_lock(&dev->pending_lock); + if (!list_empty(&qp->timerwait)) + list_del_init(&qp->timerwait); + if (!list_empty(&qp->piowait)) + list_del_init(&qp->piowait); + spin_unlock(&dev->pending_lock); + + /* Schedule the sending tasklet to drain the send work queue. */ + if (qp->s_last != qp->s_head) + ipath_schedule_send(qp); + + memset(&wc, 0, sizeof(wc)); + wc.qp = &qp->ibqp; + wc.opcode = IB_WC_RECV; + + if (test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags)) { + wc.wr_id = qp->r_wr_id; + wc.status = err; + ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1); + } + wc.status = IB_WC_WR_FLUSH_ERR; + + if (qp->r_rq.wq) { + struct ipath_rwq *wq; + u32 head; + u32 tail; + + spin_lock(&qp->r_rq.lock); + + /* sanity check pointers before trusting them */ + wq = qp->r_rq.wq; + head = wq->head; + if (head >= qp->r_rq.size) + head = 0; + tail = wq->tail; + if (tail >= qp->r_rq.size) + tail = 0; + while (tail != head) { + wc.wr_id = get_rwqe_ptr(&qp->r_rq, tail)->wr_id; + if (++tail >= qp->r_rq.size) + tail = 0; + ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1); + } + wq->tail = tail; + + spin_unlock(&qp->r_rq.lock); + } else if (qp->ibqp.event_handler) + ret = 1; + +bail: + return ret; +} + +/** + * ipath_modify_qp - modify the attributes of a queue pair + * @ibqp: the queue pair who's attributes we're modifying + * @attr: the new attributes + * @attr_mask: the mask of attributes to modify + * @udata: user data for ipathverbs.so + * + * Returns 0 on success, otherwise returns an errno. + */ +int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct ipath_ibdev *dev = to_idev(ibqp->device); + struct ipath_qp *qp = to_iqp(ibqp); + enum ib_qp_state cur_state, new_state; + int lastwqe = 0; + int ret; + + spin_lock_irq(&qp->s_lock); + + cur_state = attr_mask & IB_QP_CUR_STATE ? + attr->cur_qp_state : qp->state; + new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; + + if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, + attr_mask, IB_LINK_LAYER_UNSPECIFIED)) + goto inval; + + if (attr_mask & IB_QP_AV) { + if (attr->ah_attr.dlid == 0 || + attr->ah_attr.dlid >= IPATH_MULTICAST_LID_BASE) + goto inval; + + if ((attr->ah_attr.ah_flags & IB_AH_GRH) && + (attr->ah_attr.grh.sgid_index > 1)) + goto inval; + } + + if (attr_mask & IB_QP_PKEY_INDEX) + if (attr->pkey_index >= ipath_get_npkeys(dev->dd)) + goto inval; + + if (attr_mask & IB_QP_MIN_RNR_TIMER) + if (attr->min_rnr_timer > 31) + goto inval; + + if (attr_mask & IB_QP_PORT) + if (attr->port_num == 0 || + attr->port_num > ibqp->device->phys_port_cnt) + goto inval; + + /* + * don't allow invalid Path MTU values or greater than 2048 + * unless we are configured for a 4KB MTU + */ + if ((attr_mask & IB_QP_PATH_MTU) && + (ib_mtu_enum_to_int(attr->path_mtu) == -1 || + (attr->path_mtu > IB_MTU_2048 && !ipath_mtu4096))) + goto inval; + + if (attr_mask & IB_QP_PATH_MIG_STATE) + if (attr->path_mig_state != IB_MIG_MIGRATED && + attr->path_mig_state != IB_MIG_REARM) + goto inval; + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) + if (attr->max_dest_rd_atomic > IPATH_MAX_RDMA_ATOMIC) + goto inval; + + switch (new_state) { + case IB_QPS_RESET: + if (qp->state != IB_QPS_RESET) { + qp->state = IB_QPS_RESET; + spin_lock(&dev->pending_lock); + if (!list_empty(&qp->timerwait)) + list_del_init(&qp->timerwait); + if (!list_empty(&qp->piowait)) + list_del_init(&qp->piowait); + spin_unlock(&dev->pending_lock); + qp->s_flags &= ~IPATH_S_ANY_WAIT; + spin_unlock_irq(&qp->s_lock); + /* Stop the sending tasklet */ + tasklet_kill(&qp->s_task); + wait_event(qp->wait_dma, !atomic_read(&qp->s_dma_busy)); + spin_lock_irq(&qp->s_lock); + } + ipath_reset_qp(qp, ibqp->qp_type); + break; + + case IB_QPS_SQD: + qp->s_draining = qp->s_last != qp->s_cur; + qp->state = new_state; + break; + + case IB_QPS_SQE: + if (qp->ibqp.qp_type == IB_QPT_RC) + goto inval; + qp->state = new_state; + break; + + case IB_QPS_ERR: + lastwqe = ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR); + break; + + default: + qp->state = new_state; + break; + } + + if (attr_mask & IB_QP_PKEY_INDEX) + qp->s_pkey_index = attr->pkey_index; + + if (attr_mask & IB_QP_DEST_QPN) + qp->remote_qpn = attr->dest_qp_num; + + if (attr_mask & IB_QP_SQ_PSN) { + qp->s_psn = qp->s_next_psn = attr->sq_psn; + qp->s_last_psn = qp->s_next_psn - 1; + } + + if (attr_mask & IB_QP_RQ_PSN) + qp->r_psn = attr->rq_psn; + + if (attr_mask & IB_QP_ACCESS_FLAGS) + qp->qp_access_flags = attr->qp_access_flags; + + if (attr_mask & IB_QP_AV) { + qp->remote_ah_attr = attr->ah_attr; + qp->s_dmult = ipath_ib_rate_to_mult(attr->ah_attr.static_rate); + } + + if (attr_mask & IB_QP_PATH_MTU) + qp->path_mtu = attr->path_mtu; + + if (attr_mask & IB_QP_RETRY_CNT) + qp->s_retry = qp->s_retry_cnt = attr->retry_cnt; + + if (attr_mask & IB_QP_RNR_RETRY) { + qp->s_rnr_retry = attr->rnr_retry; + if (qp->s_rnr_retry > 7) + qp->s_rnr_retry = 7; + qp->s_rnr_retry_cnt = qp->s_rnr_retry; + } + + if (attr_mask & IB_QP_MIN_RNR_TIMER) + qp->r_min_rnr_timer = attr->min_rnr_timer; + + if (attr_mask & IB_QP_TIMEOUT) + qp->timeout = attr->timeout; + + if (attr_mask & IB_QP_QKEY) + qp->qkey = attr->qkey; + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) + qp->r_max_rd_atomic = attr->max_dest_rd_atomic; + + if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) + qp->s_max_rd_atomic = attr->max_rd_atomic; + + spin_unlock_irq(&qp->s_lock); + + if (lastwqe) { + struct ib_event ev; + + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_QP_LAST_WQE_REACHED; + qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); + } + ret = 0; + goto bail; + +inval: + spin_unlock_irq(&qp->s_lock); + ret = -EINVAL; + +bail: + return ret; +} + +int ipath_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_qp_init_attr *init_attr) +{ + struct ipath_qp *qp = to_iqp(ibqp); + + attr->qp_state = qp->state; + attr->cur_qp_state = attr->qp_state; + attr->path_mtu = qp->path_mtu; + attr->path_mig_state = 0; + attr->qkey = qp->qkey; + attr->rq_psn = qp->r_psn; + attr->sq_psn = qp->s_next_psn; + attr->dest_qp_num = qp->remote_qpn; + attr->qp_access_flags = qp->qp_access_flags; + attr->cap.max_send_wr = qp->s_size - 1; + attr->cap.max_recv_wr = qp->ibqp.srq ? 0 : qp->r_rq.size - 1; + attr->cap.max_send_sge = qp->s_max_sge; + attr->cap.max_recv_sge = qp->r_rq.max_sge; + attr->cap.max_inline_data = 0; + attr->ah_attr = qp->remote_ah_attr; + memset(&attr->alt_ah_attr, 0, sizeof(attr->alt_ah_attr)); + attr->pkey_index = qp->s_pkey_index; + attr->alt_pkey_index = 0; + attr->en_sqd_async_notify = 0; + attr->sq_draining = qp->s_draining; + attr->max_rd_atomic = qp->s_max_rd_atomic; + attr->max_dest_rd_atomic = qp->r_max_rd_atomic; + attr->min_rnr_timer = qp->r_min_rnr_timer; + attr->port_num = 1; + attr->timeout = qp->timeout; + attr->retry_cnt = qp->s_retry_cnt; + attr->rnr_retry = qp->s_rnr_retry_cnt; + attr->alt_port_num = 0; + attr->alt_timeout = 0; + + init_attr->event_handler = qp->ibqp.event_handler; + init_attr->qp_context = qp->ibqp.qp_context; + init_attr->send_cq = qp->ibqp.send_cq; + init_attr->recv_cq = qp->ibqp.recv_cq; + init_attr->srq = qp->ibqp.srq; + init_attr->cap = attr->cap; + if (qp->s_flags & IPATH_S_SIGNAL_REQ_WR) + init_attr->sq_sig_type = IB_SIGNAL_REQ_WR; + else + init_attr->sq_sig_type = IB_SIGNAL_ALL_WR; + init_attr->qp_type = qp->ibqp.qp_type; + init_attr->port_num = 1; + return 0; +} + +/** + * ipath_compute_aeth - compute the AETH (syndrome + MSN) + * @qp: the queue pair to compute the AETH for + * + * Returns the AETH. + */ +__be32 ipath_compute_aeth(struct ipath_qp *qp) +{ + u32 aeth = qp->r_msn & IPATH_MSN_MASK; + + if (qp->ibqp.srq) { + /* + * Shared receive queues don't generate credits. + * Set the credit field to the invalid value. + */ + aeth |= IPATH_AETH_CREDIT_INVAL << IPATH_AETH_CREDIT_SHIFT; + } else { + u32 min, max, x; + u32 credits; + struct ipath_rwq *wq = qp->r_rq.wq; + u32 head; + u32 tail; + + /* sanity check pointers before trusting them */ + head = wq->head; + if (head >= qp->r_rq.size) + head = 0; + tail = wq->tail; + if (tail >= qp->r_rq.size) + tail = 0; + /* + * Compute the number of credits available (RWQEs). + * XXX Not holding the r_rq.lock here so there is a small + * chance that the pair of reads are not atomic. + */ + credits = head - tail; + if ((int)credits < 0) + credits += qp->r_rq.size; + /* + * Binary search the credit table to find the code to + * use. + */ + min = 0; + max = 31; + for (;;) { + x = (min + max) / 2; + if (credit_table[x] == credits) + break; + if (credit_table[x] > credits) + max = x; + else if (min == x) + break; + else + min = x; + } + aeth |= x << IPATH_AETH_CREDIT_SHIFT; + } + return cpu_to_be32(aeth); +} + +/** + * ipath_create_qp - create a queue pair for a device + * @ibpd: the protection domain who's device we create the queue pair for + * @init_attr: the attributes of the queue pair + * @udata: unused by InfiniPath + * + * Returns the queue pair on success, otherwise returns an errno. + * + * Called by the ib_create_qp() core verbs function. + */ +struct ib_qp *ipath_create_qp(struct ib_pd *ibpd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata) +{ + struct ipath_qp *qp; + int err; + struct ipath_swqe *swq = NULL; + struct ipath_ibdev *dev; + size_t sz; + size_t sg_list_sz; + struct ib_qp *ret; + + if (init_attr->create_flags) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + + if (init_attr->cap.max_send_sge > ib_ipath_max_sges || + init_attr->cap.max_send_wr > ib_ipath_max_qp_wrs) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + + /* Check receive queue parameters if no SRQ is specified. */ + if (!init_attr->srq) { + if (init_attr->cap.max_recv_sge > ib_ipath_max_sges || + init_attr->cap.max_recv_wr > ib_ipath_max_qp_wrs) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + if (init_attr->cap.max_send_sge + + init_attr->cap.max_send_wr + + init_attr->cap.max_recv_sge + + init_attr->cap.max_recv_wr == 0) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + } + + switch (init_attr->qp_type) { + case IB_QPT_UC: + case IB_QPT_RC: + case IB_QPT_UD: + case IB_QPT_SMI: + case IB_QPT_GSI: + sz = sizeof(struct ipath_sge) * + init_attr->cap.max_send_sge + + sizeof(struct ipath_swqe); + swq = vmalloc((init_attr->cap.max_send_wr + 1) * sz); + if (swq == NULL) { + ret = ERR_PTR(-ENOMEM); + goto bail; + } + sz = sizeof(*qp); + sg_list_sz = 0; + if (init_attr->srq) { + struct ipath_srq *srq = to_isrq(init_attr->srq); + + if (srq->rq.max_sge > 1) + sg_list_sz = sizeof(*qp->r_sg_list) * + (srq->rq.max_sge - 1); + } else if (init_attr->cap.max_recv_sge > 1) + sg_list_sz = sizeof(*qp->r_sg_list) * + (init_attr->cap.max_recv_sge - 1); + qp = kmalloc(sz + sg_list_sz, GFP_KERNEL); + if (!qp) { + ret = ERR_PTR(-ENOMEM); + goto bail_swq; + } + if (sg_list_sz && (init_attr->qp_type == IB_QPT_UD || + init_attr->qp_type == IB_QPT_SMI || + init_attr->qp_type == IB_QPT_GSI)) { + qp->r_ud_sg_list = kmalloc(sg_list_sz, GFP_KERNEL); + if (!qp->r_ud_sg_list) { + ret = ERR_PTR(-ENOMEM); + goto bail_qp; + } + } else + qp->r_ud_sg_list = NULL; + if (init_attr->srq) { + sz = 0; + qp->r_rq.size = 0; + qp->r_rq.max_sge = 0; + qp->r_rq.wq = NULL; + init_attr->cap.max_recv_wr = 0; + init_attr->cap.max_recv_sge = 0; + } else { + qp->r_rq.size = init_attr->cap.max_recv_wr + 1; + qp->r_rq.max_sge = init_attr->cap.max_recv_sge; + sz = (sizeof(struct ib_sge) * qp->r_rq.max_sge) + + sizeof(struct ipath_rwqe); + qp->r_rq.wq = vmalloc_user(sizeof(struct ipath_rwq) + + qp->r_rq.size * sz); + if (!qp->r_rq.wq) { + ret = ERR_PTR(-ENOMEM); + goto bail_sg_list; + } + } + + /* + * ib_create_qp() will initialize qp->ibqp + * except for qp->ibqp.qp_num. + */ + spin_lock_init(&qp->s_lock); + spin_lock_init(&qp->r_rq.lock); + atomic_set(&qp->refcount, 0); + init_waitqueue_head(&qp->wait); + init_waitqueue_head(&qp->wait_dma); + tasklet_init(&qp->s_task, ipath_do_send, (unsigned long)qp); + INIT_LIST_HEAD(&qp->piowait); + INIT_LIST_HEAD(&qp->timerwait); + qp->state = IB_QPS_RESET; + qp->s_wq = swq; + qp->s_size = init_attr->cap.max_send_wr + 1; + qp->s_max_sge = init_attr->cap.max_send_sge; + if (init_attr->sq_sig_type == IB_SIGNAL_REQ_WR) + qp->s_flags = IPATH_S_SIGNAL_REQ_WR; + else + qp->s_flags = 0; + dev = to_idev(ibpd->device); + err = ipath_alloc_qpn(&dev->qp_table, qp, + init_attr->qp_type); + if (err) { + ret = ERR_PTR(err); + vfree(qp->r_rq.wq); + goto bail_sg_list; + } + qp->ip = NULL; + qp->s_tx = NULL; + ipath_reset_qp(qp, init_attr->qp_type); + break; + + default: + /* Don't support raw QPs */ + ret = ERR_PTR(-ENOSYS); + goto bail; + } + + init_attr->cap.max_inline_data = 0; + + /* + * Return the address of the RWQ as the offset to mmap. + * See ipath_mmap() for details. + */ + if (udata && udata->outlen >= sizeof(__u64)) { + if (!qp->r_rq.wq) { + __u64 offset = 0; + + err = ib_copy_to_udata(udata, &offset, + sizeof(offset)); + if (err) { + ret = ERR_PTR(err); + goto bail_ip; + } + } else { + u32 s = sizeof(struct ipath_rwq) + + qp->r_rq.size * sz; + + qp->ip = + ipath_create_mmap_info(dev, s, + ibpd->uobject->context, + qp->r_rq.wq); + if (!qp->ip) { + ret = ERR_PTR(-ENOMEM); + goto bail_ip; + } + + err = ib_copy_to_udata(udata, &(qp->ip->offset), + sizeof(qp->ip->offset)); + if (err) { + ret = ERR_PTR(err); + goto bail_ip; + } + } + } + + spin_lock(&dev->n_qps_lock); + if (dev->n_qps_allocated == ib_ipath_max_qps) { + spin_unlock(&dev->n_qps_lock); + ret = ERR_PTR(-ENOMEM); + goto bail_ip; + } + + dev->n_qps_allocated++; + spin_unlock(&dev->n_qps_lock); + + if (qp->ip) { + spin_lock_irq(&dev->pending_lock); + list_add(&qp->ip->pending_mmaps, &dev->pending_mmaps); + spin_unlock_irq(&dev->pending_lock); + } + + ret = &qp->ibqp; + goto bail; + +bail_ip: + if (qp->ip) + kref_put(&qp->ip->ref, ipath_release_mmap_info); + else + vfree(qp->r_rq.wq); + ipath_free_qp(&dev->qp_table, qp); + free_qpn(&dev->qp_table, qp->ibqp.qp_num); +bail_sg_list: + kfree(qp->r_ud_sg_list); +bail_qp: + kfree(qp); +bail_swq: + vfree(swq); +bail: + return ret; +} + +/** + * ipath_destroy_qp - destroy a queue pair + * @ibqp: the queue pair to destroy + * + * Returns 0 on success. + * + * Note that this can be called while the QP is actively sending or + * receiving! + */ +int ipath_destroy_qp(struct ib_qp *ibqp) +{ + struct ipath_qp *qp = to_iqp(ibqp); + struct ipath_ibdev *dev = to_idev(ibqp->device); + + /* Make sure HW and driver activity is stopped. */ + spin_lock_irq(&qp->s_lock); + if (qp->state != IB_QPS_RESET) { + qp->state = IB_QPS_RESET; + spin_lock(&dev->pending_lock); + if (!list_empty(&qp->timerwait)) + list_del_init(&qp->timerwait); + if (!list_empty(&qp->piowait)) + list_del_init(&qp->piowait); + spin_unlock(&dev->pending_lock); + qp->s_flags &= ~IPATH_S_ANY_WAIT; + spin_unlock_irq(&qp->s_lock); + /* Stop the sending tasklet */ + tasklet_kill(&qp->s_task); + wait_event(qp->wait_dma, !atomic_read(&qp->s_dma_busy)); + } else + spin_unlock_irq(&qp->s_lock); + + ipath_free_qp(&dev->qp_table, qp); + + if (qp->s_tx) { + atomic_dec(&qp->refcount); + if (qp->s_tx->txreq.flags & IPATH_SDMA_TXREQ_F_FREEBUF) + kfree(qp->s_tx->txreq.map_addr); + spin_lock_irq(&dev->pending_lock); + list_add(&qp->s_tx->txreq.list, &dev->txreq_free); + spin_unlock_irq(&dev->pending_lock); + qp->s_tx = NULL; + } + + wait_event(qp->wait, !atomic_read(&qp->refcount)); + + /* all user's cleaned up, mark it available */ + free_qpn(&dev->qp_table, qp->ibqp.qp_num); + spin_lock(&dev->n_qps_lock); + dev->n_qps_allocated--; + spin_unlock(&dev->n_qps_lock); + + if (qp->ip) + kref_put(&qp->ip->ref, ipath_release_mmap_info); + else + vfree(qp->r_rq.wq); + kfree(qp->r_ud_sg_list); + vfree(qp->s_wq); + kfree(qp); + return 0; +} + +/** + * ipath_init_qp_table - initialize the QP table for a device + * @idev: the device who's QP table we're initializing + * @size: the size of the QP table + * + * Returns 0 on success, otherwise returns an errno. + */ +int ipath_init_qp_table(struct ipath_ibdev *idev, int size) +{ + int i; + int ret; + + idev->qp_table.last = 1; /* QPN 0 and 1 are special. */ + idev->qp_table.max = size; + idev->qp_table.nmaps = 1; + idev->qp_table.table = kzalloc(size * sizeof(*idev->qp_table.table), + GFP_KERNEL); + if (idev->qp_table.table == NULL) { + ret = -ENOMEM; + goto bail; + } + + for (i = 0; i < ARRAY_SIZE(idev->qp_table.map); i++) { + atomic_set(&idev->qp_table.map[i].n_free, BITS_PER_PAGE); + idev->qp_table.map[i].page = NULL; + } + + ret = 0; + +bail: + return ret; +} + +/** + * ipath_get_credit - flush the send work queue of a QP + * @qp: the qp who's send work queue to flush + * @aeth: the Acknowledge Extended Transport Header + * + * The QP s_lock should be held. + */ +void ipath_get_credit(struct ipath_qp *qp, u32 aeth) +{ + u32 credit = (aeth >> IPATH_AETH_CREDIT_SHIFT) & IPATH_AETH_CREDIT_MASK; + + /* + * If the credit is invalid, we can send + * as many packets as we like. Otherwise, we have to + * honor the credit field. + */ + if (credit == IPATH_AETH_CREDIT_INVAL) + qp->s_lsn = (u32) -1; + else if (qp->s_lsn != (u32) -1) { + /* Compute new LSN (i.e., MSN + credit) */ + credit = (aeth + credit_table[credit]) & IPATH_MSN_MASK; + if (ipath_cmp24(credit, qp->s_lsn) > 0) + qp->s_lsn = credit; + } + + /* Restart sending if it was blocked due to lack of credits. */ + if ((qp->s_flags & IPATH_S_WAIT_SSN_CREDIT) && + qp->s_cur != qp->s_head && + (qp->s_lsn == (u32) -1 || + ipath_cmp24(get_swqe_ptr(qp, qp->s_cur)->ssn, + qp->s_lsn + 1) <= 0)) + ipath_schedule_send(qp); +} diff --git a/kernel/drivers/infiniband/hw/ipath/ipath_rc.c b/kernel/drivers/infiniband/hw/ipath/ipath_rc.c new file mode 100644 index 000000000..79b3dbc97 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ipath/ipath_rc.c @@ -0,0 +1,1969 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include "ipath_verbs.h" +#include "ipath_kernel.h" + +/* cut down ridiculously long IB macro names */ +#define OP(x) IB_OPCODE_RC_##x + +static u32 restart_sge(struct ipath_sge_state *ss, struct ipath_swqe *wqe, + u32 psn, u32 pmtu) +{ + u32 len; + + len = ((psn - wqe->psn) & IPATH_PSN_MASK) * pmtu; + ss->sge = wqe->sg_list[0]; + ss->sg_list = wqe->sg_list + 1; + ss->num_sge = wqe->wr.num_sge; + ipath_skip_sge(ss, len); + return wqe->length - len; +} + +/** + * ipath_init_restart- initialize the qp->s_sge after a restart + * @qp: the QP who's SGE we're restarting + * @wqe: the work queue to initialize the QP's SGE from + * + * The QP s_lock should be held and interrupts disabled. + */ +static void ipath_init_restart(struct ipath_qp *qp, struct ipath_swqe *wqe) +{ + struct ipath_ibdev *dev; + + qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, + ib_mtu_enum_to_int(qp->path_mtu)); + dev = to_idev(qp->ibqp.device); + spin_lock(&dev->pending_lock); + if (list_empty(&qp->timerwait)) + list_add_tail(&qp->timerwait, + &dev->pending[dev->pending_index]); + spin_unlock(&dev->pending_lock); +} + +/** + * ipath_make_rc_ack - construct a response packet (ACK, NAK, or RDMA read) + * @qp: a pointer to the QP + * @ohdr: a pointer to the IB header being constructed + * @pmtu: the path MTU + * + * Return 1 if constructed; otherwise, return 0. + * Note that we are in the responder's side of the QP context. + * Note the QP s_lock must be held. + */ +static int ipath_make_rc_ack(struct ipath_ibdev *dev, struct ipath_qp *qp, + struct ipath_other_headers *ohdr, u32 pmtu) +{ + struct ipath_ack_entry *e; + u32 hwords; + u32 len; + u32 bth0; + u32 bth2; + + /* Don't send an ACK if we aren't supposed to. */ + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) + goto bail; + + /* header size in 32-bit words LRH+BTH = (8+12)/4. */ + hwords = 5; + + switch (qp->s_ack_state) { + case OP(RDMA_READ_RESPONSE_LAST): + case OP(RDMA_READ_RESPONSE_ONLY): + case OP(ATOMIC_ACKNOWLEDGE): + /* + * We can increment the tail pointer now that the last + * response has been sent instead of only being + * constructed. + */ + if (++qp->s_tail_ack_queue > IPATH_MAX_RDMA_ATOMIC) + qp->s_tail_ack_queue = 0; + /* FALLTHROUGH */ + case OP(SEND_ONLY): + case OP(ACKNOWLEDGE): + /* Check for no next entry in the queue. */ + if (qp->r_head_ack_queue == qp->s_tail_ack_queue) { + if (qp->s_flags & IPATH_S_ACK_PENDING) + goto normal; + qp->s_ack_state = OP(ACKNOWLEDGE); + goto bail; + } + + e = &qp->s_ack_queue[qp->s_tail_ack_queue]; + if (e->opcode == OP(RDMA_READ_REQUEST)) { + /* Copy SGE state in case we need to resend */ + qp->s_ack_rdma_sge = e->rdma_sge; + qp->s_cur_sge = &qp->s_ack_rdma_sge; + len = e->rdma_sge.sge.sge_length; + if (len > pmtu) { + len = pmtu; + qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST); + } else { + qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY); + e->sent = 1; + } + ohdr->u.aeth = ipath_compute_aeth(qp); + hwords++; + qp->s_ack_rdma_psn = e->psn; + bth2 = qp->s_ack_rdma_psn++ & IPATH_PSN_MASK; + } else { + /* COMPARE_SWAP or FETCH_ADD */ + qp->s_cur_sge = NULL; + len = 0; + qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE); + ohdr->u.at.aeth = ipath_compute_aeth(qp); + ohdr->u.at.atomic_ack_eth[0] = + cpu_to_be32(e->atomic_data >> 32); + ohdr->u.at.atomic_ack_eth[1] = + cpu_to_be32(e->atomic_data); + hwords += sizeof(ohdr->u.at) / sizeof(u32); + bth2 = e->psn; + e->sent = 1; + } + bth0 = qp->s_ack_state << 24; + break; + + case OP(RDMA_READ_RESPONSE_FIRST): + qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE); + /* FALLTHROUGH */ + case OP(RDMA_READ_RESPONSE_MIDDLE): + len = qp->s_ack_rdma_sge.sge.sge_length; + if (len > pmtu) + len = pmtu; + else { + ohdr->u.aeth = ipath_compute_aeth(qp); + hwords++; + qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST); + qp->s_ack_queue[qp->s_tail_ack_queue].sent = 1; + } + bth0 = qp->s_ack_state << 24; + bth2 = qp->s_ack_rdma_psn++ & IPATH_PSN_MASK; + break; + + default: + normal: + /* + * Send a regular ACK. + * Set the s_ack_state so we wait until after sending + * the ACK before setting s_ack_state to ACKNOWLEDGE + * (see above). + */ + qp->s_ack_state = OP(SEND_ONLY); + qp->s_flags &= ~IPATH_S_ACK_PENDING; + qp->s_cur_sge = NULL; + if (qp->s_nak_state) + ohdr->u.aeth = + cpu_to_be32((qp->r_msn & IPATH_MSN_MASK) | + (qp->s_nak_state << + IPATH_AETH_CREDIT_SHIFT)); + else + ohdr->u.aeth = ipath_compute_aeth(qp); + hwords++; + len = 0; + bth0 = OP(ACKNOWLEDGE) << 24; + bth2 = qp->s_ack_psn & IPATH_PSN_MASK; + } + qp->s_hdrwords = hwords; + qp->s_cur_size = len; + ipath_make_ruc_header(dev, qp, ohdr, bth0, bth2); + return 1; + +bail: + return 0; +} + +/** + * ipath_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC) + * @qp: a pointer to the QP + * + * Return 1 if constructed; otherwise, return 0. + */ +int ipath_make_rc_req(struct ipath_qp *qp) +{ + struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + struct ipath_other_headers *ohdr; + struct ipath_sge_state *ss; + struct ipath_swqe *wqe; + u32 hwords; + u32 len; + u32 bth0; + u32 bth2; + u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu); + char newreq; + unsigned long flags; + int ret = 0; + + ohdr = &qp->s_hdr.u.oth; + if (qp->remote_ah_attr.ah_flags & IB_AH_GRH) + ohdr = &qp->s_hdr.u.l.oth; + + /* + * The lock is needed to synchronize between the sending tasklet, + * the receive interrupt handler, and timeout resends. + */ + spin_lock_irqsave(&qp->s_lock, flags); + + /* Sending responses has higher priority over sending requests. */ + if ((qp->r_head_ack_queue != qp->s_tail_ack_queue || + (qp->s_flags & IPATH_S_ACK_PENDING) || + qp->s_ack_state != OP(ACKNOWLEDGE)) && + ipath_make_rc_ack(dev, qp, ohdr, pmtu)) + goto done; + + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)) { + if (!(ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND)) + goto bail; + /* We are in the error state, flush the work request. */ + if (qp->s_last == qp->s_head) + goto bail; + /* If DMAs are in progress, we can't flush immediately. */ + if (atomic_read(&qp->s_dma_busy)) { + qp->s_flags |= IPATH_S_WAIT_DMA; + goto bail; + } + wqe = get_swqe_ptr(qp, qp->s_last); + ipath_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); + goto done; + } + + /* Leave BUSY set until RNR timeout. */ + if (qp->s_rnr_timeout) { + qp->s_flags |= IPATH_S_WAITING; + goto bail; + } + + /* header size in 32-bit words LRH+BTH = (8+12)/4. */ + hwords = 5; + bth0 = 1 << 22; /* Set M bit */ + + /* Send a request. */ + wqe = get_swqe_ptr(qp, qp->s_cur); + switch (qp->s_state) { + default: + if (!(ib_ipath_state_ops[qp->state] & + IPATH_PROCESS_NEXT_SEND_OK)) + goto bail; + /* + * Resend an old request or start a new one. + * + * We keep track of the current SWQE so that + * we don't reset the "furthest progress" state + * if we need to back up. + */ + newreq = 0; + if (qp->s_cur == qp->s_tail) { + /* Check if send work queue is empty. */ + if (qp->s_tail == qp->s_head) + goto bail; + /* + * If a fence is requested, wait for previous + * RDMA read and atomic operations to finish. + */ + if ((wqe->wr.send_flags & IB_SEND_FENCE) && + qp->s_num_rd_atomic) { + qp->s_flags |= IPATH_S_FENCE_PENDING; + goto bail; + } + wqe->psn = qp->s_next_psn; + newreq = 1; + } + /* + * Note that we have to be careful not to modify the + * original work request since we may need to resend + * it. + */ + len = wqe->length; + ss = &qp->s_sge; + bth2 = 0; + switch (wqe->wr.opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + /* If no credit, return. */ + if (qp->s_lsn != (u32) -1 && + ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) { + qp->s_flags |= IPATH_S_WAIT_SSN_CREDIT; + goto bail; + } + wqe->lpsn = wqe->psn; + if (len > pmtu) { + wqe->lpsn += (len - 1) / pmtu; + qp->s_state = OP(SEND_FIRST); + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_SEND) + qp->s_state = OP(SEND_ONLY); + else { + qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + } + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= 1 << 23; + bth2 = 1 << 31; /* Request ACK. */ + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + + case IB_WR_RDMA_WRITE: + if (newreq && qp->s_lsn != (u32) -1) + qp->s_lsn++; + /* FALLTHROUGH */ + case IB_WR_RDMA_WRITE_WITH_IMM: + /* If no credit, return. */ + if (qp->s_lsn != (u32) -1 && + ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) { + qp->s_flags |= IPATH_S_WAIT_SSN_CREDIT; + goto bail; + } + ohdr->u.rc.reth.vaddr = + cpu_to_be64(wqe->wr.wr.rdma.remote_addr); + ohdr->u.rc.reth.rkey = + cpu_to_be32(wqe->wr.wr.rdma.rkey); + ohdr->u.rc.reth.length = cpu_to_be32(len); + hwords += sizeof(struct ib_reth) / sizeof(u32); + wqe->lpsn = wqe->psn; + if (len > pmtu) { + wqe->lpsn += (len - 1) / pmtu; + qp->s_state = OP(RDMA_WRITE_FIRST); + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_RDMA_WRITE) + qp->s_state = OP(RDMA_WRITE_ONLY); + else { + qp->s_state = + OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE); + /* Immediate data comes after RETH */ + ohdr->u.rc.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= 1 << 23; + } + bth2 = 1 << 31; /* Request ACK. */ + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + + case IB_WR_RDMA_READ: + /* + * Don't allow more operations to be started + * than the QP limits allow. + */ + if (newreq) { + if (qp->s_num_rd_atomic >= + qp->s_max_rd_atomic) { + qp->s_flags |= IPATH_S_RDMAR_PENDING; + goto bail; + } + qp->s_num_rd_atomic++; + if (qp->s_lsn != (u32) -1) + qp->s_lsn++; + /* + * Adjust s_next_psn to count the + * expected number of responses. + */ + if (len > pmtu) + qp->s_next_psn += (len - 1) / pmtu; + wqe->lpsn = qp->s_next_psn++; + } + ohdr->u.rc.reth.vaddr = + cpu_to_be64(wqe->wr.wr.rdma.remote_addr); + ohdr->u.rc.reth.rkey = + cpu_to_be32(wqe->wr.wr.rdma.rkey); + ohdr->u.rc.reth.length = cpu_to_be32(len); + qp->s_state = OP(RDMA_READ_REQUEST); + hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32); + ss = NULL; + len = 0; + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + /* + * Don't allow more operations to be started + * than the QP limits allow. + */ + if (newreq) { + if (qp->s_num_rd_atomic >= + qp->s_max_rd_atomic) { + qp->s_flags |= IPATH_S_RDMAR_PENDING; + goto bail; + } + qp->s_num_rd_atomic++; + if (qp->s_lsn != (u32) -1) + qp->s_lsn++; + wqe->lpsn = wqe->psn; + } + if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) { + qp->s_state = OP(COMPARE_SWAP); + ohdr->u.atomic_eth.swap_data = cpu_to_be64( + wqe->wr.wr.atomic.swap); + ohdr->u.atomic_eth.compare_data = cpu_to_be64( + wqe->wr.wr.atomic.compare_add); + } else { + qp->s_state = OP(FETCH_ADD); + ohdr->u.atomic_eth.swap_data = cpu_to_be64( + wqe->wr.wr.atomic.compare_add); + ohdr->u.atomic_eth.compare_data = 0; + } + ohdr->u.atomic_eth.vaddr[0] = cpu_to_be32( + wqe->wr.wr.atomic.remote_addr >> 32); + ohdr->u.atomic_eth.vaddr[1] = cpu_to_be32( + wqe->wr.wr.atomic.remote_addr); + ohdr->u.atomic_eth.rkey = cpu_to_be32( + wqe->wr.wr.atomic.rkey); + hwords += sizeof(struct ib_atomic_eth) / sizeof(u32); + ss = NULL; + len = 0; + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + + default: + goto bail; + } + qp->s_sge.sge = wqe->sg_list[0]; + qp->s_sge.sg_list = wqe->sg_list + 1; + qp->s_sge.num_sge = wqe->wr.num_sge; + qp->s_len = wqe->length; + if (newreq) { + qp->s_tail++; + if (qp->s_tail >= qp->s_size) + qp->s_tail = 0; + } + bth2 |= qp->s_psn & IPATH_PSN_MASK; + if (wqe->wr.opcode == IB_WR_RDMA_READ) + qp->s_psn = wqe->lpsn + 1; + else { + qp->s_psn++; + if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0) + qp->s_next_psn = qp->s_psn; + } + /* + * Put the QP on the pending list so lost ACKs will cause + * a retry. More than one request can be pending so the + * QP may already be on the dev->pending list. + */ + spin_lock(&dev->pending_lock); + if (list_empty(&qp->timerwait)) + list_add_tail(&qp->timerwait, + &dev->pending[dev->pending_index]); + spin_unlock(&dev->pending_lock); + break; + + case OP(RDMA_READ_RESPONSE_FIRST): + /* + * This case can only happen if a send is restarted. + * See ipath_restart_rc(). + */ + ipath_init_restart(qp, wqe); + /* FALLTHROUGH */ + case OP(SEND_FIRST): + qp->s_state = OP(SEND_MIDDLE); + /* FALLTHROUGH */ + case OP(SEND_MIDDLE): + bth2 = qp->s_psn++ & IPATH_PSN_MASK; + if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0) + qp->s_next_psn = qp->s_psn; + ss = &qp->s_sge; + len = qp->s_len; + if (len > pmtu) { + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_SEND) + qp->s_state = OP(SEND_LAST); + else { + qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + } + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= 1 << 23; + bth2 |= 1 << 31; /* Request ACK. */ + qp->s_cur++; + if (qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + + case OP(RDMA_READ_RESPONSE_LAST): + /* + * This case can only happen if a RDMA write is restarted. + * See ipath_restart_rc(). + */ + ipath_init_restart(qp, wqe); + /* FALLTHROUGH */ + case OP(RDMA_WRITE_FIRST): + qp->s_state = OP(RDMA_WRITE_MIDDLE); + /* FALLTHROUGH */ + case OP(RDMA_WRITE_MIDDLE): + bth2 = qp->s_psn++ & IPATH_PSN_MASK; + if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0) + qp->s_next_psn = qp->s_psn; + ss = &qp->s_sge; + len = qp->s_len; + if (len > pmtu) { + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_RDMA_WRITE) + qp->s_state = OP(RDMA_WRITE_LAST); + else { + qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= 1 << 23; + } + bth2 |= 1 << 31; /* Request ACK. */ + qp->s_cur++; + if (qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + + case OP(RDMA_READ_RESPONSE_MIDDLE): + /* + * This case can only happen if a RDMA read is restarted. + * See ipath_restart_rc(). + */ + ipath_init_restart(qp, wqe); + len = ((qp->s_psn - wqe->psn) & IPATH_PSN_MASK) * pmtu; + ohdr->u.rc.reth.vaddr = + cpu_to_be64(wqe->wr.wr.rdma.remote_addr + len); + ohdr->u.rc.reth.rkey = + cpu_to_be32(wqe->wr.wr.rdma.rkey); + ohdr->u.rc.reth.length = cpu_to_be32(qp->s_len); + qp->s_state = OP(RDMA_READ_REQUEST); + hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32); + bth2 = qp->s_psn & IPATH_PSN_MASK; + qp->s_psn = wqe->lpsn + 1; + ss = NULL; + len = 0; + qp->s_cur++; + if (qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + } + if (ipath_cmp24(qp->s_psn, qp->s_last_psn + IPATH_PSN_CREDIT - 1) >= 0) + bth2 |= 1 << 31; /* Request ACK. */ + qp->s_len -= len; + qp->s_hdrwords = hwords; + qp->s_cur_sge = ss; + qp->s_cur_size = len; + ipath_make_ruc_header(dev, qp, ohdr, bth0 | (qp->s_state << 24), bth2); +done: + ret = 1; + goto unlock; + +bail: + qp->s_flags &= ~IPATH_S_BUSY; +unlock: + spin_unlock_irqrestore(&qp->s_lock, flags); + return ret; +} + +/** + * send_rc_ack - Construct an ACK packet and send it + * @qp: a pointer to the QP + * + * This is called from ipath_rc_rcv() and only uses the receive + * side QP state. + * Note that RDMA reads and atomics are handled in the + * send side QP state and tasklet. + */ +static void send_rc_ack(struct ipath_qp *qp) +{ + struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + struct ipath_devdata *dd; + u16 lrh0; + u32 bth0; + u32 hwords; + u32 __iomem *piobuf; + struct ipath_ib_header hdr; + struct ipath_other_headers *ohdr; + unsigned long flags; + + spin_lock_irqsave(&qp->s_lock, flags); + + /* Don't send ACK or NAK if a RDMA read or atomic is pending. */ + if (qp->r_head_ack_queue != qp->s_tail_ack_queue || + (qp->s_flags & IPATH_S_ACK_PENDING) || + qp->s_ack_state != OP(ACKNOWLEDGE)) + goto queue_ack; + + spin_unlock_irqrestore(&qp->s_lock, flags); + + /* Don't try to send ACKs if the link isn't ACTIVE */ + dd = dev->dd; + if (!(dd->ipath_flags & IPATH_LINKACTIVE)) + goto done; + + piobuf = ipath_getpiobuf(dd, 0, NULL); + if (!piobuf) { + /* + * We are out of PIO buffers at the moment. + * Pass responsibility for sending the ACK to the + * send tasklet so that when a PIO buffer becomes + * available, the ACK is sent ahead of other outgoing + * packets. + */ + spin_lock_irqsave(&qp->s_lock, flags); + goto queue_ack; + } + + /* Construct the header. */ + ohdr = &hdr.u.oth; + lrh0 = IPATH_LRH_BTH; + /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4. */ + hwords = 6; + if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) { + hwords += ipath_make_grh(dev, &hdr.u.l.grh, + &qp->remote_ah_attr.grh, + hwords, 0); + ohdr = &hdr.u.l.oth; + lrh0 = IPATH_LRH_GRH; + } + /* read pkey_index w/o lock (its atomic) */ + bth0 = ipath_get_pkey(dd, qp->s_pkey_index) | + (OP(ACKNOWLEDGE) << 24) | (1 << 22); + if (qp->r_nak_state) + ohdr->u.aeth = cpu_to_be32((qp->r_msn & IPATH_MSN_MASK) | + (qp->r_nak_state << + IPATH_AETH_CREDIT_SHIFT)); + else + ohdr->u.aeth = ipath_compute_aeth(qp); + lrh0 |= qp->remote_ah_attr.sl << 4; + hdr.lrh[0] = cpu_to_be16(lrh0); + hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid); + hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC); + hdr.lrh[3] = cpu_to_be16(dd->ipath_lid | + qp->remote_ah_attr.src_path_bits); + ohdr->bth[0] = cpu_to_be32(bth0); + ohdr->bth[1] = cpu_to_be32(qp->remote_qpn); + ohdr->bth[2] = cpu_to_be32(qp->r_ack_psn & IPATH_PSN_MASK); + + writeq(hwords + 1, piobuf); + + if (dd->ipath_flags & IPATH_PIO_FLUSH_WC) { + u32 *hdrp = (u32 *) &hdr; + + ipath_flush_wc(); + __iowrite32_copy(piobuf + 2, hdrp, hwords - 1); + ipath_flush_wc(); + __raw_writel(hdrp[hwords - 1], piobuf + hwords + 1); + } else + __iowrite32_copy(piobuf + 2, (u32 *) &hdr, hwords); + + ipath_flush_wc(); + + dev->n_unicast_xmit++; + goto done; + +queue_ack: + if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK) { + dev->n_rc_qacks++; + qp->s_flags |= IPATH_S_ACK_PENDING; + qp->s_nak_state = qp->r_nak_state; + qp->s_ack_psn = qp->r_ack_psn; + + /* Schedule the send tasklet. */ + ipath_schedule_send(qp); + } + spin_unlock_irqrestore(&qp->s_lock, flags); +done: + return; +} + +/** + * reset_psn - reset the QP state to send starting from PSN + * @qp: the QP + * @psn: the packet sequence number to restart at + * + * This is called from ipath_rc_rcv() to process an incoming RC ACK + * for the given QP. + * Called at interrupt level with the QP s_lock held. + */ +static void reset_psn(struct ipath_qp *qp, u32 psn) +{ + u32 n = qp->s_last; + struct ipath_swqe *wqe = get_swqe_ptr(qp, n); + u32 opcode; + + qp->s_cur = n; + + /* + * If we are starting the request from the beginning, + * let the normal send code handle initialization. + */ + if (ipath_cmp24(psn, wqe->psn) <= 0) { + qp->s_state = OP(SEND_LAST); + goto done; + } + + /* Find the work request opcode corresponding to the given PSN. */ + opcode = wqe->wr.opcode; + for (;;) { + int diff; + + if (++n == qp->s_size) + n = 0; + if (n == qp->s_tail) + break; + wqe = get_swqe_ptr(qp, n); + diff = ipath_cmp24(psn, wqe->psn); + if (diff < 0) + break; + qp->s_cur = n; + /* + * If we are starting the request from the beginning, + * let the normal send code handle initialization. + */ + if (diff == 0) { + qp->s_state = OP(SEND_LAST); + goto done; + } + opcode = wqe->wr.opcode; + } + + /* + * Set the state to restart in the middle of a request. + * Don't change the s_sge, s_cur_sge, or s_cur_size. + * See ipath_make_rc_req(). + */ + switch (opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + qp->s_state = OP(RDMA_READ_RESPONSE_FIRST); + break; + + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + qp->s_state = OP(RDMA_READ_RESPONSE_LAST); + break; + + case IB_WR_RDMA_READ: + qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE); + break; + + default: + /* + * This case shouldn't happen since its only + * one PSN per req. + */ + qp->s_state = OP(SEND_LAST); + } +done: + qp->s_psn = psn; +} + +/** + * ipath_restart_rc - back up requester to resend the last un-ACKed request + * @qp: the QP to restart + * @psn: packet sequence number for the request + * @wc: the work completion request + * + * The QP s_lock should be held and interrupts disabled. + */ +void ipath_restart_rc(struct ipath_qp *qp, u32 psn) +{ + struct ipath_swqe *wqe = get_swqe_ptr(qp, qp->s_last); + struct ipath_ibdev *dev; + + if (qp->s_retry == 0) { + ipath_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR); + ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR); + goto bail; + } + qp->s_retry--; + + /* + * Remove the QP from the timeout queue. + * Note: it may already have been removed by ipath_ib_timer(). + */ + dev = to_idev(qp->ibqp.device); + spin_lock(&dev->pending_lock); + if (!list_empty(&qp->timerwait)) + list_del_init(&qp->timerwait); + if (!list_empty(&qp->piowait)) + list_del_init(&qp->piowait); + spin_unlock(&dev->pending_lock); + + if (wqe->wr.opcode == IB_WR_RDMA_READ) + dev->n_rc_resends++; + else + dev->n_rc_resends += (qp->s_psn - psn) & IPATH_PSN_MASK; + + reset_psn(qp, psn); + ipath_schedule_send(qp); + +bail: + return; +} + +static inline void update_last_psn(struct ipath_qp *qp, u32 psn) +{ + qp->s_last_psn = psn; +} + +/** + * do_rc_ack - process an incoming RC ACK + * @qp: the QP the ACK came in on + * @psn: the packet sequence number of the ACK + * @opcode: the opcode of the request that resulted in the ACK + * + * This is called from ipath_rc_rcv_resp() to process an incoming RC ACK + * for the given QP. + * Called at interrupt level with the QP s_lock held and interrupts disabled. + * Returns 1 if OK, 0 if current operation should be aborted (NAK). + */ +static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode, + u64 val) +{ + struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + struct ib_wc wc; + enum ib_wc_status status; + struct ipath_swqe *wqe; + int ret = 0; + u32 ack_psn; + int diff; + + /* + * Remove the QP from the timeout queue (or RNR timeout queue). + * If ipath_ib_timer() has already removed it, + * it's OK since we hold the QP s_lock and ipath_restart_rc() + * just won't find anything to restart if we ACK everything. + */ + spin_lock(&dev->pending_lock); + if (!list_empty(&qp->timerwait)) + list_del_init(&qp->timerwait); + spin_unlock(&dev->pending_lock); + + /* + * Note that NAKs implicitly ACK outstanding SEND and RDMA write + * requests and implicitly NAK RDMA read and atomic requests issued + * before the NAK'ed request. The MSN won't include the NAK'ed + * request but will include an ACK'ed request(s). + */ + ack_psn = psn; + if (aeth >> 29) + ack_psn--; + wqe = get_swqe_ptr(qp, qp->s_last); + + /* + * The MSN might be for a later WQE than the PSN indicates so + * only complete WQEs that the PSN finishes. + */ + while ((diff = ipath_cmp24(ack_psn, wqe->lpsn)) >= 0) { + /* + * RDMA_READ_RESPONSE_ONLY is a special case since + * we want to generate completion events for everything + * before the RDMA read, copy the data, then generate + * the completion for the read. + */ + if (wqe->wr.opcode == IB_WR_RDMA_READ && + opcode == OP(RDMA_READ_RESPONSE_ONLY) && + diff == 0) { + ret = 1; + goto bail; + } + /* + * If this request is a RDMA read or atomic, and the ACK is + * for a later operation, this ACK NAKs the RDMA read or + * atomic. In other words, only a RDMA_READ_LAST or ONLY + * can ACK a RDMA read and likewise for atomic ops. Note + * that the NAK case can only happen if relaxed ordering is + * used and requests are sent after an RDMA read or atomic + * is sent but before the response is received. + */ + if ((wqe->wr.opcode == IB_WR_RDMA_READ && + (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) || + ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || + wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) && + (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) { + /* + * The last valid PSN seen is the previous + * request's. + */ + update_last_psn(qp, wqe->psn - 1); + /* Retry this request. */ + ipath_restart_rc(qp, wqe->psn); + /* + * No need to process the ACK/NAK since we are + * restarting an earlier request. + */ + goto bail; + } + if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || + wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) + *(u64 *) wqe->sg_list[0].vaddr = val; + if (qp->s_num_rd_atomic && + (wqe->wr.opcode == IB_WR_RDMA_READ || + wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || + wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) { + qp->s_num_rd_atomic--; + /* Restart sending task if fence is complete */ + if (((qp->s_flags & IPATH_S_FENCE_PENDING) && + !qp->s_num_rd_atomic) || + qp->s_flags & IPATH_S_RDMAR_PENDING) + ipath_schedule_send(qp); + } + /* Post a send completion queue entry if requested. */ + if (!(qp->s_flags & IPATH_S_SIGNAL_REQ_WR) || + (wqe->wr.send_flags & IB_SEND_SIGNALED)) { + memset(&wc, 0, sizeof wc); + wc.wr_id = wqe->wr.wr_id; + wc.status = IB_WC_SUCCESS; + wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode]; + wc.byte_len = wqe->length; + wc.qp = &qp->ibqp; + wc.src_qp = qp->remote_qpn; + wc.slid = qp->remote_ah_attr.dlid; + wc.sl = qp->remote_ah_attr.sl; + ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0); + } + qp->s_retry = qp->s_retry_cnt; + /* + * If we are completing a request which is in the process of + * being resent, we can stop resending it since we know the + * responder has already seen it. + */ + if (qp->s_last == qp->s_cur) { + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; + qp->s_last = qp->s_cur; + if (qp->s_last == qp->s_tail) + break; + wqe = get_swqe_ptr(qp, qp->s_cur); + qp->s_state = OP(SEND_LAST); + qp->s_psn = wqe->psn; + } else { + if (++qp->s_last >= qp->s_size) + qp->s_last = 0; + if (qp->state == IB_QPS_SQD && qp->s_last == qp->s_cur) + qp->s_draining = 0; + if (qp->s_last == qp->s_tail) + break; + wqe = get_swqe_ptr(qp, qp->s_last); + } + } + + switch (aeth >> 29) { + case 0: /* ACK */ + dev->n_rc_acks++; + /* If this is a partial ACK, reset the retransmit timer. */ + if (qp->s_last != qp->s_tail) { + spin_lock(&dev->pending_lock); + if (list_empty(&qp->timerwait)) + list_add_tail(&qp->timerwait, + &dev->pending[dev->pending_index]); + spin_unlock(&dev->pending_lock); + /* + * If we get a partial ACK for a resent operation, + * we can stop resending the earlier packets and + * continue with the next packet the receiver wants. + */ + if (ipath_cmp24(qp->s_psn, psn) <= 0) { + reset_psn(qp, psn + 1); + ipath_schedule_send(qp); + } + } else if (ipath_cmp24(qp->s_psn, psn) <= 0) { + qp->s_state = OP(SEND_LAST); + qp->s_psn = psn + 1; + } + ipath_get_credit(qp, aeth); + qp->s_rnr_retry = qp->s_rnr_retry_cnt; + qp->s_retry = qp->s_retry_cnt; + update_last_psn(qp, psn); + ret = 1; + goto bail; + + case 1: /* RNR NAK */ + dev->n_rnr_naks++; + if (qp->s_last == qp->s_tail) + goto bail; + if (qp->s_rnr_retry == 0) { + status = IB_WC_RNR_RETRY_EXC_ERR; + goto class_b; + } + if (qp->s_rnr_retry_cnt < 7) + qp->s_rnr_retry--; + + /* The last valid PSN is the previous PSN. */ + update_last_psn(qp, psn - 1); + + if (wqe->wr.opcode == IB_WR_RDMA_READ) + dev->n_rc_resends++; + else + dev->n_rc_resends += + (qp->s_psn - psn) & IPATH_PSN_MASK; + + reset_psn(qp, psn); + + qp->s_rnr_timeout = + ib_ipath_rnr_table[(aeth >> IPATH_AETH_CREDIT_SHIFT) & + IPATH_AETH_CREDIT_MASK]; + ipath_insert_rnr_queue(qp); + ipath_schedule_send(qp); + goto bail; + + case 3: /* NAK */ + if (qp->s_last == qp->s_tail) + goto bail; + /* The last valid PSN is the previous PSN. */ + update_last_psn(qp, psn - 1); + switch ((aeth >> IPATH_AETH_CREDIT_SHIFT) & + IPATH_AETH_CREDIT_MASK) { + case 0: /* PSN sequence error */ + dev->n_seq_naks++; + /* + * Back up to the responder's expected PSN. + * Note that we might get a NAK in the middle of an + * RDMA READ response which terminates the RDMA + * READ. + */ + ipath_restart_rc(qp, psn); + break; + + case 1: /* Invalid Request */ + status = IB_WC_REM_INV_REQ_ERR; + dev->n_other_naks++; + goto class_b; + + case 2: /* Remote Access Error */ + status = IB_WC_REM_ACCESS_ERR; + dev->n_other_naks++; + goto class_b; + + case 3: /* Remote Operation Error */ + status = IB_WC_REM_OP_ERR; + dev->n_other_naks++; + class_b: + ipath_send_complete(qp, wqe, status); + ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR); + break; + + default: + /* Ignore other reserved NAK error codes */ + goto reserved; + } + qp->s_rnr_retry = qp->s_rnr_retry_cnt; + goto bail; + + default: /* 2: reserved */ + reserved: + /* Ignore reserved NAK codes. */ + goto bail; + } + +bail: + return ret; +} + +/** + * ipath_rc_rcv_resp - process an incoming RC response packet + * @dev: the device this packet came in on + * @ohdr: the other headers for this packet + * @data: the packet data + * @tlen: the packet length + * @qp: the QP for this packet + * @opcode: the opcode for this packet + * @psn: the packet sequence number for this packet + * @hdrsize: the header length + * @pmtu: the path MTU + * @header_in_data: true if part of the header data is in the data buffer + * + * This is called from ipath_rc_rcv() to process an incoming RC response + * packet for the given QP. + * Called at interrupt level. + */ +static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev, + struct ipath_other_headers *ohdr, + void *data, u32 tlen, + struct ipath_qp *qp, + u32 opcode, + u32 psn, u32 hdrsize, u32 pmtu, + int header_in_data) +{ + struct ipath_swqe *wqe; + enum ib_wc_status status; + unsigned long flags; + int diff; + u32 pad; + u32 aeth; + u64 val; + + spin_lock_irqsave(&qp->s_lock, flags); + + /* Double check we can process this now that we hold the s_lock. */ + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) + goto ack_done; + + /* Ignore invalid responses. */ + if (ipath_cmp24(psn, qp->s_next_psn) >= 0) + goto ack_done; + + /* Ignore duplicate responses. */ + diff = ipath_cmp24(psn, qp->s_last_psn); + if (unlikely(diff <= 0)) { + /* Update credits for "ghost" ACKs */ + if (diff == 0 && opcode == OP(ACKNOWLEDGE)) { + if (!header_in_data) + aeth = be32_to_cpu(ohdr->u.aeth); + else { + aeth = be32_to_cpu(((__be32 *) data)[0]); + data += sizeof(__be32); + } + if ((aeth >> 29) == 0) + ipath_get_credit(qp, aeth); + } + goto ack_done; + } + + if (unlikely(qp->s_last == qp->s_tail)) + goto ack_done; + wqe = get_swqe_ptr(qp, qp->s_last); + status = IB_WC_SUCCESS; + + switch (opcode) { + case OP(ACKNOWLEDGE): + case OP(ATOMIC_ACKNOWLEDGE): + case OP(RDMA_READ_RESPONSE_FIRST): + if (!header_in_data) + aeth = be32_to_cpu(ohdr->u.aeth); + else { + aeth = be32_to_cpu(((__be32 *) data)[0]); + data += sizeof(__be32); + } + if (opcode == OP(ATOMIC_ACKNOWLEDGE)) { + if (!header_in_data) { + __be32 *p = ohdr->u.at.atomic_ack_eth; + + val = ((u64) be32_to_cpu(p[0]) << 32) | + be32_to_cpu(p[1]); + } else + val = be64_to_cpu(((__be64 *) data)[0]); + } else + val = 0; + if (!do_rc_ack(qp, aeth, psn, opcode, val) || + opcode != OP(RDMA_READ_RESPONSE_FIRST)) + goto ack_done; + hdrsize += 4; + wqe = get_swqe_ptr(qp, qp->s_last); + if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) + goto ack_op_err; + qp->r_flags &= ~IPATH_R_RDMAR_SEQ; + /* + * If this is a response to a resent RDMA read, we + * have to be careful to copy the data to the right + * location. + */ + qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge, + wqe, psn, pmtu); + goto read_middle; + + case OP(RDMA_READ_RESPONSE_MIDDLE): + /* no AETH, no ACK */ + if (unlikely(ipath_cmp24(psn, qp->s_last_psn + 1))) { + dev->n_rdma_seq++; + if (qp->r_flags & IPATH_R_RDMAR_SEQ) + goto ack_done; + qp->r_flags |= IPATH_R_RDMAR_SEQ; + ipath_restart_rc(qp, qp->s_last_psn + 1); + goto ack_done; + } + if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) + goto ack_op_err; + read_middle: + if (unlikely(tlen != (hdrsize + pmtu + 4))) + goto ack_len_err; + if (unlikely(pmtu >= qp->s_rdma_read_len)) + goto ack_len_err; + + /* We got a response so update the timeout. */ + spin_lock(&dev->pending_lock); + if (qp->s_rnr_timeout == 0 && !list_empty(&qp->timerwait)) + list_move_tail(&qp->timerwait, + &dev->pending[dev->pending_index]); + spin_unlock(&dev->pending_lock); + + if (opcode == OP(RDMA_READ_RESPONSE_MIDDLE)) + qp->s_retry = qp->s_retry_cnt; + + /* + * Update the RDMA receive state but do the copy w/o + * holding the locks and blocking interrupts. + */ + qp->s_rdma_read_len -= pmtu; + update_last_psn(qp, psn); + spin_unlock_irqrestore(&qp->s_lock, flags); + ipath_copy_sge(&qp->s_rdma_read_sge, data, pmtu); + goto bail; + + case OP(RDMA_READ_RESPONSE_ONLY): + if (!header_in_data) + aeth = be32_to_cpu(ohdr->u.aeth); + else + aeth = be32_to_cpu(((__be32 *) data)[0]); + if (!do_rc_ack(qp, aeth, psn, opcode, 0)) + goto ack_done; + /* Get the number of bytes the message was padded by. */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + /* + * Check that the data size is >= 0 && <= pmtu. + * Remember to account for the AETH header (4) and + * ICRC (4). + */ + if (unlikely(tlen < (hdrsize + pad + 8))) + goto ack_len_err; + /* + * If this is a response to a resent RDMA read, we + * have to be careful to copy the data to the right + * location. + */ + wqe = get_swqe_ptr(qp, qp->s_last); + qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge, + wqe, psn, pmtu); + goto read_last; + + case OP(RDMA_READ_RESPONSE_LAST): + /* ACKs READ req. */ + if (unlikely(ipath_cmp24(psn, qp->s_last_psn + 1))) { + dev->n_rdma_seq++; + if (qp->r_flags & IPATH_R_RDMAR_SEQ) + goto ack_done; + qp->r_flags |= IPATH_R_RDMAR_SEQ; + ipath_restart_rc(qp, qp->s_last_psn + 1); + goto ack_done; + } + if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) + goto ack_op_err; + /* Get the number of bytes the message was padded by. */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + /* + * Check that the data size is >= 1 && <= pmtu. + * Remember to account for the AETH header (4) and + * ICRC (4). + */ + if (unlikely(tlen <= (hdrsize + pad + 8))) + goto ack_len_err; + read_last: + tlen -= hdrsize + pad + 8; + if (unlikely(tlen != qp->s_rdma_read_len)) + goto ack_len_err; + if (!header_in_data) + aeth = be32_to_cpu(ohdr->u.aeth); + else { + aeth = be32_to_cpu(((__be32 *) data)[0]); + data += sizeof(__be32); + } + ipath_copy_sge(&qp->s_rdma_read_sge, data, tlen); + (void) do_rc_ack(qp, aeth, psn, + OP(RDMA_READ_RESPONSE_LAST), 0); + goto ack_done; + } + +ack_op_err: + status = IB_WC_LOC_QP_OP_ERR; + goto ack_err; + +ack_len_err: + status = IB_WC_LOC_LEN_ERR; +ack_err: + ipath_send_complete(qp, wqe, status); + ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR); +ack_done: + spin_unlock_irqrestore(&qp->s_lock, flags); +bail: + return; +} + +/** + * ipath_rc_rcv_error - process an incoming duplicate or error RC packet + * @dev: the device this packet came in on + * @ohdr: the other headers for this packet + * @data: the packet data + * @qp: the QP for this packet + * @opcode: the opcode for this packet + * @psn: the packet sequence number for this packet + * @diff: the difference between the PSN and the expected PSN + * @header_in_data: true if part of the header data is in the data buffer + * + * This is called from ipath_rc_rcv() to process an unexpected + * incoming RC packet for the given QP. + * Called at interrupt level. + * Return 1 if no more processing is needed; otherwise return 0 to + * schedule a response to be sent. + */ +static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev, + struct ipath_other_headers *ohdr, + void *data, + struct ipath_qp *qp, + u32 opcode, + u32 psn, + int diff, + int header_in_data) +{ + struct ipath_ack_entry *e; + u8 i, prev; + int old_req; + unsigned long flags; + + if (diff > 0) { + /* + * Packet sequence error. + * A NAK will ACK earlier sends and RDMA writes. + * Don't queue the NAK if we already sent one. + */ + if (!qp->r_nak_state) { + qp->r_nak_state = IB_NAK_PSN_ERROR; + /* Use the expected PSN. */ + qp->r_ack_psn = qp->r_psn; + goto send_ack; + } + goto done; + } + + /* + * Handle a duplicate request. Don't re-execute SEND, RDMA + * write or atomic op. Don't NAK errors, just silently drop + * the duplicate request. Note that r_sge, r_len, and + * r_rcv_len may be in use so don't modify them. + * + * We are supposed to ACK the earliest duplicate PSN but we + * can coalesce an outstanding duplicate ACK. We have to + * send the earliest so that RDMA reads can be restarted at + * the requester's expected PSN. + * + * First, find where this duplicate PSN falls within the + * ACKs previously sent. + */ + psn &= IPATH_PSN_MASK; + e = NULL; + old_req = 1; + + spin_lock_irqsave(&qp->s_lock, flags); + /* Double check we can process this now that we hold the s_lock. */ + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) + goto unlock_done; + + for (i = qp->r_head_ack_queue; ; i = prev) { + if (i == qp->s_tail_ack_queue) + old_req = 0; + if (i) + prev = i - 1; + else + prev = IPATH_MAX_RDMA_ATOMIC; + if (prev == qp->r_head_ack_queue) { + e = NULL; + break; + } + e = &qp->s_ack_queue[prev]; + if (!e->opcode) { + e = NULL; + break; + } + if (ipath_cmp24(psn, e->psn) >= 0) { + if (prev == qp->s_tail_ack_queue) + old_req = 0; + break; + } + } + switch (opcode) { + case OP(RDMA_READ_REQUEST): { + struct ib_reth *reth; + u32 offset; + u32 len; + + /* + * If we didn't find the RDMA read request in the ack queue, + * or the send tasklet is already backed up to send an + * earlier entry, we can ignore this request. + */ + if (!e || e->opcode != OP(RDMA_READ_REQUEST) || old_req) + goto unlock_done; + /* RETH comes after BTH */ + if (!header_in_data) + reth = &ohdr->u.rc.reth; + else { + reth = (struct ib_reth *)data; + data += sizeof(*reth); + } + /* + * Address range must be a subset of the original + * request and start on pmtu boundaries. + * We reuse the old ack_queue slot since the requester + * should not back up and request an earlier PSN for the + * same request. + */ + offset = ((psn - e->psn) & IPATH_PSN_MASK) * + ib_mtu_enum_to_int(qp->path_mtu); + len = be32_to_cpu(reth->length); + if (unlikely(offset + len > e->rdma_sge.sge.sge_length)) + goto unlock_done; + if (len != 0) { + u32 rkey = be32_to_cpu(reth->rkey); + u64 vaddr = be64_to_cpu(reth->vaddr); + int ok; + + ok = ipath_rkey_ok(qp, &e->rdma_sge, + len, vaddr, rkey, + IB_ACCESS_REMOTE_READ); + if (unlikely(!ok)) + goto unlock_done; + } else { + e->rdma_sge.sg_list = NULL; + e->rdma_sge.num_sge = 0; + e->rdma_sge.sge.mr = NULL; + e->rdma_sge.sge.vaddr = NULL; + e->rdma_sge.sge.length = 0; + e->rdma_sge.sge.sge_length = 0; + } + e->psn = psn; + qp->s_ack_state = OP(ACKNOWLEDGE); + qp->s_tail_ack_queue = prev; + break; + } + + case OP(COMPARE_SWAP): + case OP(FETCH_ADD): { + /* + * If we didn't find the atomic request in the ack queue + * or the send tasklet is already backed up to send an + * earlier entry, we can ignore this request. + */ + if (!e || e->opcode != (u8) opcode || old_req) + goto unlock_done; + qp->s_ack_state = OP(ACKNOWLEDGE); + qp->s_tail_ack_queue = prev; + break; + } + + default: + if (old_req) + goto unlock_done; + /* + * Resend the most recent ACK if this request is + * after all the previous RDMA reads and atomics. + */ + if (i == qp->r_head_ack_queue) { + spin_unlock_irqrestore(&qp->s_lock, flags); + qp->r_nak_state = 0; + qp->r_ack_psn = qp->r_psn - 1; + goto send_ack; + } + /* + * Try to send a simple ACK to work around a Mellanox bug + * which doesn't accept a RDMA read response or atomic + * response as an ACK for earlier SENDs or RDMA writes. + */ + if (qp->r_head_ack_queue == qp->s_tail_ack_queue && + !(qp->s_flags & IPATH_S_ACK_PENDING) && + qp->s_ack_state == OP(ACKNOWLEDGE)) { + spin_unlock_irqrestore(&qp->s_lock, flags); + qp->r_nak_state = 0; + qp->r_ack_psn = qp->s_ack_queue[i].psn - 1; + goto send_ack; + } + /* + * Resend the RDMA read or atomic op which + * ACKs this duplicate request. + */ + qp->s_ack_state = OP(ACKNOWLEDGE); + qp->s_tail_ack_queue = i; + break; + } + qp->r_nak_state = 0; + ipath_schedule_send(qp); + +unlock_done: + spin_unlock_irqrestore(&qp->s_lock, flags); +done: + return 1; + +send_ack: + return 0; +} + +void ipath_rc_error(struct ipath_qp *qp, enum ib_wc_status err) +{ + unsigned long flags; + int lastwqe; + + spin_lock_irqsave(&qp->s_lock, flags); + lastwqe = ipath_error_qp(qp, err); + spin_unlock_irqrestore(&qp->s_lock, flags); + + if (lastwqe) { + struct ib_event ev; + + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_QP_LAST_WQE_REACHED; + qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); + } +} + +static inline void ipath_update_ack_queue(struct ipath_qp *qp, unsigned n) +{ + unsigned next; + + next = n + 1; + if (next > IPATH_MAX_RDMA_ATOMIC) + next = 0; + if (n == qp->s_tail_ack_queue) { + qp->s_tail_ack_queue = next; + qp->s_ack_state = OP(ACKNOWLEDGE); + } +} + +/** + * ipath_rc_rcv - process an incoming RC packet + * @dev: the device this packet came in on + * @hdr: the header of this packet + * @has_grh: true if the header has a GRH + * @data: the packet data + * @tlen: the packet length + * @qp: the QP for this packet + * + * This is called from ipath_qp_rcv() to process an incoming RC packet + * for the given QP. + * Called at interrupt level. + */ +void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, + int has_grh, void *data, u32 tlen, struct ipath_qp *qp) +{ + struct ipath_other_headers *ohdr; + u32 opcode; + u32 hdrsize; + u32 psn; + u32 pad; + struct ib_wc wc; + u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu); + int diff; + struct ib_reth *reth; + int header_in_data; + unsigned long flags; + + /* Validate the SLID. See Ch. 9.6.1.5 */ + if (unlikely(be16_to_cpu(hdr->lrh[3]) != qp->remote_ah_attr.dlid)) + goto done; + + /* Check for GRH */ + if (!has_grh) { + ohdr = &hdr->u.oth; + hdrsize = 8 + 12; /* LRH + BTH */ + psn = be32_to_cpu(ohdr->bth[2]); + header_in_data = 0; + } else { + ohdr = &hdr->u.l.oth; + hdrsize = 8 + 40 + 12; /* LRH + GRH + BTH */ + /* + * The header with GRH is 60 bytes and the core driver sets + * the eager header buffer size to 56 bytes so the last 4 + * bytes of the BTH header (PSN) is in the data buffer. + */ + header_in_data = dev->dd->ipath_rcvhdrentsize == 16; + if (header_in_data) { + psn = be32_to_cpu(((__be32 *) data)[0]); + data += sizeof(__be32); + } else + psn = be32_to_cpu(ohdr->bth[2]); + } + + /* + * Process responses (ACKs) before anything else. Note that the + * packet sequence number will be for something in the send work + * queue rather than the expected receive packet sequence number. + * In other words, this QP is the requester. + */ + opcode = be32_to_cpu(ohdr->bth[0]) >> 24; + if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) && + opcode <= OP(ATOMIC_ACKNOWLEDGE)) { + ipath_rc_rcv_resp(dev, ohdr, data, tlen, qp, opcode, psn, + hdrsize, pmtu, header_in_data); + goto done; + } + + /* Compute 24 bits worth of difference. */ + diff = ipath_cmp24(psn, qp->r_psn); + if (unlikely(diff)) { + if (ipath_rc_rcv_error(dev, ohdr, data, qp, opcode, + psn, diff, header_in_data)) + goto done; + goto send_ack; + } + + /* Check for opcode sequence errors. */ + switch (qp->r_state) { + case OP(SEND_FIRST): + case OP(SEND_MIDDLE): + if (opcode == OP(SEND_MIDDLE) || + opcode == OP(SEND_LAST) || + opcode == OP(SEND_LAST_WITH_IMMEDIATE)) + break; + goto nack_inv; + + case OP(RDMA_WRITE_FIRST): + case OP(RDMA_WRITE_MIDDLE): + if (opcode == OP(RDMA_WRITE_MIDDLE) || + opcode == OP(RDMA_WRITE_LAST) || + opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE)) + break; + goto nack_inv; + + default: + if (opcode == OP(SEND_MIDDLE) || + opcode == OP(SEND_LAST) || + opcode == OP(SEND_LAST_WITH_IMMEDIATE) || + opcode == OP(RDMA_WRITE_MIDDLE) || + opcode == OP(RDMA_WRITE_LAST) || + opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE)) + goto nack_inv; + /* + * Note that it is up to the requester to not send a new + * RDMA read or atomic operation before receiving an ACK + * for the previous operation. + */ + break; + } + + memset(&wc, 0, sizeof wc); + + /* OK, process the packet. */ + switch (opcode) { + case OP(SEND_FIRST): + if (!ipath_get_rwqe(qp, 0)) + goto rnr_nak; + qp->r_rcv_len = 0; + /* FALLTHROUGH */ + case OP(SEND_MIDDLE): + case OP(RDMA_WRITE_MIDDLE): + send_middle: + /* Check for invalid length PMTU or posted rwqe len. */ + if (unlikely(tlen != (hdrsize + pmtu + 4))) + goto nack_inv; + qp->r_rcv_len += pmtu; + if (unlikely(qp->r_rcv_len > qp->r_len)) + goto nack_inv; + ipath_copy_sge(&qp->r_sge, data, pmtu); + break; + + case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): + /* consume RWQE */ + if (!ipath_get_rwqe(qp, 1)) + goto rnr_nak; + goto send_last_imm; + + case OP(SEND_ONLY): + case OP(SEND_ONLY_WITH_IMMEDIATE): + if (!ipath_get_rwqe(qp, 0)) + goto rnr_nak; + qp->r_rcv_len = 0; + if (opcode == OP(SEND_ONLY)) + goto send_last; + /* FALLTHROUGH */ + case OP(SEND_LAST_WITH_IMMEDIATE): + send_last_imm: + if (header_in_data) { + wc.ex.imm_data = *(__be32 *) data; + data += sizeof(__be32); + } else { + /* Immediate data comes after BTH */ + wc.ex.imm_data = ohdr->u.imm_data; + } + hdrsize += 4; + wc.wc_flags = IB_WC_WITH_IMM; + /* FALLTHROUGH */ + case OP(SEND_LAST): + case OP(RDMA_WRITE_LAST): + send_last: + /* Get the number of bytes the message was padded by. */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + /* Check for invalid length. */ + /* XXX LAST len should be >= 1 */ + if (unlikely(tlen < (hdrsize + pad + 4))) + goto nack_inv; + /* Don't count the CRC. */ + tlen -= (hdrsize + pad + 4); + wc.byte_len = tlen + qp->r_rcv_len; + if (unlikely(wc.byte_len > qp->r_len)) + goto nack_inv; + ipath_copy_sge(&qp->r_sge, data, tlen); + qp->r_msn++; + if (!test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags)) + break; + wc.wr_id = qp->r_wr_id; + wc.status = IB_WC_SUCCESS; + if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) || + opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) + wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; + else + wc.opcode = IB_WC_RECV; + wc.qp = &qp->ibqp; + wc.src_qp = qp->remote_qpn; + wc.slid = qp->remote_ah_attr.dlid; + wc.sl = qp->remote_ah_attr.sl; + /* Signal completion event if the solicited bit is set. */ + ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, + (ohdr->bth[0] & + cpu_to_be32(1 << 23)) != 0); + break; + + case OP(RDMA_WRITE_FIRST): + case OP(RDMA_WRITE_ONLY): + case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): + if (unlikely(!(qp->qp_access_flags & + IB_ACCESS_REMOTE_WRITE))) + goto nack_inv; + /* consume RWQE */ + /* RETH comes after BTH */ + if (!header_in_data) + reth = &ohdr->u.rc.reth; + else { + reth = (struct ib_reth *)data; + data += sizeof(*reth); + } + hdrsize += sizeof(*reth); + qp->r_len = be32_to_cpu(reth->length); + qp->r_rcv_len = 0; + if (qp->r_len != 0) { + u32 rkey = be32_to_cpu(reth->rkey); + u64 vaddr = be64_to_cpu(reth->vaddr); + int ok; + + /* Check rkey & NAK */ + ok = ipath_rkey_ok(qp, &qp->r_sge, + qp->r_len, vaddr, rkey, + IB_ACCESS_REMOTE_WRITE); + if (unlikely(!ok)) + goto nack_acc; + } else { + qp->r_sge.sg_list = NULL; + qp->r_sge.sge.mr = NULL; + qp->r_sge.sge.vaddr = NULL; + qp->r_sge.sge.length = 0; + qp->r_sge.sge.sge_length = 0; + } + if (opcode == OP(RDMA_WRITE_FIRST)) + goto send_middle; + else if (opcode == OP(RDMA_WRITE_ONLY)) + goto send_last; + if (!ipath_get_rwqe(qp, 1)) + goto rnr_nak; + goto send_last_imm; + + case OP(RDMA_READ_REQUEST): { + struct ipath_ack_entry *e; + u32 len; + u8 next; + + if (unlikely(!(qp->qp_access_flags & + IB_ACCESS_REMOTE_READ))) + goto nack_inv; + next = qp->r_head_ack_queue + 1; + if (next > IPATH_MAX_RDMA_ATOMIC) + next = 0; + spin_lock_irqsave(&qp->s_lock, flags); + /* Double check we can process this while holding the s_lock. */ + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) + goto unlock; + if (unlikely(next == qp->s_tail_ack_queue)) { + if (!qp->s_ack_queue[next].sent) + goto nack_inv_unlck; + ipath_update_ack_queue(qp, next); + } + e = &qp->s_ack_queue[qp->r_head_ack_queue]; + /* RETH comes after BTH */ + if (!header_in_data) + reth = &ohdr->u.rc.reth; + else { + reth = (struct ib_reth *)data; + data += sizeof(*reth); + } + len = be32_to_cpu(reth->length); + if (len) { + u32 rkey = be32_to_cpu(reth->rkey); + u64 vaddr = be64_to_cpu(reth->vaddr); + int ok; + + /* Check rkey & NAK */ + ok = ipath_rkey_ok(qp, &e->rdma_sge, len, vaddr, + rkey, IB_ACCESS_REMOTE_READ); + if (unlikely(!ok)) + goto nack_acc_unlck; + /* + * Update the next expected PSN. We add 1 later + * below, so only add the remainder here. + */ + if (len > pmtu) + qp->r_psn += (len - 1) / pmtu; + } else { + e->rdma_sge.sg_list = NULL; + e->rdma_sge.num_sge = 0; + e->rdma_sge.sge.mr = NULL; + e->rdma_sge.sge.vaddr = NULL; + e->rdma_sge.sge.length = 0; + e->rdma_sge.sge.sge_length = 0; + } + e->opcode = opcode; + e->sent = 0; + e->psn = psn; + /* + * We need to increment the MSN here instead of when we + * finish sending the result since a duplicate request would + * increment it more than once. + */ + qp->r_msn++; + qp->r_psn++; + qp->r_state = opcode; + qp->r_nak_state = 0; + qp->r_head_ack_queue = next; + + /* Schedule the send tasklet. */ + ipath_schedule_send(qp); + + goto unlock; + } + + case OP(COMPARE_SWAP): + case OP(FETCH_ADD): { + struct ib_atomic_eth *ateth; + struct ipath_ack_entry *e; + u64 vaddr; + atomic64_t *maddr; + u64 sdata; + u32 rkey; + u8 next; + + if (unlikely(!(qp->qp_access_flags & + IB_ACCESS_REMOTE_ATOMIC))) + goto nack_inv; + next = qp->r_head_ack_queue + 1; + if (next > IPATH_MAX_RDMA_ATOMIC) + next = 0; + spin_lock_irqsave(&qp->s_lock, flags); + /* Double check we can process this while holding the s_lock. */ + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) + goto unlock; + if (unlikely(next == qp->s_tail_ack_queue)) { + if (!qp->s_ack_queue[next].sent) + goto nack_inv_unlck; + ipath_update_ack_queue(qp, next); + } + if (!header_in_data) + ateth = &ohdr->u.atomic_eth; + else + ateth = (struct ib_atomic_eth *)data; + vaddr = ((u64) be32_to_cpu(ateth->vaddr[0]) << 32) | + be32_to_cpu(ateth->vaddr[1]); + if (unlikely(vaddr & (sizeof(u64) - 1))) + goto nack_inv_unlck; + rkey = be32_to_cpu(ateth->rkey); + /* Check rkey & NAK */ + if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge, + sizeof(u64), vaddr, rkey, + IB_ACCESS_REMOTE_ATOMIC))) + goto nack_acc_unlck; + /* Perform atomic OP and save result. */ + maddr = (atomic64_t *) qp->r_sge.sge.vaddr; + sdata = be64_to_cpu(ateth->swap_data); + e = &qp->s_ack_queue[qp->r_head_ack_queue]; + e->atomic_data = (opcode == OP(FETCH_ADD)) ? + (u64) atomic64_add_return(sdata, maddr) - sdata : + (u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr, + be64_to_cpu(ateth->compare_data), + sdata); + e->opcode = opcode; + e->sent = 0; + e->psn = psn & IPATH_PSN_MASK; + qp->r_msn++; + qp->r_psn++; + qp->r_state = opcode; + qp->r_nak_state = 0; + qp->r_head_ack_queue = next; + + /* Schedule the send tasklet. */ + ipath_schedule_send(qp); + + goto unlock; + } + + default: + /* NAK unknown opcodes. */ + goto nack_inv; + } + qp->r_psn++; + qp->r_state = opcode; + qp->r_ack_psn = psn; + qp->r_nak_state = 0; + /* Send an ACK if requested or required. */ + if (psn & (1 << 31)) + goto send_ack; + goto done; + +rnr_nak: + qp->r_nak_state = IB_RNR_NAK | qp->r_min_rnr_timer; + qp->r_ack_psn = qp->r_psn; + goto send_ack; + +nack_inv_unlck: + spin_unlock_irqrestore(&qp->s_lock, flags); +nack_inv: + ipath_rc_error(qp, IB_WC_LOC_QP_OP_ERR); + qp->r_nak_state = IB_NAK_INVALID_REQUEST; + qp->r_ack_psn = qp->r_psn; + goto send_ack; + +nack_acc_unlck: + spin_unlock_irqrestore(&qp->s_lock, flags); +nack_acc: + ipath_rc_error(qp, IB_WC_LOC_PROT_ERR); + qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR; + qp->r_ack_psn = qp->r_psn; +send_ack: + send_rc_ack(qp); + goto done; + +unlock: + spin_unlock_irqrestore(&qp->s_lock, flags); +done: + return; +} diff --git a/kernel/drivers/infiniband/hw/ipath/ipath_registers.h b/kernel/drivers/infiniband/hw/ipath/ipath_registers.h new file mode 100644 index 000000000..8f44d0cf3 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ipath/ipath_registers.h @@ -0,0 +1,512 @@ +/* + * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _IPATH_REGISTERS_H +#define _IPATH_REGISTERS_H + +/* + * This file should only be included by kernel source, and by the diags. It + * defines the registers, and their contents, for InfiniPath chips. + */ + +/* + * These are the InfiniPath register and buffer bit definitions, + * that are visible to software, and needed only by the kernel + * and diag code. A few, that are visible to protocol and user + * code are in ipath_common.h. Some bits are specific + * to a given chip implementation, and have been moved to the + * chip-specific source file + */ + +/* kr_revision bits */ +#define INFINIPATH_R_CHIPREVMINOR_MASK 0xFF +#define INFINIPATH_R_CHIPREVMINOR_SHIFT 0 +#define INFINIPATH_R_CHIPREVMAJOR_MASK 0xFF +#define INFINIPATH_R_CHIPREVMAJOR_SHIFT 8 +#define INFINIPATH_R_ARCH_MASK 0xFF +#define INFINIPATH_R_ARCH_SHIFT 16 +#define INFINIPATH_R_SOFTWARE_MASK 0xFF +#define INFINIPATH_R_SOFTWARE_SHIFT 24 +#define INFINIPATH_R_BOARDID_MASK 0xFF +#define INFINIPATH_R_BOARDID_SHIFT 32 + +/* kr_control bits */ +#define INFINIPATH_C_FREEZEMODE 0x00000002 +#define INFINIPATH_C_LINKENABLE 0x00000004 + +/* kr_sendctrl bits */ +#define INFINIPATH_S_DISARMPIOBUF_SHIFT 16 +#define INFINIPATH_S_UPDTHRESH_SHIFT 24 +#define INFINIPATH_S_UPDTHRESH_MASK 0x1f + +#define IPATH_S_ABORT 0 +#define IPATH_S_PIOINTBUFAVAIL 1 +#define IPATH_S_PIOBUFAVAILUPD 2 +#define IPATH_S_PIOENABLE 3 +#define IPATH_S_SDMAINTENABLE 9 +#define IPATH_S_SDMASINGLEDESCRIPTOR 10 +#define IPATH_S_SDMAENABLE 11 +#define IPATH_S_SDMAHALT 12 +#define IPATH_S_DISARM 31 + +#define INFINIPATH_S_ABORT (1U << IPATH_S_ABORT) +#define INFINIPATH_S_PIOINTBUFAVAIL (1U << IPATH_S_PIOINTBUFAVAIL) +#define INFINIPATH_S_PIOBUFAVAILUPD (1U << IPATH_S_PIOBUFAVAILUPD) +#define INFINIPATH_S_PIOENABLE (1U << IPATH_S_PIOENABLE) +#define INFINIPATH_S_SDMAINTENABLE (1U << IPATH_S_SDMAINTENABLE) +#define INFINIPATH_S_SDMASINGLEDESCRIPTOR \ + (1U << IPATH_S_SDMASINGLEDESCRIPTOR) +#define INFINIPATH_S_SDMAENABLE (1U << IPATH_S_SDMAENABLE) +#define INFINIPATH_S_SDMAHALT (1U << IPATH_S_SDMAHALT) +#define INFINIPATH_S_DISARM (1U << IPATH_S_DISARM) + +/* kr_rcvctrl bits that are the same on multiple chips */ +#define INFINIPATH_R_PORTENABLE_SHIFT 0 +#define INFINIPATH_R_QPMAP_ENABLE (1ULL << 38) + +/* kr_intstatus, kr_intclear, kr_intmask bits */ +#define INFINIPATH_I_SDMAINT 0x8000000000000000ULL +#define INFINIPATH_I_SDMADISABLED 0x4000000000000000ULL +#define INFINIPATH_I_ERROR 0x0000000080000000ULL +#define INFINIPATH_I_SPIOSENT 0x0000000040000000ULL +#define INFINIPATH_I_SPIOBUFAVAIL 0x0000000020000000ULL +#define INFINIPATH_I_GPIO 0x0000000010000000ULL +#define INFINIPATH_I_JINT 0x0000000004000000ULL + +/* kr_errorstatus, kr_errorclear, kr_errormask bits */ +#define INFINIPATH_E_RFORMATERR 0x0000000000000001ULL +#define INFINIPATH_E_RVCRC 0x0000000000000002ULL +#define INFINIPATH_E_RICRC 0x0000000000000004ULL +#define INFINIPATH_E_RMINPKTLEN 0x0000000000000008ULL +#define INFINIPATH_E_RMAXPKTLEN 0x0000000000000010ULL +#define INFINIPATH_E_RLONGPKTLEN 0x0000000000000020ULL +#define INFINIPATH_E_RSHORTPKTLEN 0x0000000000000040ULL +#define INFINIPATH_E_RUNEXPCHAR 0x0000000000000080ULL +#define INFINIPATH_E_RUNSUPVL 0x0000000000000100ULL +#define INFINIPATH_E_REBP 0x0000000000000200ULL +#define INFINIPATH_E_RIBFLOW 0x0000000000000400ULL +#define INFINIPATH_E_RBADVERSION 0x0000000000000800ULL +#define INFINIPATH_E_RRCVEGRFULL 0x0000000000001000ULL +#define INFINIPATH_E_RRCVHDRFULL 0x0000000000002000ULL +#define INFINIPATH_E_RBADTID 0x0000000000004000ULL +#define INFINIPATH_E_RHDRLEN 0x0000000000008000ULL +#define INFINIPATH_E_RHDR 0x0000000000010000ULL +#define INFINIPATH_E_RIBLOSTLINK 0x0000000000020000ULL +#define INFINIPATH_E_SENDSPECIALTRIGGER 0x0000000008000000ULL +#define INFINIPATH_E_SDMADISABLED 0x0000000010000000ULL +#define INFINIPATH_E_SMINPKTLEN 0x0000000020000000ULL +#define INFINIPATH_E_SMAXPKTLEN 0x0000000040000000ULL +#define INFINIPATH_E_SUNDERRUN 0x0000000080000000ULL +#define INFINIPATH_E_SPKTLEN 0x0000000100000000ULL +#define INFINIPATH_E_SDROPPEDSMPPKT 0x0000000200000000ULL +#define INFINIPATH_E_SDROPPEDDATAPKT 0x0000000400000000ULL +#define INFINIPATH_E_SPIOARMLAUNCH 0x0000000800000000ULL +#define INFINIPATH_E_SUNEXPERRPKTNUM 0x0000001000000000ULL +#define INFINIPATH_E_SUNSUPVL 0x0000002000000000ULL +#define INFINIPATH_E_SENDBUFMISUSE 0x0000004000000000ULL +#define INFINIPATH_E_SDMAGENMISMATCH 0x0000008000000000ULL +#define INFINIPATH_E_SDMAOUTOFBOUND 0x0000010000000000ULL +#define INFINIPATH_E_SDMATAILOUTOFBOUND 0x0000020000000000ULL +#define INFINIPATH_E_SDMABASE 0x0000040000000000ULL +#define INFINIPATH_E_SDMA1STDESC 0x0000080000000000ULL +#define INFINIPATH_E_SDMARPYTAG 0x0000100000000000ULL +#define INFINIPATH_E_SDMADWEN 0x0000200000000000ULL +#define INFINIPATH_E_SDMAMISSINGDW 0x0000400000000000ULL +#define INFINIPATH_E_SDMAUNEXPDATA 0x0000800000000000ULL +#define INFINIPATH_E_IBSTATUSCHANGED 0x0001000000000000ULL +#define INFINIPATH_E_INVALIDADDR 0x0002000000000000ULL +#define INFINIPATH_E_RESET 0x0004000000000000ULL +#define INFINIPATH_E_HARDWARE 0x0008000000000000ULL +#define INFINIPATH_E_SDMADESCADDRMISALIGN 0x0010000000000000ULL +#define INFINIPATH_E_INVALIDEEPCMD 0x0020000000000000ULL + +/* + * this is used to print "common" packet errors only when the + * __IPATH_ERRPKTDBG bit is set in ipath_debug. + */ +#define INFINIPATH_E_PKTERRS ( INFINIPATH_E_SPKTLEN \ + | INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_RVCRC \ + | INFINIPATH_E_RICRC | INFINIPATH_E_RSHORTPKTLEN \ + | INFINIPATH_E_REBP ) + +/* Convenience for decoding Send DMA errors */ +#define INFINIPATH_E_SDMAERRS ( \ + INFINIPATH_E_SDMAGENMISMATCH | INFINIPATH_E_SDMAOUTOFBOUND | \ + INFINIPATH_E_SDMATAILOUTOFBOUND | INFINIPATH_E_SDMABASE | \ + INFINIPATH_E_SDMA1STDESC | INFINIPATH_E_SDMARPYTAG | \ + INFINIPATH_E_SDMADWEN | INFINIPATH_E_SDMAMISSINGDW | \ + INFINIPATH_E_SDMAUNEXPDATA | \ + INFINIPATH_E_SDMADESCADDRMISALIGN | \ + INFINIPATH_E_SDMADISABLED | \ + INFINIPATH_E_SENDBUFMISUSE) + +/* kr_hwerrclear, kr_hwerrmask, kr_hwerrstatus, bits */ +/* TXEMEMPARITYERR bit 0: PIObuf, 1: PIOpbc, 2: launchfifo + * RXEMEMPARITYERR bit 0: rcvbuf, 1: lookupq, 2: expTID, 3: eagerTID + * bit 4: flag buffer, 5: datainfo, 6: header info */ +#define INFINIPATH_HWE_TXEMEMPARITYERR_MASK 0xFULL +#define INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT 40 +#define INFINIPATH_HWE_RXEMEMPARITYERR_MASK 0x7FULL +#define INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT 44 +#define INFINIPATH_HWE_IBCBUSTOSPCPARITYERR 0x4000000000000000ULL +#define INFINIPATH_HWE_IBCBUSFRSPCPARITYERR 0x8000000000000000ULL +/* txe mem parity errors (shift by INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT) */ +#define INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF 0x1ULL +#define INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC 0x2ULL +#define INFINIPATH_HWE_TXEMEMPARITYERR_PIOLAUNCHFIFO 0x4ULL +/* rxe mem parity errors (shift by INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) */ +#define INFINIPATH_HWE_RXEMEMPARITYERR_RCVBUF 0x01ULL +#define INFINIPATH_HWE_RXEMEMPARITYERR_LOOKUPQ 0x02ULL +#define INFINIPATH_HWE_RXEMEMPARITYERR_EXPTID 0x04ULL +#define INFINIPATH_HWE_RXEMEMPARITYERR_EAGERTID 0x08ULL +#define INFINIPATH_HWE_RXEMEMPARITYERR_FLAGBUF 0x10ULL +#define INFINIPATH_HWE_RXEMEMPARITYERR_DATAINFO 0x20ULL +#define INFINIPATH_HWE_RXEMEMPARITYERR_HDRINFO 0x40ULL +/* waldo specific -- find the rest in ipath_6110.c */ +#define INFINIPATH_HWE_RXDSYNCMEMPARITYERR 0x0000000400000000ULL +/* 6120/7220 specific -- find the rest in ipath_6120.c and ipath_7220.c */ +#define INFINIPATH_HWE_MEMBISTFAILED 0x0040000000000000ULL + +/* kr_hwdiagctrl bits */ +#define INFINIPATH_DC_FORCETXEMEMPARITYERR_MASK 0xFULL +#define INFINIPATH_DC_FORCETXEMEMPARITYERR_SHIFT 40 +#define INFINIPATH_DC_FORCERXEMEMPARITYERR_MASK 0x7FULL +#define INFINIPATH_DC_FORCERXEMEMPARITYERR_SHIFT 44 +#define INFINIPATH_DC_FORCERXDSYNCMEMPARITYERR 0x0000000400000000ULL +#define INFINIPATH_DC_COUNTERDISABLE 0x1000000000000000ULL +#define INFINIPATH_DC_COUNTERWREN 0x2000000000000000ULL +#define INFINIPATH_DC_FORCEIBCBUSTOSPCPARITYERR 0x4000000000000000ULL +#define INFINIPATH_DC_FORCEIBCBUSFRSPCPARITYERR 0x8000000000000000ULL + +/* kr_ibcctrl bits */ +#define INFINIPATH_IBCC_FLOWCTRLPERIOD_MASK 0xFFULL +#define INFINIPATH_IBCC_FLOWCTRLPERIOD_SHIFT 0 +#define INFINIPATH_IBCC_FLOWCTRLWATERMARK_MASK 0xFFULL +#define INFINIPATH_IBCC_FLOWCTRLWATERMARK_SHIFT 8 +#define INFINIPATH_IBCC_LINKINITCMD_MASK 0x3ULL +#define INFINIPATH_IBCC_LINKINITCMD_DISABLE 1 +/* cycle through TS1/TS2 till OK */ +#define INFINIPATH_IBCC_LINKINITCMD_POLL 2 +/* wait for TS1, then go on */ +#define INFINIPATH_IBCC_LINKINITCMD_SLEEP 3 +#define INFINIPATH_IBCC_LINKINITCMD_SHIFT 16 +#define INFINIPATH_IBCC_LINKCMD_MASK 0x3ULL +#define INFINIPATH_IBCC_LINKCMD_DOWN 1 /* move to 0x11 */ +#define INFINIPATH_IBCC_LINKCMD_ARMED 2 /* move to 0x21 */ +#define INFINIPATH_IBCC_LINKCMD_ACTIVE 3 /* move to 0x31 */ +#define INFINIPATH_IBCC_LINKCMD_SHIFT 18 +#define INFINIPATH_IBCC_MAXPKTLEN_MASK 0x7FFULL +#define INFINIPATH_IBCC_MAXPKTLEN_SHIFT 20 +#define INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK 0xFULL +#define INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT 32 +#define INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK 0xFULL +#define INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT 36 +#define INFINIPATH_IBCC_CREDITSCALE_MASK 0x7ULL +#define INFINIPATH_IBCC_CREDITSCALE_SHIFT 40 +#define INFINIPATH_IBCC_LOOPBACK 0x8000000000000000ULL +#define INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE 0x4000000000000000ULL + +/* kr_ibcstatus bits */ +#define INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT 0 +#define INFINIPATH_IBCS_LINKSTATE_MASK 0x7 + +#define INFINIPATH_IBCS_TXREADY 0x40000000 +#define INFINIPATH_IBCS_TXCREDITOK 0x80000000 +/* link training states (shift by + INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) */ +#define INFINIPATH_IBCS_LT_STATE_DISABLED 0x00 +#define INFINIPATH_IBCS_LT_STATE_LINKUP 0x01 +#define INFINIPATH_IBCS_LT_STATE_POLLACTIVE 0x02 +#define INFINIPATH_IBCS_LT_STATE_POLLQUIET 0x03 +#define INFINIPATH_IBCS_LT_STATE_SLEEPDELAY 0x04 +#define INFINIPATH_IBCS_LT_STATE_SLEEPQUIET 0x05 +#define INFINIPATH_IBCS_LT_STATE_CFGDEBOUNCE 0x08 +#define INFINIPATH_IBCS_LT_STATE_CFGRCVFCFG 0x09 +#define INFINIPATH_IBCS_LT_STATE_CFGWAITRMT 0x0a +#define INFINIPATH_IBCS_LT_STATE_CFGIDLE 0x0b +#define INFINIPATH_IBCS_LT_STATE_RECOVERRETRAIN 0x0c +#define INFINIPATH_IBCS_LT_STATE_RECOVERWAITRMT 0x0e +#define INFINIPATH_IBCS_LT_STATE_RECOVERIDLE 0x0f +/* link state machine states (shift by ibcs_ls_shift) */ +#define INFINIPATH_IBCS_L_STATE_DOWN 0x0 +#define INFINIPATH_IBCS_L_STATE_INIT 0x1 +#define INFINIPATH_IBCS_L_STATE_ARM 0x2 +#define INFINIPATH_IBCS_L_STATE_ACTIVE 0x3 +#define INFINIPATH_IBCS_L_STATE_ACT_DEFER 0x4 + + +/* kr_extstatus bits */ +#define INFINIPATH_EXTS_SERDESPLLLOCK 0x1 +#define INFINIPATH_EXTS_GPIOIN_MASK 0xFFFFULL +#define INFINIPATH_EXTS_GPIOIN_SHIFT 48 + +/* kr_extctrl bits */ +#define INFINIPATH_EXTC_GPIOINVERT_MASK 0xFFFFULL +#define INFINIPATH_EXTC_GPIOINVERT_SHIFT 32 +#define INFINIPATH_EXTC_GPIOOE_MASK 0xFFFFULL +#define INFINIPATH_EXTC_GPIOOE_SHIFT 48 +#define INFINIPATH_EXTC_SERDESENABLE 0x80000000ULL +#define INFINIPATH_EXTC_SERDESCONNECT 0x40000000ULL +#define INFINIPATH_EXTC_SERDESENTRUNKING 0x20000000ULL +#define INFINIPATH_EXTC_SERDESDISRXFIFO 0x10000000ULL +#define INFINIPATH_EXTC_SERDESENPLPBK1 0x08000000ULL +#define INFINIPATH_EXTC_SERDESENPLPBK2 0x04000000ULL +#define INFINIPATH_EXTC_SERDESENENCDEC 0x02000000ULL +#define INFINIPATH_EXTC_LED1SECPORT_ON 0x00000020ULL +#define INFINIPATH_EXTC_LED2SECPORT_ON 0x00000010ULL +#define INFINIPATH_EXTC_LED1PRIPORT_ON 0x00000008ULL +#define INFINIPATH_EXTC_LED2PRIPORT_ON 0x00000004ULL +#define INFINIPATH_EXTC_LEDGBLOK_ON 0x00000002ULL +#define INFINIPATH_EXTC_LEDGBLERR_OFF 0x00000001ULL + +/* kr_partitionkey bits */ +#define INFINIPATH_PKEY_SIZE 16 +#define INFINIPATH_PKEY_MASK 0xFFFF +#define INFINIPATH_PKEY_DEFAULT_PKEY 0xFFFF + +/* kr_serdesconfig0 bits */ +#define INFINIPATH_SERDC0_RESET_MASK 0xfULL /* overal reset bits */ +#define INFINIPATH_SERDC0_RESET_PLL 0x10000000ULL /* pll reset */ +/* tx idle enables (per lane) */ +#define INFINIPATH_SERDC0_TXIDLE 0xF000ULL +/* rx detect enables (per lane) */ +#define INFINIPATH_SERDC0_RXDETECT_EN 0xF0000ULL +/* L1 Power down; use with RXDETECT, Otherwise not used on IB side */ +#define INFINIPATH_SERDC0_L1PWR_DN 0xF0ULL + +/* common kr_xgxsconfig bits (or safe in all, even if not implemented) */ +#define INFINIPATH_XGXS_RX_POL_SHIFT 19 +#define INFINIPATH_XGXS_RX_POL_MASK 0xfULL + + +/* + * IPATH_PIO_MAXIBHDR is the max IB header size allowed for in our + * PIO send buffers. This is well beyond anything currently + * defined in the InfiniBand spec. + */ +#define IPATH_PIO_MAXIBHDR 128 + +typedef u64 ipath_err_t; + +/* The following change with the type of device, so + * need to be part of the ipath_devdata struct, or + * we could have problems plugging in devices of + * different types (e.g. one HT, one PCIE) + * in one system, to be managed by one driver. + * On the other hand, this file is may also be included + * by other code, so leave the declarations here + * temporarily. Minor footprint issue if common-model + * linker used, none if C89+ linker used. + */ + +/* mask of defined bits for various registers */ +extern u64 infinipath_i_bitsextant; +extern ipath_err_t infinipath_e_bitsextant, infinipath_hwe_bitsextant; + +/* masks that are different in various chips, or only exist in some chips */ +extern u32 infinipath_i_rcvavail_mask, infinipath_i_rcvurg_mask; + +/* + * These are the infinipath general register numbers (not offsets). + * The kernel registers are used directly, those beyond the kernel + * registers are calculated from one of the base registers. The use of + * an integer type doesn't allow type-checking as thorough as, say, + * an enum but allows for better hiding of chip differences. + */ +typedef const u16 ipath_kreg, /* infinipath general registers */ + ipath_creg, /* infinipath counter registers */ + ipath_sreg; /* kernel-only, infinipath send registers */ + +/* + * These are the chip registers common to all infinipath chips, and + * used both by the kernel and the diagnostics or other user code. + * They are all implemented such that 64 bit accesses work. + * Some implement no more than 32 bits. Because 64 bit reads + * require 2 HT cmds on opteron, we access those with 32 bit + * reads for efficiency (they are written as 64 bits, since + * the extra 32 bits are nearly free on writes, and it slightly reduces + * complexity). The rest are all accessed as 64 bits. + */ +struct ipath_kregs { + /* These are the 32 bit group */ + ipath_kreg kr_control; + ipath_kreg kr_counterregbase; + ipath_kreg kr_intmask; + ipath_kreg kr_intstatus; + ipath_kreg kr_pagealign; + ipath_kreg kr_portcnt; + ipath_kreg kr_rcvtidbase; + ipath_kreg kr_rcvtidcnt; + ipath_kreg kr_rcvegrbase; + ipath_kreg kr_rcvegrcnt; + ipath_kreg kr_scratch; + ipath_kreg kr_sendctrl; + ipath_kreg kr_sendpiobufbase; + ipath_kreg kr_sendpiobufcnt; + ipath_kreg kr_sendpiosize; + ipath_kreg kr_sendregbase; + ipath_kreg kr_userregbase; + /* These are the 64 bit group */ + ipath_kreg kr_debugport; + ipath_kreg kr_debugportselect; + ipath_kreg kr_errorclear; + ipath_kreg kr_errormask; + ipath_kreg kr_errorstatus; + ipath_kreg kr_extctrl; + ipath_kreg kr_extstatus; + ipath_kreg kr_gpio_clear; + ipath_kreg kr_gpio_mask; + ipath_kreg kr_gpio_out; + ipath_kreg kr_gpio_status; + ipath_kreg kr_hwdiagctrl; + ipath_kreg kr_hwerrclear; + ipath_kreg kr_hwerrmask; + ipath_kreg kr_hwerrstatus; + ipath_kreg kr_ibcctrl; + ipath_kreg kr_ibcstatus; + ipath_kreg kr_intblocked; + ipath_kreg kr_intclear; + ipath_kreg kr_interruptconfig; + ipath_kreg kr_mdio; + ipath_kreg kr_partitionkey; + ipath_kreg kr_rcvbthqp; + ipath_kreg kr_rcvbufbase; + ipath_kreg kr_rcvbufsize; + ipath_kreg kr_rcvctrl; + ipath_kreg kr_rcvhdrcnt; + ipath_kreg kr_rcvhdrentsize; + ipath_kreg kr_rcvhdrsize; + ipath_kreg kr_rcvintmembase; + ipath_kreg kr_rcvintmemsize; + ipath_kreg kr_revision; + ipath_kreg kr_sendbuffererror; + ipath_kreg kr_sendpioavailaddr; + ipath_kreg kr_serdesconfig0; + ipath_kreg kr_serdesconfig1; + ipath_kreg kr_serdesstatus; + ipath_kreg kr_txintmembase; + ipath_kreg kr_txintmemsize; + ipath_kreg kr_xgxsconfig; + ipath_kreg kr_ibpllcfg; + /* use these two (and the following N ports) only with + * ipath_k*_kreg64_port(); not *kreg64() */ + ipath_kreg kr_rcvhdraddr; + ipath_kreg kr_rcvhdrtailaddr; + + /* remaining registers are not present on all types of infinipath + chips */ + ipath_kreg kr_rcvpktledcnt; + ipath_kreg kr_pcierbuftestreg0; + ipath_kreg kr_pcierbuftestreg1; + ipath_kreg kr_pcieq0serdesconfig0; + ipath_kreg kr_pcieq0serdesconfig1; + ipath_kreg kr_pcieq0serdesstatus; + ipath_kreg kr_pcieq1serdesconfig0; + ipath_kreg kr_pcieq1serdesconfig1; + ipath_kreg kr_pcieq1serdesstatus; + ipath_kreg kr_hrtbt_guid; + ipath_kreg kr_ibcddrctrl; + ipath_kreg kr_ibcddrstatus; + ipath_kreg kr_jintreload; + + /* send dma related regs */ + ipath_kreg kr_senddmabase; + ipath_kreg kr_senddmalengen; + ipath_kreg kr_senddmatail; + ipath_kreg kr_senddmahead; + ipath_kreg kr_senddmaheadaddr; + ipath_kreg kr_senddmabufmask0; + ipath_kreg kr_senddmabufmask1; + ipath_kreg kr_senddmabufmask2; + ipath_kreg kr_senddmastatus; + + /* SerDes related regs (IBA7220-only) */ + ipath_kreg kr_ibserdesctrl; + ipath_kreg kr_ib_epbacc; + ipath_kreg kr_ib_epbtrans; + ipath_kreg kr_pcie_epbacc; + ipath_kreg kr_pcie_epbtrans; + ipath_kreg kr_ib_ddsrxeq; +}; + +struct ipath_cregs { + ipath_creg cr_badformatcnt; + ipath_creg cr_erricrccnt; + ipath_creg cr_errlinkcnt; + ipath_creg cr_errlpcrccnt; + ipath_creg cr_errpkey; + ipath_creg cr_errrcvflowctrlcnt; + ipath_creg cr_err_rlencnt; + ipath_creg cr_errslencnt; + ipath_creg cr_errtidfull; + ipath_creg cr_errtidvalid; + ipath_creg cr_errvcrccnt; + ipath_creg cr_ibstatuschange; + ipath_creg cr_intcnt; + ipath_creg cr_invalidrlencnt; + ipath_creg cr_invalidslencnt; + ipath_creg cr_lbflowstallcnt; + ipath_creg cr_iblinkdowncnt; + ipath_creg cr_iblinkerrrecovcnt; + ipath_creg cr_ibsymbolerrcnt; + ipath_creg cr_pktrcvcnt; + ipath_creg cr_pktrcvflowctrlcnt; + ipath_creg cr_pktsendcnt; + ipath_creg cr_pktsendflowcnt; + ipath_creg cr_portovflcnt; + ipath_creg cr_rcvebpcnt; + ipath_creg cr_rcvovflcnt; + ipath_creg cr_rxdroppktcnt; + ipath_creg cr_senddropped; + ipath_creg cr_sendstallcnt; + ipath_creg cr_sendunderruncnt; + ipath_creg cr_unsupvlcnt; + ipath_creg cr_wordrcvcnt; + ipath_creg cr_wordsendcnt; + ipath_creg cr_vl15droppedpktcnt; + ipath_creg cr_rxotherlocalphyerrcnt; + ipath_creg cr_excessbufferovflcnt; + ipath_creg cr_locallinkintegrityerrcnt; + ipath_creg cr_rxvlerrcnt; + ipath_creg cr_rxdlidfltrcnt; + ipath_creg cr_psstat; + ipath_creg cr_psstart; + ipath_creg cr_psinterval; + ipath_creg cr_psrcvdatacount; + ipath_creg cr_psrcvpktscount; + ipath_creg cr_psxmitdatacount; + ipath_creg cr_psxmitpktscount; + ipath_creg cr_psxmitwaitcount; +}; + +#endif /* _IPATH_REGISTERS_H */ diff --git a/kernel/drivers/infiniband/hw/ipath/ipath_ruc.c b/kernel/drivers/infiniband/hw/ipath/ipath_ruc.c new file mode 100644 index 000000000..1f95bbaf7 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ipath/ipath_ruc.c @@ -0,0 +1,734 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include "ipath_verbs.h" +#include "ipath_kernel.h" + +/* + * Convert the AETH RNR timeout code into the number of milliseconds. + */ +const u32 ib_ipath_rnr_table[32] = { + 656, /* 0 */ + 1, /* 1 */ + 1, /* 2 */ + 1, /* 3 */ + 1, /* 4 */ + 1, /* 5 */ + 1, /* 6 */ + 1, /* 7 */ + 1, /* 8 */ + 1, /* 9 */ + 1, /* A */ + 1, /* B */ + 1, /* C */ + 1, /* D */ + 2, /* E */ + 2, /* F */ + 3, /* 10 */ + 4, /* 11 */ + 6, /* 12 */ + 8, /* 13 */ + 11, /* 14 */ + 16, /* 15 */ + 21, /* 16 */ + 31, /* 17 */ + 41, /* 18 */ + 62, /* 19 */ + 82, /* 1A */ + 123, /* 1B */ + 164, /* 1C */ + 246, /* 1D */ + 328, /* 1E */ + 492 /* 1F */ +}; + +/** + * ipath_insert_rnr_queue - put QP on the RNR timeout list for the device + * @qp: the QP + * + * Called with the QP s_lock held and interrupts disabled. + * XXX Use a simple list for now. We might need a priority + * queue if we have lots of QPs waiting for RNR timeouts + * but that should be rare. + */ +void ipath_insert_rnr_queue(struct ipath_qp *qp) +{ + struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + + /* We already did a spin_lock_irqsave(), so just use spin_lock */ + spin_lock(&dev->pending_lock); + if (list_empty(&dev->rnrwait)) + list_add(&qp->timerwait, &dev->rnrwait); + else { + struct list_head *l = &dev->rnrwait; + struct ipath_qp *nqp = list_entry(l->next, struct ipath_qp, + timerwait); + + while (qp->s_rnr_timeout >= nqp->s_rnr_timeout) { + qp->s_rnr_timeout -= nqp->s_rnr_timeout; + l = l->next; + if (l->next == &dev->rnrwait) { + nqp = NULL; + break; + } + nqp = list_entry(l->next, struct ipath_qp, + timerwait); + } + if (nqp) + nqp->s_rnr_timeout -= qp->s_rnr_timeout; + list_add(&qp->timerwait, l); + } + spin_unlock(&dev->pending_lock); +} + +/** + * ipath_init_sge - Validate a RWQE and fill in the SGE state + * @qp: the QP + * + * Return 1 if OK. + */ +int ipath_init_sge(struct ipath_qp *qp, struct ipath_rwqe *wqe, + u32 *lengthp, struct ipath_sge_state *ss) +{ + int i, j, ret; + struct ib_wc wc; + + *lengthp = 0; + for (i = j = 0; i < wqe->num_sge; i++) { + if (wqe->sg_list[i].length == 0) + continue; + /* Check LKEY */ + if (!ipath_lkey_ok(qp, j ? &ss->sg_list[j - 1] : &ss->sge, + &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE)) + goto bad_lkey; + *lengthp += wqe->sg_list[i].length; + j++; + } + ss->num_sge = j; + ret = 1; + goto bail; + +bad_lkey: + memset(&wc, 0, sizeof(wc)); + wc.wr_id = wqe->wr_id; + wc.status = IB_WC_LOC_PROT_ERR; + wc.opcode = IB_WC_RECV; + wc.qp = &qp->ibqp; + /* Signal solicited completion event. */ + ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1); + ret = 0; +bail: + return ret; +} + +/** + * ipath_get_rwqe - copy the next RWQE into the QP's RWQE + * @qp: the QP + * @wr_id_only: update qp->r_wr_id only, not qp->r_sge + * + * Return 0 if no RWQE is available, otherwise return 1. + * + * Can be called from interrupt level. + */ +int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only) +{ + unsigned long flags; + struct ipath_rq *rq; + struct ipath_rwq *wq; + struct ipath_srq *srq; + struct ipath_rwqe *wqe; + void (*handler)(struct ib_event *, void *); + u32 tail; + int ret; + + if (qp->ibqp.srq) { + srq = to_isrq(qp->ibqp.srq); + handler = srq->ibsrq.event_handler; + rq = &srq->rq; + } else { + srq = NULL; + handler = NULL; + rq = &qp->r_rq; + } + + spin_lock_irqsave(&rq->lock, flags); + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) { + ret = 0; + goto unlock; + } + + wq = rq->wq; + tail = wq->tail; + /* Validate tail before using it since it is user writable. */ + if (tail >= rq->size) + tail = 0; + do { + if (unlikely(tail == wq->head)) { + ret = 0; + goto unlock; + } + /* Make sure entry is read after head index is read. */ + smp_rmb(); + wqe = get_rwqe_ptr(rq, tail); + if (++tail >= rq->size) + tail = 0; + if (wr_id_only) + break; + qp->r_sge.sg_list = qp->r_sg_list; + } while (!ipath_init_sge(qp, wqe, &qp->r_len, &qp->r_sge)); + qp->r_wr_id = wqe->wr_id; + wq->tail = tail; + + ret = 1; + set_bit(IPATH_R_WRID_VALID, &qp->r_aflags); + if (handler) { + u32 n; + + /* + * validate head pointer value and compute + * the number of remaining WQEs. + */ + n = wq->head; + if (n >= rq->size) + n = 0; + if (n < tail) + n += rq->size - tail; + else + n -= tail; + if (n < srq->limit) { + struct ib_event ev; + + srq->limit = 0; + spin_unlock_irqrestore(&rq->lock, flags); + ev.device = qp->ibqp.device; + ev.element.srq = qp->ibqp.srq; + ev.event = IB_EVENT_SRQ_LIMIT_REACHED; + handler(&ev, srq->ibsrq.srq_context); + goto bail; + } + } +unlock: + spin_unlock_irqrestore(&rq->lock, flags); +bail: + return ret; +} + +/** + * ipath_ruc_loopback - handle UC and RC lookback requests + * @sqp: the sending QP + * + * This is called from ipath_do_send() to + * forward a WQE addressed to the same HCA. + * Note that although we are single threaded due to the tasklet, we still + * have to protect against post_send(). We don't have to worry about + * receive interrupts since this is a connected protocol and all packets + * will pass through here. + */ +static void ipath_ruc_loopback(struct ipath_qp *sqp) +{ + struct ipath_ibdev *dev = to_idev(sqp->ibqp.device); + struct ipath_qp *qp; + struct ipath_swqe *wqe; + struct ipath_sge *sge; + unsigned long flags; + struct ib_wc wc; + u64 sdata; + atomic64_t *maddr; + enum ib_wc_status send_status; + + /* + * Note that we check the responder QP state after + * checking the requester's state. + */ + qp = ipath_lookup_qpn(&dev->qp_table, sqp->remote_qpn); + + spin_lock_irqsave(&sqp->s_lock, flags); + + /* Return if we are already busy processing a work request. */ + if ((sqp->s_flags & (IPATH_S_BUSY | IPATH_S_ANY_WAIT)) || + !(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_OR_FLUSH_SEND)) + goto unlock; + + sqp->s_flags |= IPATH_S_BUSY; + +again: + if (sqp->s_last == sqp->s_head) + goto clr_busy; + wqe = get_swqe_ptr(sqp, sqp->s_last); + + /* Return if it is not OK to start a new work reqeust. */ + if (!(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_NEXT_SEND_OK)) { + if (!(ib_ipath_state_ops[sqp->state] & IPATH_FLUSH_SEND)) + goto clr_busy; + /* We are in the error state, flush the work request. */ + send_status = IB_WC_WR_FLUSH_ERR; + goto flush_send; + } + + /* + * We can rely on the entry not changing without the s_lock + * being held until we update s_last. + * We increment s_cur to indicate s_last is in progress. + */ + if (sqp->s_last == sqp->s_cur) { + if (++sqp->s_cur >= sqp->s_size) + sqp->s_cur = 0; + } + spin_unlock_irqrestore(&sqp->s_lock, flags); + + if (!qp || !(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) { + dev->n_pkt_drops++; + /* + * For RC, the requester would timeout and retry so + * shortcut the timeouts and just signal too many retries. + */ + if (sqp->ibqp.qp_type == IB_QPT_RC) + send_status = IB_WC_RETRY_EXC_ERR; + else + send_status = IB_WC_SUCCESS; + goto serr; + } + + memset(&wc, 0, sizeof wc); + send_status = IB_WC_SUCCESS; + + sqp->s_sge.sge = wqe->sg_list[0]; + sqp->s_sge.sg_list = wqe->sg_list + 1; + sqp->s_sge.num_sge = wqe->wr.num_sge; + sqp->s_len = wqe->length; + switch (wqe->wr.opcode) { + case IB_WR_SEND_WITH_IMM: + wc.wc_flags = IB_WC_WITH_IMM; + wc.ex.imm_data = wqe->wr.ex.imm_data; + /* FALLTHROUGH */ + case IB_WR_SEND: + if (!ipath_get_rwqe(qp, 0)) + goto rnr_nak; + break; + + case IB_WR_RDMA_WRITE_WITH_IMM: + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) + goto inv_err; + wc.wc_flags = IB_WC_WITH_IMM; + wc.ex.imm_data = wqe->wr.ex.imm_data; + if (!ipath_get_rwqe(qp, 1)) + goto rnr_nak; + /* FALLTHROUGH */ + case IB_WR_RDMA_WRITE: + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) + goto inv_err; + if (wqe->length == 0) + break; + if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge, wqe->length, + wqe->wr.wr.rdma.remote_addr, + wqe->wr.wr.rdma.rkey, + IB_ACCESS_REMOTE_WRITE))) + goto acc_err; + break; + + case IB_WR_RDMA_READ: + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ))) + goto inv_err; + if (unlikely(!ipath_rkey_ok(qp, &sqp->s_sge, wqe->length, + wqe->wr.wr.rdma.remote_addr, + wqe->wr.wr.rdma.rkey, + IB_ACCESS_REMOTE_READ))) + goto acc_err; + qp->r_sge.sge = wqe->sg_list[0]; + qp->r_sge.sg_list = wqe->sg_list + 1; + qp->r_sge.num_sge = wqe->wr.num_sge; + break; + + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) + goto inv_err; + if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge, sizeof(u64), + wqe->wr.wr.atomic.remote_addr, + wqe->wr.wr.atomic.rkey, + IB_ACCESS_REMOTE_ATOMIC))) + goto acc_err; + /* Perform atomic OP and save result. */ + maddr = (atomic64_t *) qp->r_sge.sge.vaddr; + sdata = wqe->wr.wr.atomic.compare_add; + *(u64 *) sqp->s_sge.sge.vaddr = + (wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ? + (u64) atomic64_add_return(sdata, maddr) - sdata : + (u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr, + sdata, wqe->wr.wr.atomic.swap); + goto send_comp; + + default: + send_status = IB_WC_LOC_QP_OP_ERR; + goto serr; + } + + sge = &sqp->s_sge.sge; + while (sqp->s_len) { + u32 len = sqp->s_len; + + if (len > sge->length) + len = sge->length; + if (len > sge->sge_length) + len = sge->sge_length; + BUG_ON(len == 0); + ipath_copy_sge(&qp->r_sge, sge->vaddr, len); + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (sge->sge_length == 0) { + if (--sqp->s_sge.num_sge) + *sge = *sqp->s_sge.sg_list++; + } else if (sge->length == 0 && sge->mr != NULL) { + if (++sge->n >= IPATH_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + break; + sge->n = 0; + } + sge->vaddr = + sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = + sge->mr->map[sge->m]->segs[sge->n].length; + } + sqp->s_len -= len; + } + + if (!test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags)) + goto send_comp; + + if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM) + wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; + else + wc.opcode = IB_WC_RECV; + wc.wr_id = qp->r_wr_id; + wc.status = IB_WC_SUCCESS; + wc.byte_len = wqe->length; + wc.qp = &qp->ibqp; + wc.src_qp = qp->remote_qpn; + wc.slid = qp->remote_ah_attr.dlid; + wc.sl = qp->remote_ah_attr.sl; + wc.port_num = 1; + /* Signal completion event if the solicited bit is set. */ + ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, + wqe->wr.send_flags & IB_SEND_SOLICITED); + +send_comp: + spin_lock_irqsave(&sqp->s_lock, flags); +flush_send: + sqp->s_rnr_retry = sqp->s_rnr_retry_cnt; + ipath_send_complete(sqp, wqe, send_status); + goto again; + +rnr_nak: + /* Handle RNR NAK */ + if (qp->ibqp.qp_type == IB_QPT_UC) + goto send_comp; + /* + * Note: we don't need the s_lock held since the BUSY flag + * makes this single threaded. + */ + if (sqp->s_rnr_retry == 0) { + send_status = IB_WC_RNR_RETRY_EXC_ERR; + goto serr; + } + if (sqp->s_rnr_retry_cnt < 7) + sqp->s_rnr_retry--; + spin_lock_irqsave(&sqp->s_lock, flags); + if (!(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_RECV_OK)) + goto clr_busy; + sqp->s_flags |= IPATH_S_WAITING; + dev->n_rnr_naks++; + sqp->s_rnr_timeout = ib_ipath_rnr_table[qp->r_min_rnr_timer]; + ipath_insert_rnr_queue(sqp); + goto clr_busy; + +inv_err: + send_status = IB_WC_REM_INV_REQ_ERR; + wc.status = IB_WC_LOC_QP_OP_ERR; + goto err; + +acc_err: + send_status = IB_WC_REM_ACCESS_ERR; + wc.status = IB_WC_LOC_PROT_ERR; +err: + /* responder goes to error state */ + ipath_rc_error(qp, wc.status); + +serr: + spin_lock_irqsave(&sqp->s_lock, flags); + ipath_send_complete(sqp, wqe, send_status); + if (sqp->ibqp.qp_type == IB_QPT_RC) { + int lastwqe = ipath_error_qp(sqp, IB_WC_WR_FLUSH_ERR); + + sqp->s_flags &= ~IPATH_S_BUSY; + spin_unlock_irqrestore(&sqp->s_lock, flags); + if (lastwqe) { + struct ib_event ev; + + ev.device = sqp->ibqp.device; + ev.element.qp = &sqp->ibqp; + ev.event = IB_EVENT_QP_LAST_WQE_REACHED; + sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context); + } + goto done; + } +clr_busy: + sqp->s_flags &= ~IPATH_S_BUSY; +unlock: + spin_unlock_irqrestore(&sqp->s_lock, flags); +done: + if (qp && atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); +} + +static void want_buffer(struct ipath_devdata *dd, struct ipath_qp *qp) +{ + if (!(dd->ipath_flags & IPATH_HAS_SEND_DMA) || + qp->ibqp.qp_type == IB_QPT_SMI) { + unsigned long flags; + + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + dd->ipath_sendctrl |= INFINIPATH_S_PIOINTBUFAVAIL; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + } +} + +/** + * ipath_no_bufs_available - tell the layer driver we need buffers + * @qp: the QP that caused the problem + * @dev: the device we ran out of buffers on + * + * Called when we run out of PIO buffers. + * If we are now in the error state, return zero to flush the + * send work request. + */ +static int ipath_no_bufs_available(struct ipath_qp *qp, + struct ipath_ibdev *dev) +{ + unsigned long flags; + int ret = 1; + + /* + * Note that as soon as want_buffer() is called and + * possibly before it returns, ipath_ib_piobufavail() + * could be called. Therefore, put QP on the piowait list before + * enabling the PIO avail interrupt. + */ + spin_lock_irqsave(&qp->s_lock, flags); + if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) { + dev->n_piowait++; + qp->s_flags |= IPATH_S_WAITING; + qp->s_flags &= ~IPATH_S_BUSY; + spin_lock(&dev->pending_lock); + if (list_empty(&qp->piowait)) + list_add_tail(&qp->piowait, &dev->piowait); + spin_unlock(&dev->pending_lock); + } else + ret = 0; + spin_unlock_irqrestore(&qp->s_lock, flags); + if (ret) + want_buffer(dev->dd, qp); + return ret; +} + +/** + * ipath_make_grh - construct a GRH header + * @dev: a pointer to the ipath device + * @hdr: a pointer to the GRH header being constructed + * @grh: the global route address to send to + * @hwords: the number of 32 bit words of header being sent + * @nwords: the number of 32 bit words of data being sent + * + * Return the size of the header in 32 bit words. + */ +u32 ipath_make_grh(struct ipath_ibdev *dev, struct ib_grh *hdr, + struct ib_global_route *grh, u32 hwords, u32 nwords) +{ + hdr->version_tclass_flow = + cpu_to_be32((6 << 28) | + (grh->traffic_class << 20) | + grh->flow_label); + hdr->paylen = cpu_to_be16((hwords - 2 + nwords + SIZE_OF_CRC) << 2); + /* next_hdr is defined by C8-7 in ch. 8.4.1 */ + hdr->next_hdr = 0x1B; + hdr->hop_limit = grh->hop_limit; + /* The SGID is 32-bit aligned. */ + hdr->sgid.global.subnet_prefix = dev->gid_prefix; + hdr->sgid.global.interface_id = dev->dd->ipath_guid; + hdr->dgid = grh->dgid; + + /* GRH header size in 32-bit words. */ + return sizeof(struct ib_grh) / sizeof(u32); +} + +void ipath_make_ruc_header(struct ipath_ibdev *dev, struct ipath_qp *qp, + struct ipath_other_headers *ohdr, + u32 bth0, u32 bth2) +{ + u16 lrh0; + u32 nwords; + u32 extra_bytes; + + /* Construct the header. */ + extra_bytes = -qp->s_cur_size & 3; + nwords = (qp->s_cur_size + extra_bytes) >> 2; + lrh0 = IPATH_LRH_BTH; + if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) { + qp->s_hdrwords += ipath_make_grh(dev, &qp->s_hdr.u.l.grh, + &qp->remote_ah_attr.grh, + qp->s_hdrwords, nwords); + lrh0 = IPATH_LRH_GRH; + } + lrh0 |= qp->remote_ah_attr.sl << 4; + qp->s_hdr.lrh[0] = cpu_to_be16(lrh0); + qp->s_hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid); + qp->s_hdr.lrh[2] = cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC); + qp->s_hdr.lrh[3] = cpu_to_be16(dev->dd->ipath_lid | + qp->remote_ah_attr.src_path_bits); + bth0 |= ipath_get_pkey(dev->dd, qp->s_pkey_index); + bth0 |= extra_bytes << 20; + ohdr->bth[0] = cpu_to_be32(bth0 | (1 << 22)); + ohdr->bth[1] = cpu_to_be32(qp->remote_qpn); + ohdr->bth[2] = cpu_to_be32(bth2); +} + +/** + * ipath_do_send - perform a send on a QP + * @data: contains a pointer to the QP + * + * Process entries in the send work queue until credit or queue is + * exhausted. Only allow one CPU to send a packet per QP (tasklet). + * Otherwise, two threads could send packets out of order. + */ +void ipath_do_send(unsigned long data) +{ + struct ipath_qp *qp = (struct ipath_qp *)data; + struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + int (*make_req)(struct ipath_qp *qp); + unsigned long flags; + + if ((qp->ibqp.qp_type == IB_QPT_RC || + qp->ibqp.qp_type == IB_QPT_UC) && + qp->remote_ah_attr.dlid == dev->dd->ipath_lid) { + ipath_ruc_loopback(qp); + goto bail; + } + + if (qp->ibqp.qp_type == IB_QPT_RC) + make_req = ipath_make_rc_req; + else if (qp->ibqp.qp_type == IB_QPT_UC) + make_req = ipath_make_uc_req; + else + make_req = ipath_make_ud_req; + + spin_lock_irqsave(&qp->s_lock, flags); + + /* Return if we are already busy processing a work request. */ + if ((qp->s_flags & (IPATH_S_BUSY | IPATH_S_ANY_WAIT)) || + !(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_OR_FLUSH_SEND)) { + spin_unlock_irqrestore(&qp->s_lock, flags); + goto bail; + } + + qp->s_flags |= IPATH_S_BUSY; + + spin_unlock_irqrestore(&qp->s_lock, flags); + +again: + /* Check for a constructed packet to be sent. */ + if (qp->s_hdrwords != 0) { + /* + * If no PIO bufs are available, return. An interrupt will + * call ipath_ib_piobufavail() when one is available. + */ + if (ipath_verbs_send(qp, &qp->s_hdr, qp->s_hdrwords, + qp->s_cur_sge, qp->s_cur_size)) { + if (ipath_no_bufs_available(qp, dev)) + goto bail; + } + dev->n_unicast_xmit++; + /* Record that we sent the packet and s_hdr is empty. */ + qp->s_hdrwords = 0; + } + + if (make_req(qp)) + goto again; + +bail:; +} + +/* + * This should be called with s_lock held. + */ +void ipath_send_complete(struct ipath_qp *qp, struct ipath_swqe *wqe, + enum ib_wc_status status) +{ + u32 old_last, last; + + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_OR_FLUSH_SEND)) + return; + + /* See ch. 11.2.4.1 and 10.7.3.1 */ + if (!(qp->s_flags & IPATH_S_SIGNAL_REQ_WR) || + (wqe->wr.send_flags & IB_SEND_SIGNALED) || + status != IB_WC_SUCCESS) { + struct ib_wc wc; + + memset(&wc, 0, sizeof wc); + wc.wr_id = wqe->wr.wr_id; + wc.status = status; + wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode]; + wc.qp = &qp->ibqp; + if (status == IB_WC_SUCCESS) + wc.byte_len = wqe->length; + ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, + status != IB_WC_SUCCESS); + } + + old_last = last = qp->s_last; + if (++last >= qp->s_size) + last = 0; + qp->s_last = last; + if (qp->s_cur == old_last) + qp->s_cur = last; + if (qp->s_tail == old_last) + qp->s_tail = last; + if (qp->state == IB_QPS_SQD && last == qp->s_cur) + qp->s_draining = 0; +} diff --git a/kernel/drivers/infiniband/hw/ipath/ipath_sdma.c b/kernel/drivers/infiniband/hw/ipath/ipath_sdma.c new file mode 100644 index 000000000..17a517766 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ipath/ipath_sdma.c @@ -0,0 +1,818 @@ +/* + * Copyright (c) 2007, 2008 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include "ipath_kernel.h" +#include "ipath_verbs.h" +#include "ipath_common.h" + +#define SDMA_DESCQ_SZ PAGE_SIZE /* 256 entries per 4KB page */ + +static void vl15_watchdog_enq(struct ipath_devdata *dd) +{ + /* ipath_sdma_lock must already be held */ + if (atomic_inc_return(&dd->ipath_sdma_vl15_count) == 1) { + unsigned long interval = (HZ + 19) / 20; + dd->ipath_sdma_vl15_timer.expires = jiffies + interval; + add_timer(&dd->ipath_sdma_vl15_timer); + } +} + +static void vl15_watchdog_deq(struct ipath_devdata *dd) +{ + /* ipath_sdma_lock must already be held */ + if (atomic_dec_return(&dd->ipath_sdma_vl15_count) != 0) { + unsigned long interval = (HZ + 19) / 20; + mod_timer(&dd->ipath_sdma_vl15_timer, jiffies + interval); + } else { + del_timer(&dd->ipath_sdma_vl15_timer); + } +} + +static void vl15_watchdog_timeout(unsigned long opaque) +{ + struct ipath_devdata *dd = (struct ipath_devdata *)opaque; + + if (atomic_read(&dd->ipath_sdma_vl15_count) != 0) { + ipath_dbg("vl15 watchdog timeout - clearing\n"); + ipath_cancel_sends(dd, 1); + ipath_hol_down(dd); + } else { + ipath_dbg("vl15 watchdog timeout - " + "condition already cleared\n"); + } +} + +static void unmap_desc(struct ipath_devdata *dd, unsigned head) +{ + __le64 *descqp = &dd->ipath_sdma_descq[head].qw[0]; + u64 desc[2]; + dma_addr_t addr; + size_t len; + + desc[0] = le64_to_cpu(descqp[0]); + desc[1] = le64_to_cpu(descqp[1]); + + addr = (desc[1] << 32) | (desc[0] >> 32); + len = (desc[0] >> 14) & (0x7ffULL << 2); + dma_unmap_single(&dd->pcidev->dev, addr, len, DMA_TO_DEVICE); +} + +/* + * ipath_sdma_lock should be locked before calling this. + */ +int ipath_sdma_make_progress(struct ipath_devdata *dd) +{ + struct list_head *lp = NULL; + struct ipath_sdma_txreq *txp = NULL; + u16 dmahead; + u16 start_idx = 0; + int progress = 0; + + if (!list_empty(&dd->ipath_sdma_activelist)) { + lp = dd->ipath_sdma_activelist.next; + txp = list_entry(lp, struct ipath_sdma_txreq, list); + start_idx = txp->start_idx; + } + + /* + * Read the SDMA head register in order to know that the + * interrupt clear has been written to the chip. + * Otherwise, we may not get an interrupt for the last + * descriptor in the queue. + */ + dmahead = (u16)ipath_read_kreg32(dd, dd->ipath_kregs->kr_senddmahead); + /* sanity check return value for error handling (chip reset, etc.) */ + if (dmahead >= dd->ipath_sdma_descq_cnt) + goto done; + + while (dd->ipath_sdma_descq_head != dmahead) { + if (txp && txp->flags & IPATH_SDMA_TXREQ_F_FREEDESC && + dd->ipath_sdma_descq_head == start_idx) { + unmap_desc(dd, dd->ipath_sdma_descq_head); + start_idx++; + if (start_idx == dd->ipath_sdma_descq_cnt) + start_idx = 0; + } + + /* increment free count and head */ + dd->ipath_sdma_descq_removed++; + if (++dd->ipath_sdma_descq_head == dd->ipath_sdma_descq_cnt) + dd->ipath_sdma_descq_head = 0; + + if (txp && txp->next_descq_idx == dd->ipath_sdma_descq_head) { + /* move to notify list */ + if (txp->flags & IPATH_SDMA_TXREQ_F_VL15) + vl15_watchdog_deq(dd); + list_move_tail(lp, &dd->ipath_sdma_notifylist); + if (!list_empty(&dd->ipath_sdma_activelist)) { + lp = dd->ipath_sdma_activelist.next; + txp = list_entry(lp, struct ipath_sdma_txreq, + list); + start_idx = txp->start_idx; + } else { + lp = NULL; + txp = NULL; + } + } + progress = 1; + } + + if (progress) + tasklet_hi_schedule(&dd->ipath_sdma_notify_task); + +done: + return progress; +} + +static void ipath_sdma_notify(struct ipath_devdata *dd, struct list_head *list) +{ + struct ipath_sdma_txreq *txp, *txp_next; + + list_for_each_entry_safe(txp, txp_next, list, list) { + list_del_init(&txp->list); + + if (txp->callback) + (*txp->callback)(txp->callback_cookie, + txp->callback_status); + } +} + +static void sdma_notify_taskbody(struct ipath_devdata *dd) +{ + unsigned long flags; + struct list_head list; + + INIT_LIST_HEAD(&list); + + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + + list_splice_init(&dd->ipath_sdma_notifylist, &list); + + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + + ipath_sdma_notify(dd, &list); + + /* + * The IB verbs layer needs to see the callback before getting + * the call to ipath_ib_piobufavail() because the callback + * handles releasing resources the next send will need. + * Otherwise, we could do these calls in + * ipath_sdma_make_progress(). + */ + ipath_ib_piobufavail(dd->verbs_dev); +} + +static void sdma_notify_task(unsigned long opaque) +{ + struct ipath_devdata *dd = (struct ipath_devdata *)opaque; + + if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status)) + sdma_notify_taskbody(dd); +} + +static void dump_sdma_state(struct ipath_devdata *dd) +{ + unsigned long reg; + + reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmastatus); + ipath_cdbg(VERBOSE, "kr_senddmastatus: 0x%016lx\n", reg); + + reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendctrl); + ipath_cdbg(VERBOSE, "kr_sendctrl: 0x%016lx\n", reg); + + reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmabufmask0); + ipath_cdbg(VERBOSE, "kr_senddmabufmask0: 0x%016lx\n", reg); + + reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmabufmask1); + ipath_cdbg(VERBOSE, "kr_senddmabufmask1: 0x%016lx\n", reg); + + reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmabufmask2); + ipath_cdbg(VERBOSE, "kr_senddmabufmask2: 0x%016lx\n", reg); + + reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmatail); + ipath_cdbg(VERBOSE, "kr_senddmatail: 0x%016lx\n", reg); + + reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmahead); + ipath_cdbg(VERBOSE, "kr_senddmahead: 0x%016lx\n", reg); +} + +static void sdma_abort_task(unsigned long opaque) +{ + struct ipath_devdata *dd = (struct ipath_devdata *) opaque; + u64 status; + unsigned long flags; + + if (test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status)) + return; + + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + + status = dd->ipath_sdma_status & IPATH_SDMA_ABORT_MASK; + + /* nothing to do */ + if (status == IPATH_SDMA_ABORT_NONE) + goto unlock; + + /* ipath_sdma_abort() is done, waiting for interrupt */ + if (status == IPATH_SDMA_ABORT_DISARMED) { + if (time_before(jiffies, dd->ipath_sdma_abort_intr_timeout)) + goto resched_noprint; + /* give up, intr got lost somewhere */ + ipath_dbg("give up waiting for SDMADISABLED intr\n"); + __set_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status); + status = IPATH_SDMA_ABORT_ABORTED; + } + + /* everything is stopped, time to clean up and restart */ + if (status == IPATH_SDMA_ABORT_ABORTED) { + struct ipath_sdma_txreq *txp, *txpnext; + u64 hwstatus; + int notify = 0; + + hwstatus = ipath_read_kreg64(dd, + dd->ipath_kregs->kr_senddmastatus); + + if ((hwstatus & (IPATH_SDMA_STATUS_SCORE_BOARD_DRAIN_IN_PROG | + IPATH_SDMA_STATUS_ABORT_IN_PROG | + IPATH_SDMA_STATUS_INTERNAL_SDMA_ENABLE)) || + !(hwstatus & IPATH_SDMA_STATUS_SCB_EMPTY)) { + if (dd->ipath_sdma_reset_wait > 0) { + /* not done shutting down sdma */ + --dd->ipath_sdma_reset_wait; + goto resched; + } + ipath_cdbg(VERBOSE, "gave up waiting for quiescent " + "status after SDMA reset, continuing\n"); + dump_sdma_state(dd); + } + + /* dequeue all "sent" requests */ + list_for_each_entry_safe(txp, txpnext, + &dd->ipath_sdma_activelist, list) { + txp->callback_status = IPATH_SDMA_TXREQ_S_ABORTED; + if (txp->flags & IPATH_SDMA_TXREQ_F_VL15) + vl15_watchdog_deq(dd); + list_move_tail(&txp->list, &dd->ipath_sdma_notifylist); + notify = 1; + } + if (notify) + tasklet_hi_schedule(&dd->ipath_sdma_notify_task); + + /* reset our notion of head and tail */ + dd->ipath_sdma_descq_tail = 0; + dd->ipath_sdma_descq_head = 0; + dd->ipath_sdma_head_dma[0] = 0; + dd->ipath_sdma_generation = 0; + dd->ipath_sdma_descq_removed = dd->ipath_sdma_descq_added; + + /* Reset SendDmaLenGen */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmalengen, + (u64) dd->ipath_sdma_descq_cnt | (1ULL << 18)); + + /* done with sdma state for a bit */ + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + + /* + * Don't restart sdma here (with the exception + * below). Wait until link is up to ACTIVE. VL15 MADs + * used to bring the link up use PIO, and multiple link + * transitions otherwise cause the sdma engine to be + * stopped and started multiple times. + * The disable is done here, including the shadow, + * so the state is kept consistent. + * See ipath_restart_sdma() for the actual starting + * of sdma. + */ + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + dd->ipath_sendctrl &= ~INFINIPATH_S_SDMAENABLE; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + + /* make sure I see next message */ + dd->ipath_sdma_abort_jiffies = 0; + + /* + * Not everything that takes SDMA offline is a link + * status change. If the link was up, restart SDMA. + */ + if (dd->ipath_flags & IPATH_LINKACTIVE) + ipath_restart_sdma(dd); + + goto done; + } + +resched: + /* + * for now, keep spinning + * JAG - this is bad to just have default be a loop without + * state change + */ + if (time_after(jiffies, dd->ipath_sdma_abort_jiffies)) { + ipath_dbg("looping with status 0x%08lx\n", + dd->ipath_sdma_status); + dd->ipath_sdma_abort_jiffies = jiffies + 5 * HZ; + } +resched_noprint: + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status)) + tasklet_hi_schedule(&dd->ipath_sdma_abort_task); + return; + +unlock: + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); +done: + return; +} + +/* + * This is called from interrupt context. + */ +void ipath_sdma_intr(struct ipath_devdata *dd) +{ + unsigned long flags; + + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + + (void) ipath_sdma_make_progress(dd); + + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); +} + +static int alloc_sdma(struct ipath_devdata *dd) +{ + int ret = 0; + + /* Allocate memory for SendDMA descriptor FIFO */ + dd->ipath_sdma_descq = dma_alloc_coherent(&dd->pcidev->dev, + SDMA_DESCQ_SZ, &dd->ipath_sdma_descq_phys, GFP_KERNEL); + + if (!dd->ipath_sdma_descq) { + ipath_dev_err(dd, "failed to allocate SendDMA descriptor " + "FIFO memory\n"); + ret = -ENOMEM; + goto done; + } + + dd->ipath_sdma_descq_cnt = + SDMA_DESCQ_SZ / sizeof(struct ipath_sdma_desc); + + /* Allocate memory for DMA of head register to memory */ + dd->ipath_sdma_head_dma = dma_alloc_coherent(&dd->pcidev->dev, + PAGE_SIZE, &dd->ipath_sdma_head_phys, GFP_KERNEL); + if (!dd->ipath_sdma_head_dma) { + ipath_dev_err(dd, "failed to allocate SendDMA head memory\n"); + ret = -ENOMEM; + goto cleanup_descq; + } + dd->ipath_sdma_head_dma[0] = 0; + + init_timer(&dd->ipath_sdma_vl15_timer); + dd->ipath_sdma_vl15_timer.function = vl15_watchdog_timeout; + dd->ipath_sdma_vl15_timer.data = (unsigned long)dd; + atomic_set(&dd->ipath_sdma_vl15_count, 0); + + goto done; + +cleanup_descq: + dma_free_coherent(&dd->pcidev->dev, SDMA_DESCQ_SZ, + (void *)dd->ipath_sdma_descq, dd->ipath_sdma_descq_phys); + dd->ipath_sdma_descq = NULL; + dd->ipath_sdma_descq_phys = 0; +done: + return ret; +} + +int setup_sdma(struct ipath_devdata *dd) +{ + int ret = 0; + unsigned i, n; + u64 tmp64; + u64 senddmabufmask[3] = { 0 }; + unsigned long flags; + + ret = alloc_sdma(dd); + if (ret) + goto done; + + if (!dd->ipath_sdma_descq) { + ipath_dev_err(dd, "SendDMA memory not allocated\n"); + goto done; + } + + /* + * Set initial status as if we had been up, then gone down. + * This lets initial start on transition to ACTIVE be the + * same as restart after link flap. + */ + dd->ipath_sdma_status = IPATH_SDMA_ABORT_ABORTED; + dd->ipath_sdma_abort_jiffies = 0; + dd->ipath_sdma_generation = 0; + dd->ipath_sdma_descq_tail = 0; + dd->ipath_sdma_descq_head = 0; + dd->ipath_sdma_descq_removed = 0; + dd->ipath_sdma_descq_added = 0; + + /* Set SendDmaBase */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabase, + dd->ipath_sdma_descq_phys); + /* Set SendDmaLenGen */ + tmp64 = dd->ipath_sdma_descq_cnt; + tmp64 |= 1<<18; /* enable generation checking */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmalengen, tmp64); + /* Set SendDmaTail */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmatail, + dd->ipath_sdma_descq_tail); + /* Set SendDmaHeadAddr */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmaheadaddr, + dd->ipath_sdma_head_phys); + + /* + * Reserve all the former "kernel" piobufs, using high number range + * so we get as many 4K buffers as possible + */ + n = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k; + i = dd->ipath_lastport_piobuf + dd->ipath_pioreserved; + ipath_chg_pioavailkernel(dd, i, n - i , 0); + for (; i < n; ++i) { + unsigned word = i / 64; + unsigned bit = i & 63; + BUG_ON(word >= 3); + senddmabufmask[word] |= 1ULL << bit; + } + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask0, + senddmabufmask[0]); + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask1, + senddmabufmask[1]); + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask2, + senddmabufmask[2]); + + INIT_LIST_HEAD(&dd->ipath_sdma_activelist); + INIT_LIST_HEAD(&dd->ipath_sdma_notifylist); + + tasklet_init(&dd->ipath_sdma_notify_task, sdma_notify_task, + (unsigned long) dd); + tasklet_init(&dd->ipath_sdma_abort_task, sdma_abort_task, + (unsigned long) dd); + + /* + * No use to turn on SDMA here, as link is probably not ACTIVE + * Just mark it RUNNING and enable the interrupt, and let the + * ipath_restart_sdma() on link transition to ACTIVE actually + * enable it. + */ + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + dd->ipath_sendctrl |= INFINIPATH_S_SDMAINTENABLE; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + __set_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + +done: + return ret; +} + +void teardown_sdma(struct ipath_devdata *dd) +{ + struct ipath_sdma_txreq *txp, *txpnext; + unsigned long flags; + dma_addr_t sdma_head_phys = 0; + dma_addr_t sdma_descq_phys = 0; + void *sdma_descq = NULL; + void *sdma_head_dma = NULL; + + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + __clear_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status); + __set_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status); + __set_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status); + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + + tasklet_kill(&dd->ipath_sdma_abort_task); + tasklet_kill(&dd->ipath_sdma_notify_task); + + /* turn off sdma */ + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + dd->ipath_sendctrl &= ~INFINIPATH_S_SDMAENABLE; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + /* dequeue all "sent" requests */ + list_for_each_entry_safe(txp, txpnext, &dd->ipath_sdma_activelist, + list) { + txp->callback_status = IPATH_SDMA_TXREQ_S_SHUTDOWN; + if (txp->flags & IPATH_SDMA_TXREQ_F_VL15) + vl15_watchdog_deq(dd); + list_move_tail(&txp->list, &dd->ipath_sdma_notifylist); + } + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + + sdma_notify_taskbody(dd); + + del_timer_sync(&dd->ipath_sdma_vl15_timer); + + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + + dd->ipath_sdma_abort_jiffies = 0; + + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabase, 0); + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmalengen, 0); + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmatail, 0); + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmaheadaddr, 0); + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask0, 0); + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask1, 0); + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask2, 0); + + if (dd->ipath_sdma_head_dma) { + sdma_head_dma = (void *) dd->ipath_sdma_head_dma; + sdma_head_phys = dd->ipath_sdma_head_phys; + dd->ipath_sdma_head_dma = NULL; + dd->ipath_sdma_head_phys = 0; + } + + if (dd->ipath_sdma_descq) { + sdma_descq = dd->ipath_sdma_descq; + sdma_descq_phys = dd->ipath_sdma_descq_phys; + dd->ipath_sdma_descq = NULL; + dd->ipath_sdma_descq_phys = 0; + } + + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + + if (sdma_head_dma) + dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE, + sdma_head_dma, sdma_head_phys); + + if (sdma_descq) + dma_free_coherent(&dd->pcidev->dev, SDMA_DESCQ_SZ, + sdma_descq, sdma_descq_phys); +} + +/* + * [Re]start SDMA, if we use it, and it's not already OK. + * This is called on transition to link ACTIVE, either the first or + * subsequent times. + */ +void ipath_restart_sdma(struct ipath_devdata *dd) +{ + unsigned long flags; + int needed = 1; + + if (!(dd->ipath_flags & IPATH_HAS_SEND_DMA)) + goto bail; + + /* + * First, make sure we should, which is to say, + * check that we are "RUNNING" (not in teardown) + * and not "SHUTDOWN" + */ + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + if (!test_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status) + || test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status)) + needed = 0; + else { + __clear_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status); + __clear_bit(IPATH_SDMA_DISARMED, &dd->ipath_sdma_status); + __clear_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status); + } + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + if (!needed) { + ipath_dbg("invalid attempt to restart SDMA, status 0x%08lx\n", + dd->ipath_sdma_status); + goto bail; + } + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + /* + * First clear, just to be safe. Enable is only done + * in chip on 0->1 transition + */ + dd->ipath_sendctrl &= ~INFINIPATH_S_SDMAENABLE; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + dd->ipath_sendctrl |= INFINIPATH_S_SDMAENABLE; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + + /* notify upper layers */ + ipath_ib_piobufavail(dd->verbs_dev); + +bail: + return; +} + +static inline void make_sdma_desc(struct ipath_devdata *dd, + u64 *sdmadesc, u64 addr, u64 dwlen, u64 dwoffset) +{ + WARN_ON(addr & 3); + /* SDmaPhyAddr[47:32] */ + sdmadesc[1] = addr >> 32; + /* SDmaPhyAddr[31:0] */ + sdmadesc[0] = (addr & 0xfffffffcULL) << 32; + /* SDmaGeneration[1:0] */ + sdmadesc[0] |= (dd->ipath_sdma_generation & 3ULL) << 30; + /* SDmaDwordCount[10:0] */ + sdmadesc[0] |= (dwlen & 0x7ffULL) << 16; + /* SDmaBufOffset[12:2] */ + sdmadesc[0] |= dwoffset & 0x7ffULL; +} + +/* + * This function queues one IB packet onto the send DMA queue per call. + * The caller is responsible for checking: + * 1) The number of send DMA descriptor entries is less than the size of + * the descriptor queue. + * 2) The IB SGE addresses and lengths are 32-bit aligned + * (except possibly the last SGE's length) + * 3) The SGE addresses are suitable for passing to dma_map_single(). + */ +int ipath_sdma_verbs_send(struct ipath_devdata *dd, + struct ipath_sge_state *ss, u32 dwords, + struct ipath_verbs_txreq *tx) +{ + + unsigned long flags; + struct ipath_sge *sge; + int ret = 0; + u16 tail; + __le64 *descqp; + u64 sdmadesc[2]; + u32 dwoffset; + dma_addr_t addr; + + if ((tx->map_len + (dwords<<2)) > dd->ipath_ibmaxlen) { + ipath_dbg("packet size %X > ibmax %X, fail\n", + tx->map_len + (dwords<<2), dd->ipath_ibmaxlen); + ret = -EMSGSIZE; + goto fail; + } + + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + +retry: + if (unlikely(test_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status))) { + ret = -EBUSY; + goto unlock; + } + + if (tx->txreq.sg_count > ipath_sdma_descq_freecnt(dd)) { + if (ipath_sdma_make_progress(dd)) + goto retry; + ret = -ENOBUFS; + goto unlock; + } + + addr = dma_map_single(&dd->pcidev->dev, tx->txreq.map_addr, + tx->map_len, DMA_TO_DEVICE); + if (dma_mapping_error(&dd->pcidev->dev, addr)) + goto ioerr; + + dwoffset = tx->map_len >> 2; + make_sdma_desc(dd, sdmadesc, (u64) addr, dwoffset, 0); + + /* SDmaFirstDesc */ + sdmadesc[0] |= 1ULL << 12; + if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_USELARGEBUF) + sdmadesc[0] |= 1ULL << 14; /* SDmaUseLargeBuf */ + + /* write to the descq */ + tail = dd->ipath_sdma_descq_tail; + descqp = &dd->ipath_sdma_descq[tail].qw[0]; + *descqp++ = cpu_to_le64(sdmadesc[0]); + *descqp++ = cpu_to_le64(sdmadesc[1]); + + if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_FREEDESC) + tx->txreq.start_idx = tail; + + /* increment the tail */ + if (++tail == dd->ipath_sdma_descq_cnt) { + tail = 0; + descqp = &dd->ipath_sdma_descq[0].qw[0]; + ++dd->ipath_sdma_generation; + } + + sge = &ss->sge; + while (dwords) { + u32 dw; + u32 len; + + len = dwords << 2; + if (len > sge->length) + len = sge->length; + if (len > sge->sge_length) + len = sge->sge_length; + BUG_ON(len == 0); + dw = (len + 3) >> 2; + addr = dma_map_single(&dd->pcidev->dev, sge->vaddr, dw << 2, + DMA_TO_DEVICE); + if (dma_mapping_error(&dd->pcidev->dev, addr)) + goto unmap; + make_sdma_desc(dd, sdmadesc, (u64) addr, dw, dwoffset); + /* SDmaUseLargeBuf has to be set in every descriptor */ + if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_USELARGEBUF) + sdmadesc[0] |= 1ULL << 14; + /* write to the descq */ + *descqp++ = cpu_to_le64(sdmadesc[0]); + *descqp++ = cpu_to_le64(sdmadesc[1]); + + /* increment the tail */ + if (++tail == dd->ipath_sdma_descq_cnt) { + tail = 0; + descqp = &dd->ipath_sdma_descq[0].qw[0]; + ++dd->ipath_sdma_generation; + } + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (sge->sge_length == 0) { + if (--ss->num_sge) + *sge = *ss->sg_list++; + } else if (sge->length == 0 && sge->mr != NULL) { + if (++sge->n >= IPATH_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + break; + sge->n = 0; + } + sge->vaddr = + sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = + sge->mr->map[sge->m]->segs[sge->n].length; + } + + dwoffset += dw; + dwords -= dw; + } + + if (!tail) + descqp = &dd->ipath_sdma_descq[dd->ipath_sdma_descq_cnt].qw[0]; + descqp -= 2; + /* SDmaLastDesc */ + descqp[0] |= cpu_to_le64(1ULL << 11); + if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_INTREQ) { + /* SDmaIntReq */ + descqp[0] |= cpu_to_le64(1ULL << 15); + } + + /* Commit writes to memory and advance the tail on the chip */ + wmb(); + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmatail, tail); + + tx->txreq.next_descq_idx = tail; + tx->txreq.callback_status = IPATH_SDMA_TXREQ_S_OK; + dd->ipath_sdma_descq_tail = tail; + dd->ipath_sdma_descq_added += tx->txreq.sg_count; + list_add_tail(&tx->txreq.list, &dd->ipath_sdma_activelist); + if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_VL15) + vl15_watchdog_enq(dd); + goto unlock; + +unmap: + while (tail != dd->ipath_sdma_descq_tail) { + if (!tail) + tail = dd->ipath_sdma_descq_cnt - 1; + else + tail--; + unmap_desc(dd, tail); + } +ioerr: + ret = -EIO; +unlock: + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); +fail: + return ret; +} diff --git a/kernel/drivers/infiniband/hw/ipath/ipath_srq.c b/kernel/drivers/infiniband/hw/ipath/ipath_srq.c new file mode 100644 index 000000000..26271984b --- /dev/null +++ b/kernel/drivers/infiniband/hw/ipath/ipath_srq.c @@ -0,0 +1,380 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include "ipath_verbs.h" + +/** + * ipath_post_srq_receive - post a receive on a shared receive queue + * @ibsrq: the SRQ to post the receive on + * @wr: the list of work requests to post + * @bad_wr: the first WR to cause a problem is put here + * + * This may be called from interrupt context. + */ +int ipath_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct ipath_srq *srq = to_isrq(ibsrq); + struct ipath_rwq *wq; + unsigned long flags; + int ret; + + for (; wr; wr = wr->next) { + struct ipath_rwqe *wqe; + u32 next; + int i; + + if ((unsigned) wr->num_sge > srq->rq.max_sge) { + *bad_wr = wr; + ret = -EINVAL; + goto bail; + } + + spin_lock_irqsave(&srq->rq.lock, flags); + wq = srq->rq.wq; + next = wq->head + 1; + if (next >= srq->rq.size) + next = 0; + if (next == wq->tail) { + spin_unlock_irqrestore(&srq->rq.lock, flags); + *bad_wr = wr; + ret = -ENOMEM; + goto bail; + } + + wqe = get_rwqe_ptr(&srq->rq, wq->head); + wqe->wr_id = wr->wr_id; + wqe->num_sge = wr->num_sge; + for (i = 0; i < wr->num_sge; i++) + wqe->sg_list[i] = wr->sg_list[i]; + /* Make sure queue entry is written before the head index. */ + smp_wmb(); + wq->head = next; + spin_unlock_irqrestore(&srq->rq.lock, flags); + } + ret = 0; + +bail: + return ret; +} + +/** + * ipath_create_srq - create a shared receive queue + * @ibpd: the protection domain of the SRQ to create + * @srq_init_attr: the attributes of the SRQ + * @udata: data from libipathverbs when creating a user SRQ + */ +struct ib_srq *ipath_create_srq(struct ib_pd *ibpd, + struct ib_srq_init_attr *srq_init_attr, + struct ib_udata *udata) +{ + struct ipath_ibdev *dev = to_idev(ibpd->device); + struct ipath_srq *srq; + u32 sz; + struct ib_srq *ret; + + if (srq_init_attr->srq_type != IB_SRQT_BASIC) { + ret = ERR_PTR(-ENOSYS); + goto done; + } + + if (srq_init_attr->attr.max_wr == 0) { + ret = ERR_PTR(-EINVAL); + goto done; + } + + if ((srq_init_attr->attr.max_sge > ib_ipath_max_srq_sges) || + (srq_init_attr->attr.max_wr > ib_ipath_max_srq_wrs)) { + ret = ERR_PTR(-EINVAL); + goto done; + } + + srq = kmalloc(sizeof(*srq), GFP_KERNEL); + if (!srq) { + ret = ERR_PTR(-ENOMEM); + goto done; + } + + /* + * Need to use vmalloc() if we want to support large #s of entries. + */ + srq->rq.size = srq_init_attr->attr.max_wr + 1; + srq->rq.max_sge = srq_init_attr->attr.max_sge; + sz = sizeof(struct ib_sge) * srq->rq.max_sge + + sizeof(struct ipath_rwqe); + srq->rq.wq = vmalloc_user(sizeof(struct ipath_rwq) + srq->rq.size * sz); + if (!srq->rq.wq) { + ret = ERR_PTR(-ENOMEM); + goto bail_srq; + } + + /* + * Return the address of the RWQ as the offset to mmap. + * See ipath_mmap() for details. + */ + if (udata && udata->outlen >= sizeof(__u64)) { + int err; + u32 s = sizeof(struct ipath_rwq) + srq->rq.size * sz; + + srq->ip = + ipath_create_mmap_info(dev, s, + ibpd->uobject->context, + srq->rq.wq); + if (!srq->ip) { + ret = ERR_PTR(-ENOMEM); + goto bail_wq; + } + + err = ib_copy_to_udata(udata, &srq->ip->offset, + sizeof(srq->ip->offset)); + if (err) { + ret = ERR_PTR(err); + goto bail_ip; + } + } else + srq->ip = NULL; + + /* + * ib_create_srq() will initialize srq->ibsrq. + */ + spin_lock_init(&srq->rq.lock); + srq->rq.wq->head = 0; + srq->rq.wq->tail = 0; + srq->limit = srq_init_attr->attr.srq_limit; + + spin_lock(&dev->n_srqs_lock); + if (dev->n_srqs_allocated == ib_ipath_max_srqs) { + spin_unlock(&dev->n_srqs_lock); + ret = ERR_PTR(-ENOMEM); + goto bail_ip; + } + + dev->n_srqs_allocated++; + spin_unlock(&dev->n_srqs_lock); + + if (srq->ip) { + spin_lock_irq(&dev->pending_lock); + list_add(&srq->ip->pending_mmaps, &dev->pending_mmaps); + spin_unlock_irq(&dev->pending_lock); + } + + ret = &srq->ibsrq; + goto done; + +bail_ip: + kfree(srq->ip); +bail_wq: + vfree(srq->rq.wq); +bail_srq: + kfree(srq); +done: + return ret; +} + +/** + * ipath_modify_srq - modify a shared receive queue + * @ibsrq: the SRQ to modify + * @attr: the new attributes of the SRQ + * @attr_mask: indicates which attributes to modify + * @udata: user data for ipathverbs.so + */ +int ipath_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, + struct ib_udata *udata) +{ + struct ipath_srq *srq = to_isrq(ibsrq); + struct ipath_rwq *wq; + int ret = 0; + + if (attr_mask & IB_SRQ_MAX_WR) { + struct ipath_rwq *owq; + struct ipath_rwqe *p; + u32 sz, size, n, head, tail; + + /* Check that the requested sizes are below the limits. */ + if ((attr->max_wr > ib_ipath_max_srq_wrs) || + ((attr_mask & IB_SRQ_LIMIT) ? + attr->srq_limit : srq->limit) > attr->max_wr) { + ret = -EINVAL; + goto bail; + } + + sz = sizeof(struct ipath_rwqe) + + srq->rq.max_sge * sizeof(struct ib_sge); + size = attr->max_wr + 1; + wq = vmalloc_user(sizeof(struct ipath_rwq) + size * sz); + if (!wq) { + ret = -ENOMEM; + goto bail; + } + + /* Check that we can write the offset to mmap. */ + if (udata && udata->inlen >= sizeof(__u64)) { + __u64 offset_addr; + __u64 offset = 0; + + ret = ib_copy_from_udata(&offset_addr, udata, + sizeof(offset_addr)); + if (ret) + goto bail_free; + udata->outbuf = + (void __user *) (unsigned long) offset_addr; + ret = ib_copy_to_udata(udata, &offset, + sizeof(offset)); + if (ret) + goto bail_free; + } + + spin_lock_irq(&srq->rq.lock); + /* + * validate head pointer value and compute + * the number of remaining WQEs. + */ + owq = srq->rq.wq; + head = owq->head; + if (head >= srq->rq.size) + head = 0; + tail = owq->tail; + if (tail >= srq->rq.size) + tail = 0; + n = head; + if (n < tail) + n += srq->rq.size - tail; + else + n -= tail; + if (size <= n) { + ret = -EINVAL; + goto bail_unlock; + } + n = 0; + p = wq->wq; + while (tail != head) { + struct ipath_rwqe *wqe; + int i; + + wqe = get_rwqe_ptr(&srq->rq, tail); + p->wr_id = wqe->wr_id; + p->num_sge = wqe->num_sge; + for (i = 0; i < wqe->num_sge; i++) + p->sg_list[i] = wqe->sg_list[i]; + n++; + p = (struct ipath_rwqe *)((char *) p + sz); + if (++tail >= srq->rq.size) + tail = 0; + } + srq->rq.wq = wq; + srq->rq.size = size; + wq->head = n; + wq->tail = 0; + if (attr_mask & IB_SRQ_LIMIT) + srq->limit = attr->srq_limit; + spin_unlock_irq(&srq->rq.lock); + + vfree(owq); + + if (srq->ip) { + struct ipath_mmap_info *ip = srq->ip; + struct ipath_ibdev *dev = to_idev(srq->ibsrq.device); + u32 s = sizeof(struct ipath_rwq) + size * sz; + + ipath_update_mmap_info(dev, ip, s, wq); + + /* + * Return the offset to mmap. + * See ipath_mmap() for details. + */ + if (udata && udata->inlen >= sizeof(__u64)) { + ret = ib_copy_to_udata(udata, &ip->offset, + sizeof(ip->offset)); + if (ret) + goto bail; + } + + spin_lock_irq(&dev->pending_lock); + if (list_empty(&ip->pending_mmaps)) + list_add(&ip->pending_mmaps, + &dev->pending_mmaps); + spin_unlock_irq(&dev->pending_lock); + } + } else if (attr_mask & IB_SRQ_LIMIT) { + spin_lock_irq(&srq->rq.lock); + if (attr->srq_limit >= srq->rq.size) + ret = -EINVAL; + else + srq->limit = attr->srq_limit; + spin_unlock_irq(&srq->rq.lock); + } + goto bail; + +bail_unlock: + spin_unlock_irq(&srq->rq.lock); +bail_free: + vfree(wq); +bail: + return ret; +} + +int ipath_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr) +{ + struct ipath_srq *srq = to_isrq(ibsrq); + + attr->max_wr = srq->rq.size - 1; + attr->max_sge = srq->rq.max_sge; + attr->srq_limit = srq->limit; + return 0; +} + +/** + * ipath_destroy_srq - destroy a shared receive queue + * @ibsrq: the SRQ to destroy + */ +int ipath_destroy_srq(struct ib_srq *ibsrq) +{ + struct ipath_srq *srq = to_isrq(ibsrq); + struct ipath_ibdev *dev = to_idev(ibsrq->device); + + spin_lock(&dev->n_srqs_lock); + dev->n_srqs_allocated--; + spin_unlock(&dev->n_srqs_lock); + if (srq->ip) + kref_put(&srq->ip->ref, ipath_release_mmap_info); + else + vfree(srq->rq.wq); + kfree(srq); + + return 0; +} diff --git a/kernel/drivers/infiniband/hw/ipath/ipath_stats.c b/kernel/drivers/infiniband/hw/ipath/ipath_stats.c new file mode 100644 index 000000000..f63e143e3 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ipath/ipath_stats.c @@ -0,0 +1,347 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ipath_kernel.h" + +struct infinipath_stats ipath_stats; + +/** + * ipath_snap_cntr - snapshot a chip counter + * @dd: the infinipath device + * @creg: the counter to snapshot + * + * called from add_timer and user counter read calls, to deal with + * counters that wrap in "human time". The words sent and received, and + * the packets sent and received are all that we worry about. For now, + * at least, we don't worry about error counters, because if they wrap + * that quickly, we probably don't care. We may eventually just make this + * handle all the counters. word counters can wrap in about 20 seconds + * of full bandwidth traffic, packet counters in a few hours. + */ + +u64 ipath_snap_cntr(struct ipath_devdata *dd, ipath_creg creg) +{ + u32 val, reg64 = 0; + u64 val64; + unsigned long t0, t1; + u64 ret; + + t0 = jiffies; + /* If fast increment counters are only 32 bits, snapshot them, + * and maintain them as 64bit values in the driver */ + if (!(dd->ipath_flags & IPATH_32BITCOUNTERS) && + (creg == dd->ipath_cregs->cr_wordsendcnt || + creg == dd->ipath_cregs->cr_wordrcvcnt || + creg == dd->ipath_cregs->cr_pktsendcnt || + creg == dd->ipath_cregs->cr_pktrcvcnt)) { + val64 = ipath_read_creg(dd, creg); + val = val64 == ~0ULL ? ~0U : 0; + reg64 = 1; + } else /* val64 just to keep gcc quiet... */ + val64 = val = ipath_read_creg32(dd, creg); + /* + * See if a second has passed. This is just a way to detect things + * that are quite broken. Normally this should take just a few + * cycles (the check is for long enough that we don't care if we get + * pre-empted.) An Opteron HT O read timeout is 4 seconds with + * normal NB values + */ + t1 = jiffies; + if (time_before(t0 + HZ, t1) && val == -1) { + ipath_dev_err(dd, "Error! Read counter 0x%x timed out\n", + creg); + ret = 0ULL; + goto bail; + } + if (reg64) { + ret = val64; + goto bail; + } + + if (creg == dd->ipath_cregs->cr_wordsendcnt) { + if (val != dd->ipath_lastsword) { + dd->ipath_sword += val - dd->ipath_lastsword; + dd->ipath_lastsword = val; + } + val64 = dd->ipath_sword; + } else if (creg == dd->ipath_cregs->cr_wordrcvcnt) { + if (val != dd->ipath_lastrword) { + dd->ipath_rword += val - dd->ipath_lastrword; + dd->ipath_lastrword = val; + } + val64 = dd->ipath_rword; + } else if (creg == dd->ipath_cregs->cr_pktsendcnt) { + if (val != dd->ipath_lastspkts) { + dd->ipath_spkts += val - dd->ipath_lastspkts; + dd->ipath_lastspkts = val; + } + val64 = dd->ipath_spkts; + } else if (creg == dd->ipath_cregs->cr_pktrcvcnt) { + if (val != dd->ipath_lastrpkts) { + dd->ipath_rpkts += val - dd->ipath_lastrpkts; + dd->ipath_lastrpkts = val; + } + val64 = dd->ipath_rpkts; + } else if (creg == dd->ipath_cregs->cr_ibsymbolerrcnt) { + if (dd->ibdeltainprog) + val64 -= val64 - dd->ibsymsnap; + val64 -= dd->ibsymdelta; + } else if (creg == dd->ipath_cregs->cr_iblinkerrrecovcnt) { + if (dd->ibdeltainprog) + val64 -= val64 - dd->iblnkerrsnap; + val64 -= dd->iblnkerrdelta; + } else + val64 = (u64) val; + + ret = val64; + +bail: + return ret; +} + +/** + * ipath_qcheck - print delta of egrfull/hdrqfull errors for kernel ports + * @dd: the infinipath device + * + * print the delta of egrfull/hdrqfull errors for kernel ports no more than + * every 5 seconds. User processes are printed at close, but kernel doesn't + * close, so... Separate routine so may call from other places someday, and + * so function name when printed by _IPATH_INFO is meaningfull + */ +static void ipath_qcheck(struct ipath_devdata *dd) +{ + static u64 last_tot_hdrqfull; + struct ipath_portdata *pd = dd->ipath_pd[0]; + size_t blen = 0; + char buf[128]; + u32 hdrqtail; + + *buf = 0; + if (pd->port_hdrqfull != dd->ipath_p0_hdrqfull) { + blen = snprintf(buf, sizeof buf, "port 0 hdrqfull %u", + pd->port_hdrqfull - + dd->ipath_p0_hdrqfull); + dd->ipath_p0_hdrqfull = pd->port_hdrqfull; + } + if (ipath_stats.sps_etidfull != dd->ipath_last_tidfull) { + blen += snprintf(buf + blen, sizeof buf - blen, + "%srcvegrfull %llu", + blen ? ", " : "", + (unsigned long long) + (ipath_stats.sps_etidfull - + dd->ipath_last_tidfull)); + dd->ipath_last_tidfull = ipath_stats.sps_etidfull; + } + + /* + * this is actually the number of hdrq full interrupts, not actual + * events, but at the moment that's mostly what I'm interested in. + * Actual count, etc. is in the counters, if needed. For production + * users this won't ordinarily be printed. + */ + + if ((ipath_debug & (__IPATH_PKTDBG | __IPATH_DBG)) && + ipath_stats.sps_hdrqfull != last_tot_hdrqfull) { + blen += snprintf(buf + blen, sizeof buf - blen, + "%shdrqfull %llu (all ports)", + blen ? ", " : "", + (unsigned long long) + (ipath_stats.sps_hdrqfull - + last_tot_hdrqfull)); + last_tot_hdrqfull = ipath_stats.sps_hdrqfull; + } + if (blen) + ipath_dbg("%s\n", buf); + + hdrqtail = ipath_get_hdrqtail(pd); + if (pd->port_head != hdrqtail) { + if (dd->ipath_lastport0rcv_cnt == + ipath_stats.sps_port0pkts) { + ipath_cdbg(PKT, "missing rcv interrupts? " + "port0 hd=%x tl=%x; port0pkts %llx; write" + " hd (w/intr)\n", + pd->port_head, hdrqtail, + (unsigned long long) + ipath_stats.sps_port0pkts); + ipath_write_ureg(dd, ur_rcvhdrhead, hdrqtail | + dd->ipath_rhdrhead_intr_off, pd->port_port); + } + dd->ipath_lastport0rcv_cnt = ipath_stats.sps_port0pkts; + } +} + +static void ipath_chk_errormask(struct ipath_devdata *dd) +{ + static u32 fixed; + u32 ctrl; + unsigned long errormask; + unsigned long hwerrs; + + if (!dd->ipath_errormask || !(dd->ipath_flags & IPATH_INITTED)) + return; + + errormask = ipath_read_kreg64(dd, dd->ipath_kregs->kr_errormask); + + if (errormask == dd->ipath_errormask) + return; + fixed++; + + hwerrs = ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwerrstatus); + ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control); + + ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, + dd->ipath_errormask); + + if ((hwerrs & dd->ipath_hwerrmask) || + (ctrl & INFINIPATH_C_FREEZEMODE)) { + /* force re-interrupt of pending events, just in case */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear, 0ULL); + ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, 0ULL); + ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, 0ULL); + dev_info(&dd->pcidev->dev, + "errormask fixed(%u) %lx -> %lx, ctrl %x hwerr %lx\n", + fixed, errormask, (unsigned long)dd->ipath_errormask, + ctrl, hwerrs); + } else + ipath_dbg("errormask fixed(%u) %lx -> %lx, no freeze\n", + fixed, errormask, + (unsigned long)dd->ipath_errormask); +} + + +/** + * ipath_get_faststats - get word counters from chip before they overflow + * @opaque - contains a pointer to the infinipath device ipath_devdata + * + * called from add_timer + */ +void ipath_get_faststats(unsigned long opaque) +{ + struct ipath_devdata *dd = (struct ipath_devdata *) opaque; + int i; + static unsigned cnt; + unsigned long flags; + u64 traffic_wds; + + /* + * don't access the chip while running diags, or memory diags can + * fail + */ + if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_INITTED) || + ipath_diag_inuse) + /* but re-arm the timer, for diags case; won't hurt other */ + goto done; + + /* + * We now try to maintain a "active timer", based on traffic + * exceeding a threshold, so we need to check the word-counts + * even if they are 64-bit. + */ + traffic_wds = ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordsendcnt) + + ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordrcvcnt); + spin_lock_irqsave(&dd->ipath_eep_st_lock, flags); + traffic_wds -= dd->ipath_traffic_wds; + dd->ipath_traffic_wds += traffic_wds; + if (traffic_wds >= IPATH_TRAFFIC_ACTIVE_THRESHOLD) + atomic_add(5, &dd->ipath_active_time); /* S/B #define */ + spin_unlock_irqrestore(&dd->ipath_eep_st_lock, flags); + + if (dd->ipath_flags & IPATH_32BITCOUNTERS) { + ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktsendcnt); + ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktrcvcnt); + } + + ipath_qcheck(dd); + + /* + * deal with repeat error suppression. Doesn't really matter if + * last error was almost a full interval ago, or just a few usecs + * ago; still won't get more than 2 per interval. We may want + * longer intervals for this eventually, could do with mod, counter + * or separate timer. Also see code in ipath_handle_errors() and + * ipath_handle_hwerrors(). + */ + + if (dd->ipath_lasterror) + dd->ipath_lasterror = 0; + if (dd->ipath_lasthwerror) + dd->ipath_lasthwerror = 0; + if (dd->ipath_maskederrs + && time_after(jiffies, dd->ipath_unmasktime)) { + char ebuf[256]; + int iserr; + iserr = ipath_decode_err(dd, ebuf, sizeof ebuf, + dd->ipath_maskederrs); + if (dd->ipath_maskederrs & + ~(INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL | + INFINIPATH_E_PKTERRS)) + ipath_dev_err(dd, "Re-enabling masked errors " + "(%s)\n", ebuf); + else { + /* + * rcvegrfull and rcvhdrqfull are "normal", for some + * types of processes (mostly benchmarks) that send + * huge numbers of messages, while not processing + * them. So only complain about these at debug + * level. + */ + if (iserr) + ipath_dbg( + "Re-enabling queue full errors (%s)\n", + ebuf); + else + ipath_cdbg(ERRPKT, "Re-enabling packet" + " problem interrupt (%s)\n", ebuf); + } + + /* re-enable masked errors */ + dd->ipath_errormask |= dd->ipath_maskederrs; + ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, + dd->ipath_errormask); + dd->ipath_maskederrs = 0; + } + + /* limit qfull messages to ~one per minute per port */ + if ((++cnt & 0x10)) { + for (i = (int) dd->ipath_cfgports; --i >= 0; ) { + struct ipath_portdata *pd = dd->ipath_pd[i]; + + if (pd && pd->port_lastrcvhdrqtail != -1) + pd->port_lastrcvhdrqtail = -1; + } + } + + ipath_chk_errormask(dd); +done: + mod_timer(&dd->ipath_stats_timer, jiffies + HZ * 5); +} diff --git a/kernel/drivers/infiniband/hw/ipath/ipath_sysfs.c b/kernel/drivers/infiniband/hw/ipath/ipath_sysfs.c new file mode 100644 index 000000000..75558f33f --- /dev/null +++ b/kernel/drivers/infiniband/hw/ipath/ipath_sysfs.c @@ -0,0 +1,1238 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include "ipath_kernel.h" +#include "ipath_verbs.h" +#include "ipath_common.h" + +/** + * ipath_parse_ushort - parse an unsigned short value in an arbitrary base + * @str: the string containing the number + * @valp: where to put the result + * + * returns the number of bytes consumed, or negative value on error + */ +int ipath_parse_ushort(const char *str, unsigned short *valp) +{ + unsigned long val; + char *end; + int ret; + + if (!isdigit(str[0])) { + ret = -EINVAL; + goto bail; + } + + val = simple_strtoul(str, &end, 0); + + if (val > 0xffff) { + ret = -EINVAL; + goto bail; + } + + *valp = val; + + ret = end + 1 - str; + if (ret == 0) + ret = -EINVAL; + +bail: + return ret; +} + +static ssize_t show_version(struct device_driver *dev, char *buf) +{ + /* The string printed here is already newline-terminated. */ + return scnprintf(buf, PAGE_SIZE, "%s", ib_ipath_version); +} + +static ssize_t show_num_units(struct device_driver *dev, char *buf) +{ + return scnprintf(buf, PAGE_SIZE, "%d\n", + ipath_count_units(NULL, NULL, NULL)); +} + +static ssize_t show_status(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + ssize_t ret; + + if (!dd->ipath_statusp) { + ret = -EINVAL; + goto bail; + } + + ret = scnprintf(buf, PAGE_SIZE, "0x%llx\n", + (unsigned long long) *(dd->ipath_statusp)); + +bail: + return ret; +} + +static const char *ipath_status_str[] = { + "Initted", + "Disabled", + "Admin_Disabled", + "", /* This used to be the old "OIB_SMA" status. */ + "", /* This used to be the old "SMA" status. */ + "Present", + "IB_link_up", + "IB_configured", + "NoIBcable", + "Fatal_Hardware_Error", + NULL, +}; + +static ssize_t show_status_str(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int i, any; + u64 s; + ssize_t ret; + + if (!dd->ipath_statusp) { + ret = -EINVAL; + goto bail; + } + + s = *(dd->ipath_statusp); + *buf = '\0'; + for (any = i = 0; s && ipath_status_str[i]; i++) { + if (s & 1) { + if (any && strlcat(buf, " ", PAGE_SIZE) >= + PAGE_SIZE) + /* overflow */ + break; + if (strlcat(buf, ipath_status_str[i], + PAGE_SIZE) >= PAGE_SIZE) + break; + any = 1; + } + s >>= 1; + } + if (any) + strlcat(buf, "\n", PAGE_SIZE); + + ret = strlen(buf); + +bail: + return ret; +} + +static ssize_t show_boardversion(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + /* The string printed here is already newline-terminated. */ + return scnprintf(buf, PAGE_SIZE, "%s", dd->ipath_boardversion); +} + +static ssize_t show_localbus_info(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + /* The string printed here is already newline-terminated. */ + return scnprintf(buf, PAGE_SIZE, "%s", dd->ipath_lbus_info); +} + +static ssize_t show_lmc(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + + return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_lmc); +} + +static ssize_t store_lmc(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + u16 lmc = 0; + int ret; + + ret = ipath_parse_ushort(buf, &lmc); + if (ret < 0) + goto invalid; + + if (lmc > 7) { + ret = -EINVAL; + goto invalid; + } + + ipath_set_lid(dd, dd->ipath_lid, lmc); + + goto bail; +invalid: + ipath_dev_err(dd, "attempt to set invalid LMC %u\n", lmc); +bail: + return ret; +} + +static ssize_t show_lid(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + + return scnprintf(buf, PAGE_SIZE, "0x%x\n", dd->ipath_lid); +} + +static ssize_t store_lid(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + u16 lid = 0; + int ret; + + ret = ipath_parse_ushort(buf, &lid); + if (ret < 0) + goto invalid; + + if (lid == 0 || lid >= IPATH_MULTICAST_LID_BASE) { + ret = -EINVAL; + goto invalid; + } + + ipath_set_lid(dd, lid, dd->ipath_lmc); + + goto bail; +invalid: + ipath_dev_err(dd, "attempt to set invalid LID 0x%x\n", lid); +bail: + return ret; +} + +static ssize_t show_mlid(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + + return scnprintf(buf, PAGE_SIZE, "0x%x\n", dd->ipath_mlid); +} + +static ssize_t store_mlid(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + u16 mlid; + int ret; + + ret = ipath_parse_ushort(buf, &mlid); + if (ret < 0 || mlid < IPATH_MULTICAST_LID_BASE) + goto invalid; + + dd->ipath_mlid = mlid; + + goto bail; +invalid: + ipath_dev_err(dd, "attempt to set invalid MLID\n"); +bail: + return ret; +} + +static ssize_t show_guid(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + u8 *guid; + + guid = (u8 *) & (dd->ipath_guid); + + return scnprintf(buf, PAGE_SIZE, + "%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n", + guid[0], guid[1], guid[2], guid[3], + guid[4], guid[5], guid[6], guid[7]); +} + +static ssize_t store_guid(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + ssize_t ret; + unsigned short guid[8]; + __be64 new_guid; + u8 *ng; + int i; + + if (sscanf(buf, "%hx:%hx:%hx:%hx:%hx:%hx:%hx:%hx", + &guid[0], &guid[1], &guid[2], &guid[3], + &guid[4], &guid[5], &guid[6], &guid[7]) != 8) + goto invalid; + + ng = (u8 *) &new_guid; + + for (i = 0; i < 8; i++) { + if (guid[i] > 0xff) + goto invalid; + ng[i] = guid[i]; + } + + if (new_guid == 0) + goto invalid; + + dd->ipath_guid = new_guid; + dd->ipath_nguid = 1; + if (dd->verbs_dev) + dd->verbs_dev->ibdev.node_guid = new_guid; + + ret = strlen(buf); + goto bail; + +invalid: + ipath_dev_err(dd, "attempt to set invalid GUID\n"); + ret = -EINVAL; + +bail: + return ret; +} + +static ssize_t show_nguid(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + + return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_nguid); +} + +static ssize_t show_nports(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + + /* Return the number of user ports available. */ + return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_cfgports - 1); +} + +static ssize_t show_serial(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + + buf[sizeof dd->ipath_serial] = '\0'; + memcpy(buf, dd->ipath_serial, sizeof dd->ipath_serial); + strcat(buf, "\n"); + return strlen(buf); +} + +static ssize_t show_unit(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + + return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_unit); +} + +static ssize_t show_jint_max_packets(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + + return scnprintf(buf, PAGE_SIZE, "%hu\n", dd->ipath_jint_max_packets); +} + +static ssize_t store_jint_max_packets(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + u16 v = 0; + int ret; + + ret = ipath_parse_ushort(buf, &v); + if (ret < 0) + ipath_dev_err(dd, "invalid jint_max_packets.\n"); + else + dd->ipath_f_config_jint(dd, dd->ipath_jint_idle_ticks, v); + + return ret; +} + +static ssize_t show_jint_idle_ticks(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + + return scnprintf(buf, PAGE_SIZE, "%hu\n", dd->ipath_jint_idle_ticks); +} + +static ssize_t store_jint_idle_ticks(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + u16 v = 0; + int ret; + + ret = ipath_parse_ushort(buf, &v); + if (ret < 0) + ipath_dev_err(dd, "invalid jint_idle_ticks.\n"); + else + dd->ipath_f_config_jint(dd, v, dd->ipath_jint_max_packets); + + return ret; +} + +#define DEVICE_COUNTER(name, attr) \ + static ssize_t show_counter_##name(struct device *dev, \ + struct device_attribute *attr, \ + char *buf) \ + { \ + struct ipath_devdata *dd = dev_get_drvdata(dev); \ + return scnprintf(\ + buf, PAGE_SIZE, "%llu\n", (unsigned long long) \ + ipath_snap_cntr( \ + dd, offsetof(struct infinipath_counters, \ + attr) / sizeof(u64))); \ + } \ + static DEVICE_ATTR(name, S_IRUGO, show_counter_##name, NULL); + +DEVICE_COUNTER(ib_link_downeds, IBLinkDownedCnt); +DEVICE_COUNTER(ib_link_err_recoveries, IBLinkErrRecoveryCnt); +DEVICE_COUNTER(ib_status_changes, IBStatusChangeCnt); +DEVICE_COUNTER(ib_symbol_errs, IBSymbolErrCnt); +DEVICE_COUNTER(lb_flow_stalls, LBFlowStallCnt); +DEVICE_COUNTER(lb_ints, LBIntCnt); +DEVICE_COUNTER(rx_bad_formats, RxBadFormatCnt); +DEVICE_COUNTER(rx_buf_ovfls, RxBufOvflCnt); +DEVICE_COUNTER(rx_data_pkts, RxDataPktCnt); +DEVICE_COUNTER(rx_dropped_pkts, RxDroppedPktCnt); +DEVICE_COUNTER(rx_dwords, RxDwordCnt); +DEVICE_COUNTER(rx_ebps, RxEBPCnt); +DEVICE_COUNTER(rx_flow_ctrl_errs, RxFlowCtrlErrCnt); +DEVICE_COUNTER(rx_flow_pkts, RxFlowPktCnt); +DEVICE_COUNTER(rx_icrc_errs, RxICRCErrCnt); +DEVICE_COUNTER(rx_len_errs, RxLenErrCnt); +DEVICE_COUNTER(rx_link_problems, RxLinkProblemCnt); +DEVICE_COUNTER(rx_lpcrc_errs, RxLPCRCErrCnt); +DEVICE_COUNTER(rx_max_min_len_errs, RxMaxMinLenErrCnt); +DEVICE_COUNTER(rx_p0_hdr_egr_ovfls, RxP0HdrEgrOvflCnt); +DEVICE_COUNTER(rx_p1_hdr_egr_ovfls, RxP1HdrEgrOvflCnt); +DEVICE_COUNTER(rx_p2_hdr_egr_ovfls, RxP2HdrEgrOvflCnt); +DEVICE_COUNTER(rx_p3_hdr_egr_ovfls, RxP3HdrEgrOvflCnt); +DEVICE_COUNTER(rx_p4_hdr_egr_ovfls, RxP4HdrEgrOvflCnt); +DEVICE_COUNTER(rx_p5_hdr_egr_ovfls, RxP5HdrEgrOvflCnt); +DEVICE_COUNTER(rx_p6_hdr_egr_ovfls, RxP6HdrEgrOvflCnt); +DEVICE_COUNTER(rx_p7_hdr_egr_ovfls, RxP7HdrEgrOvflCnt); +DEVICE_COUNTER(rx_p8_hdr_egr_ovfls, RxP8HdrEgrOvflCnt); +DEVICE_COUNTER(rx_pkey_mismatches, RxPKeyMismatchCnt); +DEVICE_COUNTER(rx_tid_full_errs, RxTIDFullErrCnt); +DEVICE_COUNTER(rx_tid_valid_errs, RxTIDValidErrCnt); +DEVICE_COUNTER(rx_vcrc_errs, RxVCRCErrCnt); +DEVICE_COUNTER(tx_data_pkts, TxDataPktCnt); +DEVICE_COUNTER(tx_dropped_pkts, TxDroppedPktCnt); +DEVICE_COUNTER(tx_dwords, TxDwordCnt); +DEVICE_COUNTER(tx_flow_pkts, TxFlowPktCnt); +DEVICE_COUNTER(tx_flow_stalls, TxFlowStallCnt); +DEVICE_COUNTER(tx_len_errs, TxLenErrCnt); +DEVICE_COUNTER(tx_max_min_len_errs, TxMaxMinLenErrCnt); +DEVICE_COUNTER(tx_underruns, TxUnderrunCnt); +DEVICE_COUNTER(tx_unsup_vl_errs, TxUnsupVLErrCnt); + +static struct attribute *dev_counter_attributes[] = { + &dev_attr_ib_link_downeds.attr, + &dev_attr_ib_link_err_recoveries.attr, + &dev_attr_ib_status_changes.attr, + &dev_attr_ib_symbol_errs.attr, + &dev_attr_lb_flow_stalls.attr, + &dev_attr_lb_ints.attr, + &dev_attr_rx_bad_formats.attr, + &dev_attr_rx_buf_ovfls.attr, + &dev_attr_rx_data_pkts.attr, + &dev_attr_rx_dropped_pkts.attr, + &dev_attr_rx_dwords.attr, + &dev_attr_rx_ebps.attr, + &dev_attr_rx_flow_ctrl_errs.attr, + &dev_attr_rx_flow_pkts.attr, + &dev_attr_rx_icrc_errs.attr, + &dev_attr_rx_len_errs.attr, + &dev_attr_rx_link_problems.attr, + &dev_attr_rx_lpcrc_errs.attr, + &dev_attr_rx_max_min_len_errs.attr, + &dev_attr_rx_p0_hdr_egr_ovfls.attr, + &dev_attr_rx_p1_hdr_egr_ovfls.attr, + &dev_attr_rx_p2_hdr_egr_ovfls.attr, + &dev_attr_rx_p3_hdr_egr_ovfls.attr, + &dev_attr_rx_p4_hdr_egr_ovfls.attr, + &dev_attr_rx_p5_hdr_egr_ovfls.attr, + &dev_attr_rx_p6_hdr_egr_ovfls.attr, + &dev_attr_rx_p7_hdr_egr_ovfls.attr, + &dev_attr_rx_p8_hdr_egr_ovfls.attr, + &dev_attr_rx_pkey_mismatches.attr, + &dev_attr_rx_tid_full_errs.attr, + &dev_attr_rx_tid_valid_errs.attr, + &dev_attr_rx_vcrc_errs.attr, + &dev_attr_tx_data_pkts.attr, + &dev_attr_tx_dropped_pkts.attr, + &dev_attr_tx_dwords.attr, + &dev_attr_tx_flow_pkts.attr, + &dev_attr_tx_flow_stalls.attr, + &dev_attr_tx_len_errs.attr, + &dev_attr_tx_max_min_len_errs.attr, + &dev_attr_tx_underruns.attr, + &dev_attr_tx_unsup_vl_errs.attr, + NULL +}; + +static struct attribute_group dev_counter_attr_group = { + .name = "counters", + .attrs = dev_counter_attributes +}; + +static ssize_t store_reset(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret; + + if (count < 5 || memcmp(buf, "reset", 5)) { + ret = -EINVAL; + goto bail; + } + + if (dd->ipath_flags & IPATH_DISABLED) { + /* + * post-reset init would re-enable interrupts, etc. + * so don't allow reset on disabled devices. Not + * perfect error, but about the best choice. + */ + dev_info(dev,"Unit %d is disabled, can't reset\n", + dd->ipath_unit); + ret = -EINVAL; + goto bail; + } + ret = ipath_reset_device(dd->ipath_unit); +bail: + return ret<0 ? ret : count; +} + +static ssize_t store_link_state(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret, r; + u16 state; + + ret = ipath_parse_ushort(buf, &state); + if (ret < 0) + goto invalid; + + r = ipath_set_linkstate(dd, state); + if (r < 0) { + ret = r; + goto bail; + } + + goto bail; +invalid: + ipath_dev_err(dd, "attempt to set invalid link state\n"); +bail: + return ret; +} + +static ssize_t show_mtu(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_ibmtu); +} + +static ssize_t store_mtu(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + ssize_t ret; + u16 mtu = 0; + int r; + + ret = ipath_parse_ushort(buf, &mtu); + if (ret < 0) + goto invalid; + + r = ipath_set_mtu(dd, mtu); + if (r < 0) + ret = r; + + goto bail; +invalid: + ipath_dev_err(dd, "attempt to set invalid MTU\n"); +bail: + return ret; +} + +static ssize_t show_enabled(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + return scnprintf(buf, PAGE_SIZE, "%u\n", + (dd->ipath_flags & IPATH_DISABLED) ? 0 : 1); +} + +static ssize_t store_enabled(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + ssize_t ret; + u16 enable = 0; + + ret = ipath_parse_ushort(buf, &enable); + if (ret < 0) { + ipath_dev_err(dd, "attempt to use non-numeric on enable\n"); + goto bail; + } + + if (enable) { + if (!(dd->ipath_flags & IPATH_DISABLED)) + goto bail; + + dev_info(dev, "Enabling unit %d\n", dd->ipath_unit); + /* same as post-reset */ + ret = ipath_init_chip(dd, 1); + if (ret) + ipath_dev_err(dd, "Failed to enable unit %d\n", + dd->ipath_unit); + else { + dd->ipath_flags &= ~IPATH_DISABLED; + *dd->ipath_statusp &= ~IPATH_STATUS_ADMIN_DISABLED; + } + } + else if (!(dd->ipath_flags & IPATH_DISABLED)) { + dev_info(dev, "Disabling unit %d\n", dd->ipath_unit); + ipath_shutdown_device(dd); + dd->ipath_flags |= IPATH_DISABLED; + *dd->ipath_statusp |= IPATH_STATUS_ADMIN_DISABLED; + } + +bail: + return ret; +} + +static ssize_t store_rx_pol_inv(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret, r; + u16 val; + + ret = ipath_parse_ushort(buf, &val); + if (ret < 0) + goto invalid; + + r = ipath_set_rx_pol_inv(dd, val); + if (r < 0) { + ret = r; + goto bail; + } + + goto bail; +invalid: + ipath_dev_err(dd, "attempt to set invalid Rx Polarity invert\n"); +bail: + return ret; +} + +static ssize_t store_led_override(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret; + u16 val; + + ret = ipath_parse_ushort(buf, &val); + if (ret > 0) + ipath_set_led_override(dd, val); + else + ipath_dev_err(dd, "attempt to set invalid LED override\n"); + return ret; +} + +static ssize_t show_logged_errs(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int idx, count; + + /* force consistency with actual EEPROM */ + if (ipath_update_eeprom_log(dd) != 0) + return -ENXIO; + + count = 0; + for (idx = 0; idx < IPATH_EEP_LOG_CNT; ++idx) { + count += scnprintf(buf + count, PAGE_SIZE - count, "%d%c", + dd->ipath_eep_st_errs[idx], + idx == (IPATH_EEP_LOG_CNT - 1) ? '\n' : ' '); + } + + return count; +} + +/* + * New sysfs entries to control various IB config. These all turn into + * accesses via ipath_f_get/set_ib_cfg. + * + * Get/Set heartbeat enable. Or of 1=enabled, 2=auto + */ +static ssize_t show_hrtbt_enb(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret; + + ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_HRTBT); + if (ret >= 0) + ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret); + return ret; +} + +static ssize_t store_hrtbt_enb(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret, r; + u16 val; + + ret = ipath_parse_ushort(buf, &val); + if (ret >= 0 && val > 3) + ret = -EINVAL; + if (ret < 0) { + ipath_dev_err(dd, "attempt to set invalid Heartbeat enable\n"); + goto bail; + } + + /* + * Set the "intentional" heartbeat enable per either of + * "Enable" and "Auto", as these are normally set together. + * This bit is consulted when leaving loopback mode, + * because entering loopback mode overrides it and automatically + * disables heartbeat. + */ + r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT, val); + if (r < 0) + ret = r; + else if (val == IPATH_IB_HRTBT_OFF) + dd->ipath_flags |= IPATH_NO_HRTBT; + else + dd->ipath_flags &= ~IPATH_NO_HRTBT; + +bail: + return ret; +} + +/* + * Get/Set Link-widths enabled. Or of 1=1x, 2=4x (this is human/IB centric, + * _not_ the particular encoding of any given chip) + */ +static ssize_t show_lwid_enb(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret; + + ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_LWID_ENB); + if (ret >= 0) + ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret); + return ret; +} + +static ssize_t store_lwid_enb(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret, r; + u16 val; + + ret = ipath_parse_ushort(buf, &val); + if (ret >= 0 && (val == 0 || val > 3)) + ret = -EINVAL; + if (ret < 0) { + ipath_dev_err(dd, + "attempt to set invalid Link Width (enable)\n"); + goto bail; + } + + r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LWID_ENB, val); + if (r < 0) + ret = r; + +bail: + return ret; +} + +/* Get current link width */ +static ssize_t show_lwid(struct device *dev, + struct device_attribute *attr, + char *buf) + +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret; + + ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_LWID); + if (ret >= 0) + ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret); + return ret; +} + +/* + * Get/Set Link-speeds enabled. Or of 1=SDR 2=DDR. + */ +static ssize_t show_spd_enb(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret; + + ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_SPD_ENB); + if (ret >= 0) + ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret); + return ret; +} + +static ssize_t store_spd_enb(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret, r; + u16 val; + + ret = ipath_parse_ushort(buf, &val); + if (ret >= 0 && (val == 0 || val > (IPATH_IB_SDR | IPATH_IB_DDR))) + ret = -EINVAL; + if (ret < 0) { + ipath_dev_err(dd, + "attempt to set invalid Link Speed (enable)\n"); + goto bail; + } + + r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_SPD_ENB, val); + if (r < 0) + ret = r; + +bail: + return ret; +} + +/* Get current link speed */ +static ssize_t show_spd(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret; + + ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_SPD); + if (ret >= 0) + ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret); + return ret; +} + +/* + * Get/Set RX polarity-invert enable. 0=no, 1=yes. + */ +static ssize_t show_rx_polinv_enb(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret; + + ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_RXPOL_ENB); + if (ret >= 0) + ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret); + return ret; +} + +static ssize_t store_rx_polinv_enb(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret, r; + u16 val; + + ret = ipath_parse_ushort(buf, &val); + if (ret >= 0 && val > 1) { + ipath_dev_err(dd, + "attempt to set invalid Rx Polarity (enable)\n"); + ret = -EINVAL; + goto bail; + } + + r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_RXPOL_ENB, val); + if (r < 0) + ret = r; + +bail: + return ret; +} + +/* + * Get/Set RX lane-reversal enable. 0=no, 1=yes. + */ +static ssize_t show_lanerev_enb(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret; + + ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_LREV_ENB); + if (ret >= 0) + ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret); + return ret; +} + +static ssize_t store_lanerev_enb(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret, r; + u16 val; + + ret = ipath_parse_ushort(buf, &val); + if (ret >= 0 && val > 1) { + ret = -EINVAL; + ipath_dev_err(dd, + "attempt to set invalid Lane reversal (enable)\n"); + goto bail; + } + + r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LREV_ENB, val); + if (r < 0) + ret = r; + +bail: + return ret; +} + +static DRIVER_ATTR(num_units, S_IRUGO, show_num_units, NULL); +static DRIVER_ATTR(version, S_IRUGO, show_version, NULL); + +static struct attribute *driver_attributes[] = { + &driver_attr_num_units.attr, + &driver_attr_version.attr, + NULL +}; + +static struct attribute_group driver_attr_group = { + .attrs = driver_attributes +}; + +static ssize_t store_tempsense(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret, stat; + u16 val; + + ret = ipath_parse_ushort(buf, &val); + if (ret <= 0) { + ipath_dev_err(dd, "attempt to set invalid tempsense config\n"); + goto bail; + } + /* If anything but the highest limit, enable T_CRIT_A "interrupt" */ + stat = ipath_tempsense_write(dd, 9, (val == 0x7f7f) ? 0x80 : 0); + if (stat) { + ipath_dev_err(dd, "Unable to set tempsense config\n"); + ret = -1; + goto bail; + } + stat = ipath_tempsense_write(dd, 0xB, (u8) (val & 0xFF)); + if (stat) { + ipath_dev_err(dd, "Unable to set local Tcrit\n"); + ret = -1; + goto bail; + } + stat = ipath_tempsense_write(dd, 0xD, (u8) (val >> 8)); + if (stat) { + ipath_dev_err(dd, "Unable to set remote Tcrit\n"); + ret = -1; + goto bail; + } + +bail: + return ret; +} + +/* + * dump tempsense regs. in decimal, to ease shell-scripts. + */ +static ssize_t show_tempsense(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret; + int idx; + u8 regvals[8]; + + ret = -ENXIO; + for (idx = 0; idx < 8; ++idx) { + if (idx == 6) + continue; + ret = ipath_tempsense_read(dd, idx); + if (ret < 0) + break; + regvals[idx] = ret; + } + if (idx == 8) + ret = scnprintf(buf, PAGE_SIZE, "%d %d %02X %02X %d %d\n", + *(signed char *)(regvals), + *(signed char *)(regvals + 1), + regvals[2], regvals[3], + *(signed char *)(regvals + 5), + *(signed char *)(regvals + 7)); + return ret; +} + +const struct attribute_group *ipath_driver_attr_groups[] = { + &driver_attr_group, + NULL, +}; + +static DEVICE_ATTR(guid, S_IWUSR | S_IRUGO, show_guid, store_guid); +static DEVICE_ATTR(lmc, S_IWUSR | S_IRUGO, show_lmc, store_lmc); +static DEVICE_ATTR(lid, S_IWUSR | S_IRUGO, show_lid, store_lid); +static DEVICE_ATTR(link_state, S_IWUSR, NULL, store_link_state); +static DEVICE_ATTR(mlid, S_IWUSR | S_IRUGO, show_mlid, store_mlid); +static DEVICE_ATTR(mtu, S_IWUSR | S_IRUGO, show_mtu, store_mtu); +static DEVICE_ATTR(enabled, S_IWUSR | S_IRUGO, show_enabled, store_enabled); +static DEVICE_ATTR(nguid, S_IRUGO, show_nguid, NULL); +static DEVICE_ATTR(nports, S_IRUGO, show_nports, NULL); +static DEVICE_ATTR(reset, S_IWUSR, NULL, store_reset); +static DEVICE_ATTR(serial, S_IRUGO, show_serial, NULL); +static DEVICE_ATTR(status, S_IRUGO, show_status, NULL); +static DEVICE_ATTR(status_str, S_IRUGO, show_status_str, NULL); +static DEVICE_ATTR(boardversion, S_IRUGO, show_boardversion, NULL); +static DEVICE_ATTR(unit, S_IRUGO, show_unit, NULL); +static DEVICE_ATTR(rx_pol_inv, S_IWUSR, NULL, store_rx_pol_inv); +static DEVICE_ATTR(led_override, S_IWUSR, NULL, store_led_override); +static DEVICE_ATTR(logged_errors, S_IRUGO, show_logged_errs, NULL); +static DEVICE_ATTR(localbus_info, S_IRUGO, show_localbus_info, NULL); +static DEVICE_ATTR(jint_max_packets, S_IWUSR | S_IRUGO, + show_jint_max_packets, store_jint_max_packets); +static DEVICE_ATTR(jint_idle_ticks, S_IWUSR | S_IRUGO, + show_jint_idle_ticks, store_jint_idle_ticks); +static DEVICE_ATTR(tempsense, S_IWUSR | S_IRUGO, + show_tempsense, store_tempsense); + +static struct attribute *dev_attributes[] = { + &dev_attr_guid.attr, + &dev_attr_lmc.attr, + &dev_attr_lid.attr, + &dev_attr_link_state.attr, + &dev_attr_mlid.attr, + &dev_attr_mtu.attr, + &dev_attr_nguid.attr, + &dev_attr_nports.attr, + &dev_attr_serial.attr, + &dev_attr_status.attr, + &dev_attr_status_str.attr, + &dev_attr_boardversion.attr, + &dev_attr_unit.attr, + &dev_attr_enabled.attr, + &dev_attr_rx_pol_inv.attr, + &dev_attr_led_override.attr, + &dev_attr_logged_errors.attr, + &dev_attr_tempsense.attr, + &dev_attr_localbus_info.attr, + NULL +}; + +static struct attribute_group dev_attr_group = { + .attrs = dev_attributes +}; + +static DEVICE_ATTR(hrtbt_enable, S_IWUSR | S_IRUGO, show_hrtbt_enb, + store_hrtbt_enb); +static DEVICE_ATTR(link_width_enable, S_IWUSR | S_IRUGO, show_lwid_enb, + store_lwid_enb); +static DEVICE_ATTR(link_width, S_IRUGO, show_lwid, NULL); +static DEVICE_ATTR(link_speed_enable, S_IWUSR | S_IRUGO, show_spd_enb, + store_spd_enb); +static DEVICE_ATTR(link_speed, S_IRUGO, show_spd, NULL); +static DEVICE_ATTR(rx_pol_inv_enable, S_IWUSR | S_IRUGO, show_rx_polinv_enb, + store_rx_polinv_enb); +static DEVICE_ATTR(rx_lane_rev_enable, S_IWUSR | S_IRUGO, show_lanerev_enb, + store_lanerev_enb); + +static struct attribute *dev_ibcfg_attributes[] = { + &dev_attr_hrtbt_enable.attr, + &dev_attr_link_width_enable.attr, + &dev_attr_link_width.attr, + &dev_attr_link_speed_enable.attr, + &dev_attr_link_speed.attr, + &dev_attr_rx_pol_inv_enable.attr, + &dev_attr_rx_lane_rev_enable.attr, + NULL +}; + +static struct attribute_group dev_ibcfg_attr_group = { + .attrs = dev_ibcfg_attributes +}; + +/** + * ipath_expose_reset - create a device reset file + * @dev: the device structure + * + * Only expose a file that lets us reset the device after someone + * enters diag mode. A device reset is quite likely to crash the + * machine entirely, so we don't want to normally make it + * available. + * + * Called with ipath_mutex held. + */ +int ipath_expose_reset(struct device *dev) +{ + static int exposed; + int ret; + + if (!exposed) { + ret = device_create_file(dev, &dev_attr_reset); + exposed = 1; + } + else + ret = 0; + + return ret; +} + +int ipath_device_create_group(struct device *dev, struct ipath_devdata *dd) +{ + int ret; + + ret = sysfs_create_group(&dev->kobj, &dev_attr_group); + if (ret) + goto bail; + + ret = sysfs_create_group(&dev->kobj, &dev_counter_attr_group); + if (ret) + goto bail_attrs; + + if (dd->ipath_flags & IPATH_HAS_MULT_IB_SPEED) { + ret = device_create_file(dev, &dev_attr_jint_idle_ticks); + if (ret) + goto bail_counter; + ret = device_create_file(dev, &dev_attr_jint_max_packets); + if (ret) + goto bail_idle; + + ret = sysfs_create_group(&dev->kobj, &dev_ibcfg_attr_group); + if (ret) + goto bail_max; + } + + return 0; + +bail_max: + device_remove_file(dev, &dev_attr_jint_max_packets); +bail_idle: + device_remove_file(dev, &dev_attr_jint_idle_ticks); +bail_counter: + sysfs_remove_group(&dev->kobj, &dev_counter_attr_group); +bail_attrs: + sysfs_remove_group(&dev->kobj, &dev_attr_group); +bail: + return ret; +} + +void ipath_device_remove_group(struct device *dev, struct ipath_devdata *dd) +{ + sysfs_remove_group(&dev->kobj, &dev_counter_attr_group); + + if (dd->ipath_flags & IPATH_HAS_MULT_IB_SPEED) { + sysfs_remove_group(&dev->kobj, &dev_ibcfg_attr_group); + device_remove_file(dev, &dev_attr_jint_idle_ticks); + device_remove_file(dev, &dev_attr_jint_max_packets); + } + + sysfs_remove_group(&dev->kobj, &dev_attr_group); + + device_remove_file(dev, &dev_attr_reset); +} diff --git a/kernel/drivers/infiniband/hw/ipath/ipath_uc.c b/kernel/drivers/infiniband/hw/ipath/ipath_uc.c new file mode 100644 index 000000000..22e60998f --- /dev/null +++ b/kernel/drivers/infiniband/hw/ipath/ipath_uc.c @@ -0,0 +1,547 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ipath_verbs.h" +#include "ipath_kernel.h" + +/* cut down ridiculously long IB macro names */ +#define OP(x) IB_OPCODE_UC_##x + +/** + * ipath_make_uc_req - construct a request packet (SEND, RDMA write) + * @qp: a pointer to the QP + * + * Return 1 if constructed; otherwise, return 0. + */ +int ipath_make_uc_req(struct ipath_qp *qp) +{ + struct ipath_other_headers *ohdr; + struct ipath_swqe *wqe; + unsigned long flags; + u32 hwords; + u32 bth0; + u32 len; + u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu); + int ret = 0; + + spin_lock_irqsave(&qp->s_lock, flags); + + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)) { + if (!(ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND)) + goto bail; + /* We are in the error state, flush the work request. */ + if (qp->s_last == qp->s_head) + goto bail; + /* If DMAs are in progress, we can't flush immediately. */ + if (atomic_read(&qp->s_dma_busy)) { + qp->s_flags |= IPATH_S_WAIT_DMA; + goto bail; + } + wqe = get_swqe_ptr(qp, qp->s_last); + ipath_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); + goto done; + } + + ohdr = &qp->s_hdr.u.oth; + if (qp->remote_ah_attr.ah_flags & IB_AH_GRH) + ohdr = &qp->s_hdr.u.l.oth; + + /* header size in 32-bit words LRH+BTH = (8+12)/4. */ + hwords = 5; + bth0 = 1 << 22; /* Set M bit */ + + /* Get the next send request. */ + wqe = get_swqe_ptr(qp, qp->s_cur); + qp->s_wqe = NULL; + switch (qp->s_state) { + default: + if (!(ib_ipath_state_ops[qp->state] & + IPATH_PROCESS_NEXT_SEND_OK)) + goto bail; + /* Check if send work queue is empty. */ + if (qp->s_cur == qp->s_head) + goto bail; + /* + * Start a new request. + */ + qp->s_psn = wqe->psn = qp->s_next_psn; + qp->s_sge.sge = wqe->sg_list[0]; + qp->s_sge.sg_list = wqe->sg_list + 1; + qp->s_sge.num_sge = wqe->wr.num_sge; + qp->s_len = len = wqe->length; + switch (wqe->wr.opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + if (len > pmtu) { + qp->s_state = OP(SEND_FIRST); + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_SEND) + qp->s_state = OP(SEND_ONLY); + else { + qp->s_state = + OP(SEND_ONLY_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + } + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= 1 << 23; + qp->s_wqe = wqe; + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + ohdr->u.rc.reth.vaddr = + cpu_to_be64(wqe->wr.wr.rdma.remote_addr); + ohdr->u.rc.reth.rkey = + cpu_to_be32(wqe->wr.wr.rdma.rkey); + ohdr->u.rc.reth.length = cpu_to_be32(len); + hwords += sizeof(struct ib_reth) / 4; + if (len > pmtu) { + qp->s_state = OP(RDMA_WRITE_FIRST); + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_RDMA_WRITE) + qp->s_state = OP(RDMA_WRITE_ONLY); + else { + qp->s_state = + OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE); + /* Immediate data comes after the RETH */ + ohdr->u.rc.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= 1 << 23; + } + qp->s_wqe = wqe; + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + + default: + goto bail; + } + break; + + case OP(SEND_FIRST): + qp->s_state = OP(SEND_MIDDLE); + /* FALLTHROUGH */ + case OP(SEND_MIDDLE): + len = qp->s_len; + if (len > pmtu) { + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_SEND) + qp->s_state = OP(SEND_LAST); + else { + qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + } + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= 1 << 23; + qp->s_wqe = wqe; + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + + case OP(RDMA_WRITE_FIRST): + qp->s_state = OP(RDMA_WRITE_MIDDLE); + /* FALLTHROUGH */ + case OP(RDMA_WRITE_MIDDLE): + len = qp->s_len; + if (len > pmtu) { + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_RDMA_WRITE) + qp->s_state = OP(RDMA_WRITE_LAST); + else { + qp->s_state = + OP(RDMA_WRITE_LAST_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= 1 << 23; + } + qp->s_wqe = wqe; + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + } + qp->s_len -= len; + qp->s_hdrwords = hwords; + qp->s_cur_sge = &qp->s_sge; + qp->s_cur_size = len; + ipath_make_ruc_header(to_idev(qp->ibqp.device), + qp, ohdr, bth0 | (qp->s_state << 24), + qp->s_next_psn++ & IPATH_PSN_MASK); +done: + ret = 1; + goto unlock; + +bail: + qp->s_flags &= ~IPATH_S_BUSY; +unlock: + spin_unlock_irqrestore(&qp->s_lock, flags); + return ret; +} + +/** + * ipath_uc_rcv - handle an incoming UC packet + * @dev: the device the packet came in on + * @hdr: the header of the packet + * @has_grh: true if the packet has a GRH + * @data: the packet data + * @tlen: the length of the packet + * @qp: the QP for this packet. + * + * This is called from ipath_qp_rcv() to process an incoming UC packet + * for the given QP. + * Called at interrupt level. + */ +void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, + int has_grh, void *data, u32 tlen, struct ipath_qp *qp) +{ + struct ipath_other_headers *ohdr; + int opcode; + u32 hdrsize; + u32 psn; + u32 pad; + struct ib_wc wc; + u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu); + struct ib_reth *reth; + int header_in_data; + + /* Validate the SLID. See Ch. 9.6.1.5 */ + if (unlikely(be16_to_cpu(hdr->lrh[3]) != qp->remote_ah_attr.dlid)) + goto done; + + /* Check for GRH */ + if (!has_grh) { + ohdr = &hdr->u.oth; + hdrsize = 8 + 12; /* LRH + BTH */ + psn = be32_to_cpu(ohdr->bth[2]); + header_in_data = 0; + } else { + ohdr = &hdr->u.l.oth; + hdrsize = 8 + 40 + 12; /* LRH + GRH + BTH */ + /* + * The header with GRH is 60 bytes and the + * core driver sets the eager header buffer + * size to 56 bytes so the last 4 bytes of + * the BTH header (PSN) is in the data buffer. + */ + header_in_data = dev->dd->ipath_rcvhdrentsize == 16; + if (header_in_data) { + psn = be32_to_cpu(((__be32 *) data)[0]); + data += sizeof(__be32); + } else + psn = be32_to_cpu(ohdr->bth[2]); + } + /* + * The opcode is in the low byte when its in network order + * (top byte when in host order). + */ + opcode = be32_to_cpu(ohdr->bth[0]) >> 24; + + memset(&wc, 0, sizeof wc); + + /* Compare the PSN verses the expected PSN. */ + if (unlikely(ipath_cmp24(psn, qp->r_psn) != 0)) { + /* + * Handle a sequence error. + * Silently drop any current message. + */ + qp->r_psn = psn; + inv: + qp->r_state = OP(SEND_LAST); + switch (opcode) { + case OP(SEND_FIRST): + case OP(SEND_ONLY): + case OP(SEND_ONLY_WITH_IMMEDIATE): + goto send_first; + + case OP(RDMA_WRITE_FIRST): + case OP(RDMA_WRITE_ONLY): + case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): + goto rdma_first; + + default: + dev->n_pkt_drops++; + goto done; + } + } + + /* Check for opcode sequence errors. */ + switch (qp->r_state) { + case OP(SEND_FIRST): + case OP(SEND_MIDDLE): + if (opcode == OP(SEND_MIDDLE) || + opcode == OP(SEND_LAST) || + opcode == OP(SEND_LAST_WITH_IMMEDIATE)) + break; + goto inv; + + case OP(RDMA_WRITE_FIRST): + case OP(RDMA_WRITE_MIDDLE): + if (opcode == OP(RDMA_WRITE_MIDDLE) || + opcode == OP(RDMA_WRITE_LAST) || + opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE)) + break; + goto inv; + + default: + if (opcode == OP(SEND_FIRST) || + opcode == OP(SEND_ONLY) || + opcode == OP(SEND_ONLY_WITH_IMMEDIATE) || + opcode == OP(RDMA_WRITE_FIRST) || + opcode == OP(RDMA_WRITE_ONLY) || + opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) + break; + goto inv; + } + + /* OK, process the packet. */ + switch (opcode) { + case OP(SEND_FIRST): + case OP(SEND_ONLY): + case OP(SEND_ONLY_WITH_IMMEDIATE): + send_first: + if (qp->r_flags & IPATH_R_REUSE_SGE) { + qp->r_flags &= ~IPATH_R_REUSE_SGE; + qp->r_sge = qp->s_rdma_read_sge; + } else if (!ipath_get_rwqe(qp, 0)) { + dev->n_pkt_drops++; + goto done; + } + /* Save the WQE so we can reuse it in case of an error. */ + qp->s_rdma_read_sge = qp->r_sge; + qp->r_rcv_len = 0; + if (opcode == OP(SEND_ONLY)) + goto send_last; + else if (opcode == OP(SEND_ONLY_WITH_IMMEDIATE)) + goto send_last_imm; + /* FALLTHROUGH */ + case OP(SEND_MIDDLE): + /* Check for invalid length PMTU or posted rwqe len. */ + if (unlikely(tlen != (hdrsize + pmtu + 4))) { + qp->r_flags |= IPATH_R_REUSE_SGE; + dev->n_pkt_drops++; + goto done; + } + qp->r_rcv_len += pmtu; + if (unlikely(qp->r_rcv_len > qp->r_len)) { + qp->r_flags |= IPATH_R_REUSE_SGE; + dev->n_pkt_drops++; + goto done; + } + ipath_copy_sge(&qp->r_sge, data, pmtu); + break; + + case OP(SEND_LAST_WITH_IMMEDIATE): + send_last_imm: + if (header_in_data) { + wc.ex.imm_data = *(__be32 *) data; + data += sizeof(__be32); + } else { + /* Immediate data comes after BTH */ + wc.ex.imm_data = ohdr->u.imm_data; + } + hdrsize += 4; + wc.wc_flags = IB_WC_WITH_IMM; + /* FALLTHROUGH */ + case OP(SEND_LAST): + send_last: + /* Get the number of bytes the message was padded by. */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + /* Check for invalid length. */ + /* XXX LAST len should be >= 1 */ + if (unlikely(tlen < (hdrsize + pad + 4))) { + qp->r_flags |= IPATH_R_REUSE_SGE; + dev->n_pkt_drops++; + goto done; + } + /* Don't count the CRC. */ + tlen -= (hdrsize + pad + 4); + wc.byte_len = tlen + qp->r_rcv_len; + if (unlikely(wc.byte_len > qp->r_len)) { + qp->r_flags |= IPATH_R_REUSE_SGE; + dev->n_pkt_drops++; + goto done; + } + wc.opcode = IB_WC_RECV; + last_imm: + ipath_copy_sge(&qp->r_sge, data, tlen); + wc.wr_id = qp->r_wr_id; + wc.status = IB_WC_SUCCESS; + wc.qp = &qp->ibqp; + wc.src_qp = qp->remote_qpn; + wc.slid = qp->remote_ah_attr.dlid; + wc.sl = qp->remote_ah_attr.sl; + /* Signal completion event if the solicited bit is set. */ + ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, + (ohdr->bth[0] & + cpu_to_be32(1 << 23)) != 0); + break; + + case OP(RDMA_WRITE_FIRST): + case OP(RDMA_WRITE_ONLY): + case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): /* consume RWQE */ + rdma_first: + /* RETH comes after BTH */ + if (!header_in_data) + reth = &ohdr->u.rc.reth; + else { + reth = (struct ib_reth *)data; + data += sizeof(*reth); + } + hdrsize += sizeof(*reth); + qp->r_len = be32_to_cpu(reth->length); + qp->r_rcv_len = 0; + if (qp->r_len != 0) { + u32 rkey = be32_to_cpu(reth->rkey); + u64 vaddr = be64_to_cpu(reth->vaddr); + int ok; + + /* Check rkey */ + ok = ipath_rkey_ok(qp, &qp->r_sge, qp->r_len, + vaddr, rkey, + IB_ACCESS_REMOTE_WRITE); + if (unlikely(!ok)) { + dev->n_pkt_drops++; + goto done; + } + } else { + qp->r_sge.sg_list = NULL; + qp->r_sge.sge.mr = NULL; + qp->r_sge.sge.vaddr = NULL; + qp->r_sge.sge.length = 0; + qp->r_sge.sge.sge_length = 0; + } + if (unlikely(!(qp->qp_access_flags & + IB_ACCESS_REMOTE_WRITE))) { + dev->n_pkt_drops++; + goto done; + } + if (opcode == OP(RDMA_WRITE_ONLY)) + goto rdma_last; + else if (opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) + goto rdma_last_imm; + /* FALLTHROUGH */ + case OP(RDMA_WRITE_MIDDLE): + /* Check for invalid length PMTU or posted rwqe len. */ + if (unlikely(tlen != (hdrsize + pmtu + 4))) { + dev->n_pkt_drops++; + goto done; + } + qp->r_rcv_len += pmtu; + if (unlikely(qp->r_rcv_len > qp->r_len)) { + dev->n_pkt_drops++; + goto done; + } + ipath_copy_sge(&qp->r_sge, data, pmtu); + break; + + case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): + rdma_last_imm: + if (header_in_data) { + wc.ex.imm_data = *(__be32 *) data; + data += sizeof(__be32); + } else { + /* Immediate data comes after BTH */ + wc.ex.imm_data = ohdr->u.imm_data; + } + hdrsize += 4; + wc.wc_flags = IB_WC_WITH_IMM; + + /* Get the number of bytes the message was padded by. */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + /* Check for invalid length. */ + /* XXX LAST len should be >= 1 */ + if (unlikely(tlen < (hdrsize + pad + 4))) { + dev->n_pkt_drops++; + goto done; + } + /* Don't count the CRC. */ + tlen -= (hdrsize + pad + 4); + if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) { + dev->n_pkt_drops++; + goto done; + } + if (qp->r_flags & IPATH_R_REUSE_SGE) + qp->r_flags &= ~IPATH_R_REUSE_SGE; + else if (!ipath_get_rwqe(qp, 1)) { + dev->n_pkt_drops++; + goto done; + } + wc.byte_len = qp->r_len; + wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; + goto last_imm; + + case OP(RDMA_WRITE_LAST): + rdma_last: + /* Get the number of bytes the message was padded by. */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + /* Check for invalid length. */ + /* XXX LAST len should be >= 1 */ + if (unlikely(tlen < (hdrsize + pad + 4))) { + dev->n_pkt_drops++; + goto done; + } + /* Don't count the CRC. */ + tlen -= (hdrsize + pad + 4); + if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) { + dev->n_pkt_drops++; + goto done; + } + ipath_copy_sge(&qp->r_sge, data, tlen); + break; + + default: + /* Drop packet for unknown opcodes. */ + dev->n_pkt_drops++; + goto done; + } + qp->r_psn++; + qp->r_state = opcode; +done: + return; +} diff --git a/kernel/drivers/infiniband/hw/ipath/ipath_ud.c b/kernel/drivers/infiniband/hw/ipath/ipath_ud.c new file mode 100644 index 000000000..e8a2a9152 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ipath/ipath_ud.c @@ -0,0 +1,580 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include "ipath_verbs.h" +#include "ipath_kernel.h" + +/** + * ipath_ud_loopback - handle send on loopback QPs + * @sqp: the sending QP + * @swqe: the send work request + * + * This is called from ipath_make_ud_req() to forward a WQE addressed + * to the same HCA. + * Note that the receive interrupt handler may be calling ipath_ud_rcv() + * while this is being called. + */ +static void ipath_ud_loopback(struct ipath_qp *sqp, struct ipath_swqe *swqe) +{ + struct ipath_ibdev *dev = to_idev(sqp->ibqp.device); + struct ipath_qp *qp; + struct ib_ah_attr *ah_attr; + unsigned long flags; + struct ipath_rq *rq; + struct ipath_srq *srq; + struct ipath_sge_state rsge; + struct ipath_sge *sge; + struct ipath_rwq *wq; + struct ipath_rwqe *wqe; + void (*handler)(struct ib_event *, void *); + struct ib_wc wc; + u32 tail; + u32 rlen; + u32 length; + + qp = ipath_lookup_qpn(&dev->qp_table, swqe->wr.wr.ud.remote_qpn); + if (!qp || !(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) { + dev->n_pkt_drops++; + goto done; + } + + /* + * Check that the qkey matches (except for QP0, see 9.6.1.4.1). + * Qkeys with the high order bit set mean use the + * qkey from the QP context instead of the WR (see 10.2.5). + */ + if (unlikely(qp->ibqp.qp_num && + ((int) swqe->wr.wr.ud.remote_qkey < 0 ? + sqp->qkey : swqe->wr.wr.ud.remote_qkey) != qp->qkey)) { + /* XXX OK to lose a count once in a while. */ + dev->qkey_violations++; + dev->n_pkt_drops++; + goto drop; + } + + /* + * A GRH is expected to precede the data even if not + * present on the wire. + */ + length = swqe->length; + memset(&wc, 0, sizeof wc); + wc.byte_len = length + sizeof(struct ib_grh); + + if (swqe->wr.opcode == IB_WR_SEND_WITH_IMM) { + wc.wc_flags = IB_WC_WITH_IMM; + wc.ex.imm_data = swqe->wr.ex.imm_data; + } + + /* + * This would be a lot simpler if we could call ipath_get_rwqe() + * but that uses state that the receive interrupt handler uses + * so we would need to lock out receive interrupts while doing + * local loopback. + */ + if (qp->ibqp.srq) { + srq = to_isrq(qp->ibqp.srq); + handler = srq->ibsrq.event_handler; + rq = &srq->rq; + } else { + srq = NULL; + handler = NULL; + rq = &qp->r_rq; + } + + /* + * Get the next work request entry to find where to put the data. + * Note that it is safe to drop the lock after changing rq->tail + * since ipath_post_receive() won't fill the empty slot. + */ + spin_lock_irqsave(&rq->lock, flags); + wq = rq->wq; + tail = wq->tail; + /* Validate tail before using it since it is user writable. */ + if (tail >= rq->size) + tail = 0; + if (unlikely(tail == wq->head)) { + spin_unlock_irqrestore(&rq->lock, flags); + dev->n_pkt_drops++; + goto drop; + } + wqe = get_rwqe_ptr(rq, tail); + rsge.sg_list = qp->r_ud_sg_list; + if (!ipath_init_sge(qp, wqe, &rlen, &rsge)) { + spin_unlock_irqrestore(&rq->lock, flags); + dev->n_pkt_drops++; + goto drop; + } + /* Silently drop packets which are too big. */ + if (wc.byte_len > rlen) { + spin_unlock_irqrestore(&rq->lock, flags); + dev->n_pkt_drops++; + goto drop; + } + if (++tail >= rq->size) + tail = 0; + wq->tail = tail; + wc.wr_id = wqe->wr_id; + if (handler) { + u32 n; + + /* + * validate head pointer value and compute + * the number of remaining WQEs. + */ + n = wq->head; + if (n >= rq->size) + n = 0; + if (n < tail) + n += rq->size - tail; + else + n -= tail; + if (n < srq->limit) { + struct ib_event ev; + + srq->limit = 0; + spin_unlock_irqrestore(&rq->lock, flags); + ev.device = qp->ibqp.device; + ev.element.srq = qp->ibqp.srq; + ev.event = IB_EVENT_SRQ_LIMIT_REACHED; + handler(&ev, srq->ibsrq.srq_context); + } else + spin_unlock_irqrestore(&rq->lock, flags); + } else + spin_unlock_irqrestore(&rq->lock, flags); + + ah_attr = &to_iah(swqe->wr.wr.ud.ah)->attr; + if (ah_attr->ah_flags & IB_AH_GRH) { + ipath_copy_sge(&rsge, &ah_attr->grh, sizeof(struct ib_grh)); + wc.wc_flags |= IB_WC_GRH; + } else + ipath_skip_sge(&rsge, sizeof(struct ib_grh)); + sge = swqe->sg_list; + while (length) { + u32 len = sge->length; + + if (len > length) + len = length; + if (len > sge->sge_length) + len = sge->sge_length; + BUG_ON(len == 0); + ipath_copy_sge(&rsge, sge->vaddr, len); + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (sge->sge_length == 0) { + if (--swqe->wr.num_sge) + sge++; + } else if (sge->length == 0 && sge->mr != NULL) { + if (++sge->n >= IPATH_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + break; + sge->n = 0; + } + sge->vaddr = + sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = + sge->mr->map[sge->m]->segs[sge->n].length; + } + length -= len; + } + wc.status = IB_WC_SUCCESS; + wc.opcode = IB_WC_RECV; + wc.qp = &qp->ibqp; + wc.src_qp = sqp->ibqp.qp_num; + /* XXX do we know which pkey matched? Only needed for GSI. */ + wc.pkey_index = 0; + wc.slid = dev->dd->ipath_lid | + (ah_attr->src_path_bits & + ((1 << dev->dd->ipath_lmc) - 1)); + wc.sl = ah_attr->sl; + wc.dlid_path_bits = + ah_attr->dlid & ((1 << dev->dd->ipath_lmc) - 1); + wc.port_num = 1; + /* Signal completion event if the solicited bit is set. */ + ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, + swqe->wr.send_flags & IB_SEND_SOLICITED); +drop: + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); +done:; +} + +/** + * ipath_make_ud_req - construct a UD request packet + * @qp: the QP + * + * Return 1 if constructed; otherwise, return 0. + */ +int ipath_make_ud_req(struct ipath_qp *qp) +{ + struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + struct ipath_other_headers *ohdr; + struct ib_ah_attr *ah_attr; + struct ipath_swqe *wqe; + unsigned long flags; + u32 nwords; + u32 extra_bytes; + u32 bth0; + u16 lrh0; + u16 lid; + int ret = 0; + int next_cur; + + spin_lock_irqsave(&qp->s_lock, flags); + + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_NEXT_SEND_OK)) { + if (!(ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND)) + goto bail; + /* We are in the error state, flush the work request. */ + if (qp->s_last == qp->s_head) + goto bail; + /* If DMAs are in progress, we can't flush immediately. */ + if (atomic_read(&qp->s_dma_busy)) { + qp->s_flags |= IPATH_S_WAIT_DMA; + goto bail; + } + wqe = get_swqe_ptr(qp, qp->s_last); + ipath_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); + goto done; + } + + if (qp->s_cur == qp->s_head) + goto bail; + + wqe = get_swqe_ptr(qp, qp->s_cur); + next_cur = qp->s_cur + 1; + if (next_cur >= qp->s_size) + next_cur = 0; + + /* Construct the header. */ + ah_attr = &to_iah(wqe->wr.wr.ud.ah)->attr; + if (ah_attr->dlid >= IPATH_MULTICAST_LID_BASE) { + if (ah_attr->dlid != IPATH_PERMISSIVE_LID) + dev->n_multicast_xmit++; + else + dev->n_unicast_xmit++; + } else { + dev->n_unicast_xmit++; + lid = ah_attr->dlid & ~((1 << dev->dd->ipath_lmc) - 1); + if (unlikely(lid == dev->dd->ipath_lid)) { + /* + * If DMAs are in progress, we can't generate + * a completion for the loopback packet since + * it would be out of order. + * XXX Instead of waiting, we could queue a + * zero length descriptor so we get a callback. + */ + if (atomic_read(&qp->s_dma_busy)) { + qp->s_flags |= IPATH_S_WAIT_DMA; + goto bail; + } + qp->s_cur = next_cur; + spin_unlock_irqrestore(&qp->s_lock, flags); + ipath_ud_loopback(qp, wqe); + spin_lock_irqsave(&qp->s_lock, flags); + ipath_send_complete(qp, wqe, IB_WC_SUCCESS); + goto done; + } + } + + qp->s_cur = next_cur; + extra_bytes = -wqe->length & 3; + nwords = (wqe->length + extra_bytes) >> 2; + + /* header size in 32-bit words LRH+BTH+DETH = (8+12+8)/4. */ + qp->s_hdrwords = 7; + qp->s_cur_size = wqe->length; + qp->s_cur_sge = &qp->s_sge; + qp->s_dmult = ah_attr->static_rate; + qp->s_wqe = wqe; + qp->s_sge.sge = wqe->sg_list[0]; + qp->s_sge.sg_list = wqe->sg_list + 1; + qp->s_sge.num_sge = wqe->wr.num_sge; + + if (ah_attr->ah_flags & IB_AH_GRH) { + /* Header size in 32-bit words. */ + qp->s_hdrwords += ipath_make_grh(dev, &qp->s_hdr.u.l.grh, + &ah_attr->grh, + qp->s_hdrwords, nwords); + lrh0 = IPATH_LRH_GRH; + ohdr = &qp->s_hdr.u.l.oth; + /* + * Don't worry about sending to locally attached multicast + * QPs. It is unspecified by the spec. what happens. + */ + } else { + /* Header size in 32-bit words. */ + lrh0 = IPATH_LRH_BTH; + ohdr = &qp->s_hdr.u.oth; + } + if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) { + qp->s_hdrwords++; + ohdr->u.ud.imm_data = wqe->wr.ex.imm_data; + bth0 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE << 24; + } else + bth0 = IB_OPCODE_UD_SEND_ONLY << 24; + lrh0 |= ah_attr->sl << 4; + if (qp->ibqp.qp_type == IB_QPT_SMI) + lrh0 |= 0xF000; /* Set VL (see ch. 13.5.3.1) */ + qp->s_hdr.lrh[0] = cpu_to_be16(lrh0); + qp->s_hdr.lrh[1] = cpu_to_be16(ah_attr->dlid); /* DEST LID */ + qp->s_hdr.lrh[2] = cpu_to_be16(qp->s_hdrwords + nwords + + SIZE_OF_CRC); + lid = dev->dd->ipath_lid; + if (lid) { + lid |= ah_attr->src_path_bits & + ((1 << dev->dd->ipath_lmc) - 1); + qp->s_hdr.lrh[3] = cpu_to_be16(lid); + } else + qp->s_hdr.lrh[3] = IB_LID_PERMISSIVE; + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= 1 << 23; + bth0 |= extra_bytes << 20; + bth0 |= qp->ibqp.qp_type == IB_QPT_SMI ? IPATH_DEFAULT_P_KEY : + ipath_get_pkey(dev->dd, qp->s_pkey_index); + ohdr->bth[0] = cpu_to_be32(bth0); + /* + * Use the multicast QP if the destination LID is a multicast LID. + */ + ohdr->bth[1] = ah_attr->dlid >= IPATH_MULTICAST_LID_BASE && + ah_attr->dlid != IPATH_PERMISSIVE_LID ? + cpu_to_be32(IPATH_MULTICAST_QPN) : + cpu_to_be32(wqe->wr.wr.ud.remote_qpn); + ohdr->bth[2] = cpu_to_be32(qp->s_next_psn++ & IPATH_PSN_MASK); + /* + * Qkeys with the high order bit set mean use the + * qkey from the QP context instead of the WR (see 10.2.5). + */ + ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->wr.wr.ud.remote_qkey < 0 ? + qp->qkey : wqe->wr.wr.ud.remote_qkey); + ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num); + +done: + ret = 1; + goto unlock; + +bail: + qp->s_flags &= ~IPATH_S_BUSY; +unlock: + spin_unlock_irqrestore(&qp->s_lock, flags); + return ret; +} + +/** + * ipath_ud_rcv - receive an incoming UD packet + * @dev: the device the packet came in on + * @hdr: the packet header + * @has_grh: true if the packet has a GRH + * @data: the packet data + * @tlen: the packet length + * @qp: the QP the packet came on + * + * This is called from ipath_qp_rcv() to process an incoming UD packet + * for the given QP. + * Called at interrupt level. + */ +void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, + int has_grh, void *data, u32 tlen, struct ipath_qp *qp) +{ + struct ipath_other_headers *ohdr; + int opcode; + u32 hdrsize; + u32 pad; + struct ib_wc wc; + u32 qkey; + u32 src_qp; + u16 dlid; + int header_in_data; + + /* Check for GRH */ + if (!has_grh) { + ohdr = &hdr->u.oth; + hdrsize = 8 + 12 + 8; /* LRH + BTH + DETH */ + qkey = be32_to_cpu(ohdr->u.ud.deth[0]); + src_qp = be32_to_cpu(ohdr->u.ud.deth[1]); + header_in_data = 0; + } else { + ohdr = &hdr->u.l.oth; + hdrsize = 8 + 40 + 12 + 8; /* LRH + GRH + BTH + DETH */ + /* + * The header with GRH is 68 bytes and the core driver sets + * the eager header buffer size to 56 bytes so the last 12 + * bytes of the IB header is in the data buffer. + */ + header_in_data = dev->dd->ipath_rcvhdrentsize == 16; + if (header_in_data) { + qkey = be32_to_cpu(((__be32 *) data)[1]); + src_qp = be32_to_cpu(((__be32 *) data)[2]); + data += 12; + } else { + qkey = be32_to_cpu(ohdr->u.ud.deth[0]); + src_qp = be32_to_cpu(ohdr->u.ud.deth[1]); + } + } + src_qp &= IPATH_QPN_MASK; + + /* + * Check that the permissive LID is only used on QP0 + * and the QKEY matches (see 9.6.1.4.1 and 9.6.1.5.1). + */ + if (qp->ibqp.qp_num) { + if (unlikely(hdr->lrh[1] == IB_LID_PERMISSIVE || + hdr->lrh[3] == IB_LID_PERMISSIVE)) { + dev->n_pkt_drops++; + goto bail; + } + if (unlikely(qkey != qp->qkey)) { + /* XXX OK to lose a count once in a while. */ + dev->qkey_violations++; + dev->n_pkt_drops++; + goto bail; + } + } else if (hdr->lrh[1] == IB_LID_PERMISSIVE || + hdr->lrh[3] == IB_LID_PERMISSIVE) { + struct ib_smp *smp = (struct ib_smp *) data; + + if (smp->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { + dev->n_pkt_drops++; + goto bail; + } + } + + /* + * The opcode is in the low byte when its in network order + * (top byte when in host order). + */ + opcode = be32_to_cpu(ohdr->bth[0]) >> 24; + if (qp->ibqp.qp_num > 1 && + opcode == IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE) { + if (header_in_data) { + wc.ex.imm_data = *(__be32 *) data; + data += sizeof(__be32); + } else + wc.ex.imm_data = ohdr->u.ud.imm_data; + wc.wc_flags = IB_WC_WITH_IMM; + hdrsize += sizeof(u32); + } else if (opcode == IB_OPCODE_UD_SEND_ONLY) { + wc.ex.imm_data = 0; + wc.wc_flags = 0; + } else { + dev->n_pkt_drops++; + goto bail; + } + + /* Get the number of bytes the message was padded by. */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + if (unlikely(tlen < (hdrsize + pad + 4))) { + /* Drop incomplete packets. */ + dev->n_pkt_drops++; + goto bail; + } + tlen -= hdrsize + pad + 4; + + /* Drop invalid MAD packets (see 13.5.3.1). */ + if (unlikely((qp->ibqp.qp_num == 0 && + (tlen != 256 || + (be16_to_cpu(hdr->lrh[0]) >> 12) != 15)) || + (qp->ibqp.qp_num == 1 && + (tlen != 256 || + (be16_to_cpu(hdr->lrh[0]) >> 12) == 15)))) { + dev->n_pkt_drops++; + goto bail; + } + + /* + * A GRH is expected to precede the data even if not + * present on the wire. + */ + wc.byte_len = tlen + sizeof(struct ib_grh); + + /* + * Get the next work request entry to find where to put the data. + */ + if (qp->r_flags & IPATH_R_REUSE_SGE) + qp->r_flags &= ~IPATH_R_REUSE_SGE; + else if (!ipath_get_rwqe(qp, 0)) { + /* + * Count VL15 packets dropped due to no receive buffer. + * Otherwise, count them as buffer overruns since usually, + * the HW will be able to receive packets even if there are + * no QPs with posted receive buffers. + */ + if (qp->ibqp.qp_num == 0) + dev->n_vl15_dropped++; + else + dev->rcv_errors++; + goto bail; + } + /* Silently drop packets which are too big. */ + if (wc.byte_len > qp->r_len) { + qp->r_flags |= IPATH_R_REUSE_SGE; + dev->n_pkt_drops++; + goto bail; + } + if (has_grh) { + ipath_copy_sge(&qp->r_sge, &hdr->u.l.grh, + sizeof(struct ib_grh)); + wc.wc_flags |= IB_WC_GRH; + } else + ipath_skip_sge(&qp->r_sge, sizeof(struct ib_grh)); + ipath_copy_sge(&qp->r_sge, data, + wc.byte_len - sizeof(struct ib_grh)); + if (!test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags)) + goto bail; + wc.wr_id = qp->r_wr_id; + wc.status = IB_WC_SUCCESS; + wc.opcode = IB_WC_RECV; + wc.vendor_err = 0; + wc.qp = &qp->ibqp; + wc.src_qp = src_qp; + /* XXX do we know which pkey matched? Only needed for GSI. */ + wc.pkey_index = 0; + wc.slid = be16_to_cpu(hdr->lrh[3]); + wc.sl = (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF; + dlid = be16_to_cpu(hdr->lrh[1]); + /* + * Save the LMC lower bits if the destination LID is a unicast LID. + */ + wc.dlid_path_bits = dlid >= IPATH_MULTICAST_LID_BASE ? 0 : + dlid & ((1 << dev->dd->ipath_lmc) - 1); + wc.port_num = 1; + /* Signal completion event if the solicited bit is set. */ + ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, + (ohdr->bth[0] & + cpu_to_be32(1 << 23)) != 0); + +bail:; +} diff --git a/kernel/drivers/infiniband/hw/ipath/ipath_user_pages.c b/kernel/drivers/infiniband/hw/ipath/ipath_user_pages.c new file mode 100644 index 000000000..1da1252dc --- /dev/null +++ b/kernel/drivers/infiniband/hw/ipath/ipath_user_pages.c @@ -0,0 +1,229 @@ +/* + * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#include "ipath_kernel.h" + +static void __ipath_release_user_pages(struct page **p, size_t num_pages, + int dirty) +{ + size_t i; + + for (i = 0; i < num_pages; i++) { + ipath_cdbg(MM, "%lu/%lu put_page %p\n", (unsigned long) i, + (unsigned long) num_pages, p[i]); + if (dirty) + set_page_dirty_lock(p[i]); + put_page(p[i]); + } +} + +/* call with current->mm->mmap_sem held */ +static int __ipath_get_user_pages(unsigned long start_page, size_t num_pages, + struct page **p) +{ + unsigned long lock_limit; + size_t got; + int ret; + + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + + if (num_pages > lock_limit) { + ret = -ENOMEM; + goto bail; + } + + ipath_cdbg(VERBOSE, "pin %lx pages from vaddr %lx\n", + (unsigned long) num_pages, start_page); + + for (got = 0; got < num_pages; got += ret) { + ret = get_user_pages(current, current->mm, + start_page + got * PAGE_SIZE, + num_pages - got, 1, 1, + p + got, NULL); + if (ret < 0) + goto bail_release; + } + + current->mm->pinned_vm += num_pages; + + ret = 0; + goto bail; + +bail_release: + __ipath_release_user_pages(p, got, 0); +bail: + return ret; +} + +/** + * ipath_map_page - a safety wrapper around pci_map_page() + * + * A dma_addr of all 0's is interpreted by the chip as "disabled". + * Unfortunately, it can also be a valid dma_addr returned on some + * architectures. + * + * The powerpc iommu assigns dma_addrs in ascending order, so we don't + * have to bother with retries or mapping a dummy page to insure we + * don't just get the same mapping again. + * + * I'm sure we won't be so lucky with other iommu's, so FIXME. + */ +dma_addr_t ipath_map_page(struct pci_dev *hwdev, struct page *page, + unsigned long offset, size_t size, int direction) +{ + dma_addr_t phys; + + phys = pci_map_page(hwdev, page, offset, size, direction); + + if (phys == 0) { + pci_unmap_page(hwdev, phys, size, direction); + phys = pci_map_page(hwdev, page, offset, size, direction); + /* + * FIXME: If we get 0 again, we should keep this page, + * map another, then free the 0 page. + */ + } + + return phys; +} + +/** + * ipath_map_single - a safety wrapper around pci_map_single() + * + * Same idea as ipath_map_page(). + */ +dma_addr_t ipath_map_single(struct pci_dev *hwdev, void *ptr, size_t size, + int direction) +{ + dma_addr_t phys; + + phys = pci_map_single(hwdev, ptr, size, direction); + + if (phys == 0) { + pci_unmap_single(hwdev, phys, size, direction); + phys = pci_map_single(hwdev, ptr, size, direction); + /* + * FIXME: If we get 0 again, we should keep this page, + * map another, then free the 0 page. + */ + } + + return phys; +} + +/** + * ipath_get_user_pages - lock user pages into memory + * @start_page: the start page + * @num_pages: the number of pages + * @p: the output page structures + * + * This function takes a given start page (page aligned user virtual + * address) and pins it and the following specified number of pages. For + * now, num_pages is always 1, but that will probably change at some point + * (because caller is doing expected sends on a single virtually contiguous + * buffer, so we can do all pages at once). + */ +int ipath_get_user_pages(unsigned long start_page, size_t num_pages, + struct page **p) +{ + int ret; + + down_write(¤t->mm->mmap_sem); + + ret = __ipath_get_user_pages(start_page, num_pages, p); + + up_write(¤t->mm->mmap_sem); + + return ret; +} + +void ipath_release_user_pages(struct page **p, size_t num_pages) +{ + down_write(¤t->mm->mmap_sem); + + __ipath_release_user_pages(p, num_pages, 1); + + current->mm->pinned_vm -= num_pages; + + up_write(¤t->mm->mmap_sem); +} + +struct ipath_user_pages_work { + struct work_struct work; + struct mm_struct *mm; + unsigned long num_pages; +}; + +static void user_pages_account(struct work_struct *_work) +{ + struct ipath_user_pages_work *work = + container_of(_work, struct ipath_user_pages_work, work); + + down_write(&work->mm->mmap_sem); + work->mm->pinned_vm -= work->num_pages; + up_write(&work->mm->mmap_sem); + mmput(work->mm); + kfree(work); +} + +void ipath_release_user_pages_on_close(struct page **p, size_t num_pages) +{ + struct ipath_user_pages_work *work; + struct mm_struct *mm; + + __ipath_release_user_pages(p, num_pages, 1); + + mm = get_task_mm(current); + if (!mm) + return; + + work = kmalloc(sizeof(*work), GFP_KERNEL); + if (!work) + goto bail_mm; + + INIT_WORK(&work->work, user_pages_account); + work->mm = mm; + work->num_pages = num_pages; + + queue_work(ib_wq, &work->work); + return; + +bail_mm: + mmput(mm); + return; +} diff --git a/kernel/drivers/infiniband/hw/ipath/ipath_user_sdma.c b/kernel/drivers/infiniband/hw/ipath/ipath_user_sdma.c new file mode 100644 index 000000000..cc04b7ba3 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ipath/ipath_user_sdma.c @@ -0,0 +1,875 @@ +/* + * Copyright (c) 2007, 2008 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ipath_kernel.h" +#include "ipath_user_sdma.h" + +/* minimum size of header */ +#define IPATH_USER_SDMA_MIN_HEADER_LENGTH 64 +/* expected size of headers (for dma_pool) */ +#define IPATH_USER_SDMA_EXP_HEADER_LENGTH 64 +/* length mask in PBC (lower 11 bits) */ +#define IPATH_PBC_LENGTH_MASK ((1 << 11) - 1) + +struct ipath_user_sdma_pkt { + u8 naddr; /* dimension of addr (1..3) ... */ + u32 counter; /* sdma pkts queued counter for this entry */ + u64 added; /* global descq number of entries */ + + struct { + u32 offset; /* offset for kvaddr, addr */ + u32 length; /* length in page */ + u8 put_page; /* should we put_page? */ + u8 dma_mapped; /* is page dma_mapped? */ + struct page *page; /* may be NULL (coherent mem) */ + void *kvaddr; /* FIXME: only for pio hack */ + dma_addr_t addr; + } addr[4]; /* max pages, any more and we coalesce */ + struct list_head list; /* list element */ +}; + +struct ipath_user_sdma_queue { + /* + * pkts sent to dma engine are queued on this + * list head. the type of the elements of this + * list are struct ipath_user_sdma_pkt... + */ + struct list_head sent; + + /* headers with expected length are allocated from here... */ + char header_cache_name[64]; + struct dma_pool *header_cache; + + /* packets are allocated from the slab cache... */ + char pkt_slab_name[64]; + struct kmem_cache *pkt_slab; + + /* as packets go on the queued queue, they are counted... */ + u32 counter; + u32 sent_counter; + + /* dma page table */ + struct rb_root dma_pages_root; + + /* protect everything above... */ + struct mutex lock; +}; + +struct ipath_user_sdma_queue * +ipath_user_sdma_queue_create(struct device *dev, int unit, int port, int sport) +{ + struct ipath_user_sdma_queue *pq = + kmalloc(sizeof(struct ipath_user_sdma_queue), GFP_KERNEL); + + if (!pq) + goto done; + + pq->counter = 0; + pq->sent_counter = 0; + INIT_LIST_HEAD(&pq->sent); + + mutex_init(&pq->lock); + + snprintf(pq->pkt_slab_name, sizeof(pq->pkt_slab_name), + "ipath-user-sdma-pkts-%u-%02u.%02u", unit, port, sport); + pq->pkt_slab = kmem_cache_create(pq->pkt_slab_name, + sizeof(struct ipath_user_sdma_pkt), + 0, 0, NULL); + + if (!pq->pkt_slab) + goto err_kfree; + + snprintf(pq->header_cache_name, sizeof(pq->header_cache_name), + "ipath-user-sdma-headers-%u-%02u.%02u", unit, port, sport); + pq->header_cache = dma_pool_create(pq->header_cache_name, + dev, + IPATH_USER_SDMA_EXP_HEADER_LENGTH, + 4, 0); + if (!pq->header_cache) + goto err_slab; + + pq->dma_pages_root = RB_ROOT; + + goto done; + +err_slab: + kmem_cache_destroy(pq->pkt_slab); +err_kfree: + kfree(pq); + pq = NULL; + +done: + return pq; +} + +static void ipath_user_sdma_init_frag(struct ipath_user_sdma_pkt *pkt, + int i, size_t offset, size_t len, + int put_page, int dma_mapped, + struct page *page, + void *kvaddr, dma_addr_t dma_addr) +{ + pkt->addr[i].offset = offset; + pkt->addr[i].length = len; + pkt->addr[i].put_page = put_page; + pkt->addr[i].dma_mapped = dma_mapped; + pkt->addr[i].page = page; + pkt->addr[i].kvaddr = kvaddr; + pkt->addr[i].addr = dma_addr; +} + +static void ipath_user_sdma_init_header(struct ipath_user_sdma_pkt *pkt, + u32 counter, size_t offset, + size_t len, int dma_mapped, + struct page *page, + void *kvaddr, dma_addr_t dma_addr) +{ + pkt->naddr = 1; + pkt->counter = counter; + ipath_user_sdma_init_frag(pkt, 0, offset, len, 0, dma_mapped, page, + kvaddr, dma_addr); +} + +/* we've too many pages in the iovec, coalesce to a single page */ +static int ipath_user_sdma_coalesce(const struct ipath_devdata *dd, + struct ipath_user_sdma_pkt *pkt, + const struct iovec *iov, + unsigned long niov) { + int ret = 0; + struct page *page = alloc_page(GFP_KERNEL); + void *mpage_save; + char *mpage; + int i; + int len = 0; + dma_addr_t dma_addr; + + if (!page) { + ret = -ENOMEM; + goto done; + } + + mpage = kmap(page); + mpage_save = mpage; + for (i = 0; i < niov; i++) { + int cfur; + + cfur = copy_from_user(mpage, + iov[i].iov_base, iov[i].iov_len); + if (cfur) { + ret = -EFAULT; + goto free_unmap; + } + + mpage += iov[i].iov_len; + len += iov[i].iov_len; + } + + dma_addr = dma_map_page(&dd->pcidev->dev, page, 0, len, + DMA_TO_DEVICE); + if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) { + ret = -ENOMEM; + goto free_unmap; + } + + ipath_user_sdma_init_frag(pkt, 1, 0, len, 0, 1, page, mpage_save, + dma_addr); + pkt->naddr = 2; + + goto done; + +free_unmap: + kunmap(page); + __free_page(page); +done: + return ret; +} + +/* how many pages in this iovec element? */ +static int ipath_user_sdma_num_pages(const struct iovec *iov) +{ + const unsigned long addr = (unsigned long) iov->iov_base; + const unsigned long len = iov->iov_len; + const unsigned long spage = addr & PAGE_MASK; + const unsigned long epage = (addr + len - 1) & PAGE_MASK; + + return 1 + ((epage - spage) >> PAGE_SHIFT); +} + +/* truncate length to page boundary */ +static int ipath_user_sdma_page_length(unsigned long addr, unsigned long len) +{ + const unsigned long offset = addr & ~PAGE_MASK; + + return ((offset + len) > PAGE_SIZE) ? (PAGE_SIZE - offset) : len; +} + +static void ipath_user_sdma_free_pkt_frag(struct device *dev, + struct ipath_user_sdma_queue *pq, + struct ipath_user_sdma_pkt *pkt, + int frag) +{ + const int i = frag; + + if (pkt->addr[i].page) { + if (pkt->addr[i].dma_mapped) + dma_unmap_page(dev, + pkt->addr[i].addr, + pkt->addr[i].length, + DMA_TO_DEVICE); + + if (pkt->addr[i].kvaddr) + kunmap(pkt->addr[i].page); + + if (pkt->addr[i].put_page) + put_page(pkt->addr[i].page); + else + __free_page(pkt->addr[i].page); + } else if (pkt->addr[i].kvaddr) + /* free coherent mem from cache... */ + dma_pool_free(pq->header_cache, + pkt->addr[i].kvaddr, pkt->addr[i].addr); +} + +/* return number of pages pinned... */ +static int ipath_user_sdma_pin_pages(const struct ipath_devdata *dd, + struct ipath_user_sdma_pkt *pkt, + unsigned long addr, int tlen, int npages) +{ + struct page *pages[2]; + int j; + int ret; + + ret = get_user_pages_fast(addr, npages, 0, pages); + if (ret != npages) { + int i; + + for (i = 0; i < ret; i++) + put_page(pages[i]); + + ret = -ENOMEM; + goto done; + } + + for (j = 0; j < npages; j++) { + /* map the pages... */ + const int flen = + ipath_user_sdma_page_length(addr, tlen); + dma_addr_t dma_addr = + dma_map_page(&dd->pcidev->dev, + pages[j], 0, flen, DMA_TO_DEVICE); + unsigned long fofs = addr & ~PAGE_MASK; + + if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) { + ret = -ENOMEM; + goto done; + } + + ipath_user_sdma_init_frag(pkt, pkt->naddr, fofs, flen, 1, 1, + pages[j], kmap(pages[j]), + dma_addr); + + pkt->naddr++; + addr += flen; + tlen -= flen; + } + +done: + return ret; +} + +static int ipath_user_sdma_pin_pkt(const struct ipath_devdata *dd, + struct ipath_user_sdma_queue *pq, + struct ipath_user_sdma_pkt *pkt, + const struct iovec *iov, + unsigned long niov) +{ + int ret = 0; + unsigned long idx; + + for (idx = 0; idx < niov; idx++) { + const int npages = ipath_user_sdma_num_pages(iov + idx); + const unsigned long addr = (unsigned long) iov[idx].iov_base; + + ret = ipath_user_sdma_pin_pages(dd, pkt, + addr, iov[idx].iov_len, + npages); + if (ret < 0) + goto free_pkt; + } + + goto done; + +free_pkt: + for (idx = 0; idx < pkt->naddr; idx++) + ipath_user_sdma_free_pkt_frag(&dd->pcidev->dev, pq, pkt, idx); + +done: + return ret; +} + +static int ipath_user_sdma_init_payload(const struct ipath_devdata *dd, + struct ipath_user_sdma_queue *pq, + struct ipath_user_sdma_pkt *pkt, + const struct iovec *iov, + unsigned long niov, int npages) +{ + int ret = 0; + + if (npages >= ARRAY_SIZE(pkt->addr)) + ret = ipath_user_sdma_coalesce(dd, pkt, iov, niov); + else + ret = ipath_user_sdma_pin_pkt(dd, pq, pkt, iov, niov); + + return ret; +} + +/* free a packet list -- return counter value of last packet */ +static void ipath_user_sdma_free_pkt_list(struct device *dev, + struct ipath_user_sdma_queue *pq, + struct list_head *list) +{ + struct ipath_user_sdma_pkt *pkt, *pkt_next; + + list_for_each_entry_safe(pkt, pkt_next, list, list) { + int i; + + for (i = 0; i < pkt->naddr; i++) + ipath_user_sdma_free_pkt_frag(dev, pq, pkt, i); + + kmem_cache_free(pq->pkt_slab, pkt); + } +} + +/* + * copy headers, coalesce etc -- pq->lock must be held + * + * we queue all the packets to list, returning the + * number of bytes total. list must be empty initially, + * as, if there is an error we clean it... + */ +static int ipath_user_sdma_queue_pkts(const struct ipath_devdata *dd, + struct ipath_user_sdma_queue *pq, + struct list_head *list, + const struct iovec *iov, + unsigned long niov, + int maxpkts) +{ + unsigned long idx = 0; + int ret = 0; + int npkts = 0; + struct page *page = NULL; + __le32 *pbc; + dma_addr_t dma_addr; + struct ipath_user_sdma_pkt *pkt = NULL; + size_t len; + size_t nw; + u32 counter = pq->counter; + int dma_mapped = 0; + + while (idx < niov && npkts < maxpkts) { + const unsigned long addr = (unsigned long) iov[idx].iov_base; + const unsigned long idx_save = idx; + unsigned pktnw; + unsigned pktnwc; + int nfrags = 0; + int npages = 0; + int cfur; + + dma_mapped = 0; + len = iov[idx].iov_len; + nw = len >> 2; + page = NULL; + + pkt = kmem_cache_alloc(pq->pkt_slab, GFP_KERNEL); + if (!pkt) { + ret = -ENOMEM; + goto free_list; + } + + if (len < IPATH_USER_SDMA_MIN_HEADER_LENGTH || + len > PAGE_SIZE || len & 3 || addr & 3) { + ret = -EINVAL; + goto free_pkt; + } + + if (len == IPATH_USER_SDMA_EXP_HEADER_LENGTH) + pbc = dma_pool_alloc(pq->header_cache, GFP_KERNEL, + &dma_addr); + else + pbc = NULL; + + if (!pbc) { + page = alloc_page(GFP_KERNEL); + if (!page) { + ret = -ENOMEM; + goto free_pkt; + } + pbc = kmap(page); + } + + cfur = copy_from_user(pbc, iov[idx].iov_base, len); + if (cfur) { + ret = -EFAULT; + goto free_pbc; + } + + /* + * this assignment is a bit strange. it's because the + * the pbc counts the number of 32 bit words in the full + * packet _except_ the first word of the pbc itself... + */ + pktnwc = nw - 1; + + /* + * pktnw computation yields the number of 32 bit words + * that the caller has indicated in the PBC. note that + * this is one less than the total number of words that + * goes to the send DMA engine as the first 32 bit word + * of the PBC itself is not counted. Armed with this count, + * we can verify that the packet is consistent with the + * iovec lengths. + */ + pktnw = le32_to_cpu(*pbc) & IPATH_PBC_LENGTH_MASK; + if (pktnw < pktnwc || pktnw > pktnwc + (PAGE_SIZE >> 2)) { + ret = -EINVAL; + goto free_pbc; + } + + + idx++; + while (pktnwc < pktnw && idx < niov) { + const size_t slen = iov[idx].iov_len; + const unsigned long faddr = + (unsigned long) iov[idx].iov_base; + + if (slen & 3 || faddr & 3 || !slen || + slen > PAGE_SIZE) { + ret = -EINVAL; + goto free_pbc; + } + + npages++; + if ((faddr & PAGE_MASK) != + ((faddr + slen - 1) & PAGE_MASK)) + npages++; + + pktnwc += slen >> 2; + idx++; + nfrags++; + } + + if (pktnwc != pktnw) { + ret = -EINVAL; + goto free_pbc; + } + + if (page) { + dma_addr = dma_map_page(&dd->pcidev->dev, + page, 0, len, DMA_TO_DEVICE); + if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) { + ret = -ENOMEM; + goto free_pbc; + } + + dma_mapped = 1; + } + + ipath_user_sdma_init_header(pkt, counter, 0, len, dma_mapped, + page, pbc, dma_addr); + + if (nfrags) { + ret = ipath_user_sdma_init_payload(dd, pq, pkt, + iov + idx_save + 1, + nfrags, npages); + if (ret < 0) + goto free_pbc_dma; + } + + counter++; + npkts++; + + list_add_tail(&pkt->list, list); + } + + ret = idx; + goto done; + +free_pbc_dma: + if (dma_mapped) + dma_unmap_page(&dd->pcidev->dev, dma_addr, len, DMA_TO_DEVICE); +free_pbc: + if (page) { + kunmap(page); + __free_page(page); + } else + dma_pool_free(pq->header_cache, pbc, dma_addr); +free_pkt: + kmem_cache_free(pq->pkt_slab, pkt); +free_list: + ipath_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, list); +done: + return ret; +} + +static void ipath_user_sdma_set_complete_counter(struct ipath_user_sdma_queue *pq, + u32 c) +{ + pq->sent_counter = c; +} + +/* try to clean out queue -- needs pq->lock */ +static int ipath_user_sdma_queue_clean(const struct ipath_devdata *dd, + struct ipath_user_sdma_queue *pq) +{ + struct list_head free_list; + struct ipath_user_sdma_pkt *pkt; + struct ipath_user_sdma_pkt *pkt_prev; + int ret = 0; + + INIT_LIST_HEAD(&free_list); + + list_for_each_entry_safe(pkt, pkt_prev, &pq->sent, list) { + s64 descd = dd->ipath_sdma_descq_removed - pkt->added; + + if (descd < 0) + break; + + list_move_tail(&pkt->list, &free_list); + + /* one more packet cleaned */ + ret++; + } + + if (!list_empty(&free_list)) { + u32 counter; + + pkt = list_entry(free_list.prev, + struct ipath_user_sdma_pkt, list); + counter = pkt->counter; + + ipath_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &free_list); + ipath_user_sdma_set_complete_counter(pq, counter); + } + + return ret; +} + +void ipath_user_sdma_queue_destroy(struct ipath_user_sdma_queue *pq) +{ + if (!pq) + return; + + kmem_cache_destroy(pq->pkt_slab); + dma_pool_destroy(pq->header_cache); + kfree(pq); +} + +/* clean descriptor queue, returns > 0 if some elements cleaned */ +static int ipath_user_sdma_hwqueue_clean(struct ipath_devdata *dd) +{ + int ret; + unsigned long flags; + + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + ret = ipath_sdma_make_progress(dd); + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + + return ret; +} + +/* we're in close, drain packets so that we can cleanup successfully... */ +void ipath_user_sdma_queue_drain(struct ipath_devdata *dd, + struct ipath_user_sdma_queue *pq) +{ + int i; + + if (!pq) + return; + + for (i = 0; i < 100; i++) { + mutex_lock(&pq->lock); + if (list_empty(&pq->sent)) { + mutex_unlock(&pq->lock); + break; + } + ipath_user_sdma_hwqueue_clean(dd); + ipath_user_sdma_queue_clean(dd, pq); + mutex_unlock(&pq->lock); + msleep(10); + } + + if (!list_empty(&pq->sent)) { + struct list_head free_list; + + printk(KERN_INFO "drain: lists not empty: forcing!\n"); + INIT_LIST_HEAD(&free_list); + mutex_lock(&pq->lock); + list_splice_init(&pq->sent, &free_list); + ipath_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &free_list); + mutex_unlock(&pq->lock); + } +} + +static inline __le64 ipath_sdma_make_desc0(struct ipath_devdata *dd, + u64 addr, u64 dwlen, u64 dwoffset) +{ + return cpu_to_le64(/* SDmaPhyAddr[31:0] */ + ((addr & 0xfffffffcULL) << 32) | + /* SDmaGeneration[1:0] */ + ((dd->ipath_sdma_generation & 3ULL) << 30) | + /* SDmaDwordCount[10:0] */ + ((dwlen & 0x7ffULL) << 16) | + /* SDmaBufOffset[12:2] */ + (dwoffset & 0x7ffULL)); +} + +static inline __le64 ipath_sdma_make_first_desc0(__le64 descq) +{ + return descq | cpu_to_le64(1ULL << 12); +} + +static inline __le64 ipath_sdma_make_last_desc0(__le64 descq) +{ + /* last */ /* dma head */ + return descq | cpu_to_le64(1ULL << 11 | 1ULL << 13); +} + +static inline __le64 ipath_sdma_make_desc1(u64 addr) +{ + /* SDmaPhyAddr[47:32] */ + return cpu_to_le64(addr >> 32); +} + +static void ipath_user_sdma_send_frag(struct ipath_devdata *dd, + struct ipath_user_sdma_pkt *pkt, int idx, + unsigned ofs, u16 tail) +{ + const u64 addr = (u64) pkt->addr[idx].addr + + (u64) pkt->addr[idx].offset; + const u64 dwlen = (u64) pkt->addr[idx].length / 4; + __le64 *descqp; + __le64 descq0; + + descqp = &dd->ipath_sdma_descq[tail].qw[0]; + + descq0 = ipath_sdma_make_desc0(dd, addr, dwlen, ofs); + if (idx == 0) + descq0 = ipath_sdma_make_first_desc0(descq0); + if (idx == pkt->naddr - 1) + descq0 = ipath_sdma_make_last_desc0(descq0); + + descqp[0] = descq0; + descqp[1] = ipath_sdma_make_desc1(addr); +} + +/* pq->lock must be held, get packets on the wire... */ +static int ipath_user_sdma_push_pkts(struct ipath_devdata *dd, + struct ipath_user_sdma_queue *pq, + struct list_head *pktlist) +{ + int ret = 0; + unsigned long flags; + u16 tail; + + if (list_empty(pktlist)) + return 0; + + if (unlikely(!(dd->ipath_flags & IPATH_LINKACTIVE))) + return -ECOMM; + + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + + if (unlikely(dd->ipath_sdma_status & IPATH_SDMA_ABORT_MASK)) { + ret = -ECOMM; + goto unlock; + } + + tail = dd->ipath_sdma_descq_tail; + while (!list_empty(pktlist)) { + struct ipath_user_sdma_pkt *pkt = + list_entry(pktlist->next, struct ipath_user_sdma_pkt, + list); + int i; + unsigned ofs = 0; + u16 dtail = tail; + + if (pkt->naddr > ipath_sdma_descq_freecnt(dd)) + goto unlock_check_tail; + + for (i = 0; i < pkt->naddr; i++) { + ipath_user_sdma_send_frag(dd, pkt, i, ofs, tail); + ofs += pkt->addr[i].length >> 2; + + if (++tail == dd->ipath_sdma_descq_cnt) { + tail = 0; + ++dd->ipath_sdma_generation; + } + } + + if ((ofs<<2) > dd->ipath_ibmaxlen) { + ipath_dbg("packet size %X > ibmax %X, fail\n", + ofs<<2, dd->ipath_ibmaxlen); + ret = -EMSGSIZE; + goto unlock; + } + + /* + * if the packet is >= 2KB mtu equivalent, we have to use + * the large buffers, and have to mark each descriptor as + * part of a large buffer packet. + */ + if (ofs >= IPATH_SMALLBUF_DWORDS) { + for (i = 0; i < pkt->naddr; i++) { + dd->ipath_sdma_descq[dtail].qw[0] |= + cpu_to_le64(1ULL << 14); + if (++dtail == dd->ipath_sdma_descq_cnt) + dtail = 0; + } + } + + dd->ipath_sdma_descq_added += pkt->naddr; + pkt->added = dd->ipath_sdma_descq_added; + list_move_tail(&pkt->list, &pq->sent); + ret++; + } + +unlock_check_tail: + /* advance the tail on the chip if necessary */ + if (dd->ipath_sdma_descq_tail != tail) { + wmb(); + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmatail, tail); + dd->ipath_sdma_descq_tail = tail; + } + +unlock: + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + + return ret; +} + +int ipath_user_sdma_writev(struct ipath_devdata *dd, + struct ipath_user_sdma_queue *pq, + const struct iovec *iov, + unsigned long dim) +{ + int ret = 0; + struct list_head list; + int npkts = 0; + + INIT_LIST_HEAD(&list); + + mutex_lock(&pq->lock); + + if (dd->ipath_sdma_descq_added != dd->ipath_sdma_descq_removed) { + ipath_user_sdma_hwqueue_clean(dd); + ipath_user_sdma_queue_clean(dd, pq); + } + + while (dim) { + const int mxp = 8; + + ret = ipath_user_sdma_queue_pkts(dd, pq, &list, iov, dim, mxp); + if (ret <= 0) + goto done_unlock; + else { + dim -= ret; + iov += ret; + } + + /* force packets onto the sdma hw queue... */ + if (!list_empty(&list)) { + /* + * lazily clean hw queue. the 4 is a guess of about + * how many sdma descriptors a packet will take (it + * doesn't have to be perfect). + */ + if (ipath_sdma_descq_freecnt(dd) < ret * 4) { + ipath_user_sdma_hwqueue_clean(dd); + ipath_user_sdma_queue_clean(dd, pq); + } + + ret = ipath_user_sdma_push_pkts(dd, pq, &list); + if (ret < 0) + goto done_unlock; + else { + npkts += ret; + pq->counter += ret; + + if (!list_empty(&list)) + goto done_unlock; + } + } + } + +done_unlock: + if (!list_empty(&list)) + ipath_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &list); + mutex_unlock(&pq->lock); + + return (ret < 0) ? ret : npkts; +} + +int ipath_user_sdma_make_progress(struct ipath_devdata *dd, + struct ipath_user_sdma_queue *pq) +{ + int ret = 0; + + mutex_lock(&pq->lock); + ipath_user_sdma_hwqueue_clean(dd); + ret = ipath_user_sdma_queue_clean(dd, pq); + mutex_unlock(&pq->lock); + + return ret; +} + +u32 ipath_user_sdma_complete_counter(const struct ipath_user_sdma_queue *pq) +{ + return pq->sent_counter; +} + +u32 ipath_user_sdma_inflight_counter(struct ipath_user_sdma_queue *pq) +{ + return pq->counter; +} + diff --git a/kernel/drivers/infiniband/hw/ipath/ipath_user_sdma.h b/kernel/drivers/infiniband/hw/ipath/ipath_user_sdma.h new file mode 100644 index 000000000..fc76316c4 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ipath/ipath_user_sdma.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2007, 2008 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include + +struct ipath_user_sdma_queue; + +struct ipath_user_sdma_queue * +ipath_user_sdma_queue_create(struct device *dev, int unit, int port, int sport); +void ipath_user_sdma_queue_destroy(struct ipath_user_sdma_queue *pq); + +int ipath_user_sdma_writev(struct ipath_devdata *dd, + struct ipath_user_sdma_queue *pq, + const struct iovec *iov, + unsigned long dim); + +int ipath_user_sdma_make_progress(struct ipath_devdata *dd, + struct ipath_user_sdma_queue *pq); + +void ipath_user_sdma_queue_drain(struct ipath_devdata *dd, + struct ipath_user_sdma_queue *pq); + +u32 ipath_user_sdma_complete_counter(const struct ipath_user_sdma_queue *pq); +u32 ipath_user_sdma_inflight_counter(struct ipath_user_sdma_queue *pq); diff --git a/kernel/drivers/infiniband/hw/ipath/ipath_verbs.c b/kernel/drivers/infiniband/hw/ipath/ipath_verbs.c new file mode 100644 index 000000000..44ea93904 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ipath/ipath_verbs.c @@ -0,0 +1,2342 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "ipath_kernel.h" +#include "ipath_verbs.h" +#include "ipath_common.h" + +static unsigned int ib_ipath_qp_table_size = 251; +module_param_named(qp_table_size, ib_ipath_qp_table_size, uint, S_IRUGO); +MODULE_PARM_DESC(qp_table_size, "QP table size"); + +unsigned int ib_ipath_lkey_table_size = 12; +module_param_named(lkey_table_size, ib_ipath_lkey_table_size, uint, + S_IRUGO); +MODULE_PARM_DESC(lkey_table_size, + "LKEY table size in bits (2^n, 1 <= n <= 23)"); + +static unsigned int ib_ipath_max_pds = 0xFFFF; +module_param_named(max_pds, ib_ipath_max_pds, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(max_pds, + "Maximum number of protection domains to support"); + +static unsigned int ib_ipath_max_ahs = 0xFFFF; +module_param_named(max_ahs, ib_ipath_max_ahs, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(max_ahs, "Maximum number of address handles to support"); + +unsigned int ib_ipath_max_cqes = 0x2FFFF; +module_param_named(max_cqes, ib_ipath_max_cqes, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(max_cqes, + "Maximum number of completion queue entries to support"); + +unsigned int ib_ipath_max_cqs = 0x1FFFF; +module_param_named(max_cqs, ib_ipath_max_cqs, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(max_cqs, "Maximum number of completion queues to support"); + +unsigned int ib_ipath_max_qp_wrs = 0x3FFF; +module_param_named(max_qp_wrs, ib_ipath_max_qp_wrs, uint, + S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(max_qp_wrs, "Maximum number of QP WRs to support"); + +unsigned int ib_ipath_max_qps = 16384; +module_param_named(max_qps, ib_ipath_max_qps, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(max_qps, "Maximum number of QPs to support"); + +unsigned int ib_ipath_max_sges = 0x60; +module_param_named(max_sges, ib_ipath_max_sges, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(max_sges, "Maximum number of SGEs to support"); + +unsigned int ib_ipath_max_mcast_grps = 16384; +module_param_named(max_mcast_grps, ib_ipath_max_mcast_grps, uint, + S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(max_mcast_grps, + "Maximum number of multicast groups to support"); + +unsigned int ib_ipath_max_mcast_qp_attached = 16; +module_param_named(max_mcast_qp_attached, ib_ipath_max_mcast_qp_attached, + uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(max_mcast_qp_attached, + "Maximum number of attached QPs to support"); + +unsigned int ib_ipath_max_srqs = 1024; +module_param_named(max_srqs, ib_ipath_max_srqs, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(max_srqs, "Maximum number of SRQs to support"); + +unsigned int ib_ipath_max_srq_sges = 128; +module_param_named(max_srq_sges, ib_ipath_max_srq_sges, + uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(max_srq_sges, "Maximum number of SRQ SGEs to support"); + +unsigned int ib_ipath_max_srq_wrs = 0x1FFFF; +module_param_named(max_srq_wrs, ib_ipath_max_srq_wrs, + uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(max_srq_wrs, "Maximum number of SRQ WRs support"); + +static unsigned int ib_ipath_disable_sma; +module_param_named(disable_sma, ib_ipath_disable_sma, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(disable_sma, "Disable the SMA"); + +/* + * Note that it is OK to post send work requests in the SQE and ERR + * states; ipath_do_send() will process them and generate error + * completions as per IB 1.2 C10-96. + */ +const int ib_ipath_state_ops[IB_QPS_ERR + 1] = { + [IB_QPS_RESET] = 0, + [IB_QPS_INIT] = IPATH_POST_RECV_OK, + [IB_QPS_RTR] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK, + [IB_QPS_RTS] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK | + IPATH_POST_SEND_OK | IPATH_PROCESS_SEND_OK | + IPATH_PROCESS_NEXT_SEND_OK, + [IB_QPS_SQD] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK | + IPATH_POST_SEND_OK | IPATH_PROCESS_SEND_OK, + [IB_QPS_SQE] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK | + IPATH_POST_SEND_OK | IPATH_FLUSH_SEND, + [IB_QPS_ERR] = IPATH_POST_RECV_OK | IPATH_FLUSH_RECV | + IPATH_POST_SEND_OK | IPATH_FLUSH_SEND, +}; + +struct ipath_ucontext { + struct ib_ucontext ibucontext; +}; + +static inline struct ipath_ucontext *to_iucontext(struct ib_ucontext + *ibucontext) +{ + return container_of(ibucontext, struct ipath_ucontext, ibucontext); +} + +/* + * Translate ib_wr_opcode into ib_wc_opcode. + */ +const enum ib_wc_opcode ib_ipath_wc_opcode[] = { + [IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE, + [IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE, + [IB_WR_SEND] = IB_WC_SEND, + [IB_WR_SEND_WITH_IMM] = IB_WC_SEND, + [IB_WR_RDMA_READ] = IB_WC_RDMA_READ, + [IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP, + [IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD +}; + +/* + * System image GUID. + */ +static __be64 sys_image_guid; + +/** + * ipath_copy_sge - copy data to SGE memory + * @ss: the SGE state + * @data: the data to copy + * @length: the length of the data + */ +void ipath_copy_sge(struct ipath_sge_state *ss, void *data, u32 length) +{ + struct ipath_sge *sge = &ss->sge; + + while (length) { + u32 len = sge->length; + + if (len > length) + len = length; + if (len > sge->sge_length) + len = sge->sge_length; + BUG_ON(len == 0); + memcpy(sge->vaddr, data, len); + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (sge->sge_length == 0) { + if (--ss->num_sge) + *sge = *ss->sg_list++; + } else if (sge->length == 0 && sge->mr != NULL) { + if (++sge->n >= IPATH_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + break; + sge->n = 0; + } + sge->vaddr = + sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = + sge->mr->map[sge->m]->segs[sge->n].length; + } + data += len; + length -= len; + } +} + +/** + * ipath_skip_sge - skip over SGE memory - XXX almost dup of prev func + * @ss: the SGE state + * @length: the number of bytes to skip + */ +void ipath_skip_sge(struct ipath_sge_state *ss, u32 length) +{ + struct ipath_sge *sge = &ss->sge; + + while (length) { + u32 len = sge->length; + + if (len > length) + len = length; + if (len > sge->sge_length) + len = sge->sge_length; + BUG_ON(len == 0); + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (sge->sge_length == 0) { + if (--ss->num_sge) + *sge = *ss->sg_list++; + } else if (sge->length == 0 && sge->mr != NULL) { + if (++sge->n >= IPATH_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + break; + sge->n = 0; + } + sge->vaddr = + sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = + sge->mr->map[sge->m]->segs[sge->n].length; + } + length -= len; + } +} + +/* + * Count the number of DMA descriptors needed to send length bytes of data. + * Don't modify the ipath_sge_state to get the count. + * Return zero if any of the segments is not aligned. + */ +static u32 ipath_count_sge(struct ipath_sge_state *ss, u32 length) +{ + struct ipath_sge *sg_list = ss->sg_list; + struct ipath_sge sge = ss->sge; + u8 num_sge = ss->num_sge; + u32 ndesc = 1; /* count the header */ + + while (length) { + u32 len = sge.length; + + if (len > length) + len = length; + if (len > sge.sge_length) + len = sge.sge_length; + BUG_ON(len == 0); + if (((long) sge.vaddr & (sizeof(u32) - 1)) || + (len != length && (len & (sizeof(u32) - 1)))) { + ndesc = 0; + break; + } + ndesc++; + sge.vaddr += len; + sge.length -= len; + sge.sge_length -= len; + if (sge.sge_length == 0) { + if (--num_sge) + sge = *sg_list++; + } else if (sge.length == 0 && sge.mr != NULL) { + if (++sge.n >= IPATH_SEGSZ) { + if (++sge.m >= sge.mr->mapsz) + break; + sge.n = 0; + } + sge.vaddr = + sge.mr->map[sge.m]->segs[sge.n].vaddr; + sge.length = + sge.mr->map[sge.m]->segs[sge.n].length; + } + length -= len; + } + return ndesc; +} + +/* + * Copy from the SGEs to the data buffer. + */ +static void ipath_copy_from_sge(void *data, struct ipath_sge_state *ss, + u32 length) +{ + struct ipath_sge *sge = &ss->sge; + + while (length) { + u32 len = sge->length; + + if (len > length) + len = length; + if (len > sge->sge_length) + len = sge->sge_length; + BUG_ON(len == 0); + memcpy(data, sge->vaddr, len); + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (sge->sge_length == 0) { + if (--ss->num_sge) + *sge = *ss->sg_list++; + } else if (sge->length == 0 && sge->mr != NULL) { + if (++sge->n >= IPATH_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + break; + sge->n = 0; + } + sge->vaddr = + sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = + sge->mr->map[sge->m]->segs[sge->n].length; + } + data += len; + length -= len; + } +} + +/** + * ipath_post_one_send - post one RC, UC, or UD send work request + * @qp: the QP to post on + * @wr: the work request to send + */ +static int ipath_post_one_send(struct ipath_qp *qp, struct ib_send_wr *wr) +{ + struct ipath_swqe *wqe; + u32 next; + int i; + int j; + int acc; + int ret; + unsigned long flags; + struct ipath_devdata *dd = to_idev(qp->ibqp.device)->dd; + + spin_lock_irqsave(&qp->s_lock, flags); + + if (qp->ibqp.qp_type != IB_QPT_SMI && + !(dd->ipath_flags & IPATH_LINKACTIVE)) { + ret = -ENETDOWN; + goto bail; + } + + /* Check that state is OK to post send. */ + if (unlikely(!(ib_ipath_state_ops[qp->state] & IPATH_POST_SEND_OK))) + goto bail_inval; + + /* IB spec says that num_sge == 0 is OK. */ + if (wr->num_sge > qp->s_max_sge) + goto bail_inval; + + /* + * Don't allow RDMA reads or atomic operations on UC or + * undefined operations. + * Make sure buffer is large enough to hold the result for atomics. + */ + if (qp->ibqp.qp_type == IB_QPT_UC) { + if ((unsigned) wr->opcode >= IB_WR_RDMA_READ) + goto bail_inval; + } else if (qp->ibqp.qp_type == IB_QPT_UD) { + /* Check UD opcode */ + if (wr->opcode != IB_WR_SEND && + wr->opcode != IB_WR_SEND_WITH_IMM) + goto bail_inval; + /* Check UD destination address PD */ + if (qp->ibqp.pd != wr->wr.ud.ah->pd) + goto bail_inval; + } else if ((unsigned) wr->opcode > IB_WR_ATOMIC_FETCH_AND_ADD) + goto bail_inval; + else if (wr->opcode >= IB_WR_ATOMIC_CMP_AND_SWP && + (wr->num_sge == 0 || + wr->sg_list[0].length < sizeof(u64) || + wr->sg_list[0].addr & (sizeof(u64) - 1))) + goto bail_inval; + else if (wr->opcode >= IB_WR_RDMA_READ && !qp->s_max_rd_atomic) + goto bail_inval; + + next = qp->s_head + 1; + if (next >= qp->s_size) + next = 0; + if (next == qp->s_last) { + ret = -ENOMEM; + goto bail; + } + + wqe = get_swqe_ptr(qp, qp->s_head); + wqe->wr = *wr; + wqe->length = 0; + if (wr->num_sge) { + acc = wr->opcode >= IB_WR_RDMA_READ ? + IB_ACCESS_LOCAL_WRITE : 0; + for (i = 0, j = 0; i < wr->num_sge; i++) { + u32 length = wr->sg_list[i].length; + int ok; + + if (length == 0) + continue; + ok = ipath_lkey_ok(qp, &wqe->sg_list[j], + &wr->sg_list[i], acc); + if (!ok) + goto bail_inval; + wqe->length += length; + j++; + } + wqe->wr.num_sge = j; + } + if (qp->ibqp.qp_type == IB_QPT_UC || + qp->ibqp.qp_type == IB_QPT_RC) { + if (wqe->length > 0x80000000U) + goto bail_inval; + } else if (wqe->length > to_idev(qp->ibqp.device)->dd->ipath_ibmtu) + goto bail_inval; + wqe->ssn = qp->s_ssn++; + qp->s_head = next; + + ret = 0; + goto bail; + +bail_inval: + ret = -EINVAL; +bail: + spin_unlock_irqrestore(&qp->s_lock, flags); + return ret; +} + +/** + * ipath_post_send - post a send on a QP + * @ibqp: the QP to post the send on + * @wr: the list of work requests to post + * @bad_wr: the first bad WR is put here + * + * This may be called from interrupt context. + */ +static int ipath_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr) +{ + struct ipath_qp *qp = to_iqp(ibqp); + int err = 0; + + for (; wr; wr = wr->next) { + err = ipath_post_one_send(qp, wr); + if (err) { + *bad_wr = wr; + goto bail; + } + } + + /* Try to do the send work in the caller's context. */ + ipath_do_send((unsigned long) qp); + +bail: + return err; +} + +/** + * ipath_post_receive - post a receive on a QP + * @ibqp: the QP to post the receive on + * @wr: the WR to post + * @bad_wr: the first bad WR is put here + * + * This may be called from interrupt context. + */ +static int ipath_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct ipath_qp *qp = to_iqp(ibqp); + struct ipath_rwq *wq = qp->r_rq.wq; + unsigned long flags; + int ret; + + /* Check that state is OK to post receive. */ + if (!(ib_ipath_state_ops[qp->state] & IPATH_POST_RECV_OK) || !wq) { + *bad_wr = wr; + ret = -EINVAL; + goto bail; + } + + for (; wr; wr = wr->next) { + struct ipath_rwqe *wqe; + u32 next; + int i; + + if ((unsigned) wr->num_sge > qp->r_rq.max_sge) { + *bad_wr = wr; + ret = -EINVAL; + goto bail; + } + + spin_lock_irqsave(&qp->r_rq.lock, flags); + next = wq->head + 1; + if (next >= qp->r_rq.size) + next = 0; + if (next == wq->tail) { + spin_unlock_irqrestore(&qp->r_rq.lock, flags); + *bad_wr = wr; + ret = -ENOMEM; + goto bail; + } + + wqe = get_rwqe_ptr(&qp->r_rq, wq->head); + wqe->wr_id = wr->wr_id; + wqe->num_sge = wr->num_sge; + for (i = 0; i < wr->num_sge; i++) + wqe->sg_list[i] = wr->sg_list[i]; + /* Make sure queue entry is written before the head index. */ + smp_wmb(); + wq->head = next; + spin_unlock_irqrestore(&qp->r_rq.lock, flags); + } + ret = 0; + +bail: + return ret; +} + +/** + * ipath_qp_rcv - processing an incoming packet on a QP + * @dev: the device the packet came on + * @hdr: the packet header + * @has_grh: true if the packet has a GRH + * @data: the packet data + * @tlen: the packet length + * @qp: the QP the packet came on + * + * This is called from ipath_ib_rcv() to process an incoming packet + * for the given QP. + * Called at interrupt level. + */ +static void ipath_qp_rcv(struct ipath_ibdev *dev, + struct ipath_ib_header *hdr, int has_grh, + void *data, u32 tlen, struct ipath_qp *qp) +{ + /* Check for valid receive state. */ + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) { + dev->n_pkt_drops++; + return; + } + + switch (qp->ibqp.qp_type) { + case IB_QPT_SMI: + case IB_QPT_GSI: + if (ib_ipath_disable_sma) + break; + /* FALLTHROUGH */ + case IB_QPT_UD: + ipath_ud_rcv(dev, hdr, has_grh, data, tlen, qp); + break; + + case IB_QPT_RC: + ipath_rc_rcv(dev, hdr, has_grh, data, tlen, qp); + break; + + case IB_QPT_UC: + ipath_uc_rcv(dev, hdr, has_grh, data, tlen, qp); + break; + + default: + break; + } +} + +/** + * ipath_ib_rcv - process an incoming packet + * @arg: the device pointer + * @rhdr: the header of the packet + * @data: the packet data + * @tlen: the packet length + * + * This is called from ipath_kreceive() to process an incoming packet at + * interrupt level. Tlen is the length of the header + data + CRC in bytes. + */ +void ipath_ib_rcv(struct ipath_ibdev *dev, void *rhdr, void *data, + u32 tlen) +{ + struct ipath_ib_header *hdr = rhdr; + struct ipath_other_headers *ohdr; + struct ipath_qp *qp; + u32 qp_num; + int lnh; + u8 opcode; + u16 lid; + + if (unlikely(dev == NULL)) + goto bail; + + if (unlikely(tlen < 24)) { /* LRH+BTH+CRC */ + dev->rcv_errors++; + goto bail; + } + + /* Check for a valid destination LID (see ch. 7.11.1). */ + lid = be16_to_cpu(hdr->lrh[1]); + if (lid < IPATH_MULTICAST_LID_BASE) { + lid &= ~((1 << dev->dd->ipath_lmc) - 1); + if (unlikely(lid != dev->dd->ipath_lid)) { + dev->rcv_errors++; + goto bail; + } + } + + /* Check for GRH */ + lnh = be16_to_cpu(hdr->lrh[0]) & 3; + if (lnh == IPATH_LRH_BTH) + ohdr = &hdr->u.oth; + else if (lnh == IPATH_LRH_GRH) + ohdr = &hdr->u.l.oth; + else { + dev->rcv_errors++; + goto bail; + } + + opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0x7f; + dev->opstats[opcode].n_bytes += tlen; + dev->opstats[opcode].n_packets++; + + /* Get the destination QP number. */ + qp_num = be32_to_cpu(ohdr->bth[1]) & IPATH_QPN_MASK; + if (qp_num == IPATH_MULTICAST_QPN) { + struct ipath_mcast *mcast; + struct ipath_mcast_qp *p; + + if (lnh != IPATH_LRH_GRH) { + dev->n_pkt_drops++; + goto bail; + } + mcast = ipath_mcast_find(&hdr->u.l.grh.dgid); + if (mcast == NULL) { + dev->n_pkt_drops++; + goto bail; + } + dev->n_multicast_rcv++; + list_for_each_entry_rcu(p, &mcast->qp_list, list) + ipath_qp_rcv(dev, hdr, 1, data, tlen, p->qp); + /* + * Notify ipath_multicast_detach() if it is waiting for us + * to finish. + */ + if (atomic_dec_return(&mcast->refcount) <= 1) + wake_up(&mcast->wait); + } else { + qp = ipath_lookup_qpn(&dev->qp_table, qp_num); + if (qp) { + dev->n_unicast_rcv++; + ipath_qp_rcv(dev, hdr, lnh == IPATH_LRH_GRH, data, + tlen, qp); + /* + * Notify ipath_destroy_qp() if it is waiting + * for us to finish. + */ + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + } else + dev->n_pkt_drops++; + } + +bail:; +} + +/** + * ipath_ib_timer - verbs timer + * @arg: the device pointer + * + * This is called from ipath_do_rcv_timer() at interrupt level to check for + * QPs which need retransmits and to collect performance numbers. + */ +static void ipath_ib_timer(struct ipath_ibdev *dev) +{ + struct ipath_qp *resend = NULL; + struct ipath_qp *rnr = NULL; + struct list_head *last; + struct ipath_qp *qp; + unsigned long flags; + + if (dev == NULL) + return; + + spin_lock_irqsave(&dev->pending_lock, flags); + /* Start filling the next pending queue. */ + if (++dev->pending_index >= ARRAY_SIZE(dev->pending)) + dev->pending_index = 0; + /* Save any requests still in the new queue, they have timed out. */ + last = &dev->pending[dev->pending_index]; + while (!list_empty(last)) { + qp = list_entry(last->next, struct ipath_qp, timerwait); + list_del_init(&qp->timerwait); + qp->timer_next = resend; + resend = qp; + atomic_inc(&qp->refcount); + } + last = &dev->rnrwait; + if (!list_empty(last)) { + qp = list_entry(last->next, struct ipath_qp, timerwait); + if (--qp->s_rnr_timeout == 0) { + do { + list_del_init(&qp->timerwait); + qp->timer_next = rnr; + rnr = qp; + atomic_inc(&qp->refcount); + if (list_empty(last)) + break; + qp = list_entry(last->next, struct ipath_qp, + timerwait); + } while (qp->s_rnr_timeout == 0); + } + } + /* + * We should only be in the started state if pma_sample_start != 0 + */ + if (dev->pma_sample_status == IB_PMA_SAMPLE_STATUS_STARTED && + --dev->pma_sample_start == 0) { + dev->pma_sample_status = IB_PMA_SAMPLE_STATUS_RUNNING; + ipath_snapshot_counters(dev->dd, &dev->ipath_sword, + &dev->ipath_rword, + &dev->ipath_spkts, + &dev->ipath_rpkts, + &dev->ipath_xmit_wait); + } + if (dev->pma_sample_status == IB_PMA_SAMPLE_STATUS_RUNNING) { + if (dev->pma_sample_interval == 0) { + u64 ta, tb, tc, td, te; + + dev->pma_sample_status = IB_PMA_SAMPLE_STATUS_DONE; + ipath_snapshot_counters(dev->dd, &ta, &tb, + &tc, &td, &te); + + dev->ipath_sword = ta - dev->ipath_sword; + dev->ipath_rword = tb - dev->ipath_rword; + dev->ipath_spkts = tc - dev->ipath_spkts; + dev->ipath_rpkts = td - dev->ipath_rpkts; + dev->ipath_xmit_wait = te - dev->ipath_xmit_wait; + } + else + dev->pma_sample_interval--; + } + spin_unlock_irqrestore(&dev->pending_lock, flags); + + /* XXX What if timer fires again while this is running? */ + while (resend != NULL) { + qp = resend; + resend = qp->timer_next; + + spin_lock_irqsave(&qp->s_lock, flags); + if (qp->s_last != qp->s_tail && + ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) { + dev->n_timeouts++; + ipath_restart_rc(qp, qp->s_last_psn + 1); + } + spin_unlock_irqrestore(&qp->s_lock, flags); + + /* Notify ipath_destroy_qp() if it is waiting. */ + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + } + while (rnr != NULL) { + qp = rnr; + rnr = qp->timer_next; + + spin_lock_irqsave(&qp->s_lock, flags); + if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) + ipath_schedule_send(qp); + spin_unlock_irqrestore(&qp->s_lock, flags); + + /* Notify ipath_destroy_qp() if it is waiting. */ + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + } +} + +static void update_sge(struct ipath_sge_state *ss, u32 length) +{ + struct ipath_sge *sge = &ss->sge; + + sge->vaddr += length; + sge->length -= length; + sge->sge_length -= length; + if (sge->sge_length == 0) { + if (--ss->num_sge) + *sge = *ss->sg_list++; + } else if (sge->length == 0 && sge->mr != NULL) { + if (++sge->n >= IPATH_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + return; + sge->n = 0; + } + sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = sge->mr->map[sge->m]->segs[sge->n].length; + } +} + +#ifdef __LITTLE_ENDIAN +static inline u32 get_upper_bits(u32 data, u32 shift) +{ + return data >> shift; +} + +static inline u32 set_upper_bits(u32 data, u32 shift) +{ + return data << shift; +} + +static inline u32 clear_upper_bytes(u32 data, u32 n, u32 off) +{ + data <<= ((sizeof(u32) - n) * BITS_PER_BYTE); + data >>= ((sizeof(u32) - n - off) * BITS_PER_BYTE); + return data; +} +#else +static inline u32 get_upper_bits(u32 data, u32 shift) +{ + return data << shift; +} + +static inline u32 set_upper_bits(u32 data, u32 shift) +{ + return data >> shift; +} + +static inline u32 clear_upper_bytes(u32 data, u32 n, u32 off) +{ + data >>= ((sizeof(u32) - n) * BITS_PER_BYTE); + data <<= ((sizeof(u32) - n - off) * BITS_PER_BYTE); + return data; +} +#endif + +static void copy_io(u32 __iomem *piobuf, struct ipath_sge_state *ss, + u32 length, unsigned flush_wc) +{ + u32 extra = 0; + u32 data = 0; + u32 last; + + while (1) { + u32 len = ss->sge.length; + u32 off; + + if (len > length) + len = length; + if (len > ss->sge.sge_length) + len = ss->sge.sge_length; + BUG_ON(len == 0); + /* If the source address is not aligned, try to align it. */ + off = (unsigned long)ss->sge.vaddr & (sizeof(u32) - 1); + if (off) { + u32 *addr = (u32 *)((unsigned long)ss->sge.vaddr & + ~(sizeof(u32) - 1)); + u32 v = get_upper_bits(*addr, off * BITS_PER_BYTE); + u32 y; + + y = sizeof(u32) - off; + if (len > y) + len = y; + if (len + extra >= sizeof(u32)) { + data |= set_upper_bits(v, extra * + BITS_PER_BYTE); + len = sizeof(u32) - extra; + if (len == length) { + last = data; + break; + } + __raw_writel(data, piobuf); + piobuf++; + extra = 0; + data = 0; + } else { + /* Clear unused upper bytes */ + data |= clear_upper_bytes(v, len, extra); + if (len == length) { + last = data; + break; + } + extra += len; + } + } else if (extra) { + /* Source address is aligned. */ + u32 *addr = (u32 *) ss->sge.vaddr; + int shift = extra * BITS_PER_BYTE; + int ushift = 32 - shift; + u32 l = len; + + while (l >= sizeof(u32)) { + u32 v = *addr; + + data |= set_upper_bits(v, shift); + __raw_writel(data, piobuf); + data = get_upper_bits(v, ushift); + piobuf++; + addr++; + l -= sizeof(u32); + } + /* + * We still have 'extra' number of bytes leftover. + */ + if (l) { + u32 v = *addr; + + if (l + extra >= sizeof(u32)) { + data |= set_upper_bits(v, shift); + len -= l + extra - sizeof(u32); + if (len == length) { + last = data; + break; + } + __raw_writel(data, piobuf); + piobuf++; + extra = 0; + data = 0; + } else { + /* Clear unused upper bytes */ + data |= clear_upper_bytes(v, l, + extra); + if (len == length) { + last = data; + break; + } + extra += l; + } + } else if (len == length) { + last = data; + break; + } + } else if (len == length) { + u32 w; + + /* + * Need to round up for the last dword in the + * packet. + */ + w = (len + 3) >> 2; + __iowrite32_copy(piobuf, ss->sge.vaddr, w - 1); + piobuf += w - 1; + last = ((u32 *) ss->sge.vaddr)[w - 1]; + break; + } else { + u32 w = len >> 2; + + __iowrite32_copy(piobuf, ss->sge.vaddr, w); + piobuf += w; + + extra = len & (sizeof(u32) - 1); + if (extra) { + u32 v = ((u32 *) ss->sge.vaddr)[w]; + + /* Clear unused upper bytes */ + data = clear_upper_bytes(v, extra, 0); + } + } + update_sge(ss, len); + length -= len; + } + /* Update address before sending packet. */ + update_sge(ss, length); + if (flush_wc) { + /* must flush early everything before trigger word */ + ipath_flush_wc(); + __raw_writel(last, piobuf); + /* be sure trigger word is written */ + ipath_flush_wc(); + } else + __raw_writel(last, piobuf); +} + +/* + * Convert IB rate to delay multiplier. + */ +unsigned ipath_ib_rate_to_mult(enum ib_rate rate) +{ + switch (rate) { + case IB_RATE_2_5_GBPS: return 8; + case IB_RATE_5_GBPS: return 4; + case IB_RATE_10_GBPS: return 2; + case IB_RATE_20_GBPS: return 1; + default: return 0; + } +} + +/* + * Convert delay multiplier to IB rate + */ +static enum ib_rate ipath_mult_to_ib_rate(unsigned mult) +{ + switch (mult) { + case 8: return IB_RATE_2_5_GBPS; + case 4: return IB_RATE_5_GBPS; + case 2: return IB_RATE_10_GBPS; + case 1: return IB_RATE_20_GBPS; + default: return IB_RATE_PORT_CURRENT; + } +} + +static inline struct ipath_verbs_txreq *get_txreq(struct ipath_ibdev *dev) +{ + struct ipath_verbs_txreq *tx = NULL; + unsigned long flags; + + spin_lock_irqsave(&dev->pending_lock, flags); + if (!list_empty(&dev->txreq_free)) { + struct list_head *l = dev->txreq_free.next; + + list_del(l); + tx = list_entry(l, struct ipath_verbs_txreq, txreq.list); + } + spin_unlock_irqrestore(&dev->pending_lock, flags); + return tx; +} + +static inline void put_txreq(struct ipath_ibdev *dev, + struct ipath_verbs_txreq *tx) +{ + unsigned long flags; + + spin_lock_irqsave(&dev->pending_lock, flags); + list_add(&tx->txreq.list, &dev->txreq_free); + spin_unlock_irqrestore(&dev->pending_lock, flags); +} + +static void sdma_complete(void *cookie, int status) +{ + struct ipath_verbs_txreq *tx = cookie; + struct ipath_qp *qp = tx->qp; + struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + unsigned long flags; + enum ib_wc_status ibs = status == IPATH_SDMA_TXREQ_S_OK ? + IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR; + + if (atomic_dec_and_test(&qp->s_dma_busy)) { + spin_lock_irqsave(&qp->s_lock, flags); + if (tx->wqe) + ipath_send_complete(qp, tx->wqe, ibs); + if ((ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND && + qp->s_last != qp->s_head) || + (qp->s_flags & IPATH_S_WAIT_DMA)) + ipath_schedule_send(qp); + spin_unlock_irqrestore(&qp->s_lock, flags); + wake_up(&qp->wait_dma); + } else if (tx->wqe) { + spin_lock_irqsave(&qp->s_lock, flags); + ipath_send_complete(qp, tx->wqe, ibs); + spin_unlock_irqrestore(&qp->s_lock, flags); + } + + if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_FREEBUF) + kfree(tx->txreq.map_addr); + put_txreq(dev, tx); + + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); +} + +static void decrement_dma_busy(struct ipath_qp *qp) +{ + unsigned long flags; + + if (atomic_dec_and_test(&qp->s_dma_busy)) { + spin_lock_irqsave(&qp->s_lock, flags); + if ((ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND && + qp->s_last != qp->s_head) || + (qp->s_flags & IPATH_S_WAIT_DMA)) + ipath_schedule_send(qp); + spin_unlock_irqrestore(&qp->s_lock, flags); + wake_up(&qp->wait_dma); + } +} + +/* + * Compute the number of clock cycles of delay before sending the next packet. + * The multipliers reflect the number of clocks for the fastest rate so + * one tick at 4xDDR is 8 ticks at 1xSDR. + * If the destination port will take longer to receive a packet than + * the outgoing link can send it, we need to delay sending the next packet + * by the difference in time it takes the receiver to receive and the sender + * to send this packet. + * Note that this delay is always correct for UC and RC but not always + * optimal for UD. For UD, the destination HCA can be different for each + * packet, in which case, we could send packets to a different destination + * while "waiting" for the delay. The overhead for doing this without + * HW support is more than just paying the cost of delaying some packets + * unnecessarily. + */ +static inline unsigned ipath_pkt_delay(u32 plen, u8 snd_mult, u8 rcv_mult) +{ + return (rcv_mult > snd_mult) ? + (plen * (rcv_mult - snd_mult) + 1) >> 1 : 0; +} + +static int ipath_verbs_send_dma(struct ipath_qp *qp, + struct ipath_ib_header *hdr, u32 hdrwords, + struct ipath_sge_state *ss, u32 len, + u32 plen, u32 dwords) +{ + struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + struct ipath_devdata *dd = dev->dd; + struct ipath_verbs_txreq *tx; + u32 *piobuf; + u32 control; + u32 ndesc; + int ret; + + tx = qp->s_tx; + if (tx) { + qp->s_tx = NULL; + /* resend previously constructed packet */ + atomic_inc(&qp->s_dma_busy); + ret = ipath_sdma_verbs_send(dd, tx->ss, tx->len, tx); + if (ret) { + qp->s_tx = tx; + decrement_dma_busy(qp); + } + goto bail; + } + + tx = get_txreq(dev); + if (!tx) { + ret = -EBUSY; + goto bail; + } + + /* + * Get the saved delay count we computed for the previous packet + * and save the delay count for this packet to be used next time + * we get here. + */ + control = qp->s_pkt_delay; + qp->s_pkt_delay = ipath_pkt_delay(plen, dd->delay_mult, qp->s_dmult); + + tx->qp = qp; + atomic_inc(&qp->refcount); + tx->wqe = qp->s_wqe; + tx->txreq.callback = sdma_complete; + tx->txreq.callback_cookie = tx; + tx->txreq.flags = IPATH_SDMA_TXREQ_F_HEADTOHOST | + IPATH_SDMA_TXREQ_F_INTREQ | IPATH_SDMA_TXREQ_F_FREEDESC; + if (plen + 1 >= IPATH_SMALLBUF_DWORDS) + tx->txreq.flags |= IPATH_SDMA_TXREQ_F_USELARGEBUF; + + /* VL15 packets bypass credit check */ + if ((be16_to_cpu(hdr->lrh[0]) >> 12) == 15) { + control |= 1ULL << 31; + tx->txreq.flags |= IPATH_SDMA_TXREQ_F_VL15; + } + + if (len) { + /* + * Don't try to DMA if it takes more descriptors than + * the queue holds. + */ + ndesc = ipath_count_sge(ss, len); + if (ndesc >= dd->ipath_sdma_descq_cnt) + ndesc = 0; + } else + ndesc = 1; + if (ndesc) { + tx->hdr.pbc[0] = cpu_to_le32(plen); + tx->hdr.pbc[1] = cpu_to_le32(control); + memcpy(&tx->hdr.hdr, hdr, hdrwords << 2); + tx->txreq.sg_count = ndesc; + tx->map_len = (hdrwords + 2) << 2; + tx->txreq.map_addr = &tx->hdr; + atomic_inc(&qp->s_dma_busy); + ret = ipath_sdma_verbs_send(dd, ss, dwords, tx); + if (ret) { + /* save ss and length in dwords */ + tx->ss = ss; + tx->len = dwords; + qp->s_tx = tx; + decrement_dma_busy(qp); + } + goto bail; + } + + /* Allocate a buffer and copy the header and payload to it. */ + tx->map_len = (plen + 1) << 2; + piobuf = kmalloc(tx->map_len, GFP_ATOMIC); + if (unlikely(piobuf == NULL)) { + ret = -EBUSY; + goto err_tx; + } + tx->txreq.map_addr = piobuf; + tx->txreq.flags |= IPATH_SDMA_TXREQ_F_FREEBUF; + tx->txreq.sg_count = 1; + + *piobuf++ = (__force u32) cpu_to_le32(plen); + *piobuf++ = (__force u32) cpu_to_le32(control); + memcpy(piobuf, hdr, hdrwords << 2); + ipath_copy_from_sge(piobuf + hdrwords, ss, len); + + atomic_inc(&qp->s_dma_busy); + ret = ipath_sdma_verbs_send(dd, NULL, 0, tx); + /* + * If we couldn't queue the DMA request, save the info + * and try again later rather than destroying the + * buffer and undoing the side effects of the copy. + */ + if (ret) { + tx->ss = NULL; + tx->len = 0; + qp->s_tx = tx; + decrement_dma_busy(qp); + } + dev->n_unaligned++; + goto bail; + +err_tx: + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + put_txreq(dev, tx); +bail: + return ret; +} + +static int ipath_verbs_send_pio(struct ipath_qp *qp, + struct ipath_ib_header *ibhdr, u32 hdrwords, + struct ipath_sge_state *ss, u32 len, + u32 plen, u32 dwords) +{ + struct ipath_devdata *dd = to_idev(qp->ibqp.device)->dd; + u32 *hdr = (u32 *) ibhdr; + u32 __iomem *piobuf; + unsigned flush_wc; + u32 control; + int ret; + unsigned long flags; + + piobuf = ipath_getpiobuf(dd, plen, NULL); + if (unlikely(piobuf == NULL)) { + ret = -EBUSY; + goto bail; + } + + /* + * Get the saved delay count we computed for the previous packet + * and save the delay count for this packet to be used next time + * we get here. + */ + control = qp->s_pkt_delay; + qp->s_pkt_delay = ipath_pkt_delay(plen, dd->delay_mult, qp->s_dmult); + + /* VL15 packets bypass credit check */ + if ((be16_to_cpu(ibhdr->lrh[0]) >> 12) == 15) + control |= 1ULL << 31; + + /* + * Write the length to the control qword plus any needed flags. + * We have to flush after the PBC for correctness on some cpus + * or WC buffer can be written out of order. + */ + writeq(((u64) control << 32) | plen, piobuf); + piobuf += 2; + + flush_wc = dd->ipath_flags & IPATH_PIO_FLUSH_WC; + if (len == 0) { + /* + * If there is just the header portion, must flush before + * writing last word of header for correctness, and after + * the last header word (trigger word). + */ + if (flush_wc) { + ipath_flush_wc(); + __iowrite32_copy(piobuf, hdr, hdrwords - 1); + ipath_flush_wc(); + __raw_writel(hdr[hdrwords - 1], piobuf + hdrwords - 1); + ipath_flush_wc(); + } else + __iowrite32_copy(piobuf, hdr, hdrwords); + goto done; + } + + if (flush_wc) + ipath_flush_wc(); + __iowrite32_copy(piobuf, hdr, hdrwords); + piobuf += hdrwords; + + /* The common case is aligned and contained in one segment. */ + if (likely(ss->num_sge == 1 && len <= ss->sge.length && + !((unsigned long)ss->sge.vaddr & (sizeof(u32) - 1)))) { + u32 *addr = (u32 *) ss->sge.vaddr; + + /* Update address before sending packet. */ + update_sge(ss, len); + if (flush_wc) { + __iowrite32_copy(piobuf, addr, dwords - 1); + /* must flush early everything before trigger word */ + ipath_flush_wc(); + __raw_writel(addr[dwords - 1], piobuf + dwords - 1); + /* be sure trigger word is written */ + ipath_flush_wc(); + } else + __iowrite32_copy(piobuf, addr, dwords); + goto done; + } + copy_io(piobuf, ss, len, flush_wc); +done: + if (qp->s_wqe) { + spin_lock_irqsave(&qp->s_lock, flags); + ipath_send_complete(qp, qp->s_wqe, IB_WC_SUCCESS); + spin_unlock_irqrestore(&qp->s_lock, flags); + } + ret = 0; +bail: + return ret; +} + +/** + * ipath_verbs_send - send a packet + * @qp: the QP to send on + * @hdr: the packet header + * @hdrwords: the number of 32-bit words in the header + * @ss: the SGE to send + * @len: the length of the packet in bytes + */ +int ipath_verbs_send(struct ipath_qp *qp, struct ipath_ib_header *hdr, + u32 hdrwords, struct ipath_sge_state *ss, u32 len) +{ + struct ipath_devdata *dd = to_idev(qp->ibqp.device)->dd; + u32 plen; + int ret; + u32 dwords = (len + 3) >> 2; + + /* + * Calculate the send buffer trigger address. + * The +1 counts for the pbc control dword following the pbc length. + */ + plen = hdrwords + dwords + 1; + + /* + * VL15 packets (IB_QPT_SMI) will always use PIO, so we + * can defer SDMA restart until link goes ACTIVE without + * worrying about just how we got there. + */ + if (qp->ibqp.qp_type == IB_QPT_SMI || + !(dd->ipath_flags & IPATH_HAS_SEND_DMA)) + ret = ipath_verbs_send_pio(qp, hdr, hdrwords, ss, len, + plen, dwords); + else + ret = ipath_verbs_send_dma(qp, hdr, hdrwords, ss, len, + plen, dwords); + + return ret; +} + +int ipath_snapshot_counters(struct ipath_devdata *dd, u64 *swords, + u64 *rwords, u64 *spkts, u64 *rpkts, + u64 *xmit_wait) +{ + int ret; + + if (!(dd->ipath_flags & IPATH_INITTED)) { + /* no hardware, freeze, etc. */ + ret = -EINVAL; + goto bail; + } + *swords = ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordsendcnt); + *rwords = ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordrcvcnt); + *spkts = ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktsendcnt); + *rpkts = ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktrcvcnt); + *xmit_wait = ipath_snap_cntr(dd, dd->ipath_cregs->cr_sendstallcnt); + + ret = 0; + +bail: + return ret; +} + +/** + * ipath_get_counters - get various chip counters + * @dd: the infinipath device + * @cntrs: counters are placed here + * + * Return the counters needed by recv_pma_get_portcounters(). + */ +int ipath_get_counters(struct ipath_devdata *dd, + struct ipath_verbs_counters *cntrs) +{ + struct ipath_cregs const *crp = dd->ipath_cregs; + int ret; + + if (!(dd->ipath_flags & IPATH_INITTED)) { + /* no hardware, freeze, etc. */ + ret = -EINVAL; + goto bail; + } + cntrs->symbol_error_counter = + ipath_snap_cntr(dd, crp->cr_ibsymbolerrcnt); + cntrs->link_error_recovery_counter = + ipath_snap_cntr(dd, crp->cr_iblinkerrrecovcnt); + /* + * The link downed counter counts when the other side downs the + * connection. We add in the number of times we downed the link + * due to local link integrity errors to compensate. + */ + cntrs->link_downed_counter = + ipath_snap_cntr(dd, crp->cr_iblinkdowncnt); + cntrs->port_rcv_errors = + ipath_snap_cntr(dd, crp->cr_rxdroppktcnt) + + ipath_snap_cntr(dd, crp->cr_rcvovflcnt) + + ipath_snap_cntr(dd, crp->cr_portovflcnt) + + ipath_snap_cntr(dd, crp->cr_err_rlencnt) + + ipath_snap_cntr(dd, crp->cr_invalidrlencnt) + + ipath_snap_cntr(dd, crp->cr_errlinkcnt) + + ipath_snap_cntr(dd, crp->cr_erricrccnt) + + ipath_snap_cntr(dd, crp->cr_errvcrccnt) + + ipath_snap_cntr(dd, crp->cr_errlpcrccnt) + + ipath_snap_cntr(dd, crp->cr_badformatcnt) + + dd->ipath_rxfc_unsupvl_errs; + if (crp->cr_rxotherlocalphyerrcnt) + cntrs->port_rcv_errors += + ipath_snap_cntr(dd, crp->cr_rxotherlocalphyerrcnt); + if (crp->cr_rxvlerrcnt) + cntrs->port_rcv_errors += + ipath_snap_cntr(dd, crp->cr_rxvlerrcnt); + cntrs->port_rcv_remphys_errors = + ipath_snap_cntr(dd, crp->cr_rcvebpcnt); + cntrs->port_xmit_discards = ipath_snap_cntr(dd, crp->cr_unsupvlcnt); + cntrs->port_xmit_data = ipath_snap_cntr(dd, crp->cr_wordsendcnt); + cntrs->port_rcv_data = ipath_snap_cntr(dd, crp->cr_wordrcvcnt); + cntrs->port_xmit_packets = ipath_snap_cntr(dd, crp->cr_pktsendcnt); + cntrs->port_rcv_packets = ipath_snap_cntr(dd, crp->cr_pktrcvcnt); + cntrs->local_link_integrity_errors = + crp->cr_locallinkintegrityerrcnt ? + ipath_snap_cntr(dd, crp->cr_locallinkintegrityerrcnt) : + ((dd->ipath_flags & IPATH_GPIO_ERRINTRS) ? + dd->ipath_lli_errs : dd->ipath_lli_errors); + cntrs->excessive_buffer_overrun_errors = + crp->cr_excessbufferovflcnt ? + ipath_snap_cntr(dd, crp->cr_excessbufferovflcnt) : + dd->ipath_overrun_thresh_errs; + cntrs->vl15_dropped = crp->cr_vl15droppedpktcnt ? + ipath_snap_cntr(dd, crp->cr_vl15droppedpktcnt) : 0; + + ret = 0; + +bail: + return ret; +} + +/** + * ipath_ib_piobufavail - callback when a PIO buffer is available + * @arg: the device pointer + * + * This is called from ipath_intr() at interrupt level when a PIO buffer is + * available after ipath_verbs_send() returned an error that no buffers were + * available. Return 1 if we consumed all the PIO buffers and we still have + * QPs waiting for buffers (for now, just restart the send tasklet and + * return zero). + */ +int ipath_ib_piobufavail(struct ipath_ibdev *dev) +{ + struct list_head *list; + struct ipath_qp *qplist; + struct ipath_qp *qp; + unsigned long flags; + + if (dev == NULL) + goto bail; + + list = &dev->piowait; + qplist = NULL; + + spin_lock_irqsave(&dev->pending_lock, flags); + while (!list_empty(list)) { + qp = list_entry(list->next, struct ipath_qp, piowait); + list_del_init(&qp->piowait); + qp->pio_next = qplist; + qplist = qp; + atomic_inc(&qp->refcount); + } + spin_unlock_irqrestore(&dev->pending_lock, flags); + + while (qplist != NULL) { + qp = qplist; + qplist = qp->pio_next; + + spin_lock_irqsave(&qp->s_lock, flags); + if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) + ipath_schedule_send(qp); + spin_unlock_irqrestore(&qp->s_lock, flags); + + /* Notify ipath_destroy_qp() if it is waiting. */ + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + } + +bail: + return 0; +} + +static int ipath_query_device(struct ib_device *ibdev, + struct ib_device_attr *props) +{ + struct ipath_ibdev *dev = to_idev(ibdev); + + memset(props, 0, sizeof(*props)); + + props->device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR | + IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT | + IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN | + IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE; + props->page_size_cap = PAGE_SIZE; + props->vendor_id = + IPATH_SRC_OUI_1 << 16 | IPATH_SRC_OUI_2 << 8 | IPATH_SRC_OUI_3; + props->vendor_part_id = dev->dd->ipath_deviceid; + props->hw_ver = dev->dd->ipath_pcirev; + + props->sys_image_guid = dev->sys_image_guid; + + props->max_mr_size = ~0ull; + props->max_qp = ib_ipath_max_qps; + props->max_qp_wr = ib_ipath_max_qp_wrs; + props->max_sge = ib_ipath_max_sges; + props->max_cq = ib_ipath_max_cqs; + props->max_ah = ib_ipath_max_ahs; + props->max_cqe = ib_ipath_max_cqes; + props->max_mr = dev->lk_table.max; + props->max_fmr = dev->lk_table.max; + props->max_map_per_fmr = 32767; + props->max_pd = ib_ipath_max_pds; + props->max_qp_rd_atom = IPATH_MAX_RDMA_ATOMIC; + props->max_qp_init_rd_atom = 255; + /* props->max_res_rd_atom */ + props->max_srq = ib_ipath_max_srqs; + props->max_srq_wr = ib_ipath_max_srq_wrs; + props->max_srq_sge = ib_ipath_max_srq_sges; + /* props->local_ca_ack_delay */ + props->atomic_cap = IB_ATOMIC_GLOB; + props->max_pkeys = ipath_get_npkeys(dev->dd); + props->max_mcast_grp = ib_ipath_max_mcast_grps; + props->max_mcast_qp_attach = ib_ipath_max_mcast_qp_attached; + props->max_total_mcast_qp_attach = props->max_mcast_qp_attach * + props->max_mcast_grp; + + return 0; +} + +const u8 ipath_cvt_physportstate[32] = { + [INFINIPATH_IBCS_LT_STATE_DISABLED] = IB_PHYSPORTSTATE_DISABLED, + [INFINIPATH_IBCS_LT_STATE_LINKUP] = IB_PHYSPORTSTATE_LINKUP, + [INFINIPATH_IBCS_LT_STATE_POLLACTIVE] = IB_PHYSPORTSTATE_POLL, + [INFINIPATH_IBCS_LT_STATE_POLLQUIET] = IB_PHYSPORTSTATE_POLL, + [INFINIPATH_IBCS_LT_STATE_SLEEPDELAY] = IB_PHYSPORTSTATE_SLEEP, + [INFINIPATH_IBCS_LT_STATE_SLEEPQUIET] = IB_PHYSPORTSTATE_SLEEP, + [INFINIPATH_IBCS_LT_STATE_CFGDEBOUNCE] = + IB_PHYSPORTSTATE_CFG_TRAIN, + [INFINIPATH_IBCS_LT_STATE_CFGRCVFCFG] = + IB_PHYSPORTSTATE_CFG_TRAIN, + [INFINIPATH_IBCS_LT_STATE_CFGWAITRMT] = + IB_PHYSPORTSTATE_CFG_TRAIN, + [INFINIPATH_IBCS_LT_STATE_CFGIDLE] = IB_PHYSPORTSTATE_CFG_TRAIN, + [INFINIPATH_IBCS_LT_STATE_RECOVERRETRAIN] = + IB_PHYSPORTSTATE_LINK_ERR_RECOVER, + [INFINIPATH_IBCS_LT_STATE_RECOVERWAITRMT] = + IB_PHYSPORTSTATE_LINK_ERR_RECOVER, + [INFINIPATH_IBCS_LT_STATE_RECOVERIDLE] = + IB_PHYSPORTSTATE_LINK_ERR_RECOVER, + [0x10] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x11] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x12] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x13] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x14] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x15] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x16] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x17] = IB_PHYSPORTSTATE_CFG_TRAIN +}; + +u32 ipath_get_cr_errpkey(struct ipath_devdata *dd) +{ + return ipath_read_creg32(dd, dd->ipath_cregs->cr_errpkey); +} + +static int ipath_query_port(struct ib_device *ibdev, + u8 port, struct ib_port_attr *props) +{ + struct ipath_ibdev *dev = to_idev(ibdev); + struct ipath_devdata *dd = dev->dd; + enum ib_mtu mtu; + u16 lid = dd->ipath_lid; + u64 ibcstat; + + memset(props, 0, sizeof(*props)); + props->lid = lid ? lid : be16_to_cpu(IB_LID_PERMISSIVE); + props->lmc = dd->ipath_lmc; + props->sm_lid = dev->sm_lid; + props->sm_sl = dev->sm_sl; + ibcstat = dd->ipath_lastibcstat; + /* map LinkState to IB portinfo values. */ + props->state = ipath_ib_linkstate(dd, ibcstat) + 1; + + /* See phys_state_show() */ + props->phys_state = /* MEA: assumes shift == 0 */ + ipath_cvt_physportstate[dd->ipath_lastibcstat & + dd->ibcs_lts_mask]; + props->port_cap_flags = dev->port_cap_flags; + props->gid_tbl_len = 1; + props->max_msg_sz = 0x80000000; + props->pkey_tbl_len = ipath_get_npkeys(dd); + props->bad_pkey_cntr = ipath_get_cr_errpkey(dd) - + dev->z_pkey_violations; + props->qkey_viol_cntr = dev->qkey_violations; + props->active_width = dd->ipath_link_width_active; + /* See rate_show() */ + props->active_speed = dd->ipath_link_speed_active; + props->max_vl_num = 1; /* VLCap = VL0 */ + props->init_type_reply = 0; + + props->max_mtu = ipath_mtu4096 ? IB_MTU_4096 : IB_MTU_2048; + switch (dd->ipath_ibmtu) { + case 4096: + mtu = IB_MTU_4096; + break; + case 2048: + mtu = IB_MTU_2048; + break; + case 1024: + mtu = IB_MTU_1024; + break; + case 512: + mtu = IB_MTU_512; + break; + case 256: + mtu = IB_MTU_256; + break; + default: + mtu = IB_MTU_2048; + } + props->active_mtu = mtu; + props->subnet_timeout = dev->subnet_timeout; + + return 0; +} + +static int ipath_modify_device(struct ib_device *device, + int device_modify_mask, + struct ib_device_modify *device_modify) +{ + int ret; + + if (device_modify_mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID | + IB_DEVICE_MODIFY_NODE_DESC)) { + ret = -EOPNOTSUPP; + goto bail; + } + + if (device_modify_mask & IB_DEVICE_MODIFY_NODE_DESC) + memcpy(device->node_desc, device_modify->node_desc, 64); + + if (device_modify_mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID) + to_idev(device)->sys_image_guid = + cpu_to_be64(device_modify->sys_image_guid); + + ret = 0; + +bail: + return ret; +} + +static int ipath_modify_port(struct ib_device *ibdev, + u8 port, int port_modify_mask, + struct ib_port_modify *props) +{ + struct ipath_ibdev *dev = to_idev(ibdev); + + dev->port_cap_flags |= props->set_port_cap_mask; + dev->port_cap_flags &= ~props->clr_port_cap_mask; + if (port_modify_mask & IB_PORT_SHUTDOWN) + ipath_set_linkstate(dev->dd, IPATH_IB_LINKDOWN); + if (port_modify_mask & IB_PORT_RESET_QKEY_CNTR) + dev->qkey_violations = 0; + return 0; +} + +static int ipath_query_gid(struct ib_device *ibdev, u8 port, + int index, union ib_gid *gid) +{ + struct ipath_ibdev *dev = to_idev(ibdev); + int ret; + + if (index >= 1) { + ret = -EINVAL; + goto bail; + } + gid->global.subnet_prefix = dev->gid_prefix; + gid->global.interface_id = dev->dd->ipath_guid; + + ret = 0; + +bail: + return ret; +} + +static struct ib_pd *ipath_alloc_pd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct ipath_ibdev *dev = to_idev(ibdev); + struct ipath_pd *pd; + struct ib_pd *ret; + + /* + * This is actually totally arbitrary. Some correctness tests + * assume there's a maximum number of PDs that can be allocated. + * We don't actually have this limit, but we fail the test if + * we allow allocations of more than we report for this value. + */ + + pd = kmalloc(sizeof *pd, GFP_KERNEL); + if (!pd) { + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + spin_lock(&dev->n_pds_lock); + if (dev->n_pds_allocated == ib_ipath_max_pds) { + spin_unlock(&dev->n_pds_lock); + kfree(pd); + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + dev->n_pds_allocated++; + spin_unlock(&dev->n_pds_lock); + + /* ib_alloc_pd() will initialize pd->ibpd. */ + pd->user = udata != NULL; + + ret = &pd->ibpd; + +bail: + return ret; +} + +static int ipath_dealloc_pd(struct ib_pd *ibpd) +{ + struct ipath_pd *pd = to_ipd(ibpd); + struct ipath_ibdev *dev = to_idev(ibpd->device); + + spin_lock(&dev->n_pds_lock); + dev->n_pds_allocated--; + spin_unlock(&dev->n_pds_lock); + + kfree(pd); + + return 0; +} + +/** + * ipath_create_ah - create an address handle + * @pd: the protection domain + * @ah_attr: the attributes of the AH + * + * This may be called from interrupt context. + */ +static struct ib_ah *ipath_create_ah(struct ib_pd *pd, + struct ib_ah_attr *ah_attr) +{ + struct ipath_ah *ah; + struct ib_ah *ret; + struct ipath_ibdev *dev = to_idev(pd->device); + unsigned long flags; + + /* A multicast address requires a GRH (see ch. 8.4.1). */ + if (ah_attr->dlid >= IPATH_MULTICAST_LID_BASE && + ah_attr->dlid != IPATH_PERMISSIVE_LID && + !(ah_attr->ah_flags & IB_AH_GRH)) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + + if (ah_attr->dlid == 0) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + + if (ah_attr->port_num < 1 || + ah_attr->port_num > pd->device->phys_port_cnt) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + + ah = kmalloc(sizeof *ah, GFP_ATOMIC); + if (!ah) { + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + spin_lock_irqsave(&dev->n_ahs_lock, flags); + if (dev->n_ahs_allocated == ib_ipath_max_ahs) { + spin_unlock_irqrestore(&dev->n_ahs_lock, flags); + kfree(ah); + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + dev->n_ahs_allocated++; + spin_unlock_irqrestore(&dev->n_ahs_lock, flags); + + /* ib_create_ah() will initialize ah->ibah. */ + ah->attr = *ah_attr; + ah->attr.static_rate = ipath_ib_rate_to_mult(ah_attr->static_rate); + + ret = &ah->ibah; + +bail: + return ret; +} + +/** + * ipath_destroy_ah - destroy an address handle + * @ibah: the AH to destroy + * + * This may be called from interrupt context. + */ +static int ipath_destroy_ah(struct ib_ah *ibah) +{ + struct ipath_ibdev *dev = to_idev(ibah->device); + struct ipath_ah *ah = to_iah(ibah); + unsigned long flags; + + spin_lock_irqsave(&dev->n_ahs_lock, flags); + dev->n_ahs_allocated--; + spin_unlock_irqrestore(&dev->n_ahs_lock, flags); + + kfree(ah); + + return 0; +} + +static int ipath_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr) +{ + struct ipath_ah *ah = to_iah(ibah); + + *ah_attr = ah->attr; + ah_attr->static_rate = ipath_mult_to_ib_rate(ah->attr.static_rate); + + return 0; +} + +/** + * ipath_get_npkeys - return the size of the PKEY table for port 0 + * @dd: the infinipath device + */ +unsigned ipath_get_npkeys(struct ipath_devdata *dd) +{ + return ARRAY_SIZE(dd->ipath_pd[0]->port_pkeys); +} + +/** + * ipath_get_pkey - return the indexed PKEY from the port PKEY table + * @dd: the infinipath device + * @index: the PKEY index + */ +unsigned ipath_get_pkey(struct ipath_devdata *dd, unsigned index) +{ + unsigned ret; + + /* always a kernel port, no locking needed */ + if (index >= ARRAY_SIZE(dd->ipath_pd[0]->port_pkeys)) + ret = 0; + else + ret = dd->ipath_pd[0]->port_pkeys[index]; + + return ret; +} + +static int ipath_query_pkey(struct ib_device *ibdev, u8 port, u16 index, + u16 *pkey) +{ + struct ipath_ibdev *dev = to_idev(ibdev); + int ret; + + if (index >= ipath_get_npkeys(dev->dd)) { + ret = -EINVAL; + goto bail; + } + + *pkey = ipath_get_pkey(dev->dd, index); + ret = 0; + +bail: + return ret; +} + +/** + * ipath_alloc_ucontext - allocate a ucontest + * @ibdev: the infiniband device + * @udata: not used by the InfiniPath driver + */ + +static struct ib_ucontext *ipath_alloc_ucontext(struct ib_device *ibdev, + struct ib_udata *udata) +{ + struct ipath_ucontext *context; + struct ib_ucontext *ret; + + context = kmalloc(sizeof *context, GFP_KERNEL); + if (!context) { + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + ret = &context->ibucontext; + +bail: + return ret; +} + +static int ipath_dealloc_ucontext(struct ib_ucontext *context) +{ + kfree(to_iucontext(context)); + return 0; +} + +static int ipath_verbs_register_sysfs(struct ib_device *dev); + +static void __verbs_timer(unsigned long arg) +{ + struct ipath_devdata *dd = (struct ipath_devdata *) arg; + + /* Handle verbs layer timeouts. */ + ipath_ib_timer(dd->verbs_dev); + + mod_timer(&dd->verbs_timer, jiffies + 1); +} + +static int enable_timer(struct ipath_devdata *dd) +{ + /* + * Early chips had a design flaw where the chip and kernel idea + * of the tail register don't always agree, and therefore we won't + * get an interrupt on the next packet received. + * If the board supports per packet receive interrupts, use it. + * Otherwise, the timer function periodically checks for packets + * to cover this case. + * Either way, the timer is needed for verbs layer related + * processing. + */ + if (dd->ipath_flags & IPATH_GPIO_INTR) { + ipath_write_kreg(dd, dd->ipath_kregs->kr_debugportselect, + 0x2074076542310ULL); + /* Enable GPIO bit 2 interrupt */ + dd->ipath_gpio_mask |= (u64) (1 << IPATH_GPIO_PORT0_BIT); + ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask, + dd->ipath_gpio_mask); + } + + init_timer(&dd->verbs_timer); + dd->verbs_timer.function = __verbs_timer; + dd->verbs_timer.data = (unsigned long)dd; + dd->verbs_timer.expires = jiffies + 1; + add_timer(&dd->verbs_timer); + + return 0; +} + +static int disable_timer(struct ipath_devdata *dd) +{ + /* Disable GPIO bit 2 interrupt */ + if (dd->ipath_flags & IPATH_GPIO_INTR) { + /* Disable GPIO bit 2 interrupt */ + dd->ipath_gpio_mask &= ~((u64) (1 << IPATH_GPIO_PORT0_BIT)); + ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask, + dd->ipath_gpio_mask); + /* + * We might want to undo changes to debugportselect, + * but how? + */ + } + + del_timer_sync(&dd->verbs_timer); + + return 0; +} + +/** + * ipath_register_ib_device - register our device with the infiniband core + * @dd: the device data structure + * Return the allocated ipath_ibdev pointer or NULL on error. + */ +int ipath_register_ib_device(struct ipath_devdata *dd) +{ + struct ipath_verbs_counters cntrs; + struct ipath_ibdev *idev; + struct ib_device *dev; + struct ipath_verbs_txreq *tx; + unsigned i; + int ret; + + idev = (struct ipath_ibdev *)ib_alloc_device(sizeof *idev); + if (idev == NULL) { + ret = -ENOMEM; + goto bail; + } + + dev = &idev->ibdev; + + if (dd->ipath_sdma_descq_cnt) { + tx = kmalloc(dd->ipath_sdma_descq_cnt * sizeof *tx, + GFP_KERNEL); + if (tx == NULL) { + ret = -ENOMEM; + goto err_tx; + } + } else + tx = NULL; + idev->txreq_bufs = tx; + + /* Only need to initialize non-zero fields. */ + spin_lock_init(&idev->n_pds_lock); + spin_lock_init(&idev->n_ahs_lock); + spin_lock_init(&idev->n_cqs_lock); + spin_lock_init(&idev->n_qps_lock); + spin_lock_init(&idev->n_srqs_lock); + spin_lock_init(&idev->n_mcast_grps_lock); + + spin_lock_init(&idev->qp_table.lock); + spin_lock_init(&idev->lk_table.lock); + idev->sm_lid = __constant_be16_to_cpu(IB_LID_PERMISSIVE); + /* Set the prefix to the default value (see ch. 4.1.1) */ + idev->gid_prefix = __constant_cpu_to_be64(0xfe80000000000000ULL); + + ret = ipath_init_qp_table(idev, ib_ipath_qp_table_size); + if (ret) + goto err_qp; + + /* + * The top ib_ipath_lkey_table_size bits are used to index the + * table. The lower 8 bits can be owned by the user (copied from + * the LKEY). The remaining bits act as a generation number or tag. + */ + idev->lk_table.max = 1 << ib_ipath_lkey_table_size; + idev->lk_table.table = kzalloc(idev->lk_table.max * + sizeof(*idev->lk_table.table), + GFP_KERNEL); + if (idev->lk_table.table == NULL) { + ret = -ENOMEM; + goto err_lk; + } + INIT_LIST_HEAD(&idev->pending_mmaps); + spin_lock_init(&idev->pending_lock); + idev->mmap_offset = PAGE_SIZE; + spin_lock_init(&idev->mmap_offset_lock); + INIT_LIST_HEAD(&idev->pending[0]); + INIT_LIST_HEAD(&idev->pending[1]); + INIT_LIST_HEAD(&idev->pending[2]); + INIT_LIST_HEAD(&idev->piowait); + INIT_LIST_HEAD(&idev->rnrwait); + INIT_LIST_HEAD(&idev->txreq_free); + idev->pending_index = 0; + idev->port_cap_flags = + IB_PORT_SYS_IMAGE_GUID_SUP | IB_PORT_CLIENT_REG_SUP; + if (dd->ipath_flags & IPATH_HAS_LINK_LATENCY) + idev->port_cap_flags |= IB_PORT_LINK_LATENCY_SUP; + idev->pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA; + idev->pma_counter_select[1] = IB_PMA_PORT_RCV_DATA; + idev->pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS; + idev->pma_counter_select[3] = IB_PMA_PORT_RCV_PKTS; + idev->pma_counter_select[4] = IB_PMA_PORT_XMIT_WAIT; + + /* Snapshot current HW counters to "clear" them. */ + ipath_get_counters(dd, &cntrs); + idev->z_symbol_error_counter = cntrs.symbol_error_counter; + idev->z_link_error_recovery_counter = + cntrs.link_error_recovery_counter; + idev->z_link_downed_counter = cntrs.link_downed_counter; + idev->z_port_rcv_errors = cntrs.port_rcv_errors; + idev->z_port_rcv_remphys_errors = + cntrs.port_rcv_remphys_errors; + idev->z_port_xmit_discards = cntrs.port_xmit_discards; + idev->z_port_xmit_data = cntrs.port_xmit_data; + idev->z_port_rcv_data = cntrs.port_rcv_data; + idev->z_port_xmit_packets = cntrs.port_xmit_packets; + idev->z_port_rcv_packets = cntrs.port_rcv_packets; + idev->z_local_link_integrity_errors = + cntrs.local_link_integrity_errors; + idev->z_excessive_buffer_overrun_errors = + cntrs.excessive_buffer_overrun_errors; + idev->z_vl15_dropped = cntrs.vl15_dropped; + + for (i = 0; i < dd->ipath_sdma_descq_cnt; i++, tx++) + list_add(&tx->txreq.list, &idev->txreq_free); + + /* + * The system image GUID is supposed to be the same for all + * IB HCAs in a single system but since there can be other + * device types in the system, we can't be sure this is unique. + */ + if (!sys_image_guid) + sys_image_guid = dd->ipath_guid; + idev->sys_image_guid = sys_image_guid; + idev->ib_unit = dd->ipath_unit; + idev->dd = dd; + + strlcpy(dev->name, "ipath%d", IB_DEVICE_NAME_MAX); + dev->owner = THIS_MODULE; + dev->node_guid = dd->ipath_guid; + dev->uverbs_abi_ver = IPATH_UVERBS_ABI_VERSION; + dev->uverbs_cmd_mask = + (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | + (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | + (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_CREATE_AH) | + (1ull << IB_USER_VERBS_CMD_DESTROY_AH) | + (1ull << IB_USER_VERBS_CMD_QUERY_AH) | + (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_DEREG_MR) | + (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | + (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_CMD_RESIZE_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | + (1ull << IB_USER_VERBS_CMD_POLL_CQ) | + (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_QP) | + (1ull << IB_USER_VERBS_CMD_QUERY_QP) | + (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | + (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | + (1ull << IB_USER_VERBS_CMD_POST_SEND) | + (1ull << IB_USER_VERBS_CMD_POST_RECV) | + (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_DETACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) | + (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) | + (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) | + (1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV); + dev->node_type = RDMA_NODE_IB_CA; + dev->phys_port_cnt = 1; + dev->num_comp_vectors = 1; + dev->dma_device = &dd->pcidev->dev; + dev->query_device = ipath_query_device; + dev->modify_device = ipath_modify_device; + dev->query_port = ipath_query_port; + dev->modify_port = ipath_modify_port; + dev->query_pkey = ipath_query_pkey; + dev->query_gid = ipath_query_gid; + dev->alloc_ucontext = ipath_alloc_ucontext; + dev->dealloc_ucontext = ipath_dealloc_ucontext; + dev->alloc_pd = ipath_alloc_pd; + dev->dealloc_pd = ipath_dealloc_pd; + dev->create_ah = ipath_create_ah; + dev->destroy_ah = ipath_destroy_ah; + dev->query_ah = ipath_query_ah; + dev->create_srq = ipath_create_srq; + dev->modify_srq = ipath_modify_srq; + dev->query_srq = ipath_query_srq; + dev->destroy_srq = ipath_destroy_srq; + dev->create_qp = ipath_create_qp; + dev->modify_qp = ipath_modify_qp; + dev->query_qp = ipath_query_qp; + dev->destroy_qp = ipath_destroy_qp; + dev->post_send = ipath_post_send; + dev->post_recv = ipath_post_receive; + dev->post_srq_recv = ipath_post_srq_receive; + dev->create_cq = ipath_create_cq; + dev->destroy_cq = ipath_destroy_cq; + dev->resize_cq = ipath_resize_cq; + dev->poll_cq = ipath_poll_cq; + dev->req_notify_cq = ipath_req_notify_cq; + dev->get_dma_mr = ipath_get_dma_mr; + dev->reg_phys_mr = ipath_reg_phys_mr; + dev->reg_user_mr = ipath_reg_user_mr; + dev->dereg_mr = ipath_dereg_mr; + dev->alloc_fmr = ipath_alloc_fmr; + dev->map_phys_fmr = ipath_map_phys_fmr; + dev->unmap_fmr = ipath_unmap_fmr; + dev->dealloc_fmr = ipath_dealloc_fmr; + dev->attach_mcast = ipath_multicast_attach; + dev->detach_mcast = ipath_multicast_detach; + dev->process_mad = ipath_process_mad; + dev->mmap = ipath_mmap; + dev->dma_ops = &ipath_dma_mapping_ops; + + snprintf(dev->node_desc, sizeof(dev->node_desc), + IPATH_IDSTR " %s", init_utsname()->nodename); + + ret = ib_register_device(dev, NULL); + if (ret) + goto err_reg; + + ret = ipath_verbs_register_sysfs(dev); + if (ret) + goto err_class; + + enable_timer(dd); + + goto bail; + +err_class: + ib_unregister_device(dev); +err_reg: + kfree(idev->lk_table.table); +err_lk: + kfree(idev->qp_table.table); +err_qp: + kfree(idev->txreq_bufs); +err_tx: + ib_dealloc_device(dev); + ipath_dev_err(dd, "cannot register verbs: %d!\n", -ret); + idev = NULL; + +bail: + dd->verbs_dev = idev; + return ret; +} + +void ipath_unregister_ib_device(struct ipath_ibdev *dev) +{ + struct ib_device *ibdev = &dev->ibdev; + u32 qps_inuse; + + ib_unregister_device(ibdev); + + disable_timer(dev->dd); + + if (!list_empty(&dev->pending[0]) || + !list_empty(&dev->pending[1]) || + !list_empty(&dev->pending[2])) + ipath_dev_err(dev->dd, "pending list not empty!\n"); + if (!list_empty(&dev->piowait)) + ipath_dev_err(dev->dd, "piowait list not empty!\n"); + if (!list_empty(&dev->rnrwait)) + ipath_dev_err(dev->dd, "rnrwait list not empty!\n"); + if (!ipath_mcast_tree_empty()) + ipath_dev_err(dev->dd, "multicast table memory leak!\n"); + /* + * Note that ipath_unregister_ib_device() can be called before all + * the QPs are destroyed! + */ + qps_inuse = ipath_free_all_qps(&dev->qp_table); + if (qps_inuse) + ipath_dev_err(dev->dd, "QP memory leak! %u still in use\n", + qps_inuse); + kfree(dev->qp_table.table); + kfree(dev->lk_table.table); + kfree(dev->txreq_bufs); + ib_dealloc_device(ibdev); +} + +static ssize_t show_rev(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct ipath_ibdev *dev = + container_of(device, struct ipath_ibdev, ibdev.dev); + + return sprintf(buf, "%x\n", dev->dd->ipath_pcirev); +} + +static ssize_t show_hca(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct ipath_ibdev *dev = + container_of(device, struct ipath_ibdev, ibdev.dev); + int ret; + + ret = dev->dd->ipath_f_get_boardname(dev->dd, buf, 128); + if (ret < 0) + goto bail; + strcat(buf, "\n"); + ret = strlen(buf); + +bail: + return ret; +} + +static ssize_t show_stats(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct ipath_ibdev *dev = + container_of(device, struct ipath_ibdev, ibdev.dev); + int i; + int len; + + len = sprintf(buf, + "RC resends %d\n" + "RC no QACK %d\n" + "RC ACKs %d\n" + "RC SEQ NAKs %d\n" + "RC RDMA seq %d\n" + "RC RNR NAKs %d\n" + "RC OTH NAKs %d\n" + "RC timeouts %d\n" + "RC RDMA dup %d\n" + "piobuf wait %d\n" + "unaligned %d\n" + "PKT drops %d\n" + "WQE errs %d\n", + dev->n_rc_resends, dev->n_rc_qacks, dev->n_rc_acks, + dev->n_seq_naks, dev->n_rdma_seq, dev->n_rnr_naks, + dev->n_other_naks, dev->n_timeouts, + dev->n_rdma_dup_busy, dev->n_piowait, dev->n_unaligned, + dev->n_pkt_drops, dev->n_wqe_errs); + for (i = 0; i < ARRAY_SIZE(dev->opstats); i++) { + const struct ipath_opcode_stats *si = &dev->opstats[i]; + + if (!si->n_packets && !si->n_bytes) + continue; + len += sprintf(buf + len, "%02x %llu/%llu\n", i, + (unsigned long long) si->n_packets, + (unsigned long long) si->n_bytes); + } + return len; +} + +static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); +static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); +static DEVICE_ATTR(board_id, S_IRUGO, show_hca, NULL); +static DEVICE_ATTR(stats, S_IRUGO, show_stats, NULL); + +static struct device_attribute *ipath_class_attributes[] = { + &dev_attr_hw_rev, + &dev_attr_hca_type, + &dev_attr_board_id, + &dev_attr_stats +}; + +static int ipath_verbs_register_sysfs(struct ib_device *dev) +{ + int i; + int ret; + + for (i = 0; i < ARRAY_SIZE(ipath_class_attributes); ++i) { + ret = device_create_file(&dev->dev, + ipath_class_attributes[i]); + if (ret) + goto bail; + } + return 0; +bail: + for (i = 0; i < ARRAY_SIZE(ipath_class_attributes); ++i) + device_remove_file(&dev->dev, ipath_class_attributes[i]); + return ret; +} diff --git a/kernel/drivers/infiniband/hw/ipath/ipath_verbs.h b/kernel/drivers/infiniband/hw/ipath/ipath_verbs.h new file mode 100644 index 000000000..ae6cff4ab --- /dev/null +++ b/kernel/drivers/infiniband/hw/ipath/ipath_verbs.h @@ -0,0 +1,936 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef IPATH_VERBS_H +#define IPATH_VERBS_H + +#include +#include +#include +#include +#include +#include +#include + +#include "ipath_kernel.h" + +#define IPATH_MAX_RDMA_ATOMIC 4 + +#define QPN_MAX (1 << 24) +#define QPNMAP_ENTRIES (QPN_MAX / PAGE_SIZE / BITS_PER_BYTE) + +/* + * Increment this value if any changes that break userspace ABI + * compatibility are made. + */ +#define IPATH_UVERBS_ABI_VERSION 2 + +/* + * Define an ib_cq_notify value that is not valid so we know when CQ + * notifications are armed. + */ +#define IB_CQ_NONE (IB_CQ_NEXT_COMP + 1) + +/* AETH NAK opcode values */ +#define IB_RNR_NAK 0x20 +#define IB_NAK_PSN_ERROR 0x60 +#define IB_NAK_INVALID_REQUEST 0x61 +#define IB_NAK_REMOTE_ACCESS_ERROR 0x62 +#define IB_NAK_REMOTE_OPERATIONAL_ERROR 0x63 +#define IB_NAK_INVALID_RD_REQUEST 0x64 + +/* Flags for checking QP state (see ib_ipath_state_ops[]) */ +#define IPATH_POST_SEND_OK 0x01 +#define IPATH_POST_RECV_OK 0x02 +#define IPATH_PROCESS_RECV_OK 0x04 +#define IPATH_PROCESS_SEND_OK 0x08 +#define IPATH_PROCESS_NEXT_SEND_OK 0x10 +#define IPATH_FLUSH_SEND 0x20 +#define IPATH_FLUSH_RECV 0x40 +#define IPATH_PROCESS_OR_FLUSH_SEND \ + (IPATH_PROCESS_SEND_OK | IPATH_FLUSH_SEND) + +/* IB Performance Manager status values */ +#define IB_PMA_SAMPLE_STATUS_DONE 0x00 +#define IB_PMA_SAMPLE_STATUS_STARTED 0x01 +#define IB_PMA_SAMPLE_STATUS_RUNNING 0x02 + +/* Mandatory IB performance counter select values. */ +#define IB_PMA_PORT_XMIT_DATA cpu_to_be16(0x0001) +#define IB_PMA_PORT_RCV_DATA cpu_to_be16(0x0002) +#define IB_PMA_PORT_XMIT_PKTS cpu_to_be16(0x0003) +#define IB_PMA_PORT_RCV_PKTS cpu_to_be16(0x0004) +#define IB_PMA_PORT_XMIT_WAIT cpu_to_be16(0x0005) + +struct ib_reth { + __be64 vaddr; + __be32 rkey; + __be32 length; +} __attribute__ ((packed)); + +struct ib_atomic_eth { + __be32 vaddr[2]; /* unaligned so access as 2 32-bit words */ + __be32 rkey; + __be64 swap_data; + __be64 compare_data; +} __attribute__ ((packed)); + +struct ipath_other_headers { + __be32 bth[3]; + union { + struct { + __be32 deth[2]; + __be32 imm_data; + } ud; + struct { + struct ib_reth reth; + __be32 imm_data; + } rc; + struct { + __be32 aeth; + __be32 atomic_ack_eth[2]; + } at; + __be32 imm_data; + __be32 aeth; + struct ib_atomic_eth atomic_eth; + } u; +} __attribute__ ((packed)); + +/* + * Note that UD packets with a GRH header are 8+40+12+8 = 68 bytes + * long (72 w/ imm_data). Only the first 56 bytes of the IB header + * will be in the eager header buffer. The remaining 12 or 16 bytes + * are in the data buffer. + */ +struct ipath_ib_header { + __be16 lrh[4]; + union { + struct { + struct ib_grh grh; + struct ipath_other_headers oth; + } l; + struct ipath_other_headers oth; + } u; +} __attribute__ ((packed)); + +struct ipath_pio_header { + __le32 pbc[2]; + struct ipath_ib_header hdr; +} __attribute__ ((packed)); + +/* + * There is one struct ipath_mcast for each multicast GID. + * All attached QPs are then stored as a list of + * struct ipath_mcast_qp. + */ +struct ipath_mcast_qp { + struct list_head list; + struct ipath_qp *qp; +}; + +struct ipath_mcast { + struct rb_node rb_node; + union ib_gid mgid; + struct list_head qp_list; + wait_queue_head_t wait; + atomic_t refcount; + int n_attached; +}; + +/* Protection domain */ +struct ipath_pd { + struct ib_pd ibpd; + int user; /* non-zero if created from user space */ +}; + +/* Address Handle */ +struct ipath_ah { + struct ib_ah ibah; + struct ib_ah_attr attr; +}; + +/* + * This structure is used by ipath_mmap() to validate an offset + * when an mmap() request is made. The vm_area_struct then uses + * this as its vm_private_data. + */ +struct ipath_mmap_info { + struct list_head pending_mmaps; + struct ib_ucontext *context; + void *obj; + __u64 offset; + struct kref ref; + unsigned size; +}; + +/* + * This structure is used to contain the head pointer, tail pointer, + * and completion queue entries as a single memory allocation so + * it can be mmap'ed into user space. + */ +struct ipath_cq_wc { + u32 head; /* index of next entry to fill */ + u32 tail; /* index of next ib_poll_cq() entry */ + union { + /* these are actually size ibcq.cqe + 1 */ + struct ib_uverbs_wc uqueue[0]; + struct ib_wc kqueue[0]; + }; +}; + +/* + * The completion queue structure. + */ +struct ipath_cq { + struct ib_cq ibcq; + struct tasklet_struct comptask; + spinlock_t lock; + u8 notify; + u8 triggered; + struct ipath_cq_wc *queue; + struct ipath_mmap_info *ip; +}; + +/* + * A segment is a linear region of low physical memory. + * XXX Maybe we should use phys addr here and kmap()/kunmap(). + * Used by the verbs layer. + */ +struct ipath_seg { + void *vaddr; + size_t length; +}; + +/* The number of ipath_segs that fit in a page. */ +#define IPATH_SEGSZ (PAGE_SIZE / sizeof (struct ipath_seg)) + +struct ipath_segarray { + struct ipath_seg segs[IPATH_SEGSZ]; +}; + +struct ipath_mregion { + struct ib_pd *pd; /* shares refcnt of ibmr.pd */ + u64 user_base; /* User's address for this region */ + u64 iova; /* IB start address of this region */ + size_t length; + u32 lkey; + u32 offset; /* offset (bytes) to start of region */ + int access_flags; + u32 max_segs; /* number of ipath_segs in all the arrays */ + u32 mapsz; /* size of the map array */ + struct ipath_segarray *map[0]; /* the segments */ +}; + +/* + * These keep track of the copy progress within a memory region. + * Used by the verbs layer. + */ +struct ipath_sge { + struct ipath_mregion *mr; + void *vaddr; /* kernel virtual address of segment */ + u32 sge_length; /* length of the SGE */ + u32 length; /* remaining length of the segment */ + u16 m; /* current index: mr->map[m] */ + u16 n; /* current index: mr->map[m]->segs[n] */ +}; + +/* Memory region */ +struct ipath_mr { + struct ib_mr ibmr; + struct ib_umem *umem; + struct ipath_mregion mr; /* must be last */ +}; + +/* + * Send work request queue entry. + * The size of the sg_list is determined when the QP is created and stored + * in qp->s_max_sge. + */ +struct ipath_swqe { + struct ib_send_wr wr; /* don't use wr.sg_list */ + u32 psn; /* first packet sequence number */ + u32 lpsn; /* last packet sequence number */ + u32 ssn; /* send sequence number */ + u32 length; /* total length of data in sg_list */ + struct ipath_sge sg_list[0]; +}; + +/* + * Receive work request queue entry. + * The size of the sg_list is determined when the QP (or SRQ) is created + * and stored in qp->r_rq.max_sge (or srq->rq.max_sge). + */ +struct ipath_rwqe { + u64 wr_id; + u8 num_sge; + struct ib_sge sg_list[0]; +}; + +/* + * This structure is used to contain the head pointer, tail pointer, + * and receive work queue entries as a single memory allocation so + * it can be mmap'ed into user space. + * Note that the wq array elements are variable size so you can't + * just index into the array to get the N'th element; + * use get_rwqe_ptr() instead. + */ +struct ipath_rwq { + u32 head; /* new work requests posted to the head */ + u32 tail; /* receives pull requests from here. */ + struct ipath_rwqe wq[0]; +}; + +struct ipath_rq { + struct ipath_rwq *wq; + spinlock_t lock; + u32 size; /* size of RWQE array */ + u8 max_sge; +}; + +struct ipath_srq { + struct ib_srq ibsrq; + struct ipath_rq rq; + struct ipath_mmap_info *ip; + /* send signal when number of RWQEs < limit */ + u32 limit; +}; + +struct ipath_sge_state { + struct ipath_sge *sg_list; /* next SGE to be used if any */ + struct ipath_sge sge; /* progress state for the current SGE */ + u8 num_sge; + u8 static_rate; +}; + +/* + * This structure holds the information that the send tasklet needs + * to send a RDMA read response or atomic operation. + */ +struct ipath_ack_entry { + u8 opcode; + u8 sent; + u32 psn; + union { + struct ipath_sge_state rdma_sge; + u64 atomic_data; + }; +}; + +/* + * Variables prefixed with s_ are for the requester (sender). + * Variables prefixed with r_ are for the responder (receiver). + * Variables prefixed with ack_ are for responder replies. + * + * Common variables are protected by both r_rq.lock and s_lock in that order + * which only happens in modify_qp() or changing the QP 'state'. + */ +struct ipath_qp { + struct ib_qp ibqp; + struct ipath_qp *next; /* link list for QPN hash table */ + struct ipath_qp *timer_next; /* link list for ipath_ib_timer() */ + struct ipath_qp *pio_next; /* link for ipath_ib_piobufavail() */ + struct list_head piowait; /* link for wait PIO buf */ + struct list_head timerwait; /* link for waiting for timeouts */ + struct ib_ah_attr remote_ah_attr; + struct ipath_ib_header s_hdr; /* next packet header to send */ + atomic_t refcount; + wait_queue_head_t wait; + wait_queue_head_t wait_dma; + struct tasklet_struct s_task; + struct ipath_mmap_info *ip; + struct ipath_sge_state *s_cur_sge; + struct ipath_verbs_txreq *s_tx; + struct ipath_sge_state s_sge; /* current send request data */ + struct ipath_ack_entry s_ack_queue[IPATH_MAX_RDMA_ATOMIC + 1]; + struct ipath_sge_state s_ack_rdma_sge; + struct ipath_sge_state s_rdma_read_sge; + struct ipath_sge_state r_sge; /* current receive data */ + spinlock_t s_lock; + atomic_t s_dma_busy; + u16 s_pkt_delay; + u16 s_hdrwords; /* size of s_hdr in 32 bit words */ + u32 s_cur_size; /* size of send packet in bytes */ + u32 s_len; /* total length of s_sge */ + u32 s_rdma_read_len; /* total length of s_rdma_read_sge */ + u32 s_next_psn; /* PSN for next request */ + u32 s_last_psn; /* last response PSN processed */ + u32 s_psn; /* current packet sequence number */ + u32 s_ack_rdma_psn; /* PSN for sending RDMA read responses */ + u32 s_ack_psn; /* PSN for acking sends and RDMA writes */ + u32 s_rnr_timeout; /* number of milliseconds for RNR timeout */ + u32 r_ack_psn; /* PSN for next ACK or atomic ACK */ + u64 r_wr_id; /* ID for current receive WQE */ + unsigned long r_aflags; + u32 r_len; /* total length of r_sge */ + u32 r_rcv_len; /* receive data len processed */ + u32 r_psn; /* expected rcv packet sequence number */ + u32 r_msn; /* message sequence number */ + u8 state; /* QP state */ + u8 s_state; /* opcode of last packet sent */ + u8 s_ack_state; /* opcode of packet to ACK */ + u8 s_nak_state; /* non-zero if NAK is pending */ + u8 r_state; /* opcode of last packet received */ + u8 r_nak_state; /* non-zero if NAK is pending */ + u8 r_min_rnr_timer; /* retry timeout value for RNR NAKs */ + u8 r_flags; + u8 r_max_rd_atomic; /* max number of RDMA read/atomic to receive */ + u8 r_head_ack_queue; /* index into s_ack_queue[] */ + u8 qp_access_flags; + u8 s_max_sge; /* size of s_wq->sg_list */ + u8 s_retry_cnt; /* number of times to retry */ + u8 s_rnr_retry_cnt; + u8 s_retry; /* requester retry counter */ + u8 s_rnr_retry; /* requester RNR retry counter */ + u8 s_pkey_index; /* PKEY index to use */ + u8 s_max_rd_atomic; /* max number of RDMA read/atomic to send */ + u8 s_num_rd_atomic; /* number of RDMA read/atomic pending */ + u8 s_tail_ack_queue; /* index into s_ack_queue[] */ + u8 s_flags; + u8 s_dmult; + u8 s_draining; + u8 timeout; /* Timeout for this QP */ + enum ib_mtu path_mtu; + u32 remote_qpn; + u32 qkey; /* QKEY for this QP (for UD or RD) */ + u32 s_size; /* send work queue size */ + u32 s_head; /* new entries added here */ + u32 s_tail; /* next entry to process */ + u32 s_cur; /* current work queue entry */ + u32 s_last; /* last un-ACK'ed entry */ + u32 s_ssn; /* SSN of tail entry */ + u32 s_lsn; /* limit sequence number (credit) */ + struct ipath_swqe *s_wq; /* send work queue */ + struct ipath_swqe *s_wqe; + struct ipath_sge *r_ud_sg_list; + struct ipath_rq r_rq; /* receive work queue */ + struct ipath_sge r_sg_list[0]; /* verified SGEs */ +}; + +/* + * Atomic bit definitions for r_aflags. + */ +#define IPATH_R_WRID_VALID 0 + +/* + * Bit definitions for r_flags. + */ +#define IPATH_R_REUSE_SGE 0x01 +#define IPATH_R_RDMAR_SEQ 0x02 + +/* + * Bit definitions for s_flags. + * + * IPATH_S_FENCE_PENDING - waiting for all prior RDMA read or atomic SWQEs + * before processing the next SWQE + * IPATH_S_RDMAR_PENDING - waiting for any RDMA read or atomic SWQEs + * before processing the next SWQE + * IPATH_S_WAITING - waiting for RNR timeout or send buffer available. + * IPATH_S_WAIT_SSN_CREDIT - waiting for RC credits to process next SWQE + * IPATH_S_WAIT_DMA - waiting for send DMA queue to drain before generating + * next send completion entry not via send DMA. + */ +#define IPATH_S_SIGNAL_REQ_WR 0x01 +#define IPATH_S_FENCE_PENDING 0x02 +#define IPATH_S_RDMAR_PENDING 0x04 +#define IPATH_S_ACK_PENDING 0x08 +#define IPATH_S_BUSY 0x10 +#define IPATH_S_WAITING 0x20 +#define IPATH_S_WAIT_SSN_CREDIT 0x40 +#define IPATH_S_WAIT_DMA 0x80 + +#define IPATH_S_ANY_WAIT (IPATH_S_FENCE_PENDING | IPATH_S_RDMAR_PENDING | \ + IPATH_S_WAITING | IPATH_S_WAIT_SSN_CREDIT | IPATH_S_WAIT_DMA) + +#define IPATH_PSN_CREDIT 512 + +/* + * Since struct ipath_swqe is not a fixed size, we can't simply index into + * struct ipath_qp.s_wq. This function does the array index computation. + */ +static inline struct ipath_swqe *get_swqe_ptr(struct ipath_qp *qp, + unsigned n) +{ + return (struct ipath_swqe *)((char *)qp->s_wq + + (sizeof(struct ipath_swqe) + + qp->s_max_sge * + sizeof(struct ipath_sge)) * n); +} + +/* + * Since struct ipath_rwqe is not a fixed size, we can't simply index into + * struct ipath_rwq.wq. This function does the array index computation. + */ +static inline struct ipath_rwqe *get_rwqe_ptr(struct ipath_rq *rq, + unsigned n) +{ + return (struct ipath_rwqe *) + ((char *) rq->wq->wq + + (sizeof(struct ipath_rwqe) + + rq->max_sge * sizeof(struct ib_sge)) * n); +} + +/* + * QPN-map pages start out as NULL, they get allocated upon + * first use and are never deallocated. This way, + * large bitmaps are not allocated unless large numbers of QPs are used. + */ +struct qpn_map { + atomic_t n_free; + void *page; +}; + +struct ipath_qp_table { + spinlock_t lock; + u32 last; /* last QP number allocated */ + u32 max; /* size of the hash table */ + u32 nmaps; /* size of the map table */ + struct ipath_qp **table; + /* bit map of free numbers */ + struct qpn_map map[QPNMAP_ENTRIES]; +}; + +struct ipath_lkey_table { + spinlock_t lock; + u32 next; /* next unused index (speeds search) */ + u32 gen; /* generation count */ + u32 max; /* size of the table */ + struct ipath_mregion **table; +}; + +struct ipath_opcode_stats { + u64 n_packets; /* number of packets */ + u64 n_bytes; /* total number of bytes */ +}; + +struct ipath_ibdev { + struct ib_device ibdev; + struct ipath_devdata *dd; + struct list_head pending_mmaps; + spinlock_t mmap_offset_lock; + u32 mmap_offset; + int ib_unit; /* This is the device number */ + u16 sm_lid; /* in host order */ + u8 sm_sl; + u8 mkeyprot; + /* non-zero when timer is set */ + unsigned long mkey_lease_timeout; + + /* The following fields are really per port. */ + struct ipath_qp_table qp_table; + struct ipath_lkey_table lk_table; + struct list_head pending[3]; /* FIFO of QPs waiting for ACKs */ + struct list_head piowait; /* list for wait PIO buf */ + struct list_head txreq_free; + void *txreq_bufs; + /* list of QPs waiting for RNR timer */ + struct list_head rnrwait; + spinlock_t pending_lock; + __be64 sys_image_guid; /* in network order */ + __be64 gid_prefix; /* in network order */ + __be64 mkey; + + u32 n_pds_allocated; /* number of PDs allocated for device */ + spinlock_t n_pds_lock; + u32 n_ahs_allocated; /* number of AHs allocated for device */ + spinlock_t n_ahs_lock; + u32 n_cqs_allocated; /* number of CQs allocated for device */ + spinlock_t n_cqs_lock; + u32 n_qps_allocated; /* number of QPs allocated for device */ + spinlock_t n_qps_lock; + u32 n_srqs_allocated; /* number of SRQs allocated for device */ + spinlock_t n_srqs_lock; + u32 n_mcast_grps_allocated; /* number of mcast groups allocated */ + spinlock_t n_mcast_grps_lock; + + u64 ipath_sword; /* total dwords sent (sample result) */ + u64 ipath_rword; /* total dwords received (sample result) */ + u64 ipath_spkts; /* total packets sent (sample result) */ + u64 ipath_rpkts; /* total packets received (sample result) */ + /* # of ticks no data sent (sample result) */ + u64 ipath_xmit_wait; + u64 rcv_errors; /* # of packets with SW detected rcv errs */ + u64 n_unicast_xmit; /* total unicast packets sent */ + u64 n_unicast_rcv; /* total unicast packets received */ + u64 n_multicast_xmit; /* total multicast packets sent */ + u64 n_multicast_rcv; /* total multicast packets received */ + u64 z_symbol_error_counter; /* starting count for PMA */ + u64 z_link_error_recovery_counter; /* starting count for PMA */ + u64 z_link_downed_counter; /* starting count for PMA */ + u64 z_port_rcv_errors; /* starting count for PMA */ + u64 z_port_rcv_remphys_errors; /* starting count for PMA */ + u64 z_port_xmit_discards; /* starting count for PMA */ + u64 z_port_xmit_data; /* starting count for PMA */ + u64 z_port_rcv_data; /* starting count for PMA */ + u64 z_port_xmit_packets; /* starting count for PMA */ + u64 z_port_rcv_packets; /* starting count for PMA */ + u32 z_pkey_violations; /* starting count for PMA */ + u32 z_local_link_integrity_errors; /* starting count for PMA */ + u32 z_excessive_buffer_overrun_errors; /* starting count for PMA */ + u32 z_vl15_dropped; /* starting count for PMA */ + u32 n_rc_resends; + u32 n_rc_acks; + u32 n_rc_qacks; + u32 n_seq_naks; + u32 n_rdma_seq; + u32 n_rnr_naks; + u32 n_other_naks; + u32 n_timeouts; + u32 n_pkt_drops; + u32 n_vl15_dropped; + u32 n_wqe_errs; + u32 n_rdma_dup_busy; + u32 n_piowait; + u32 n_unaligned; + u32 port_cap_flags; + u32 pma_sample_start; + u32 pma_sample_interval; + __be16 pma_counter_select[5]; + u16 pma_tag; + u16 qkey_violations; + u16 mkey_violations; + u16 mkey_lease_period; + u16 pending_index; /* which pending queue is active */ + u8 pma_sample_status; + u8 subnet_timeout; + u8 vl_high_limit; + struct ipath_opcode_stats opstats[128]; +}; + +struct ipath_verbs_counters { + u64 symbol_error_counter; + u64 link_error_recovery_counter; + u64 link_downed_counter; + u64 port_rcv_errors; + u64 port_rcv_remphys_errors; + u64 port_xmit_discards; + u64 port_xmit_data; + u64 port_rcv_data; + u64 port_xmit_packets; + u64 port_rcv_packets; + u32 local_link_integrity_errors; + u32 excessive_buffer_overrun_errors; + u32 vl15_dropped; +}; + +struct ipath_verbs_txreq { + struct ipath_qp *qp; + struct ipath_swqe *wqe; + u32 map_len; + u32 len; + struct ipath_sge_state *ss; + struct ipath_pio_header hdr; + struct ipath_sdma_txreq txreq; +}; + +static inline struct ipath_mr *to_imr(struct ib_mr *ibmr) +{ + return container_of(ibmr, struct ipath_mr, ibmr); +} + +static inline struct ipath_pd *to_ipd(struct ib_pd *ibpd) +{ + return container_of(ibpd, struct ipath_pd, ibpd); +} + +static inline struct ipath_ah *to_iah(struct ib_ah *ibah) +{ + return container_of(ibah, struct ipath_ah, ibah); +} + +static inline struct ipath_cq *to_icq(struct ib_cq *ibcq) +{ + return container_of(ibcq, struct ipath_cq, ibcq); +} + +static inline struct ipath_srq *to_isrq(struct ib_srq *ibsrq) +{ + return container_of(ibsrq, struct ipath_srq, ibsrq); +} + +static inline struct ipath_qp *to_iqp(struct ib_qp *ibqp) +{ + return container_of(ibqp, struct ipath_qp, ibqp); +} + +static inline struct ipath_ibdev *to_idev(struct ib_device *ibdev) +{ + return container_of(ibdev, struct ipath_ibdev, ibdev); +} + +/* + * This must be called with s_lock held. + */ +static inline void ipath_schedule_send(struct ipath_qp *qp) +{ + if (qp->s_flags & IPATH_S_ANY_WAIT) + qp->s_flags &= ~IPATH_S_ANY_WAIT; + if (!(qp->s_flags & IPATH_S_BUSY)) + tasklet_hi_schedule(&qp->s_task); +} + +int ipath_process_mad(struct ib_device *ibdev, + int mad_flags, + u8 port_num, + struct ib_wc *in_wc, + struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad); + +/* + * Compare the lower 24 bits of the two values. + * Returns an integer <, ==, or > than zero. + */ +static inline int ipath_cmp24(u32 a, u32 b) +{ + return (((int) a) - ((int) b)) << 8; +} + +struct ipath_mcast *ipath_mcast_find(union ib_gid *mgid); + +int ipath_snapshot_counters(struct ipath_devdata *dd, u64 *swords, + u64 *rwords, u64 *spkts, u64 *rpkts, + u64 *xmit_wait); + +int ipath_get_counters(struct ipath_devdata *dd, + struct ipath_verbs_counters *cntrs); + +int ipath_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid); + +int ipath_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid); + +int ipath_mcast_tree_empty(void); + +__be32 ipath_compute_aeth(struct ipath_qp *qp); + +struct ipath_qp *ipath_lookup_qpn(struct ipath_qp_table *qpt, u32 qpn); + +struct ib_qp *ipath_create_qp(struct ib_pd *ibpd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata); + +int ipath_destroy_qp(struct ib_qp *ibqp); + +int ipath_error_qp(struct ipath_qp *qp, enum ib_wc_status err); + +int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata); + +int ipath_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_qp_init_attr *init_attr); + +unsigned ipath_free_all_qps(struct ipath_qp_table *qpt); + +int ipath_init_qp_table(struct ipath_ibdev *idev, int size); + +void ipath_get_credit(struct ipath_qp *qp, u32 aeth); + +unsigned ipath_ib_rate_to_mult(enum ib_rate rate); + +int ipath_verbs_send(struct ipath_qp *qp, struct ipath_ib_header *hdr, + u32 hdrwords, struct ipath_sge_state *ss, u32 len); + +void ipath_copy_sge(struct ipath_sge_state *ss, void *data, u32 length); + +void ipath_skip_sge(struct ipath_sge_state *ss, u32 length); + +void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, + int has_grh, void *data, u32 tlen, struct ipath_qp *qp); + +void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, + int has_grh, void *data, u32 tlen, struct ipath_qp *qp); + +void ipath_restart_rc(struct ipath_qp *qp, u32 psn); + +void ipath_rc_error(struct ipath_qp *qp, enum ib_wc_status err); + +int ipath_post_ud_send(struct ipath_qp *qp, struct ib_send_wr *wr); + +void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, + int has_grh, void *data, u32 tlen, struct ipath_qp *qp); + +int ipath_alloc_lkey(struct ipath_lkey_table *rkt, + struct ipath_mregion *mr); + +void ipath_free_lkey(struct ipath_lkey_table *rkt, u32 lkey); + +int ipath_lkey_ok(struct ipath_qp *qp, struct ipath_sge *isge, + struct ib_sge *sge, int acc); + +int ipath_rkey_ok(struct ipath_qp *qp, struct ipath_sge_state *ss, + u32 len, u64 vaddr, u32 rkey, int acc); + +int ipath_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr); + +struct ib_srq *ipath_create_srq(struct ib_pd *ibpd, + struct ib_srq_init_attr *srq_init_attr, + struct ib_udata *udata); + +int ipath_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, + struct ib_udata *udata); + +int ipath_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr); + +int ipath_destroy_srq(struct ib_srq *ibsrq); + +void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int sig); + +int ipath_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry); + +struct ib_cq *ipath_create_cq(struct ib_device *ibdev, int entries, int comp_vector, + struct ib_ucontext *context, + struct ib_udata *udata); + +int ipath_destroy_cq(struct ib_cq *ibcq); + +int ipath_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags); + +int ipath_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata); + +struct ib_mr *ipath_get_dma_mr(struct ib_pd *pd, int acc); + +struct ib_mr *ipath_reg_phys_mr(struct ib_pd *pd, + struct ib_phys_buf *buffer_list, + int num_phys_buf, int acc, u64 *iova_start); + +struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int mr_access_flags, + struct ib_udata *udata); + +int ipath_dereg_mr(struct ib_mr *ibmr); + +struct ib_fmr *ipath_alloc_fmr(struct ib_pd *pd, int mr_access_flags, + struct ib_fmr_attr *fmr_attr); + +int ipath_map_phys_fmr(struct ib_fmr *ibfmr, u64 * page_list, + int list_len, u64 iova); + +int ipath_unmap_fmr(struct list_head *fmr_list); + +int ipath_dealloc_fmr(struct ib_fmr *ibfmr); + +void ipath_release_mmap_info(struct kref *ref); + +struct ipath_mmap_info *ipath_create_mmap_info(struct ipath_ibdev *dev, + u32 size, + struct ib_ucontext *context, + void *obj); + +void ipath_update_mmap_info(struct ipath_ibdev *dev, + struct ipath_mmap_info *ip, + u32 size, void *obj); + +int ipath_mmap(struct ib_ucontext *context, struct vm_area_struct *vma); + +void ipath_insert_rnr_queue(struct ipath_qp *qp); + +int ipath_init_sge(struct ipath_qp *qp, struct ipath_rwqe *wqe, + u32 *lengthp, struct ipath_sge_state *ss); + +int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only); + +u32 ipath_make_grh(struct ipath_ibdev *dev, struct ib_grh *hdr, + struct ib_global_route *grh, u32 hwords, u32 nwords); + +void ipath_make_ruc_header(struct ipath_ibdev *dev, struct ipath_qp *qp, + struct ipath_other_headers *ohdr, + u32 bth0, u32 bth2); + +void ipath_do_send(unsigned long data); + +void ipath_send_complete(struct ipath_qp *qp, struct ipath_swqe *wqe, + enum ib_wc_status status); + +int ipath_make_rc_req(struct ipath_qp *qp); + +int ipath_make_uc_req(struct ipath_qp *qp); + +int ipath_make_ud_req(struct ipath_qp *qp); + +int ipath_register_ib_device(struct ipath_devdata *); + +void ipath_unregister_ib_device(struct ipath_ibdev *); + +void ipath_ib_rcv(struct ipath_ibdev *, void *, void *, u32); + +int ipath_ib_piobufavail(struct ipath_ibdev *); + +unsigned ipath_get_npkeys(struct ipath_devdata *); + +u32 ipath_get_cr_errpkey(struct ipath_devdata *); + +unsigned ipath_get_pkey(struct ipath_devdata *, unsigned); + +extern const enum ib_wc_opcode ib_ipath_wc_opcode[]; + +/* + * Below converts HCA-specific LinkTrainingState to IB PhysPortState + * values. + */ +extern const u8 ipath_cvt_physportstate[]; +#define IB_PHYSPORTSTATE_SLEEP 1 +#define IB_PHYSPORTSTATE_POLL 2 +#define IB_PHYSPORTSTATE_DISABLED 3 +#define IB_PHYSPORTSTATE_CFG_TRAIN 4 +#define IB_PHYSPORTSTATE_LINKUP 5 +#define IB_PHYSPORTSTATE_LINK_ERR_RECOVER 6 + +extern const int ib_ipath_state_ops[]; + +extern unsigned int ib_ipath_lkey_table_size; + +extern unsigned int ib_ipath_max_cqes; + +extern unsigned int ib_ipath_max_cqs; + +extern unsigned int ib_ipath_max_qp_wrs; + +extern unsigned int ib_ipath_max_qps; + +extern unsigned int ib_ipath_max_sges; + +extern unsigned int ib_ipath_max_mcast_grps; + +extern unsigned int ib_ipath_max_mcast_qp_attached; + +extern unsigned int ib_ipath_max_srqs; + +extern unsigned int ib_ipath_max_srq_sges; + +extern unsigned int ib_ipath_max_srq_wrs; + +extern const u32 ib_ipath_rnr_table[]; + +extern struct ib_dma_mapping_ops ipath_dma_mapping_ops; + +#endif /* IPATH_VERBS_H */ diff --git a/kernel/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c b/kernel/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c new file mode 100644 index 000000000..6216ea923 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c @@ -0,0 +1,364 @@ +/* + * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include "ipath_verbs.h" + +/* + * Global table of GID to attached QPs. + * The table is global to all ipath devices since a send from one QP/device + * needs to be locally routed to any locally attached QPs on the same + * or different device. + */ +static struct rb_root mcast_tree; +static DEFINE_SPINLOCK(mcast_lock); + +/** + * ipath_mcast_qp_alloc - alloc a struct to link a QP to mcast GID struct + * @qp: the QP to link + */ +static struct ipath_mcast_qp *ipath_mcast_qp_alloc(struct ipath_qp *qp) +{ + struct ipath_mcast_qp *mqp; + + mqp = kmalloc(sizeof *mqp, GFP_KERNEL); + if (!mqp) + goto bail; + + mqp->qp = qp; + atomic_inc(&qp->refcount); + +bail: + return mqp; +} + +static void ipath_mcast_qp_free(struct ipath_mcast_qp *mqp) +{ + struct ipath_qp *qp = mqp->qp; + + /* Notify ipath_destroy_qp() if it is waiting. */ + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + + kfree(mqp); +} + +/** + * ipath_mcast_alloc - allocate the multicast GID structure + * @mgid: the multicast GID + * + * A list of QPs will be attached to this structure. + */ +static struct ipath_mcast *ipath_mcast_alloc(union ib_gid *mgid) +{ + struct ipath_mcast *mcast; + + mcast = kmalloc(sizeof *mcast, GFP_KERNEL); + if (!mcast) + goto bail; + + mcast->mgid = *mgid; + INIT_LIST_HEAD(&mcast->qp_list); + init_waitqueue_head(&mcast->wait); + atomic_set(&mcast->refcount, 0); + mcast->n_attached = 0; + +bail: + return mcast; +} + +static void ipath_mcast_free(struct ipath_mcast *mcast) +{ + struct ipath_mcast_qp *p, *tmp; + + list_for_each_entry_safe(p, tmp, &mcast->qp_list, list) + ipath_mcast_qp_free(p); + + kfree(mcast); +} + +/** + * ipath_mcast_find - search the global table for the given multicast GID + * @mgid: the multicast GID to search for + * + * Returns NULL if not found. + * + * The caller is responsible for decrementing the reference count if found. + */ +struct ipath_mcast *ipath_mcast_find(union ib_gid *mgid) +{ + struct rb_node *n; + unsigned long flags; + struct ipath_mcast *mcast; + + spin_lock_irqsave(&mcast_lock, flags); + n = mcast_tree.rb_node; + while (n) { + int ret; + + mcast = rb_entry(n, struct ipath_mcast, rb_node); + + ret = memcmp(mgid->raw, mcast->mgid.raw, + sizeof(union ib_gid)); + if (ret < 0) + n = n->rb_left; + else if (ret > 0) + n = n->rb_right; + else { + atomic_inc(&mcast->refcount); + spin_unlock_irqrestore(&mcast_lock, flags); + goto bail; + } + } + spin_unlock_irqrestore(&mcast_lock, flags); + + mcast = NULL; + +bail: + return mcast; +} + +/** + * ipath_mcast_add - insert mcast GID into table and attach QP struct + * @mcast: the mcast GID table + * @mqp: the QP to attach + * + * Return zero if both were added. Return EEXIST if the GID was already in + * the table but the QP was added. Return ESRCH if the QP was already + * attached and neither structure was added. + */ +static int ipath_mcast_add(struct ipath_ibdev *dev, + struct ipath_mcast *mcast, + struct ipath_mcast_qp *mqp) +{ + struct rb_node **n = &mcast_tree.rb_node; + struct rb_node *pn = NULL; + int ret; + + spin_lock_irq(&mcast_lock); + + while (*n) { + struct ipath_mcast *tmcast; + struct ipath_mcast_qp *p; + + pn = *n; + tmcast = rb_entry(pn, struct ipath_mcast, rb_node); + + ret = memcmp(mcast->mgid.raw, tmcast->mgid.raw, + sizeof(union ib_gid)); + if (ret < 0) { + n = &pn->rb_left; + continue; + } + if (ret > 0) { + n = &pn->rb_right; + continue; + } + + /* Search the QP list to see if this is already there. */ + list_for_each_entry_rcu(p, &tmcast->qp_list, list) { + if (p->qp == mqp->qp) { + ret = ESRCH; + goto bail; + } + } + if (tmcast->n_attached == ib_ipath_max_mcast_qp_attached) { + ret = ENOMEM; + goto bail; + } + + tmcast->n_attached++; + + list_add_tail_rcu(&mqp->list, &tmcast->qp_list); + ret = EEXIST; + goto bail; + } + + spin_lock(&dev->n_mcast_grps_lock); + if (dev->n_mcast_grps_allocated == ib_ipath_max_mcast_grps) { + spin_unlock(&dev->n_mcast_grps_lock); + ret = ENOMEM; + goto bail; + } + + dev->n_mcast_grps_allocated++; + spin_unlock(&dev->n_mcast_grps_lock); + + mcast->n_attached++; + + list_add_tail_rcu(&mqp->list, &mcast->qp_list); + + atomic_inc(&mcast->refcount); + rb_link_node(&mcast->rb_node, pn, n); + rb_insert_color(&mcast->rb_node, &mcast_tree); + + ret = 0; + +bail: + spin_unlock_irq(&mcast_lock); + + return ret; +} + +int ipath_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + struct ipath_qp *qp = to_iqp(ibqp); + struct ipath_ibdev *dev = to_idev(ibqp->device); + struct ipath_mcast *mcast; + struct ipath_mcast_qp *mqp; + int ret; + + /* + * Allocate data structures since its better to do this outside of + * spin locks and it will most likely be needed. + */ + mcast = ipath_mcast_alloc(gid); + if (mcast == NULL) { + ret = -ENOMEM; + goto bail; + } + mqp = ipath_mcast_qp_alloc(qp); + if (mqp == NULL) { + ipath_mcast_free(mcast); + ret = -ENOMEM; + goto bail; + } + switch (ipath_mcast_add(dev, mcast, mqp)) { + case ESRCH: + /* Neither was used: can't attach the same QP twice. */ + ipath_mcast_qp_free(mqp); + ipath_mcast_free(mcast); + ret = -EINVAL; + goto bail; + case EEXIST: /* The mcast wasn't used */ + ipath_mcast_free(mcast); + break; + case ENOMEM: + /* Exceeded the maximum number of mcast groups. */ + ipath_mcast_qp_free(mqp); + ipath_mcast_free(mcast); + ret = -ENOMEM; + goto bail; + default: + break; + } + + ret = 0; + +bail: + return ret; +} + +int ipath_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + struct ipath_qp *qp = to_iqp(ibqp); + struct ipath_ibdev *dev = to_idev(ibqp->device); + struct ipath_mcast *mcast = NULL; + struct ipath_mcast_qp *p, *tmp; + struct rb_node *n; + int last = 0; + int ret; + + spin_lock_irq(&mcast_lock); + + /* Find the GID in the mcast table. */ + n = mcast_tree.rb_node; + while (1) { + if (n == NULL) { + spin_unlock_irq(&mcast_lock); + ret = -EINVAL; + goto bail; + } + + mcast = rb_entry(n, struct ipath_mcast, rb_node); + ret = memcmp(gid->raw, mcast->mgid.raw, + sizeof(union ib_gid)); + if (ret < 0) + n = n->rb_left; + else if (ret > 0) + n = n->rb_right; + else + break; + } + + /* Search the QP list. */ + list_for_each_entry_safe(p, tmp, &mcast->qp_list, list) { + if (p->qp != qp) + continue; + /* + * We found it, so remove it, but don't poison the forward + * link until we are sure there are no list walkers. + */ + list_del_rcu(&p->list); + mcast->n_attached--; + + /* If this was the last attached QP, remove the GID too. */ + if (list_empty(&mcast->qp_list)) { + rb_erase(&mcast->rb_node, &mcast_tree); + last = 1; + } + break; + } + + spin_unlock_irq(&mcast_lock); + + if (p) { + /* + * Wait for any list walkers to finish before freeing the + * list element. + */ + wait_event(mcast->wait, atomic_read(&mcast->refcount) <= 1); + ipath_mcast_qp_free(p); + } + if (last) { + atomic_dec(&mcast->refcount); + wait_event(mcast->wait, !atomic_read(&mcast->refcount)); + ipath_mcast_free(mcast); + spin_lock_irq(&dev->n_mcast_grps_lock); + dev->n_mcast_grps_allocated--; + spin_unlock_irq(&dev->n_mcast_grps_lock); + } + + ret = 0; + +bail: + return ret; +} + +int ipath_mcast_tree_empty(void) +{ + return mcast_tree.rb_node == NULL; +} diff --git a/kernel/drivers/infiniband/hw/ipath/ipath_wc_ppc64.c b/kernel/drivers/infiniband/hw/ipath/ipath_wc_ppc64.c new file mode 100644 index 000000000..1a7e20a75 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ipath/ipath_wc_ppc64.c @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * This file is conditionally built on PowerPC only. Otherwise weak symbol + * versions of the functions exported from here are used. + */ + +#include "ipath_kernel.h" + +/** + * ipath_enable_wc - enable write combining for MMIO writes to the device + * @dd: infinipath device + * + * Nothing to do on PowerPC, so just return without error. + */ +int ipath_enable_wc(struct ipath_devdata *dd) +{ + return 0; +} diff --git a/kernel/drivers/infiniband/hw/ipath/ipath_wc_x86_64.c b/kernel/drivers/infiniband/hw/ipath/ipath_wc_x86_64.c new file mode 100644 index 000000000..4ad0b932d --- /dev/null +++ b/kernel/drivers/infiniband/hw/ipath/ipath_wc_x86_64.c @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * This file is conditionally built on x86_64 only. Otherwise weak symbol + * versions of the functions exported from here are used. + */ + +#include +#include +#include + +#include "ipath_kernel.h" + +/** + * ipath_enable_wc - enable write combining for MMIO writes to the device + * @dd: infinipath device + * + * This routine is x86_64-specific; it twiddles the CPU's MTRRs to enable + * write combining. + */ +int ipath_enable_wc(struct ipath_devdata *dd) +{ + int ret = 0; + u64 pioaddr, piolen; + unsigned bits; + const unsigned long addr = pci_resource_start(dd->pcidev, 0); + const size_t len = pci_resource_len(dd->pcidev, 0); + + /* + * Set the PIO buffers to be WCCOMB, so we get HT bursts to the + * chip. Linux (possibly the hardware) requires it to be on a power + * of 2 address matching the length (which has to be a power of 2). + * For rev1, that means the base address, for rev2, it will be just + * the PIO buffers themselves. + * For chips with two sets of buffers, the calculations are + * somewhat more complicated; we need to sum, and the piobufbase + * register has both offsets, 2K in low 32 bits, 4K in high 32 bits. + * The buffers are still packed, so a single range covers both. + */ + if (dd->ipath_piobcnt2k && dd->ipath_piobcnt4k) { /* 2 sizes */ + unsigned long pio2kbase, pio4kbase; + pio2kbase = dd->ipath_piobufbase & 0xffffffffUL; + pio4kbase = (dd->ipath_piobufbase >> 32) & 0xffffffffUL; + if (pio2kbase < pio4kbase) { /* all, for now */ + pioaddr = addr + pio2kbase; + piolen = pio4kbase - pio2kbase + + dd->ipath_piobcnt4k * dd->ipath_4kalign; + } else { + pioaddr = addr + pio4kbase; + piolen = pio2kbase - pio4kbase + + dd->ipath_piobcnt2k * dd->ipath_palign; + } + } else { /* single buffer size (2K, currently) */ + pioaddr = addr + dd->ipath_piobufbase; + piolen = dd->ipath_piobcnt2k * dd->ipath_palign + + dd->ipath_piobcnt4k * dd->ipath_4kalign; + } + + for (bits = 0; !(piolen & (1ULL << bits)); bits++) + /* do nothing */ ; + + if (piolen != (1ULL << bits)) { + piolen >>= bits; + while (piolen >>= 1) + bits++; + piolen = 1ULL << (bits + 1); + } + if (pioaddr & (piolen - 1)) { + u64 atmp; + ipath_dbg("pioaddr %llx not on right boundary for size " + "%llx, fixing\n", + (unsigned long long) pioaddr, + (unsigned long long) piolen); + atmp = pioaddr & ~(piolen - 1); + if (atmp < addr || (atmp + piolen) > (addr + len)) { + ipath_dev_err(dd, "No way to align address/size " + "(%llx/%llx), no WC mtrr\n", + (unsigned long long) atmp, + (unsigned long long) piolen << 1); + ret = -ENODEV; + } else { + ipath_dbg("changing WC base from %llx to %llx, " + "len from %llx to %llx\n", + (unsigned long long) pioaddr, + (unsigned long long) atmp, + (unsigned long long) piolen, + (unsigned long long) piolen << 1); + pioaddr = atmp; + piolen <<= 1; + } + } + + if (!ret) { + int cookie; + ipath_cdbg(VERBOSE, "Setting mtrr for chip to WC " + "(addr %llx, len=0x%llx)\n", + (unsigned long long) pioaddr, + (unsigned long long) piolen); + cookie = mtrr_add(pioaddr, piolen, MTRR_TYPE_WRCOMB, 0); + if (cookie < 0) { + { + dev_info(&dd->pcidev->dev, + "mtrr_add() WC for PIO bufs " + "failed (%d)\n", + cookie); + ret = -EINVAL; + } + } else { + ipath_cdbg(VERBOSE, "Set mtrr for chip to WC, " + "cookie is %d\n", cookie); + dd->ipath_wc_cookie = cookie; + dd->ipath_wc_base = (unsigned long) pioaddr; + dd->ipath_wc_len = (unsigned long) piolen; + } + } + + return ret; +} + +/** + * ipath_disable_wc - disable write combining for MMIO writes to the device + * @dd: infinipath device + */ +void ipath_disable_wc(struct ipath_devdata *dd) +{ + if (dd->ipath_wc_cookie) { + int r; + ipath_cdbg(VERBOSE, "undoing WCCOMB on pio buffers\n"); + r = mtrr_del(dd->ipath_wc_cookie, dd->ipath_wc_base, + dd->ipath_wc_len); + if (r < 0) + dev_info(&dd->pcidev->dev, + "mtrr_del(%lx, %lx, %lx) failed: %d\n", + dd->ipath_wc_cookie, dd->ipath_wc_base, + dd->ipath_wc_len, r); + dd->ipath_wc_cookie = 0; /* even on failure */ + } +} diff --git a/kernel/drivers/infiniband/hw/mlx4/Kconfig b/kernel/drivers/infiniband/hw/mlx4/Kconfig new file mode 100644 index 000000000..fc01deac1 --- /dev/null +++ b/kernel/drivers/infiniband/hw/mlx4/Kconfig @@ -0,0 +1,10 @@ +config MLX4_INFINIBAND + tristate "Mellanox ConnectX HCA support" + depends on NETDEVICES && ETHERNET && PCI && INET + select NET_VENDOR_MELLANOX + select MLX4_CORE + ---help--- + This driver provides low-level InfiniBand support for + Mellanox ConnectX PCI Express host channel adapters (HCAs). + This is required to use InfiniBand protocols such as + IP-over-IB or SRP with these devices. diff --git a/kernel/drivers/infiniband/hw/mlx4/Makefile b/kernel/drivers/infiniband/hw/mlx4/Makefile new file mode 100644 index 000000000..f4213b3a8 --- /dev/null +++ b/kernel/drivers/infiniband/hw/mlx4/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_MLX4_INFINIBAND) += mlx4_ib.o + +mlx4_ib-y := ah.o cq.o doorbell.o mad.o main.o mr.o qp.o srq.o mcg.o cm.o alias_GUID.o sysfs.o diff --git a/kernel/drivers/infiniband/hw/mlx4/ah.c b/kernel/drivers/infiniband/hw/mlx4/ah.c new file mode 100644 index 000000000..f50a54622 --- /dev/null +++ b/kernel/drivers/infiniband/hw/mlx4/ah.c @@ -0,0 +1,178 @@ +/* + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include +#include +#include +#include + +#include "mlx4_ib.h" + +static struct ib_ah *create_ib_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr, + struct mlx4_ib_ah *ah) +{ + struct mlx4_dev *dev = to_mdev(pd->device)->dev; + + ah->av.ib.port_pd = cpu_to_be32(to_mpd(pd)->pdn | (ah_attr->port_num << 24)); + ah->av.ib.g_slid = ah_attr->src_path_bits; + if (ah_attr->ah_flags & IB_AH_GRH) { + ah->av.ib.g_slid |= 0x80; + ah->av.ib.gid_index = ah_attr->grh.sgid_index; + ah->av.ib.hop_limit = ah_attr->grh.hop_limit; + ah->av.ib.sl_tclass_flowlabel |= + cpu_to_be32((ah_attr->grh.traffic_class << 20) | + ah_attr->grh.flow_label); + memcpy(ah->av.ib.dgid, ah_attr->grh.dgid.raw, 16); + } + + ah->av.ib.dlid = cpu_to_be16(ah_attr->dlid); + if (ah_attr->static_rate) { + ah->av.ib.stat_rate = ah_attr->static_rate + MLX4_STAT_RATE_OFFSET; + while (ah->av.ib.stat_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET && + !(1 << ah->av.ib.stat_rate & dev->caps.stat_rate_support)) + --ah->av.ib.stat_rate; + } + ah->av.ib.sl_tclass_flowlabel = cpu_to_be32(ah_attr->sl << 28); + + return &ah->ibah; +} + +static struct ib_ah *create_iboe_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr, + struct mlx4_ib_ah *ah) +{ + struct mlx4_ib_dev *ibdev = to_mdev(pd->device); + struct mlx4_dev *dev = ibdev->dev; + int is_mcast = 0; + struct in6_addr in6; + u16 vlan_tag; + + memcpy(&in6, ah_attr->grh.dgid.raw, sizeof(in6)); + if (rdma_is_multicast_addr(&in6)) { + is_mcast = 1; + rdma_get_mcast_mac(&in6, ah->av.eth.mac); + } else { + memcpy(ah->av.eth.mac, ah_attr->dmac, ETH_ALEN); + } + vlan_tag = ah_attr->vlan_id; + if (vlan_tag < 0x1000) + vlan_tag |= (ah_attr->sl & 7) << 13; + ah->av.eth.port_pd = cpu_to_be32(to_mpd(pd)->pdn | (ah_attr->port_num << 24)); + ah->av.eth.gid_index = ah_attr->grh.sgid_index; + ah->av.eth.vlan = cpu_to_be16(vlan_tag); + if (ah_attr->static_rate) { + ah->av.eth.stat_rate = ah_attr->static_rate + MLX4_STAT_RATE_OFFSET; + while (ah->av.eth.stat_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET && + !(1 << ah->av.eth.stat_rate & dev->caps.stat_rate_support)) + --ah->av.eth.stat_rate; + } + + /* + * HW requires multicast LID so we just choose one. + */ + if (is_mcast) + ah->av.ib.dlid = cpu_to_be16(0xc000); + + memcpy(ah->av.eth.dgid, ah_attr->grh.dgid.raw, 16); + ah->av.eth.sl_tclass_flowlabel = cpu_to_be32(ah_attr->sl << 29); + + return &ah->ibah; +} + +struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr) +{ + struct mlx4_ib_ah *ah; + struct ib_ah *ret; + + ah = kzalloc(sizeof *ah, GFP_ATOMIC); + if (!ah) + return ERR_PTR(-ENOMEM); + + if (rdma_port_get_link_layer(pd->device, ah_attr->port_num) == IB_LINK_LAYER_ETHERNET) { + if (!(ah_attr->ah_flags & IB_AH_GRH)) { + ret = ERR_PTR(-EINVAL); + } else { + /* + * TBD: need to handle the case when we get + * called in an atomic context and there we + * might sleep. We don't expect this + * currently since we're working with link + * local addresses which we can translate + * without going to sleep. + */ + ret = create_iboe_ah(pd, ah_attr, ah); + } + + if (IS_ERR(ret)) + kfree(ah); + + return ret; + } else + return create_ib_ah(pd, ah_attr, ah); /* never fails */ +} + +int mlx4_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr) +{ + struct mlx4_ib_ah *ah = to_mah(ibah); + enum rdma_link_layer ll; + + memset(ah_attr, 0, sizeof *ah_attr); + ah_attr->sl = be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28; + ah_attr->port_num = be32_to_cpu(ah->av.ib.port_pd) >> 24; + ll = rdma_port_get_link_layer(ibah->device, ah_attr->port_num); + ah_attr->dlid = ll == IB_LINK_LAYER_INFINIBAND ? be16_to_cpu(ah->av.ib.dlid) : 0; + if (ah->av.ib.stat_rate) + ah_attr->static_rate = ah->av.ib.stat_rate - MLX4_STAT_RATE_OFFSET; + ah_attr->src_path_bits = ah->av.ib.g_slid & 0x7F; + + if (mlx4_ib_ah_grh_present(ah)) { + ah_attr->ah_flags = IB_AH_GRH; + + ah_attr->grh.traffic_class = + be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20; + ah_attr->grh.flow_label = + be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) & 0xfffff; + ah_attr->grh.hop_limit = ah->av.ib.hop_limit; + ah_attr->grh.sgid_index = ah->av.ib.gid_index; + memcpy(ah_attr->grh.dgid.raw, ah->av.ib.dgid, 16); + } + + return 0; +} + +int mlx4_ib_destroy_ah(struct ib_ah *ah) +{ + kfree(to_mah(ah)); + return 0; +} diff --git a/kernel/drivers/infiniband/hw/mlx4/alias_GUID.c b/kernel/drivers/infiniband/hw/mlx4/alias_GUID.c new file mode 100644 index 000000000..0f00204d2 --- /dev/null +++ b/kernel/drivers/infiniband/hw/mlx4/alias_GUID.c @@ -0,0 +1,901 @@ +/* + * Copyright (c) 2012 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + /***********************************************************/ +/*This file support the handling of the Alias GUID feature. */ +/***********************************************************/ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "mlx4_ib.h" + +/* +The driver keeps the current state of all guids, as they are in the HW. +Whenever we receive an smp mad GUIDInfo record, the data will be cached. +*/ + +struct mlx4_alias_guid_work_context { + u8 port; + struct mlx4_ib_dev *dev ; + struct ib_sa_query *sa_query; + struct completion done; + int query_id; + struct list_head list; + int block_num; + ib_sa_comp_mask guid_indexes; + u8 method; +}; + +struct mlx4_next_alias_guid_work { + u8 port; + u8 block_num; + u8 method; + struct mlx4_sriov_alias_guid_info_rec_det rec_det; +}; + +static int get_low_record_time_index(struct mlx4_ib_dev *dev, u8 port, + int *resched_delay_sec); + +void mlx4_ib_update_cache_on_guid_change(struct mlx4_ib_dev *dev, int block_num, + u8 port_num, u8 *p_data) +{ + int i; + u64 guid_indexes; + int slave_id; + int port_index = port_num - 1; + + if (!mlx4_is_master(dev->dev)) + return; + + guid_indexes = be64_to_cpu((__force __be64) dev->sriov.alias_guid. + ports_guid[port_num - 1]. + all_rec_per_port[block_num].guid_indexes); + pr_debug("port: %d, guid_indexes: 0x%llx\n", port_num, guid_indexes); + + for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) { + /* The location of the specific index starts from bit number 4 + * until bit num 11 */ + if (test_bit(i + 4, (unsigned long *)&guid_indexes)) { + slave_id = (block_num * NUM_ALIAS_GUID_IN_REC) + i ; + if (slave_id >= dev->dev->num_slaves) { + pr_debug("The last slave: %d\n", slave_id); + return; + } + + /* cache the guid: */ + memcpy(&dev->sriov.demux[port_index].guid_cache[slave_id], + &p_data[i * GUID_REC_SIZE], + GUID_REC_SIZE); + } else + pr_debug("Guid number: %d in block: %d" + " was not updated\n", i, block_num); + } +} + +static __be64 get_cached_alias_guid(struct mlx4_ib_dev *dev, int port, int index) +{ + if (index >= NUM_ALIAS_GUID_PER_PORT) { + pr_err("%s: ERROR: asked for index:%d\n", __func__, index); + return (__force __be64) -1; + } + return *(__be64 *)&dev->sriov.demux[port - 1].guid_cache[index]; +} + + +ib_sa_comp_mask mlx4_ib_get_aguid_comp_mask_from_ix(int index) +{ + return IB_SA_COMP_MASK(4 + index); +} + +void mlx4_ib_slave_alias_guid_event(struct mlx4_ib_dev *dev, int slave, + int port, int slave_init) +{ + __be64 curr_guid, required_guid; + int record_num = slave / 8; + int index = slave % 8; + int port_index = port - 1; + unsigned long flags; + int do_work = 0; + + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags); + if (dev->sriov.alias_guid.ports_guid[port_index].state_flags & + GUID_STATE_NEED_PORT_INIT) + goto unlock; + if (!slave_init) { + curr_guid = *(__be64 *)&dev->sriov. + alias_guid.ports_guid[port_index]. + all_rec_per_port[record_num]. + all_recs[GUID_REC_SIZE * index]; + if (curr_guid == cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL) || + !curr_guid) + goto unlock; + required_guid = cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL); + } else { + required_guid = mlx4_get_admin_guid(dev->dev, slave, port); + if (required_guid == cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL)) + goto unlock; + } + *(__be64 *)&dev->sriov.alias_guid.ports_guid[port_index]. + all_rec_per_port[record_num]. + all_recs[GUID_REC_SIZE * index] = required_guid; + dev->sriov.alias_guid.ports_guid[port_index]. + all_rec_per_port[record_num].guid_indexes + |= mlx4_ib_get_aguid_comp_mask_from_ix(index); + dev->sriov.alias_guid.ports_guid[port_index]. + all_rec_per_port[record_num].status + = MLX4_GUID_INFO_STATUS_IDLE; + /* set to run immediately */ + dev->sriov.alias_guid.ports_guid[port_index]. + all_rec_per_port[record_num].time_to_run = 0; + dev->sriov.alias_guid.ports_guid[port_index]. + all_rec_per_port[record_num]. + guids_retry_schedule[index] = 0; + do_work = 1; +unlock: + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags); + + if (do_work) + mlx4_ib_init_alias_guid_work(dev, port_index); +} + +/* + * Whenever new GUID is set/unset (guid table change) create event and + * notify the relevant slave (master also should be notified). + * If the GUID value is not as we have in the cache the slave will not be + * updated; in this case it waits for the smp_snoop or the port management + * event to call the function and to update the slave. + * block_number - the index of the block (16 blocks available) + * port_number - 1 or 2 + */ +void mlx4_ib_notify_slaves_on_guid_change(struct mlx4_ib_dev *dev, + int block_num, u8 port_num, + u8 *p_data) +{ + int i; + u64 guid_indexes; + int slave_id; + enum slave_port_state new_state; + enum slave_port_state prev_state; + __be64 tmp_cur_ag, form_cache_ag; + enum slave_port_gen_event gen_event; + struct mlx4_sriov_alias_guid_info_rec_det *rec; + unsigned long flags; + __be64 required_value; + + if (!mlx4_is_master(dev->dev)) + return; + + rec = &dev->sriov.alias_guid.ports_guid[port_num - 1]. + all_rec_per_port[block_num]; + guid_indexes = be64_to_cpu((__force __be64) dev->sriov.alias_guid. + ports_guid[port_num - 1]. + all_rec_per_port[block_num].guid_indexes); + pr_debug("port: %d, guid_indexes: 0x%llx\n", port_num, guid_indexes); + + /*calculate the slaves and notify them*/ + for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) { + /* the location of the specific index runs from bits 4..11 */ + if (!(test_bit(i + 4, (unsigned long *)&guid_indexes))) + continue; + + slave_id = (block_num * NUM_ALIAS_GUID_IN_REC) + i ; + if (slave_id >= dev->dev->persist->num_vfs + 1) + return; + tmp_cur_ag = *(__be64 *)&p_data[i * GUID_REC_SIZE]; + form_cache_ag = get_cached_alias_guid(dev, port_num, + (NUM_ALIAS_GUID_IN_REC * block_num) + i); + /* + * Check if guid is not the same as in the cache, + * If it is different, wait for the snoop_smp or the port mgmt + * change event to update the slave on its port state change + */ + if (tmp_cur_ag != form_cache_ag) + continue; + + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags); + required_value = *(__be64 *)&rec->all_recs[i * GUID_REC_SIZE]; + + if (required_value == cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL)) + required_value = 0; + + if (tmp_cur_ag == required_value) { + rec->guid_indexes = rec->guid_indexes & + ~mlx4_ib_get_aguid_comp_mask_from_ix(i); + } else { + /* may notify port down if value is 0 */ + if (tmp_cur_ag != MLX4_NOT_SET_GUID) { + spin_unlock_irqrestore(&dev->sriov. + alias_guid.ag_work_lock, flags); + continue; + } + } + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, + flags); + mlx4_gen_guid_change_eqe(dev->dev, slave_id, port_num); + /*2 cases: Valid GUID, and Invalid Guid*/ + + if (tmp_cur_ag != MLX4_NOT_SET_GUID) { /*valid GUID*/ + prev_state = mlx4_get_slave_port_state(dev->dev, slave_id, port_num); + new_state = set_and_calc_slave_port_state(dev->dev, slave_id, port_num, + MLX4_PORT_STATE_IB_PORT_STATE_EVENT_GID_VALID, + &gen_event); + pr_debug("slave: %d, port: %d prev_port_state: %d," + " new_port_state: %d, gen_event: %d\n", + slave_id, port_num, prev_state, new_state, gen_event); + if (gen_event == SLAVE_PORT_GEN_EVENT_UP) { + pr_debug("sending PORT_UP event to slave: %d, port: %d\n", + slave_id, port_num); + mlx4_gen_port_state_change_eqe(dev->dev, slave_id, + port_num, MLX4_PORT_CHANGE_SUBTYPE_ACTIVE); + } + } else { /* request to invalidate GUID */ + set_and_calc_slave_port_state(dev->dev, slave_id, port_num, + MLX4_PORT_STATE_IB_EVENT_GID_INVALID, + &gen_event); + if (gen_event == SLAVE_PORT_GEN_EVENT_DOWN) { + pr_debug("sending PORT DOWN event to slave: %d, port: %d\n", + slave_id, port_num); + mlx4_gen_port_state_change_eqe(dev->dev, + slave_id, + port_num, + MLX4_PORT_CHANGE_SUBTYPE_DOWN); + } + } + } +} + +static void aliasguid_query_handler(int status, + struct ib_sa_guidinfo_rec *guid_rec, + void *context) +{ + struct mlx4_ib_dev *dev; + struct mlx4_alias_guid_work_context *cb_ctx = context; + u8 port_index ; + int i; + struct mlx4_sriov_alias_guid_info_rec_det *rec; + unsigned long flags, flags1; + ib_sa_comp_mask declined_guid_indexes = 0; + ib_sa_comp_mask applied_guid_indexes = 0; + unsigned int resched_delay_sec = 0; + + if (!context) + return; + + dev = cb_ctx->dev; + port_index = cb_ctx->port - 1; + rec = &dev->sriov.alias_guid.ports_guid[port_index]. + all_rec_per_port[cb_ctx->block_num]; + + if (status) { + pr_debug("(port: %d) failed: status = %d\n", + cb_ctx->port, status); + rec->time_to_run = ktime_get_real_ns() + 1 * NSEC_PER_SEC; + goto out; + } + + if (guid_rec->block_num != cb_ctx->block_num) { + pr_err("block num mismatch: %d != %d\n", + cb_ctx->block_num, guid_rec->block_num); + goto out; + } + + pr_debug("lid/port: %d/%d, block_num: %d\n", + be16_to_cpu(guid_rec->lid), cb_ctx->port, + guid_rec->block_num); + + rec = &dev->sriov.alias_guid.ports_guid[port_index]. + all_rec_per_port[guid_rec->block_num]; + + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags); + for (i = 0 ; i < NUM_ALIAS_GUID_IN_REC; i++) { + __be64 sm_response, required_val; + + if (!(cb_ctx->guid_indexes & + mlx4_ib_get_aguid_comp_mask_from_ix(i))) + continue; + sm_response = *(__be64 *)&guid_rec->guid_info_list + [i * GUID_REC_SIZE]; + required_val = *(__be64 *)&rec->all_recs[i * GUID_REC_SIZE]; + if (cb_ctx->method == MLX4_GUID_INFO_RECORD_DELETE) { + if (required_val == + cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL)) + goto next_entry; + + /* A new value was set till we got the response */ + pr_debug("need to set new value %llx, record num %d, block_num:%d\n", + be64_to_cpu(required_val), + i, guid_rec->block_num); + goto entry_declined; + } + + /* check if the SM didn't assign one of the records. + * if it didn't, re-ask for. + */ + if (sm_response == MLX4_NOT_SET_GUID) { + if (rec->guids_retry_schedule[i] == 0) + mlx4_ib_warn(&dev->ib_dev, + "%s:Record num %d in block_num: %d was declined by SM\n", + __func__, i, + guid_rec->block_num); + goto entry_declined; + } else { + /* properly assigned record. */ + /* We save the GUID we just got from the SM in the + * admin_guid in order to be persistent, and in the + * request from the sm the process will ask for the same GUID */ + if (required_val && + sm_response != required_val) { + /* Warn only on first retry */ + if (rec->guids_retry_schedule[i] == 0) + mlx4_ib_warn(&dev->ib_dev, "%s: Failed to set" + " admin guid after SysAdmin " + "configuration. " + "Record num %d in block_num:%d " + "was declined by SM, " + "new val(0x%llx) was kept, SM returned (0x%llx)\n", + __func__, i, + guid_rec->block_num, + be64_to_cpu(required_val), + be64_to_cpu(sm_response)); + goto entry_declined; + } else { + *(__be64 *)&rec->all_recs[i * GUID_REC_SIZE] = + sm_response; + if (required_val == 0) + mlx4_set_admin_guid(dev->dev, + sm_response, + (guid_rec->block_num + * NUM_ALIAS_GUID_IN_REC) + i, + cb_ctx->port); + goto next_entry; + } + } +entry_declined: + declined_guid_indexes |= mlx4_ib_get_aguid_comp_mask_from_ix(i); + rec->guids_retry_schedule[i] = + (rec->guids_retry_schedule[i] == 0) ? 1 : + min((unsigned int)60, + rec->guids_retry_schedule[i] * 2); + /* using the minimum value among all entries in that record */ + resched_delay_sec = (resched_delay_sec == 0) ? + rec->guids_retry_schedule[i] : + min(resched_delay_sec, + rec->guids_retry_schedule[i]); + continue; + +next_entry: + rec->guids_retry_schedule[i] = 0; + } + + applied_guid_indexes = cb_ctx->guid_indexes & ~declined_guid_indexes; + if (declined_guid_indexes || + rec->guid_indexes & ~(applied_guid_indexes)) { + pr_debug("record=%d wasn't fully set, guid_indexes=0x%llx applied_indexes=0x%llx, declined_indexes=0x%llx\n", + guid_rec->block_num, + be64_to_cpu((__force __be64)rec->guid_indexes), + be64_to_cpu((__force __be64)applied_guid_indexes), + be64_to_cpu((__force __be64)declined_guid_indexes)); + rec->time_to_run = ktime_get_real_ns() + + resched_delay_sec * NSEC_PER_SEC; + } else { + rec->status = MLX4_GUID_INFO_STATUS_SET; + } + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags); + /* + The func is call here to close the cases when the + sm doesn't send smp, so in the sa response the driver + notifies the slave. + */ + mlx4_ib_notify_slaves_on_guid_change(dev, guid_rec->block_num, + cb_ctx->port, + guid_rec->guid_info_list); +out: + spin_lock_irqsave(&dev->sriov.going_down_lock, flags); + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1); + if (!dev->sriov.is_going_down) { + get_low_record_time_index(dev, port_index, &resched_delay_sec); + queue_delayed_work(dev->sriov.alias_guid.ports_guid[port_index].wq, + &dev->sriov.alias_guid.ports_guid[port_index]. + alias_guid_work, + msecs_to_jiffies(resched_delay_sec * 1000)); + } + if (cb_ctx->sa_query) { + list_del(&cb_ctx->list); + kfree(cb_ctx); + } else + complete(&cb_ctx->done); + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1); + spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags); +} + +static void invalidate_guid_record(struct mlx4_ib_dev *dev, u8 port, int index) +{ + int i; + u64 cur_admin_val; + ib_sa_comp_mask comp_mask = 0; + + dev->sriov.alias_guid.ports_guid[port - 1].all_rec_per_port[index].status + = MLX4_GUID_INFO_STATUS_SET; + + /* calculate the comp_mask for that record.*/ + for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) { + cur_admin_val = + *(u64 *)&dev->sriov.alias_guid.ports_guid[port - 1]. + all_rec_per_port[index].all_recs[GUID_REC_SIZE * i]; + /* + check the admin value: if it's for delete (~00LL) or + it is the first guid of the first record (hw guid) or + the records is not in ownership of the sysadmin and the sm doesn't + need to assign GUIDs, then don't put it up for assignment. + */ + if (MLX4_GUID_FOR_DELETE_VAL == cur_admin_val || + (!index && !i)) + continue; + comp_mask |= mlx4_ib_get_aguid_comp_mask_from_ix(i); + } + dev->sriov.alias_guid.ports_guid[port - 1]. + all_rec_per_port[index].guid_indexes |= comp_mask; + if (dev->sriov.alias_guid.ports_guid[port - 1]. + all_rec_per_port[index].guid_indexes) + dev->sriov.alias_guid.ports_guid[port - 1]. + all_rec_per_port[index].status = MLX4_GUID_INFO_STATUS_IDLE; + +} + +static int set_guid_rec(struct ib_device *ibdev, + struct mlx4_next_alias_guid_work *rec) +{ + int err; + struct mlx4_ib_dev *dev = to_mdev(ibdev); + struct ib_sa_guidinfo_rec guid_info_rec; + ib_sa_comp_mask comp_mask; + struct ib_port_attr attr; + struct mlx4_alias_guid_work_context *callback_context; + unsigned long resched_delay, flags, flags1; + u8 port = rec->port + 1; + int index = rec->block_num; + struct mlx4_sriov_alias_guid_info_rec_det *rec_det = &rec->rec_det; + struct list_head *head = + &dev->sriov.alias_guid.ports_guid[port - 1].cb_list; + + err = __mlx4_ib_query_port(ibdev, port, &attr, 1); + if (err) { + pr_debug("mlx4_ib_query_port failed (err: %d), port: %d\n", + err, port); + return err; + } + /*check the port was configured by the sm, otherwise no need to send */ + if (attr.state != IB_PORT_ACTIVE) { + pr_debug("port %d not active...rescheduling\n", port); + resched_delay = 5 * HZ; + err = -EAGAIN; + goto new_schedule; + } + + callback_context = kmalloc(sizeof *callback_context, GFP_KERNEL); + if (!callback_context) { + err = -ENOMEM; + resched_delay = HZ * 5; + goto new_schedule; + } + callback_context->port = port; + callback_context->dev = dev; + callback_context->block_num = index; + callback_context->guid_indexes = rec_det->guid_indexes; + callback_context->method = rec->method; + + memset(&guid_info_rec, 0, sizeof (struct ib_sa_guidinfo_rec)); + + guid_info_rec.lid = cpu_to_be16(attr.lid); + guid_info_rec.block_num = index; + + memcpy(guid_info_rec.guid_info_list, rec_det->all_recs, + GUID_REC_SIZE * NUM_ALIAS_GUID_IN_REC); + comp_mask = IB_SA_GUIDINFO_REC_LID | IB_SA_GUIDINFO_REC_BLOCK_NUM | + rec_det->guid_indexes; + + init_completion(&callback_context->done); + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1); + list_add_tail(&callback_context->list, head); + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1); + + callback_context->query_id = + ib_sa_guid_info_rec_query(dev->sriov.alias_guid.sa_client, + ibdev, port, &guid_info_rec, + comp_mask, rec->method, 1000, + GFP_KERNEL, aliasguid_query_handler, + callback_context, + &callback_context->sa_query); + if (callback_context->query_id < 0) { + pr_debug("ib_sa_guid_info_rec_query failed, query_id: " + "%d. will reschedule to the next 1 sec.\n", + callback_context->query_id); + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1); + list_del(&callback_context->list); + kfree(callback_context); + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1); + resched_delay = 1 * HZ; + err = -EAGAIN; + goto new_schedule; + } + err = 0; + goto out; + +new_schedule: + spin_lock_irqsave(&dev->sriov.going_down_lock, flags); + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1); + invalidate_guid_record(dev, port, index); + if (!dev->sriov.is_going_down) { + queue_delayed_work(dev->sriov.alias_guid.ports_guid[port - 1].wq, + &dev->sriov.alias_guid.ports_guid[port - 1].alias_guid_work, + resched_delay); + } + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1); + spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags); + +out: + return err; +} + +static void mlx4_ib_guid_port_init(struct mlx4_ib_dev *dev, int port) +{ + int j, k, entry; + __be64 guid; + + /*Check if the SM doesn't need to assign the GUIDs*/ + for (j = 0; j < NUM_ALIAS_GUID_REC_IN_PORT; j++) { + for (k = 0; k < NUM_ALIAS_GUID_IN_REC; k++) { + entry = j * NUM_ALIAS_GUID_IN_REC + k; + /* no request for the 0 entry (hw guid) */ + if (!entry || entry > dev->dev->persist->num_vfs || + !mlx4_is_slave_active(dev->dev, entry)) + continue; + guid = mlx4_get_admin_guid(dev->dev, entry, port); + *(__be64 *)&dev->sriov.alias_guid.ports_guid[port - 1]. + all_rec_per_port[j].all_recs + [GUID_REC_SIZE * k] = guid; + pr_debug("guid was set, entry=%d, val=0x%llx, port=%d\n", + entry, + be64_to_cpu(guid), + port); + } + } +} +void mlx4_ib_invalidate_all_guid_record(struct mlx4_ib_dev *dev, int port) +{ + int i; + unsigned long flags, flags1; + + pr_debug("port %d\n", port); + + spin_lock_irqsave(&dev->sriov.going_down_lock, flags); + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1); + + if (dev->sriov.alias_guid.ports_guid[port - 1].state_flags & + GUID_STATE_NEED_PORT_INIT) { + mlx4_ib_guid_port_init(dev, port); + dev->sriov.alias_guid.ports_guid[port - 1].state_flags &= + (~GUID_STATE_NEED_PORT_INIT); + } + for (i = 0; i < NUM_ALIAS_GUID_REC_IN_PORT; i++) + invalidate_guid_record(dev, port, i); + + if (mlx4_is_master(dev->dev) && !dev->sriov.is_going_down) { + /* + make sure no work waits in the queue, if the work is already + queued(not on the timer) the cancel will fail. That is not a problem + because we just want the work started. + */ + cancel_delayed_work(&dev->sriov.alias_guid. + ports_guid[port - 1].alias_guid_work); + queue_delayed_work(dev->sriov.alias_guid.ports_guid[port - 1].wq, + &dev->sriov.alias_guid.ports_guid[port - 1].alias_guid_work, + 0); + } + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1); + spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags); +} + +static void set_required_record(struct mlx4_ib_dev *dev, u8 port, + struct mlx4_next_alias_guid_work *next_rec, + int record_index) +{ + int i; + int lowset_time_entry = -1; + int lowest_time = 0; + ib_sa_comp_mask delete_guid_indexes = 0; + ib_sa_comp_mask set_guid_indexes = 0; + struct mlx4_sriov_alias_guid_info_rec_det *rec = + &dev->sriov.alias_guid.ports_guid[port]. + all_rec_per_port[record_index]; + + for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) { + if (!(rec->guid_indexes & + mlx4_ib_get_aguid_comp_mask_from_ix(i))) + continue; + + if (*(__be64 *)&rec->all_recs[i * GUID_REC_SIZE] == + cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL)) + delete_guid_indexes |= + mlx4_ib_get_aguid_comp_mask_from_ix(i); + else + set_guid_indexes |= + mlx4_ib_get_aguid_comp_mask_from_ix(i); + + if (lowset_time_entry == -1 || rec->guids_retry_schedule[i] <= + lowest_time) { + lowset_time_entry = i; + lowest_time = rec->guids_retry_schedule[i]; + } + } + + memcpy(&next_rec->rec_det, rec, sizeof(*rec)); + next_rec->port = port; + next_rec->block_num = record_index; + + if (*(__be64 *)&rec->all_recs[lowset_time_entry * GUID_REC_SIZE] == + cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL)) { + next_rec->rec_det.guid_indexes = delete_guid_indexes; + next_rec->method = MLX4_GUID_INFO_RECORD_DELETE; + } else { + next_rec->rec_det.guid_indexes = set_guid_indexes; + next_rec->method = MLX4_GUID_INFO_RECORD_SET; + } +} + +/* return index of record that should be updated based on lowest + * rescheduled time + */ +static int get_low_record_time_index(struct mlx4_ib_dev *dev, u8 port, + int *resched_delay_sec) +{ + int record_index = -1; + u64 low_record_time = 0; + struct mlx4_sriov_alias_guid_info_rec_det rec; + int j; + + for (j = 0; j < NUM_ALIAS_GUID_REC_IN_PORT; j++) { + rec = dev->sriov.alias_guid.ports_guid[port]. + all_rec_per_port[j]; + if (rec.status == MLX4_GUID_INFO_STATUS_IDLE && + rec.guid_indexes) { + if (record_index == -1 || + rec.time_to_run < low_record_time) { + record_index = j; + low_record_time = rec.time_to_run; + } + } + } + if (resched_delay_sec) { + u64 curr_time = ktime_get_real_ns(); + + *resched_delay_sec = (low_record_time < curr_time) ? 0 : + div_u64((low_record_time - curr_time), NSEC_PER_SEC); + } + + return record_index; +} + +/* The function returns the next record that was + * not configured (or failed to be configured) */ +static int get_next_record_to_update(struct mlx4_ib_dev *dev, u8 port, + struct mlx4_next_alias_guid_work *rec) +{ + unsigned long flags; + int record_index; + int ret = 0; + + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags); + record_index = get_low_record_time_index(dev, port, NULL); + + if (record_index < 0) { + ret = -ENOENT; + goto out; + } + + set_required_record(dev, port, rec, record_index); +out: + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags); + return ret; +} + +static void alias_guid_work(struct work_struct *work) +{ + struct delayed_work *delay = to_delayed_work(work); + int ret = 0; + struct mlx4_next_alias_guid_work *rec; + struct mlx4_sriov_alias_guid_port_rec_det *sriov_alias_port = + container_of(delay, struct mlx4_sriov_alias_guid_port_rec_det, + alias_guid_work); + struct mlx4_sriov_alias_guid *sriov_alias_guid = sriov_alias_port->parent; + struct mlx4_ib_sriov *ib_sriov = container_of(sriov_alias_guid, + struct mlx4_ib_sriov, + alias_guid); + struct mlx4_ib_dev *dev = container_of(ib_sriov, struct mlx4_ib_dev, sriov); + + rec = kzalloc(sizeof *rec, GFP_KERNEL); + if (!rec) { + pr_err("alias_guid_work: No Memory\n"); + return; + } + + pr_debug("starting [port: %d]...\n", sriov_alias_port->port + 1); + ret = get_next_record_to_update(dev, sriov_alias_port->port, rec); + if (ret) { + pr_debug("No more records to update.\n"); + goto out; + } + + set_guid_rec(&dev->ib_dev, rec); +out: + kfree(rec); +} + + +void mlx4_ib_init_alias_guid_work(struct mlx4_ib_dev *dev, int port) +{ + unsigned long flags, flags1; + + if (!mlx4_is_master(dev->dev)) + return; + spin_lock_irqsave(&dev->sriov.going_down_lock, flags); + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1); + if (!dev->sriov.is_going_down) { + /* If there is pending one should cancell then run, otherwise + * won't run till previous one is ended as same work + * struct is used. + */ + cancel_delayed_work(&dev->sriov.alias_guid.ports_guid[port]. + alias_guid_work); + queue_delayed_work(dev->sriov.alias_guid.ports_guid[port].wq, + &dev->sriov.alias_guid.ports_guid[port].alias_guid_work, 0); + } + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1); + spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags); +} + +void mlx4_ib_destroy_alias_guid_service(struct mlx4_ib_dev *dev) +{ + int i; + struct mlx4_ib_sriov *sriov = &dev->sriov; + struct mlx4_alias_guid_work_context *cb_ctx; + struct mlx4_sriov_alias_guid_port_rec_det *det; + struct ib_sa_query *sa_query; + unsigned long flags; + + for (i = 0 ; i < dev->num_ports; i++) { + cancel_delayed_work(&dev->sriov.alias_guid.ports_guid[i].alias_guid_work); + det = &sriov->alias_guid.ports_guid[i]; + spin_lock_irqsave(&sriov->alias_guid.ag_work_lock, flags); + while (!list_empty(&det->cb_list)) { + cb_ctx = list_entry(det->cb_list.next, + struct mlx4_alias_guid_work_context, + list); + sa_query = cb_ctx->sa_query; + cb_ctx->sa_query = NULL; + list_del(&cb_ctx->list); + spin_unlock_irqrestore(&sriov->alias_guid.ag_work_lock, flags); + ib_sa_cancel_query(cb_ctx->query_id, sa_query); + wait_for_completion(&cb_ctx->done); + kfree(cb_ctx); + spin_lock_irqsave(&sriov->alias_guid.ag_work_lock, flags); + } + spin_unlock_irqrestore(&sriov->alias_guid.ag_work_lock, flags); + } + for (i = 0 ; i < dev->num_ports; i++) { + flush_workqueue(dev->sriov.alias_guid.ports_guid[i].wq); + destroy_workqueue(dev->sriov.alias_guid.ports_guid[i].wq); + } + ib_sa_unregister_client(dev->sriov.alias_guid.sa_client); + kfree(dev->sriov.alias_guid.sa_client); +} + +int mlx4_ib_init_alias_guid_service(struct mlx4_ib_dev *dev) +{ + char alias_wq_name[15]; + int ret = 0; + int i, j; + union ib_gid gid; + + if (!mlx4_is_master(dev->dev)) + return 0; + dev->sriov.alias_guid.sa_client = + kzalloc(sizeof *dev->sriov.alias_guid.sa_client, GFP_KERNEL); + if (!dev->sriov.alias_guid.sa_client) + return -ENOMEM; + + ib_sa_register_client(dev->sriov.alias_guid.sa_client); + + spin_lock_init(&dev->sriov.alias_guid.ag_work_lock); + + for (i = 1; i <= dev->num_ports; ++i) { + if (dev->ib_dev.query_gid(&dev->ib_dev , i, 0, &gid)) { + ret = -EFAULT; + goto err_unregister; + } + } + + for (i = 0 ; i < dev->num_ports; i++) { + memset(&dev->sriov.alias_guid.ports_guid[i], 0, + sizeof (struct mlx4_sriov_alias_guid_port_rec_det)); + dev->sriov.alias_guid.ports_guid[i].state_flags |= + GUID_STATE_NEED_PORT_INIT; + for (j = 0; j < NUM_ALIAS_GUID_REC_IN_PORT; j++) { + /* mark each val as it was deleted */ + memset(dev->sriov.alias_guid.ports_guid[i]. + all_rec_per_port[j].all_recs, 0xFF, + sizeof(dev->sriov.alias_guid.ports_guid[i]. + all_rec_per_port[j].all_recs)); + } + INIT_LIST_HEAD(&dev->sriov.alias_guid.ports_guid[i].cb_list); + /*prepare the records, set them to be allocated by sm*/ + if (mlx4_ib_sm_guid_assign) + for (j = 1; j < NUM_ALIAS_GUID_PER_PORT; j++) + mlx4_set_admin_guid(dev->dev, 0, j, i + 1); + for (j = 0 ; j < NUM_ALIAS_GUID_REC_IN_PORT; j++) + invalidate_guid_record(dev, i + 1, j); + + dev->sriov.alias_guid.ports_guid[i].parent = &dev->sriov.alias_guid; + dev->sriov.alias_guid.ports_guid[i].port = i; + + snprintf(alias_wq_name, sizeof alias_wq_name, "alias_guid%d", i); + dev->sriov.alias_guid.ports_guid[i].wq = + create_singlethread_workqueue(alias_wq_name); + if (!dev->sriov.alias_guid.ports_guid[i].wq) { + ret = -ENOMEM; + goto err_thread; + } + INIT_DELAYED_WORK(&dev->sriov.alias_guid.ports_guid[i].alias_guid_work, + alias_guid_work); + } + return 0; + +err_thread: + for (--i; i >= 0; i--) { + destroy_workqueue(dev->sriov.alias_guid.ports_guid[i].wq); + dev->sriov.alias_guid.ports_guid[i].wq = NULL; + } + +err_unregister: + ib_sa_unregister_client(dev->sriov.alias_guid.sa_client); + kfree(dev->sriov.alias_guid.sa_client); + dev->sriov.alias_guid.sa_client = NULL; + pr_err("init_alias_guid_service: Failed. (ret:%d)\n", ret); + return ret; +} diff --git a/kernel/drivers/infiniband/hw/mlx4/cm.c b/kernel/drivers/infiniband/hw/mlx4/cm.c new file mode 100644 index 000000000..39a488889 --- /dev/null +++ b/kernel/drivers/infiniband/hw/mlx4/cm.c @@ -0,0 +1,478 @@ +/* + * Copyright (c) 2012 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include +#include +#include +#include + +#include "mlx4_ib.h" + +#define CM_CLEANUP_CACHE_TIMEOUT (5 * HZ) + +struct id_map_entry { + struct rb_node node; + + u32 sl_cm_id; + u32 pv_cm_id; + int slave_id; + int scheduled_delete; + struct mlx4_ib_dev *dev; + + struct list_head list; + struct delayed_work timeout; +}; + +struct cm_generic_msg { + struct ib_mad_hdr hdr; + + __be32 local_comm_id; + __be32 remote_comm_id; +}; + +struct cm_sidr_generic_msg { + struct ib_mad_hdr hdr; + __be32 request_id; +}; + +struct cm_req_msg { + unsigned char unused[0x60]; + union ib_gid primary_path_sgid; +}; + + +static void set_local_comm_id(struct ib_mad *mad, u32 cm_id) +{ + if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { + struct cm_sidr_generic_msg *msg = + (struct cm_sidr_generic_msg *)mad; + msg->request_id = cpu_to_be32(cm_id); + } else if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) { + pr_err("trying to set local_comm_id in SIDR_REP\n"); + return; + } else { + struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; + msg->local_comm_id = cpu_to_be32(cm_id); + } +} + +static u32 get_local_comm_id(struct ib_mad *mad) +{ + if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { + struct cm_sidr_generic_msg *msg = + (struct cm_sidr_generic_msg *)mad; + return be32_to_cpu(msg->request_id); + } else if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) { + pr_err("trying to set local_comm_id in SIDR_REP\n"); + return -1; + } else { + struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; + return be32_to_cpu(msg->local_comm_id); + } +} + +static void set_remote_comm_id(struct ib_mad *mad, u32 cm_id) +{ + if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) { + struct cm_sidr_generic_msg *msg = + (struct cm_sidr_generic_msg *)mad; + msg->request_id = cpu_to_be32(cm_id); + } else if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { + pr_err("trying to set remote_comm_id in SIDR_REQ\n"); + return; + } else { + struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; + msg->remote_comm_id = cpu_to_be32(cm_id); + } +} + +static u32 get_remote_comm_id(struct ib_mad *mad) +{ + if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) { + struct cm_sidr_generic_msg *msg = + (struct cm_sidr_generic_msg *)mad; + return be32_to_cpu(msg->request_id); + } else if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { + pr_err("trying to set remote_comm_id in SIDR_REQ\n"); + return -1; + } else { + struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; + return be32_to_cpu(msg->remote_comm_id); + } +} + +static union ib_gid gid_from_req_msg(struct ib_device *ibdev, struct ib_mad *mad) +{ + struct cm_req_msg *msg = (struct cm_req_msg *)mad; + + return msg->primary_path_sgid; +} + +/* Lock should be taken before called */ +static struct id_map_entry * +id_map_find_by_sl_id(struct ib_device *ibdev, u32 slave_id, u32 sl_cm_id) +{ + struct rb_root *sl_id_map = &to_mdev(ibdev)->sriov.sl_id_map; + struct rb_node *node = sl_id_map->rb_node; + + while (node) { + struct id_map_entry *id_map_entry = + rb_entry(node, struct id_map_entry, node); + + if (id_map_entry->sl_cm_id > sl_cm_id) + node = node->rb_left; + else if (id_map_entry->sl_cm_id < sl_cm_id) + node = node->rb_right; + else if (id_map_entry->slave_id > slave_id) + node = node->rb_left; + else if (id_map_entry->slave_id < slave_id) + node = node->rb_right; + else + return id_map_entry; + } + return NULL; +} + +static void id_map_ent_timeout(struct work_struct *work) +{ + struct delayed_work *delay = to_delayed_work(work); + struct id_map_entry *ent = container_of(delay, struct id_map_entry, timeout); + struct id_map_entry *db_ent, *found_ent; + struct mlx4_ib_dev *dev = ent->dev; + struct mlx4_ib_sriov *sriov = &dev->sriov; + struct rb_root *sl_id_map = &sriov->sl_id_map; + int pv_id = (int) ent->pv_cm_id; + + spin_lock(&sriov->id_map_lock); + db_ent = (struct id_map_entry *)idr_find(&sriov->pv_id_table, pv_id); + if (!db_ent) + goto out; + found_ent = id_map_find_by_sl_id(&dev->ib_dev, ent->slave_id, ent->sl_cm_id); + if (found_ent && found_ent == ent) + rb_erase(&found_ent->node, sl_id_map); + idr_remove(&sriov->pv_id_table, pv_id); + +out: + list_del(&ent->list); + spin_unlock(&sriov->id_map_lock); + kfree(ent); +} + +static void id_map_find_del(struct ib_device *ibdev, int pv_cm_id) +{ + struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov; + struct rb_root *sl_id_map = &sriov->sl_id_map; + struct id_map_entry *ent, *found_ent; + + spin_lock(&sriov->id_map_lock); + ent = (struct id_map_entry *)idr_find(&sriov->pv_id_table, pv_cm_id); + if (!ent) + goto out; + found_ent = id_map_find_by_sl_id(ibdev, ent->slave_id, ent->sl_cm_id); + if (found_ent && found_ent == ent) + rb_erase(&found_ent->node, sl_id_map); + idr_remove(&sriov->pv_id_table, pv_cm_id); +out: + spin_unlock(&sriov->id_map_lock); +} + +static void sl_id_map_add(struct ib_device *ibdev, struct id_map_entry *new) +{ + struct rb_root *sl_id_map = &to_mdev(ibdev)->sriov.sl_id_map; + struct rb_node **link = &sl_id_map->rb_node, *parent = NULL; + struct id_map_entry *ent; + int slave_id = new->slave_id; + int sl_cm_id = new->sl_cm_id; + + ent = id_map_find_by_sl_id(ibdev, slave_id, sl_cm_id); + if (ent) { + pr_debug("overriding existing sl_id_map entry (cm_id = %x)\n", + sl_cm_id); + + rb_replace_node(&ent->node, &new->node, sl_id_map); + return; + } + + /* Go to the bottom of the tree */ + while (*link) { + parent = *link; + ent = rb_entry(parent, struct id_map_entry, node); + + if (ent->sl_cm_id > sl_cm_id || (ent->sl_cm_id == sl_cm_id && ent->slave_id > slave_id)) + link = &(*link)->rb_left; + else + link = &(*link)->rb_right; + } + + rb_link_node(&new->node, parent, link); + rb_insert_color(&new->node, sl_id_map); +} + +static struct id_map_entry * +id_map_alloc(struct ib_device *ibdev, int slave_id, u32 sl_cm_id) +{ + int ret; + struct id_map_entry *ent; + struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov; + + ent = kmalloc(sizeof (struct id_map_entry), GFP_KERNEL); + if (!ent) { + mlx4_ib_warn(ibdev, "Couldn't allocate id cache entry - out of memory\n"); + return ERR_PTR(-ENOMEM); + } + + ent->sl_cm_id = sl_cm_id; + ent->slave_id = slave_id; + ent->scheduled_delete = 0; + ent->dev = to_mdev(ibdev); + INIT_DELAYED_WORK(&ent->timeout, id_map_ent_timeout); + + idr_preload(GFP_KERNEL); + spin_lock(&to_mdev(ibdev)->sriov.id_map_lock); + + ret = idr_alloc_cyclic(&sriov->pv_id_table, ent, 0, 0, GFP_NOWAIT); + if (ret >= 0) { + ent->pv_cm_id = (u32)ret; + sl_id_map_add(ibdev, ent); + list_add_tail(&ent->list, &sriov->cm_list); + } + + spin_unlock(&sriov->id_map_lock); + idr_preload_end(); + + if (ret >= 0) + return ent; + + /*error flow*/ + kfree(ent); + mlx4_ib_warn(ibdev, "No more space in the idr (err:0x%x)\n", ret); + return ERR_PTR(-ENOMEM); +} + +static struct id_map_entry * +id_map_get(struct ib_device *ibdev, int *pv_cm_id, int sl_cm_id, int slave_id) +{ + struct id_map_entry *ent; + struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov; + + spin_lock(&sriov->id_map_lock); + if (*pv_cm_id == -1) { + ent = id_map_find_by_sl_id(ibdev, sl_cm_id, slave_id); + if (ent) + *pv_cm_id = (int) ent->pv_cm_id; + } else + ent = (struct id_map_entry *)idr_find(&sriov->pv_id_table, *pv_cm_id); + spin_unlock(&sriov->id_map_lock); + + return ent; +} + +static void schedule_delayed(struct ib_device *ibdev, struct id_map_entry *id) +{ + struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov; + unsigned long flags; + + spin_lock(&sriov->id_map_lock); + spin_lock_irqsave(&sriov->going_down_lock, flags); + /*make sure that there is no schedule inside the scheduled work.*/ + if (!sriov->is_going_down) { + id->scheduled_delete = 1; + schedule_delayed_work(&id->timeout, CM_CLEANUP_CACHE_TIMEOUT); + } + spin_unlock_irqrestore(&sriov->going_down_lock, flags); + spin_unlock(&sriov->id_map_lock); +} + +int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id, + struct ib_mad *mad) +{ + struct id_map_entry *id; + u32 sl_cm_id; + int pv_cm_id = -1; + + if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID || + mad->mad_hdr.attr_id == CM_REP_ATTR_ID || + mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { + sl_cm_id = get_local_comm_id(mad); + id = id_map_alloc(ibdev, slave_id, sl_cm_id); + if (IS_ERR(id)) { + mlx4_ib_warn(ibdev, "%s: id{slave: %d, sl_cm_id: 0x%x} Failed to id_map_alloc\n", + __func__, slave_id, sl_cm_id); + return PTR_ERR(id); + } + } else if (mad->mad_hdr.attr_id == CM_REJ_ATTR_ID || + mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) { + return 0; + } else { + sl_cm_id = get_local_comm_id(mad); + id = id_map_get(ibdev, &pv_cm_id, slave_id, sl_cm_id); + } + + if (!id) { + pr_debug("id{slave: %d, sl_cm_id: 0x%x} is NULL!\n", + slave_id, sl_cm_id); + return -EINVAL; + } + + set_local_comm_id(mad, id->pv_cm_id); + + if (mad->mad_hdr.attr_id == CM_DREQ_ATTR_ID) + schedule_delayed(ibdev, id); + else if (mad->mad_hdr.attr_id == CM_DREP_ATTR_ID) + id_map_find_del(ibdev, pv_cm_id); + + return 0; +} + +int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave, + struct ib_mad *mad) +{ + u32 pv_cm_id; + struct id_map_entry *id; + + if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID || + mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { + union ib_gid gid; + + if (!slave) + return 0; + + gid = gid_from_req_msg(ibdev, mad); + *slave = mlx4_ib_find_real_gid(ibdev, port, gid.global.interface_id); + if (*slave < 0) { + mlx4_ib_warn(ibdev, "failed matching slave_id by gid (0x%llx)\n", + be64_to_cpu(gid.global.interface_id)); + return -ENOENT; + } + return 0; + } + + pv_cm_id = get_remote_comm_id(mad); + id = id_map_get(ibdev, (int *)&pv_cm_id, -1, -1); + + if (!id) { + pr_debug("Couldn't find an entry for pv_cm_id 0x%x\n", pv_cm_id); + return -ENOENT; + } + + if (slave) + *slave = id->slave_id; + set_remote_comm_id(mad, id->sl_cm_id); + + if (mad->mad_hdr.attr_id == CM_DREQ_ATTR_ID) + schedule_delayed(ibdev, id); + else if (mad->mad_hdr.attr_id == CM_REJ_ATTR_ID || + mad->mad_hdr.attr_id == CM_DREP_ATTR_ID) { + id_map_find_del(ibdev, (int) pv_cm_id); + } + + return 0; +} + +void mlx4_ib_cm_paravirt_init(struct mlx4_ib_dev *dev) +{ + spin_lock_init(&dev->sriov.id_map_lock); + INIT_LIST_HEAD(&dev->sriov.cm_list); + dev->sriov.sl_id_map = RB_ROOT; + idr_init(&dev->sriov.pv_id_table); +} + +/* slave = -1 ==> all slaves */ +/* TBD -- call paravirt clean for single slave. Need for slave RESET event */ +void mlx4_ib_cm_paravirt_clean(struct mlx4_ib_dev *dev, int slave) +{ + struct mlx4_ib_sriov *sriov = &dev->sriov; + struct rb_root *sl_id_map = &sriov->sl_id_map; + struct list_head lh; + struct rb_node *nd; + int need_flush = 1; + struct id_map_entry *map, *tmp_map; + /* cancel all delayed work queue entries */ + INIT_LIST_HEAD(&lh); + spin_lock(&sriov->id_map_lock); + list_for_each_entry_safe(map, tmp_map, &dev->sriov.cm_list, list) { + if (slave < 0 || slave == map->slave_id) { + if (map->scheduled_delete) + need_flush &= !!cancel_delayed_work(&map->timeout); + } + } + + spin_unlock(&sriov->id_map_lock); + + if (!need_flush) + flush_scheduled_work(); /* make sure all timers were flushed */ + + /* now, remove all leftover entries from databases*/ + spin_lock(&sriov->id_map_lock); + if (slave < 0) { + while (rb_first(sl_id_map)) { + struct id_map_entry *ent = + rb_entry(rb_first(sl_id_map), + struct id_map_entry, node); + + rb_erase(&ent->node, sl_id_map); + idr_remove(&sriov->pv_id_table, (int) ent->pv_cm_id); + } + list_splice_init(&dev->sriov.cm_list, &lh); + } else { + /* first, move nodes belonging to slave to db remove list */ + nd = rb_first(sl_id_map); + while (nd) { + struct id_map_entry *ent = + rb_entry(nd, struct id_map_entry, node); + nd = rb_next(nd); + if (ent->slave_id == slave) + list_move_tail(&ent->list, &lh); + } + /* remove those nodes from databases */ + list_for_each_entry_safe(map, tmp_map, &lh, list) { + rb_erase(&map->node, sl_id_map); + idr_remove(&sriov->pv_id_table, (int) map->pv_cm_id); + } + + /* add remaining nodes from cm_list */ + list_for_each_entry_safe(map, tmp_map, &dev->sriov.cm_list, list) { + if (slave == map->slave_id) + list_move_tail(&map->list, &lh); + } + } + + spin_unlock(&sriov->id_map_lock); + + /* free any map entries left behind due to cancel_delayed_work above */ + list_for_each_entry_safe(map, tmp_map, &lh, list) { + list_del(&map->list); + kfree(map); + } +} diff --git a/kernel/drivers/infiniband/hw/mlx4/cq.c b/kernel/drivers/infiniband/hw/mlx4/cq.c new file mode 100644 index 000000000..0176caa57 --- /dev/null +++ b/kernel/drivers/infiniband/hw/mlx4/cq.c @@ -0,0 +1,983 @@ +/* + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#include "mlx4_ib.h" +#include "user.h" + +static void mlx4_ib_cq_comp(struct mlx4_cq *cq) +{ + struct ib_cq *ibcq = &to_mibcq(cq)->ibcq; + ibcq->comp_handler(ibcq, ibcq->cq_context); +} + +static void mlx4_ib_cq_event(struct mlx4_cq *cq, enum mlx4_event type) +{ + struct ib_event event; + struct ib_cq *ibcq; + + if (type != MLX4_EVENT_TYPE_CQ_ERROR) { + pr_warn("Unexpected event type %d " + "on CQ %06x\n", type, cq->cqn); + return; + } + + ibcq = &to_mibcq(cq)->ibcq; + if (ibcq->event_handler) { + event.device = ibcq->device; + event.event = IB_EVENT_CQ_ERR; + event.element.cq = ibcq; + ibcq->event_handler(&event, ibcq->cq_context); + } +} + +static void *get_cqe_from_buf(struct mlx4_ib_cq_buf *buf, int n) +{ + return mlx4_buf_offset(&buf->buf, n * buf->entry_size); +} + +static void *get_cqe(struct mlx4_ib_cq *cq, int n) +{ + return get_cqe_from_buf(&cq->buf, n); +} + +static void *get_sw_cqe(struct mlx4_ib_cq *cq, int n) +{ + struct mlx4_cqe *cqe = get_cqe(cq, n & cq->ibcq.cqe); + struct mlx4_cqe *tcqe = ((cq->buf.entry_size == 64) ? (cqe + 1) : cqe); + + return (!!(tcqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^ + !!(n & (cq->ibcq.cqe + 1))) ? NULL : cqe; +} + +static struct mlx4_cqe *next_cqe_sw(struct mlx4_ib_cq *cq) +{ + return get_sw_cqe(cq, cq->mcq.cons_index); +} + +int mlx4_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period) +{ + struct mlx4_ib_cq *mcq = to_mcq(cq); + struct mlx4_ib_dev *dev = to_mdev(cq->device); + + return mlx4_cq_modify(dev->dev, &mcq->mcq, cq_count, cq_period); +} + +static int mlx4_ib_alloc_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf *buf, int nent) +{ + int err; + + err = mlx4_buf_alloc(dev->dev, nent * dev->dev->caps.cqe_size, + PAGE_SIZE * 2, &buf->buf, GFP_KERNEL); + + if (err) + goto out; + + buf->entry_size = dev->dev->caps.cqe_size; + err = mlx4_mtt_init(dev->dev, buf->buf.npages, buf->buf.page_shift, + &buf->mtt); + if (err) + goto err_buf; + + err = mlx4_buf_write_mtt(dev->dev, &buf->mtt, &buf->buf, GFP_KERNEL); + if (err) + goto err_mtt; + + return 0; + +err_mtt: + mlx4_mtt_cleanup(dev->dev, &buf->mtt); + +err_buf: + mlx4_buf_free(dev->dev, nent * buf->entry_size, &buf->buf); + +out: + return err; +} + +static void mlx4_ib_free_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf *buf, int cqe) +{ + mlx4_buf_free(dev->dev, (cqe + 1) * buf->entry_size, &buf->buf); +} + +static int mlx4_ib_get_cq_umem(struct mlx4_ib_dev *dev, struct ib_ucontext *context, + struct mlx4_ib_cq_buf *buf, struct ib_umem **umem, + u64 buf_addr, int cqe) +{ + int err; + int cqe_size = dev->dev->caps.cqe_size; + + *umem = ib_umem_get(context, buf_addr, cqe * cqe_size, + IB_ACCESS_LOCAL_WRITE, 1); + if (IS_ERR(*umem)) + return PTR_ERR(*umem); + + err = mlx4_mtt_init(dev->dev, ib_umem_page_count(*umem), + ilog2((*umem)->page_size), &buf->mtt); + if (err) + goto err_buf; + + err = mlx4_ib_umem_write_mtt(dev, &buf->mtt, *umem); + if (err) + goto err_mtt; + + return 0; + +err_mtt: + mlx4_mtt_cleanup(dev->dev, &buf->mtt); + +err_buf: + ib_umem_release(*umem); + + return err; +} + +struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct mlx4_ib_dev *dev = to_mdev(ibdev); + struct mlx4_ib_cq *cq; + struct mlx4_uar *uar; + int err; + + if (entries < 1 || entries > dev->dev->caps.max_cqes) + return ERR_PTR(-EINVAL); + + cq = kmalloc(sizeof *cq, GFP_KERNEL); + if (!cq) + return ERR_PTR(-ENOMEM); + + entries = roundup_pow_of_two(entries + 1); + cq->ibcq.cqe = entries - 1; + mutex_init(&cq->resize_mutex); + spin_lock_init(&cq->lock); + cq->resize_buf = NULL; + cq->resize_umem = NULL; + INIT_LIST_HEAD(&cq->send_qp_list); + INIT_LIST_HEAD(&cq->recv_qp_list); + + if (context) { + struct mlx4_ib_create_cq ucmd; + + if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) { + err = -EFAULT; + goto err_cq; + } + + err = mlx4_ib_get_cq_umem(dev, context, &cq->buf, &cq->umem, + ucmd.buf_addr, entries); + if (err) + goto err_cq; + + err = mlx4_ib_db_map_user(to_mucontext(context), ucmd.db_addr, + &cq->db); + if (err) + goto err_mtt; + + uar = &to_mucontext(context)->uar; + } else { + err = mlx4_db_alloc(dev->dev, &cq->db, 1, GFP_KERNEL); + if (err) + goto err_cq; + + cq->mcq.set_ci_db = cq->db.db; + cq->mcq.arm_db = cq->db.db + 1; + *cq->mcq.set_ci_db = 0; + *cq->mcq.arm_db = 0; + + err = mlx4_ib_alloc_cq_buf(dev, &cq->buf, entries); + if (err) + goto err_db; + + uar = &dev->priv_uar; + } + + if (dev->eq_table) + vector = dev->eq_table[vector % ibdev->num_comp_vectors]; + + err = mlx4_cq_alloc(dev->dev, entries, &cq->buf.mtt, uar, + cq->db.dma, &cq->mcq, vector, 0, 0); + if (err) + goto err_dbmap; + + if (context) + cq->mcq.tasklet_ctx.comp = mlx4_ib_cq_comp; + else + cq->mcq.comp = mlx4_ib_cq_comp; + cq->mcq.event = mlx4_ib_cq_event; + + if (context) + if (ib_copy_to_udata(udata, &cq->mcq.cqn, sizeof (__u32))) { + err = -EFAULT; + goto err_dbmap; + } + + return &cq->ibcq; + +err_dbmap: + if (context) + mlx4_ib_db_unmap_user(to_mucontext(context), &cq->db); + +err_mtt: + mlx4_mtt_cleanup(dev->dev, &cq->buf.mtt); + + if (context) + ib_umem_release(cq->umem); + else + mlx4_ib_free_cq_buf(dev, &cq->buf, cq->ibcq.cqe); + +err_db: + if (!context) + mlx4_db_free(dev->dev, &cq->db); + +err_cq: + kfree(cq); + + return ERR_PTR(err); +} + +static int mlx4_alloc_resize_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq *cq, + int entries) +{ + int err; + + if (cq->resize_buf) + return -EBUSY; + + cq->resize_buf = kmalloc(sizeof *cq->resize_buf, GFP_ATOMIC); + if (!cq->resize_buf) + return -ENOMEM; + + err = mlx4_ib_alloc_cq_buf(dev, &cq->resize_buf->buf, entries); + if (err) { + kfree(cq->resize_buf); + cq->resize_buf = NULL; + return err; + } + + cq->resize_buf->cqe = entries - 1; + + return 0; +} + +static int mlx4_alloc_resize_umem(struct mlx4_ib_dev *dev, struct mlx4_ib_cq *cq, + int entries, struct ib_udata *udata) +{ + struct mlx4_ib_resize_cq ucmd; + int err; + + if (cq->resize_umem) + return -EBUSY; + + if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) + return -EFAULT; + + cq->resize_buf = kmalloc(sizeof *cq->resize_buf, GFP_ATOMIC); + if (!cq->resize_buf) + return -ENOMEM; + + err = mlx4_ib_get_cq_umem(dev, cq->umem->context, &cq->resize_buf->buf, + &cq->resize_umem, ucmd.buf_addr, entries); + if (err) { + kfree(cq->resize_buf); + cq->resize_buf = NULL; + return err; + } + + cq->resize_buf->cqe = entries - 1; + + return 0; +} + +static int mlx4_ib_get_outstanding_cqes(struct mlx4_ib_cq *cq) +{ + u32 i; + + i = cq->mcq.cons_index; + while (get_sw_cqe(cq, i)) + ++i; + + return i - cq->mcq.cons_index; +} + +static void mlx4_ib_cq_resize_copy_cqes(struct mlx4_ib_cq *cq) +{ + struct mlx4_cqe *cqe, *new_cqe; + int i; + int cqe_size = cq->buf.entry_size; + int cqe_inc = cqe_size == 64 ? 1 : 0; + + i = cq->mcq.cons_index; + cqe = get_cqe(cq, i & cq->ibcq.cqe); + cqe += cqe_inc; + + while ((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) != MLX4_CQE_OPCODE_RESIZE) { + new_cqe = get_cqe_from_buf(&cq->resize_buf->buf, + (i + 1) & cq->resize_buf->cqe); + memcpy(new_cqe, get_cqe(cq, i & cq->ibcq.cqe), cqe_size); + new_cqe += cqe_inc; + + new_cqe->owner_sr_opcode = (cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK) | + (((i + 1) & (cq->resize_buf->cqe + 1)) ? MLX4_CQE_OWNER_MASK : 0); + cqe = get_cqe(cq, ++i & cq->ibcq.cqe); + cqe += cqe_inc; + } + ++cq->mcq.cons_index; +} + +int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) +{ + struct mlx4_ib_dev *dev = to_mdev(ibcq->device); + struct mlx4_ib_cq *cq = to_mcq(ibcq); + struct mlx4_mtt mtt; + int outst_cqe; + int err; + + mutex_lock(&cq->resize_mutex); + if (entries < 1 || entries > dev->dev->caps.max_cqes) { + err = -EINVAL; + goto out; + } + + entries = roundup_pow_of_two(entries + 1); + if (entries == ibcq->cqe + 1) { + err = 0; + goto out; + } + + if (entries > dev->dev->caps.max_cqes + 1) { + err = -EINVAL; + goto out; + } + + if (ibcq->uobject) { + err = mlx4_alloc_resize_umem(dev, cq, entries, udata); + if (err) + goto out; + } else { + /* Can't be smaller than the number of outstanding CQEs */ + outst_cqe = mlx4_ib_get_outstanding_cqes(cq); + if (entries < outst_cqe + 1) { + err = -EINVAL; + goto out; + } + + err = mlx4_alloc_resize_buf(dev, cq, entries); + if (err) + goto out; + } + + mtt = cq->buf.mtt; + + err = mlx4_cq_resize(dev->dev, &cq->mcq, entries, &cq->resize_buf->buf.mtt); + if (err) + goto err_buf; + + mlx4_mtt_cleanup(dev->dev, &mtt); + if (ibcq->uobject) { + cq->buf = cq->resize_buf->buf; + cq->ibcq.cqe = cq->resize_buf->cqe; + ib_umem_release(cq->umem); + cq->umem = cq->resize_umem; + + kfree(cq->resize_buf); + cq->resize_buf = NULL; + cq->resize_umem = NULL; + } else { + struct mlx4_ib_cq_buf tmp_buf; + int tmp_cqe = 0; + + spin_lock_irq(&cq->lock); + if (cq->resize_buf) { + mlx4_ib_cq_resize_copy_cqes(cq); + tmp_buf = cq->buf; + tmp_cqe = cq->ibcq.cqe; + cq->buf = cq->resize_buf->buf; + cq->ibcq.cqe = cq->resize_buf->cqe; + + kfree(cq->resize_buf); + cq->resize_buf = NULL; + } + spin_unlock_irq(&cq->lock); + + if (tmp_cqe) + mlx4_ib_free_cq_buf(dev, &tmp_buf, tmp_cqe); + } + + goto out; + +err_buf: + mlx4_mtt_cleanup(dev->dev, &cq->resize_buf->buf.mtt); + if (!ibcq->uobject) + mlx4_ib_free_cq_buf(dev, &cq->resize_buf->buf, + cq->resize_buf->cqe); + + kfree(cq->resize_buf); + cq->resize_buf = NULL; + + if (cq->resize_umem) { + ib_umem_release(cq->resize_umem); + cq->resize_umem = NULL; + } + +out: + mutex_unlock(&cq->resize_mutex); + + return err; +} + +int mlx4_ib_destroy_cq(struct ib_cq *cq) +{ + struct mlx4_ib_dev *dev = to_mdev(cq->device); + struct mlx4_ib_cq *mcq = to_mcq(cq); + + mlx4_cq_free(dev->dev, &mcq->mcq); + mlx4_mtt_cleanup(dev->dev, &mcq->buf.mtt); + + if (cq->uobject) { + mlx4_ib_db_unmap_user(to_mucontext(cq->uobject->context), &mcq->db); + ib_umem_release(mcq->umem); + } else { + mlx4_ib_free_cq_buf(dev, &mcq->buf, cq->cqe); + mlx4_db_free(dev->dev, &mcq->db); + } + + kfree(mcq); + + return 0; +} + +static void dump_cqe(void *cqe) +{ + __be32 *buf = cqe; + + pr_debug("CQE contents %08x %08x %08x %08x %08x %08x %08x %08x\n", + be32_to_cpu(buf[0]), be32_to_cpu(buf[1]), be32_to_cpu(buf[2]), + be32_to_cpu(buf[3]), be32_to_cpu(buf[4]), be32_to_cpu(buf[5]), + be32_to_cpu(buf[6]), be32_to_cpu(buf[7])); +} + +static void mlx4_ib_handle_error_cqe(struct mlx4_err_cqe *cqe, + struct ib_wc *wc) +{ + if (cqe->syndrome == MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR) { + pr_debug("local QP operation err " + "(QPN %06x, WQE index %x, vendor syndrome %02x, " + "opcode = %02x)\n", + be32_to_cpu(cqe->my_qpn), be16_to_cpu(cqe->wqe_index), + cqe->vendor_err_syndrome, + cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK); + dump_cqe(cqe); + } + + switch (cqe->syndrome) { + case MLX4_CQE_SYNDROME_LOCAL_LENGTH_ERR: + wc->status = IB_WC_LOC_LEN_ERR; + break; + case MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR: + wc->status = IB_WC_LOC_QP_OP_ERR; + break; + case MLX4_CQE_SYNDROME_LOCAL_PROT_ERR: + wc->status = IB_WC_LOC_PROT_ERR; + break; + case MLX4_CQE_SYNDROME_WR_FLUSH_ERR: + wc->status = IB_WC_WR_FLUSH_ERR; + break; + case MLX4_CQE_SYNDROME_MW_BIND_ERR: + wc->status = IB_WC_MW_BIND_ERR; + break; + case MLX4_CQE_SYNDROME_BAD_RESP_ERR: + wc->status = IB_WC_BAD_RESP_ERR; + break; + case MLX4_CQE_SYNDROME_LOCAL_ACCESS_ERR: + wc->status = IB_WC_LOC_ACCESS_ERR; + break; + case MLX4_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR: + wc->status = IB_WC_REM_INV_REQ_ERR; + break; + case MLX4_CQE_SYNDROME_REMOTE_ACCESS_ERR: + wc->status = IB_WC_REM_ACCESS_ERR; + break; + case MLX4_CQE_SYNDROME_REMOTE_OP_ERR: + wc->status = IB_WC_REM_OP_ERR; + break; + case MLX4_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR: + wc->status = IB_WC_RETRY_EXC_ERR; + break; + case MLX4_CQE_SYNDROME_RNR_RETRY_EXC_ERR: + wc->status = IB_WC_RNR_RETRY_EXC_ERR; + break; + case MLX4_CQE_SYNDROME_REMOTE_ABORTED_ERR: + wc->status = IB_WC_REM_ABORT_ERR; + break; + default: + wc->status = IB_WC_GENERAL_ERR; + break; + } + + wc->vendor_err = cqe->vendor_err_syndrome; +} + +static int mlx4_ib_ipoib_csum_ok(__be16 status, __be16 checksum) +{ + return ((status & cpu_to_be16(MLX4_CQE_STATUS_IPV4 | + MLX4_CQE_STATUS_IPV4F | + MLX4_CQE_STATUS_IPV4OPT | + MLX4_CQE_STATUS_IPV6 | + MLX4_CQE_STATUS_IPOK)) == + cpu_to_be16(MLX4_CQE_STATUS_IPV4 | + MLX4_CQE_STATUS_IPOK)) && + (status & cpu_to_be16(MLX4_CQE_STATUS_UDP | + MLX4_CQE_STATUS_TCP)) && + checksum == cpu_to_be16(0xffff); +} + +static int use_tunnel_data(struct mlx4_ib_qp *qp, struct mlx4_ib_cq *cq, struct ib_wc *wc, + unsigned tail, struct mlx4_cqe *cqe, int is_eth) +{ + struct mlx4_ib_proxy_sqp_hdr *hdr; + + ib_dma_sync_single_for_cpu(qp->ibqp.device, + qp->sqp_proxy_rcv[tail].map, + sizeof (struct mlx4_ib_proxy_sqp_hdr), + DMA_FROM_DEVICE); + hdr = (struct mlx4_ib_proxy_sqp_hdr *) (qp->sqp_proxy_rcv[tail].addr); + wc->pkey_index = be16_to_cpu(hdr->tun.pkey_index); + wc->src_qp = be32_to_cpu(hdr->tun.flags_src_qp) & 0xFFFFFF; + wc->wc_flags |= (hdr->tun.g_ml_path & 0x80) ? (IB_WC_GRH) : 0; + wc->dlid_path_bits = 0; + + if (is_eth) { + wc->vlan_id = be16_to_cpu(hdr->tun.sl_vid); + memcpy(&(wc->smac[0]), (char *)&hdr->tun.mac_31_0, 4); + memcpy(&(wc->smac[4]), (char *)&hdr->tun.slid_mac_47_32, 2); + wc->wc_flags |= (IB_WC_WITH_VLAN | IB_WC_WITH_SMAC); + } else { + wc->slid = be16_to_cpu(hdr->tun.slid_mac_47_32); + wc->sl = (u8) (be16_to_cpu(hdr->tun.sl_vid) >> 12); + } + + return 0; +} + +static void mlx4_ib_qp_sw_comp(struct mlx4_ib_qp *qp, int num_entries, + struct ib_wc *wc, int *npolled, int is_send) +{ + struct mlx4_ib_wq *wq; + unsigned cur; + int i; + + wq = is_send ? &qp->sq : &qp->rq; + cur = wq->head - wq->tail; + + if (cur == 0) + return; + + for (i = 0; i < cur && *npolled < num_entries; i++) { + wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; + wc->status = IB_WC_WR_FLUSH_ERR; + wc->vendor_err = MLX4_CQE_SYNDROME_WR_FLUSH_ERR; + wq->tail++; + (*npolled)++; + wc->qp = &qp->ibqp; + wc++; + } +} + +static void mlx4_ib_poll_sw_comp(struct mlx4_ib_cq *cq, int num_entries, + struct ib_wc *wc, int *npolled) +{ + struct mlx4_ib_qp *qp; + + *npolled = 0; + /* Find uncompleted WQEs belonging to that cq and retrun + * simulated FLUSH_ERR completions + */ + list_for_each_entry(qp, &cq->send_qp_list, cq_send_list) { + mlx4_ib_qp_sw_comp(qp, num_entries, wc, npolled, 1); + if (*npolled >= num_entries) + goto out; + } + + list_for_each_entry(qp, &cq->recv_qp_list, cq_recv_list) { + mlx4_ib_qp_sw_comp(qp, num_entries, wc + *npolled, npolled, 0); + if (*npolled >= num_entries) + goto out; + } + +out: + return; +} + +static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq, + struct mlx4_ib_qp **cur_qp, + struct ib_wc *wc) +{ + struct mlx4_cqe *cqe; + struct mlx4_qp *mqp; + struct mlx4_ib_wq *wq; + struct mlx4_ib_srq *srq; + struct mlx4_srq *msrq = NULL; + int is_send; + int is_error; + int is_eth; + u32 g_mlpath_rqpn; + u16 wqe_ctr; + unsigned tail = 0; + +repoll: + cqe = next_cqe_sw(cq); + if (!cqe) + return -EAGAIN; + + if (cq->buf.entry_size == 64) + cqe++; + + ++cq->mcq.cons_index; + + /* + * Make sure we read CQ entry contents after we've checked the + * ownership bit. + */ + rmb(); + + is_send = cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK; + is_error = (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == + MLX4_CQE_OPCODE_ERROR; + + if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == MLX4_OPCODE_NOP && + is_send)) { + pr_warn("Completion for NOP opcode detected!\n"); + return -EINVAL; + } + + /* Resize CQ in progress */ + if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == MLX4_CQE_OPCODE_RESIZE)) { + if (cq->resize_buf) { + struct mlx4_ib_dev *dev = to_mdev(cq->ibcq.device); + + mlx4_ib_free_cq_buf(dev, &cq->buf, cq->ibcq.cqe); + cq->buf = cq->resize_buf->buf; + cq->ibcq.cqe = cq->resize_buf->cqe; + + kfree(cq->resize_buf); + cq->resize_buf = NULL; + } + + goto repoll; + } + + if (!*cur_qp || + (be32_to_cpu(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK) != (*cur_qp)->mqp.qpn) { + /* + * We do not have to take the QP table lock here, + * because CQs will be locked while QPs are removed + * from the table. + */ + mqp = __mlx4_qp_lookup(to_mdev(cq->ibcq.device)->dev, + be32_to_cpu(cqe->vlan_my_qpn)); + if (unlikely(!mqp)) { + pr_warn("CQ %06x with entry for unknown QPN %06x\n", + cq->mcq.cqn, be32_to_cpu(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK); + return -EINVAL; + } + + *cur_qp = to_mibqp(mqp); + } + + wc->qp = &(*cur_qp)->ibqp; + + if (wc->qp->qp_type == IB_QPT_XRC_TGT) { + u32 srq_num; + g_mlpath_rqpn = be32_to_cpu(cqe->g_mlpath_rqpn); + srq_num = g_mlpath_rqpn & 0xffffff; + /* SRQ is also in the radix tree */ + msrq = mlx4_srq_lookup(to_mdev(cq->ibcq.device)->dev, + srq_num); + if (unlikely(!msrq)) { + pr_warn("CQ %06x with entry for unknown SRQN %06x\n", + cq->mcq.cqn, srq_num); + return -EINVAL; + } + } + + if (is_send) { + wq = &(*cur_qp)->sq; + if (!(*cur_qp)->sq_signal_bits) { + wqe_ctr = be16_to_cpu(cqe->wqe_index); + wq->tail += (u16) (wqe_ctr - (u16) wq->tail); + } + wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; + ++wq->tail; + } else if ((*cur_qp)->ibqp.srq) { + srq = to_msrq((*cur_qp)->ibqp.srq); + wqe_ctr = be16_to_cpu(cqe->wqe_index); + wc->wr_id = srq->wrid[wqe_ctr]; + mlx4_ib_free_srq_wqe(srq, wqe_ctr); + } else if (msrq) { + srq = to_mibsrq(msrq); + wqe_ctr = be16_to_cpu(cqe->wqe_index); + wc->wr_id = srq->wrid[wqe_ctr]; + mlx4_ib_free_srq_wqe(srq, wqe_ctr); + } else { + wq = &(*cur_qp)->rq; + tail = wq->tail & (wq->wqe_cnt - 1); + wc->wr_id = wq->wrid[tail]; + ++wq->tail; + } + + if (unlikely(is_error)) { + mlx4_ib_handle_error_cqe((struct mlx4_err_cqe *) cqe, wc); + return 0; + } + + wc->status = IB_WC_SUCCESS; + + if (is_send) { + wc->wc_flags = 0; + switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) { + case MLX4_OPCODE_RDMA_WRITE_IMM: + wc->wc_flags |= IB_WC_WITH_IMM; + case MLX4_OPCODE_RDMA_WRITE: + wc->opcode = IB_WC_RDMA_WRITE; + break; + case MLX4_OPCODE_SEND_IMM: + wc->wc_flags |= IB_WC_WITH_IMM; + case MLX4_OPCODE_SEND: + case MLX4_OPCODE_SEND_INVAL: + wc->opcode = IB_WC_SEND; + break; + case MLX4_OPCODE_RDMA_READ: + wc->opcode = IB_WC_RDMA_READ; + wc->byte_len = be32_to_cpu(cqe->byte_cnt); + break; + case MLX4_OPCODE_ATOMIC_CS: + wc->opcode = IB_WC_COMP_SWAP; + wc->byte_len = 8; + break; + case MLX4_OPCODE_ATOMIC_FA: + wc->opcode = IB_WC_FETCH_ADD; + wc->byte_len = 8; + break; + case MLX4_OPCODE_MASKED_ATOMIC_CS: + wc->opcode = IB_WC_MASKED_COMP_SWAP; + wc->byte_len = 8; + break; + case MLX4_OPCODE_MASKED_ATOMIC_FA: + wc->opcode = IB_WC_MASKED_FETCH_ADD; + wc->byte_len = 8; + break; + case MLX4_OPCODE_BIND_MW: + wc->opcode = IB_WC_BIND_MW; + break; + case MLX4_OPCODE_LSO: + wc->opcode = IB_WC_LSO; + break; + case MLX4_OPCODE_FMR: + wc->opcode = IB_WC_FAST_REG_MR; + break; + case MLX4_OPCODE_LOCAL_INVAL: + wc->opcode = IB_WC_LOCAL_INV; + break; + } + } else { + wc->byte_len = be32_to_cpu(cqe->byte_cnt); + + switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) { + case MLX4_RECV_OPCODE_RDMA_WRITE_IMM: + wc->opcode = IB_WC_RECV_RDMA_WITH_IMM; + wc->wc_flags = IB_WC_WITH_IMM; + wc->ex.imm_data = cqe->immed_rss_invalid; + break; + case MLX4_RECV_OPCODE_SEND_INVAL: + wc->opcode = IB_WC_RECV; + wc->wc_flags = IB_WC_WITH_INVALIDATE; + wc->ex.invalidate_rkey = be32_to_cpu(cqe->immed_rss_invalid); + break; + case MLX4_RECV_OPCODE_SEND: + wc->opcode = IB_WC_RECV; + wc->wc_flags = 0; + break; + case MLX4_RECV_OPCODE_SEND_IMM: + wc->opcode = IB_WC_RECV; + wc->wc_flags = IB_WC_WITH_IMM; + wc->ex.imm_data = cqe->immed_rss_invalid; + break; + } + + is_eth = (rdma_port_get_link_layer(wc->qp->device, + (*cur_qp)->port) == + IB_LINK_LAYER_ETHERNET); + if (mlx4_is_mfunc(to_mdev(cq->ibcq.device)->dev)) { + if ((*cur_qp)->mlx4_ib_qp_type & + (MLX4_IB_QPT_PROXY_SMI_OWNER | + MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) + return use_tunnel_data(*cur_qp, cq, wc, tail, + cqe, is_eth); + } + + wc->slid = be16_to_cpu(cqe->rlid); + g_mlpath_rqpn = be32_to_cpu(cqe->g_mlpath_rqpn); + wc->src_qp = g_mlpath_rqpn & 0xffffff; + wc->dlid_path_bits = (g_mlpath_rqpn >> 24) & 0x7f; + wc->wc_flags |= g_mlpath_rqpn & 0x80000000 ? IB_WC_GRH : 0; + wc->pkey_index = be32_to_cpu(cqe->immed_rss_invalid) & 0x7f; + wc->wc_flags |= mlx4_ib_ipoib_csum_ok(cqe->status, + cqe->checksum) ? IB_WC_IP_CSUM_OK : 0; + if (is_eth) { + wc->sl = be16_to_cpu(cqe->sl_vid) >> 13; + if (be32_to_cpu(cqe->vlan_my_qpn) & + MLX4_CQE_VLAN_PRESENT_MASK) { + wc->vlan_id = be16_to_cpu(cqe->sl_vid) & + MLX4_CQE_VID_MASK; + } else { + wc->vlan_id = 0xffff; + } + memcpy(wc->smac, cqe->smac, ETH_ALEN); + wc->wc_flags |= (IB_WC_WITH_VLAN | IB_WC_WITH_SMAC); + } else { + wc->sl = be16_to_cpu(cqe->sl_vid) >> 12; + wc->vlan_id = 0xffff; + } + } + + return 0; +} + +int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) +{ + struct mlx4_ib_cq *cq = to_mcq(ibcq); + struct mlx4_ib_qp *cur_qp = NULL; + unsigned long flags; + int npolled; + int err = 0; + struct mlx4_ib_dev *mdev = to_mdev(cq->ibcq.device); + + spin_lock_irqsave(&cq->lock, flags); + if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) { + mlx4_ib_poll_sw_comp(cq, num_entries, wc, &npolled); + goto out; + } + + for (npolled = 0; npolled < num_entries; ++npolled) { + err = mlx4_ib_poll_one(cq, &cur_qp, wc + npolled); + if (err) + break; + } + + mlx4_cq_set_ci(&cq->mcq); + +out: + spin_unlock_irqrestore(&cq->lock, flags); + + if (err == 0 || err == -EAGAIN) + return npolled; + else + return err; +} + +int mlx4_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) +{ + mlx4_cq_arm(&to_mcq(ibcq)->mcq, + (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ? + MLX4_CQ_DB_REQ_NOT_SOL : MLX4_CQ_DB_REQ_NOT, + to_mdev(ibcq->device)->uar_map, + MLX4_GET_DOORBELL_LOCK(&to_mdev(ibcq->device)->uar_lock)); + + return 0; +} + +void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq) +{ + u32 prod_index; + int nfreed = 0; + struct mlx4_cqe *cqe, *dest; + u8 owner_bit; + int cqe_inc = cq->buf.entry_size == 64 ? 1 : 0; + + /* + * First we need to find the current producer index, so we + * know where to start cleaning from. It doesn't matter if HW + * adds new entries after this loop -- the QP we're worried + * about is already in RESET, so the new entries won't come + * from our QP and therefore don't need to be checked. + */ + for (prod_index = cq->mcq.cons_index; get_sw_cqe(cq, prod_index); ++prod_index) + if (prod_index == cq->mcq.cons_index + cq->ibcq.cqe) + break; + + /* + * Now sweep backwards through the CQ, removing CQ entries + * that match our QP by copying older entries on top of them. + */ + while ((int) --prod_index - (int) cq->mcq.cons_index >= 0) { + cqe = get_cqe(cq, prod_index & cq->ibcq.cqe); + cqe += cqe_inc; + + if ((be32_to_cpu(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK) == qpn) { + if (srq && !(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK)) + mlx4_ib_free_srq_wqe(srq, be16_to_cpu(cqe->wqe_index)); + ++nfreed; + } else if (nfreed) { + dest = get_cqe(cq, (prod_index + nfreed) & cq->ibcq.cqe); + dest += cqe_inc; + + owner_bit = dest->owner_sr_opcode & MLX4_CQE_OWNER_MASK; + memcpy(dest, cqe, sizeof *cqe); + dest->owner_sr_opcode = owner_bit | + (dest->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK); + } + } + + if (nfreed) { + cq->mcq.cons_index += nfreed; + /* + * Make sure update of buffer contents is done before + * updating consumer index. + */ + wmb(); + mlx4_cq_set_ci(&cq->mcq); + } +} + +void mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq) +{ + spin_lock_irq(&cq->lock); + __mlx4_ib_cq_clean(cq, qpn, srq); + spin_unlock_irq(&cq->lock); +} diff --git a/kernel/drivers/infiniband/hw/mlx4/doorbell.c b/kernel/drivers/infiniband/hw/mlx4/doorbell.c new file mode 100644 index 000000000..c51740986 --- /dev/null +++ b/kernel/drivers/infiniband/hw/mlx4/doorbell.c @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include "mlx4_ib.h" + +struct mlx4_ib_user_db_page { + struct list_head list; + struct ib_umem *umem; + unsigned long user_virt; + int refcnt; +}; + +int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt, + struct mlx4_db *db) +{ + struct mlx4_ib_user_db_page *page; + int err = 0; + + mutex_lock(&context->db_page_mutex); + + list_for_each_entry(page, &context->db_page_list, list) + if (page->user_virt == (virt & PAGE_MASK)) + goto found; + + page = kmalloc(sizeof *page, GFP_KERNEL); + if (!page) { + err = -ENOMEM; + goto out; + } + + page->user_virt = (virt & PAGE_MASK); + page->refcnt = 0; + page->umem = ib_umem_get(&context->ibucontext, virt & PAGE_MASK, + PAGE_SIZE, 0, 0); + if (IS_ERR(page->umem)) { + err = PTR_ERR(page->umem); + kfree(page); + goto out; + } + + list_add(&page->list, &context->db_page_list); + +found: + db->dma = sg_dma_address(page->umem->sg_head.sgl) + (virt & ~PAGE_MASK); + db->u.user_page = page; + ++page->refcnt; + +out: + mutex_unlock(&context->db_page_mutex); + + return err; +} + +void mlx4_ib_db_unmap_user(struct mlx4_ib_ucontext *context, struct mlx4_db *db) +{ + mutex_lock(&context->db_page_mutex); + + if (!--db->u.user_page->refcnt) { + list_del(&db->u.user_page->list); + ib_umem_release(db->u.user_page->umem); + kfree(db->u.user_page); + } + + mutex_unlock(&context->db_page_mutex); +} diff --git a/kernel/drivers/infiniband/hw/mlx4/mad.c b/kernel/drivers/infiniband/hw/mlx4/mad.c new file mode 100644 index 000000000..9cd2b002d --- /dev/null +++ b/kernel/drivers/infiniband/hw/mlx4/mad.c @@ -0,0 +1,2185 @@ +/* + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "mlx4_ib.h" + +enum { + MLX4_IB_VENDOR_CLASS1 = 0x9, + MLX4_IB_VENDOR_CLASS2 = 0xa +}; + +#define MLX4_TUN_SEND_WRID_SHIFT 34 +#define MLX4_TUN_QPN_SHIFT 32 +#define MLX4_TUN_WRID_RECV (((u64) 1) << MLX4_TUN_SEND_WRID_SHIFT) +#define MLX4_TUN_SET_WRID_QPN(a) (((u64) ((a) & 0x3)) << MLX4_TUN_QPN_SHIFT) + +#define MLX4_TUN_IS_RECV(a) (((a) >> MLX4_TUN_SEND_WRID_SHIFT) & 0x1) +#define MLX4_TUN_WRID_QPN(a) (((a) >> MLX4_TUN_QPN_SHIFT) & 0x3) + + /* Port mgmt change event handling */ + +#define GET_BLK_PTR_FROM_EQE(eqe) be32_to_cpu(eqe->event.port_mgmt_change.params.tbl_change_info.block_ptr) +#define GET_MASK_FROM_EQE(eqe) be32_to_cpu(eqe->event.port_mgmt_change.params.tbl_change_info.tbl_entries_mask) +#define NUM_IDX_IN_PKEY_TBL_BLK 32 +#define GUID_TBL_ENTRY_SIZE 8 /* size in bytes */ +#define GUID_TBL_BLK_NUM_ENTRIES 8 +#define GUID_TBL_BLK_SIZE (GUID_TBL_ENTRY_SIZE * GUID_TBL_BLK_NUM_ENTRIES) + +/* Counters should be saturate once they reach their maximum value */ +#define ASSIGN_32BIT_COUNTER(counter, value) do {\ + if ((value) > U32_MAX) \ + counter = cpu_to_be32(U32_MAX); \ + else \ + counter = cpu_to_be32(value); \ +} while (0) + +struct mlx4_mad_rcv_buf { + struct ib_grh grh; + u8 payload[256]; +} __packed; + +struct mlx4_mad_snd_buf { + u8 payload[256]; +} __packed; + +struct mlx4_tunnel_mad { + struct ib_grh grh; + struct mlx4_ib_tunnel_header hdr; + struct ib_mad mad; +} __packed; + +struct mlx4_rcv_tunnel_mad { + struct mlx4_rcv_tunnel_hdr hdr; + struct ib_grh grh; + struct ib_mad mad; +} __packed; + +static void handle_client_rereg_event(struct mlx4_ib_dev *dev, u8 port_num); +static void handle_lid_change_event(struct mlx4_ib_dev *dev, u8 port_num); +static void __propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num, + int block, u32 change_bitmap); + +__be64 mlx4_ib_gen_node_guid(void) +{ +#define NODE_GUID_HI ((u64) (((u64)IB_OPENIB_OUI) << 40)) + return cpu_to_be64(NODE_GUID_HI | prandom_u32()); +} + +__be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx) +{ + return cpu_to_be64(atomic_inc_return(&ctx->tid)) | + cpu_to_be64(0xff00000000000000LL); +} + +int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int mad_ifc_flags, + int port, struct ib_wc *in_wc, struct ib_grh *in_grh, + void *in_mad, void *response_mad) +{ + struct mlx4_cmd_mailbox *inmailbox, *outmailbox; + void *inbox; + int err; + u32 in_modifier = port; + u8 op_modifier = 0; + + inmailbox = mlx4_alloc_cmd_mailbox(dev->dev); + if (IS_ERR(inmailbox)) + return PTR_ERR(inmailbox); + inbox = inmailbox->buf; + + outmailbox = mlx4_alloc_cmd_mailbox(dev->dev); + if (IS_ERR(outmailbox)) { + mlx4_free_cmd_mailbox(dev->dev, inmailbox); + return PTR_ERR(outmailbox); + } + + memcpy(inbox, in_mad, 256); + + /* + * Key check traps can't be generated unless we have in_wc to + * tell us where to send the trap. + */ + if ((mad_ifc_flags & MLX4_MAD_IFC_IGNORE_MKEY) || !in_wc) + op_modifier |= 0x1; + if ((mad_ifc_flags & MLX4_MAD_IFC_IGNORE_BKEY) || !in_wc) + op_modifier |= 0x2; + if (mlx4_is_mfunc(dev->dev) && + (mad_ifc_flags & MLX4_MAD_IFC_NET_VIEW || in_wc)) + op_modifier |= 0x8; + + if (in_wc) { + struct { + __be32 my_qpn; + u32 reserved1; + __be32 rqpn; + u8 sl; + u8 g_path; + u16 reserved2[2]; + __be16 pkey; + u32 reserved3[11]; + u8 grh[40]; + } *ext_info; + + memset(inbox + 256, 0, 256); + ext_info = inbox + 256; + + ext_info->my_qpn = cpu_to_be32(in_wc->qp->qp_num); + ext_info->rqpn = cpu_to_be32(in_wc->src_qp); + ext_info->sl = in_wc->sl << 4; + ext_info->g_path = in_wc->dlid_path_bits | + (in_wc->wc_flags & IB_WC_GRH ? 0x80 : 0); + ext_info->pkey = cpu_to_be16(in_wc->pkey_index); + + if (in_grh) + memcpy(ext_info->grh, in_grh, 40); + + op_modifier |= 0x4; + + in_modifier |= in_wc->slid << 16; + } + + err = mlx4_cmd_box(dev->dev, inmailbox->dma, outmailbox->dma, in_modifier, + mlx4_is_master(dev->dev) ? (op_modifier & ~0x8) : op_modifier, + MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C, + (op_modifier & 0x8) ? MLX4_CMD_NATIVE : MLX4_CMD_WRAPPED); + + if (!err) + memcpy(response_mad, outmailbox->buf, 256); + + mlx4_free_cmd_mailbox(dev->dev, inmailbox); + mlx4_free_cmd_mailbox(dev->dev, outmailbox); + + return err; +} + +static void update_sm_ah(struct mlx4_ib_dev *dev, u8 port_num, u16 lid, u8 sl) +{ + struct ib_ah *new_ah; + struct ib_ah_attr ah_attr; + unsigned long flags; + + if (!dev->send_agent[port_num - 1][0]) + return; + + memset(&ah_attr, 0, sizeof ah_attr); + ah_attr.dlid = lid; + ah_attr.sl = sl; + ah_attr.port_num = port_num; + + new_ah = ib_create_ah(dev->send_agent[port_num - 1][0]->qp->pd, + &ah_attr); + if (IS_ERR(new_ah)) + return; + + spin_lock_irqsave(&dev->sm_lock, flags); + if (dev->sm_ah[port_num - 1]) + ib_destroy_ah(dev->sm_ah[port_num - 1]); + dev->sm_ah[port_num - 1] = new_ah; + spin_unlock_irqrestore(&dev->sm_lock, flags); +} + +/* + * Snoop SM MADs for port info, GUID info, and P_Key table sets, so we can + * synthesize LID change, Client-Rereg, GID change, and P_Key change events. + */ +static void smp_snoop(struct ib_device *ibdev, u8 port_num, struct ib_mad *mad, + u16 prev_lid) +{ + struct ib_port_info *pinfo; + u16 lid; + __be16 *base; + u32 bn, pkey_change_bitmap; + int i; + + + struct mlx4_ib_dev *dev = to_mdev(ibdev); + if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED || + mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) && + mad->mad_hdr.method == IB_MGMT_METHOD_SET) + switch (mad->mad_hdr.attr_id) { + case IB_SMP_ATTR_PORT_INFO: + pinfo = (struct ib_port_info *) ((struct ib_smp *) mad)->data; + lid = be16_to_cpu(pinfo->lid); + + update_sm_ah(dev, port_num, + be16_to_cpu(pinfo->sm_lid), + pinfo->neighbormtu_mastersmsl & 0xf); + + if (pinfo->clientrereg_resv_subnetto & 0x80) + handle_client_rereg_event(dev, port_num); + + if (prev_lid != lid) + handle_lid_change_event(dev, port_num); + break; + + case IB_SMP_ATTR_PKEY_TABLE: + if (!mlx4_is_mfunc(dev->dev)) { + mlx4_ib_dispatch_event(dev, port_num, + IB_EVENT_PKEY_CHANGE); + break; + } + + /* at this point, we are running in the master. + * Slaves do not receive SMPs. + */ + bn = be32_to_cpu(((struct ib_smp *)mad)->attr_mod) & 0xFFFF; + base = (__be16 *) &(((struct ib_smp *)mad)->data[0]); + pkey_change_bitmap = 0; + for (i = 0; i < 32; i++) { + pr_debug("PKEY[%d] = x%x\n", + i + bn*32, be16_to_cpu(base[i])); + if (be16_to_cpu(base[i]) != + dev->pkeys.phys_pkey_cache[port_num - 1][i + bn*32]) { + pkey_change_bitmap |= (1 << i); + dev->pkeys.phys_pkey_cache[port_num - 1][i + bn*32] = + be16_to_cpu(base[i]); + } + } + pr_debug("PKEY Change event: port=%d, " + "block=0x%x, change_bitmap=0x%x\n", + port_num, bn, pkey_change_bitmap); + + if (pkey_change_bitmap) { + mlx4_ib_dispatch_event(dev, port_num, + IB_EVENT_PKEY_CHANGE); + if (!dev->sriov.is_going_down) + __propagate_pkey_ev(dev, port_num, bn, + pkey_change_bitmap); + } + break; + + case IB_SMP_ATTR_GUID_INFO: + /* paravirtualized master's guid is guid 0 -- does not change */ + if (!mlx4_is_master(dev->dev)) + mlx4_ib_dispatch_event(dev, port_num, + IB_EVENT_GID_CHANGE); + /*if master, notify relevant slaves*/ + if (mlx4_is_master(dev->dev) && + !dev->sriov.is_going_down) { + bn = be32_to_cpu(((struct ib_smp *)mad)->attr_mod); + mlx4_ib_update_cache_on_guid_change(dev, bn, port_num, + (u8 *)(&((struct ib_smp *)mad)->data)); + mlx4_ib_notify_slaves_on_guid_change(dev, bn, port_num, + (u8 *)(&((struct ib_smp *)mad)->data)); + } + break; + + default: + break; + } +} + +static void __propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num, + int block, u32 change_bitmap) +{ + int i, ix, slave, err; + int have_event = 0; + + for (slave = 0; slave < dev->dev->caps.sqp_demux; slave++) { + if (slave == mlx4_master_func_num(dev->dev)) + continue; + if (!mlx4_is_slave_active(dev->dev, slave)) + continue; + + have_event = 0; + for (i = 0; i < 32; i++) { + if (!(change_bitmap & (1 << i))) + continue; + for (ix = 0; + ix < dev->dev->caps.pkey_table_len[port_num]; ix++) { + if (dev->pkeys.virt2phys_pkey[slave][port_num - 1] + [ix] == i + 32 * block) { + err = mlx4_gen_pkey_eqe(dev->dev, slave, port_num); + pr_debug("propagate_pkey_ev: slave %d," + " port %d, ix %d (%d)\n", + slave, port_num, ix, err); + have_event = 1; + break; + } + } + if (have_event) + break; + } + } +} + +static void node_desc_override(struct ib_device *dev, + struct ib_mad *mad) +{ + unsigned long flags; + + if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED || + mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) && + mad->mad_hdr.method == IB_MGMT_METHOD_GET_RESP && + mad->mad_hdr.attr_id == IB_SMP_ATTR_NODE_DESC) { + spin_lock_irqsave(&to_mdev(dev)->sm_lock, flags); + memcpy(((struct ib_smp *) mad)->data, dev->node_desc, 64); + spin_unlock_irqrestore(&to_mdev(dev)->sm_lock, flags); + } +} + +static void forward_trap(struct mlx4_ib_dev *dev, u8 port_num, struct ib_mad *mad) +{ + int qpn = mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED; + struct ib_mad_send_buf *send_buf; + struct ib_mad_agent *agent = dev->send_agent[port_num - 1][qpn]; + int ret; + unsigned long flags; + + if (agent) { + send_buf = ib_create_send_mad(agent, qpn, 0, 0, IB_MGMT_MAD_HDR, + IB_MGMT_MAD_DATA, GFP_ATOMIC); + if (IS_ERR(send_buf)) + return; + /* + * We rely here on the fact that MLX QPs don't use the + * address handle after the send is posted (this is + * wrong following the IB spec strictly, but we know + * it's OK for our devices). + */ + spin_lock_irqsave(&dev->sm_lock, flags); + memcpy(send_buf->mad, mad, sizeof *mad); + if ((send_buf->ah = dev->sm_ah[port_num - 1])) + ret = ib_post_send_mad(send_buf, NULL); + else + ret = -EINVAL; + spin_unlock_irqrestore(&dev->sm_lock, flags); + + if (ret) + ib_free_send_mad(send_buf); + } +} + +static int mlx4_ib_demux_sa_handler(struct ib_device *ibdev, int port, int slave, + struct ib_sa_mad *sa_mad) +{ + int ret = 0; + + /* dispatch to different sa handlers */ + switch (be16_to_cpu(sa_mad->mad_hdr.attr_id)) { + case IB_SA_ATTR_MC_MEMBER_REC: + ret = mlx4_ib_mcg_demux_handler(ibdev, port, slave, sa_mad); + break; + default: + break; + } + return ret; +} + +int mlx4_ib_find_real_gid(struct ib_device *ibdev, u8 port, __be64 guid) +{ + struct mlx4_ib_dev *dev = to_mdev(ibdev); + int i; + + for (i = 0; i < dev->dev->caps.sqp_demux; i++) { + if (dev->sriov.demux[port - 1].guid_cache[i] == guid) + return i; + } + return -1; +} + + +static int find_slave_port_pkey_ix(struct mlx4_ib_dev *dev, int slave, + u8 port, u16 pkey, u16 *ix) +{ + int i, ret; + u8 unassigned_pkey_ix, pkey_ix, partial_ix = 0xFF; + u16 slot_pkey; + + if (slave == mlx4_master_func_num(dev->dev)) + return ib_find_cached_pkey(&dev->ib_dev, port, pkey, ix); + + unassigned_pkey_ix = dev->dev->phys_caps.pkey_phys_table_len[port] - 1; + + for (i = 0; i < dev->dev->caps.pkey_table_len[port]; i++) { + if (dev->pkeys.virt2phys_pkey[slave][port - 1][i] == unassigned_pkey_ix) + continue; + + pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][i]; + + ret = ib_get_cached_pkey(&dev->ib_dev, port, pkey_ix, &slot_pkey); + if (ret) + continue; + if ((slot_pkey & 0x7FFF) == (pkey & 0x7FFF)) { + if (slot_pkey & 0x8000) { + *ix = (u16) pkey_ix; + return 0; + } else { + /* take first partial pkey index found */ + if (partial_ix == 0xFF) + partial_ix = pkey_ix; + } + } + } + + if (partial_ix < 0xFF) { + *ix = (u16) partial_ix; + return 0; + } + + return -EINVAL; +} + +int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port, + enum ib_qp_type dest_qpt, struct ib_wc *wc, + struct ib_grh *grh, struct ib_mad *mad) +{ + struct ib_sge list; + struct ib_send_wr wr, *bad_wr; + struct mlx4_ib_demux_pv_ctx *tun_ctx; + struct mlx4_ib_demux_pv_qp *tun_qp; + struct mlx4_rcv_tunnel_mad *tun_mad; + struct ib_ah_attr attr; + struct ib_ah *ah; + struct ib_qp *src_qp = NULL; + unsigned tun_tx_ix = 0; + int dqpn; + int ret = 0; + u16 tun_pkey_ix; + u16 cached_pkey; + u8 is_eth = dev->dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH; + + if (dest_qpt > IB_QPT_GSI) + return -EINVAL; + + tun_ctx = dev->sriov.demux[port-1].tun[slave]; + + /* check if proxy qp created */ + if (!tun_ctx || tun_ctx->state != DEMUX_PV_STATE_ACTIVE) + return -EAGAIN; + + if (!dest_qpt) + tun_qp = &tun_ctx->qp[0]; + else + tun_qp = &tun_ctx->qp[1]; + + /* compute P_Key index to put in tunnel header for slave */ + if (dest_qpt) { + u16 pkey_ix; + ret = ib_get_cached_pkey(&dev->ib_dev, port, wc->pkey_index, &cached_pkey); + if (ret) + return -EINVAL; + + ret = find_slave_port_pkey_ix(dev, slave, port, cached_pkey, &pkey_ix); + if (ret) + return -EINVAL; + tun_pkey_ix = pkey_ix; + } else + tun_pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][0]; + + dqpn = dev->dev->phys_caps.base_proxy_sqpn + 8 * slave + port + (dest_qpt * 2) - 1; + + /* get tunnel tx data buf for slave */ + src_qp = tun_qp->qp; + + /* create ah. Just need an empty one with the port num for the post send. + * The driver will set the force loopback bit in post_send */ + memset(&attr, 0, sizeof attr); + attr.port_num = port; + if (is_eth) { + memcpy(&attr.grh.dgid.raw[0], &grh->dgid.raw[0], 16); + attr.ah_flags = IB_AH_GRH; + } + ah = ib_create_ah(tun_ctx->pd, &attr); + if (IS_ERR(ah)) + return -ENOMEM; + + /* allocate tunnel tx buf after pass failure returns */ + spin_lock(&tun_qp->tx_lock); + if (tun_qp->tx_ix_head - tun_qp->tx_ix_tail >= + (MLX4_NUM_TUNNEL_BUFS - 1)) + ret = -EAGAIN; + else + tun_tx_ix = (++tun_qp->tx_ix_head) & (MLX4_NUM_TUNNEL_BUFS - 1); + spin_unlock(&tun_qp->tx_lock); + if (ret) + goto out; + + tun_mad = (struct mlx4_rcv_tunnel_mad *) (tun_qp->tx_ring[tun_tx_ix].buf.addr); + if (tun_qp->tx_ring[tun_tx_ix].ah) + ib_destroy_ah(tun_qp->tx_ring[tun_tx_ix].ah); + tun_qp->tx_ring[tun_tx_ix].ah = ah; + ib_dma_sync_single_for_cpu(&dev->ib_dev, + tun_qp->tx_ring[tun_tx_ix].buf.map, + sizeof (struct mlx4_rcv_tunnel_mad), + DMA_TO_DEVICE); + + /* copy over to tunnel buffer */ + if (grh) + memcpy(&tun_mad->grh, grh, sizeof *grh); + memcpy(&tun_mad->mad, mad, sizeof *mad); + + /* adjust tunnel data */ + tun_mad->hdr.pkey_index = cpu_to_be16(tun_pkey_ix); + tun_mad->hdr.flags_src_qp = cpu_to_be32(wc->src_qp & 0xFFFFFF); + tun_mad->hdr.g_ml_path = (grh && (wc->wc_flags & IB_WC_GRH)) ? 0x80 : 0; + + if (is_eth) { + u16 vlan = 0; + if (mlx4_get_slave_default_vlan(dev->dev, port, slave, &vlan, + NULL)) { + /* VST mode */ + if (vlan != wc->vlan_id) + /* Packet vlan is not the VST-assigned vlan. + * Drop the packet. + */ + goto out; + else + /* Remove the vlan tag before forwarding + * the packet to the VF. + */ + vlan = 0xffff; + } else { + vlan = wc->vlan_id; + } + + tun_mad->hdr.sl_vid = cpu_to_be16(vlan); + memcpy((char *)&tun_mad->hdr.mac_31_0, &(wc->smac[0]), 4); + memcpy((char *)&tun_mad->hdr.slid_mac_47_32, &(wc->smac[4]), 2); + } else { + tun_mad->hdr.sl_vid = cpu_to_be16(((u16)(wc->sl)) << 12); + tun_mad->hdr.slid_mac_47_32 = cpu_to_be16(wc->slid); + } + + ib_dma_sync_single_for_device(&dev->ib_dev, + tun_qp->tx_ring[tun_tx_ix].buf.map, + sizeof (struct mlx4_rcv_tunnel_mad), + DMA_TO_DEVICE); + + list.addr = tun_qp->tx_ring[tun_tx_ix].buf.map; + list.length = sizeof (struct mlx4_rcv_tunnel_mad); + list.lkey = tun_ctx->mr->lkey; + + wr.wr.ud.ah = ah; + wr.wr.ud.port_num = port; + wr.wr.ud.remote_qkey = IB_QP_SET_QKEY; + wr.wr.ud.remote_qpn = dqpn; + wr.next = NULL; + wr.wr_id = ((u64) tun_tx_ix) | MLX4_TUN_SET_WRID_QPN(dest_qpt); + wr.sg_list = &list; + wr.num_sge = 1; + wr.opcode = IB_WR_SEND; + wr.send_flags = IB_SEND_SIGNALED; + + ret = ib_post_send(src_qp, &wr, &bad_wr); +out: + if (ret) + ib_destroy_ah(ah); + return ret; +} + +static int mlx4_ib_demux_mad(struct ib_device *ibdev, u8 port, + struct ib_wc *wc, struct ib_grh *grh, + struct ib_mad *mad) +{ + struct mlx4_ib_dev *dev = to_mdev(ibdev); + int err; + int slave; + u8 *slave_id; + int is_eth = 0; + + if (rdma_port_get_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND) + is_eth = 0; + else + is_eth = 1; + + if (is_eth) { + if (!(wc->wc_flags & IB_WC_GRH)) { + mlx4_ib_warn(ibdev, "RoCE grh not present.\n"); + return -EINVAL; + } + if (mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_CM) { + mlx4_ib_warn(ibdev, "RoCE mgmt class is not CM\n"); + return -EINVAL; + } + if (mlx4_get_slave_from_roce_gid(dev->dev, port, grh->dgid.raw, &slave)) { + mlx4_ib_warn(ibdev, "failed matching grh\n"); + return -ENOENT; + } + if (slave >= dev->dev->caps.sqp_demux) { + mlx4_ib_warn(ibdev, "slave id: %d is bigger than allowed:%d\n", + slave, dev->dev->caps.sqp_demux); + return -ENOENT; + } + + if (mlx4_ib_demux_cm_handler(ibdev, port, NULL, mad)) + return 0; + + err = mlx4_ib_send_to_slave(dev, slave, port, wc->qp->qp_type, wc, grh, mad); + if (err) + pr_debug("failed sending to slave %d via tunnel qp (%d)\n", + slave, err); + return 0; + } + + /* Initially assume that this mad is for us */ + slave = mlx4_master_func_num(dev->dev); + + /* See if the slave id is encoded in a response mad */ + if (mad->mad_hdr.method & 0x80) { + slave_id = (u8 *) &mad->mad_hdr.tid; + slave = *slave_id; + if (slave != 255) /*255 indicates the dom0*/ + *slave_id = 0; /* remap tid */ + } + + /* If a grh is present, we demux according to it */ + if (wc->wc_flags & IB_WC_GRH) { + slave = mlx4_ib_find_real_gid(ibdev, port, grh->dgid.global.interface_id); + if (slave < 0) { + mlx4_ib_warn(ibdev, "failed matching grh\n"); + return -ENOENT; + } + } + /* Class-specific handling */ + switch (mad->mad_hdr.mgmt_class) { + case IB_MGMT_CLASS_SUBN_LID_ROUTED: + case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE: + /* 255 indicates the dom0 */ + if (slave != 255 && slave != mlx4_master_func_num(dev->dev)) { + if (!mlx4_vf_smi_enabled(dev->dev, slave, port)) + return -EPERM; + /* for a VF. drop unsolicited MADs */ + if (!(mad->mad_hdr.method & IB_MGMT_METHOD_RESP)) { + mlx4_ib_warn(ibdev, "demux QP0. rejecting unsolicited mad for slave %d class 0x%x, method 0x%x\n", + slave, mad->mad_hdr.mgmt_class, + mad->mad_hdr.method); + return -EINVAL; + } + } + break; + case IB_MGMT_CLASS_SUBN_ADM: + if (mlx4_ib_demux_sa_handler(ibdev, port, slave, + (struct ib_sa_mad *) mad)) + return 0; + break; + case IB_MGMT_CLASS_CM: + if (mlx4_ib_demux_cm_handler(ibdev, port, &slave, mad)) + return 0; + break; + case IB_MGMT_CLASS_DEVICE_MGMT: + if (mad->mad_hdr.method != IB_MGMT_METHOD_GET_RESP) + return 0; + break; + default: + /* Drop unsupported classes for slaves in tunnel mode */ + if (slave != mlx4_master_func_num(dev->dev)) { + pr_debug("dropping unsupported ingress mad from class:%d " + "for slave:%d\n", mad->mad_hdr.mgmt_class, slave); + return 0; + } + } + /*make sure that no slave==255 was not handled yet.*/ + if (slave >= dev->dev->caps.sqp_demux) { + mlx4_ib_warn(ibdev, "slave id: %d is bigger than allowed:%d\n", + slave, dev->dev->caps.sqp_demux); + return -ENOENT; + } + + err = mlx4_ib_send_to_slave(dev, slave, port, wc->qp->qp_type, wc, grh, mad); + if (err) + pr_debug("failed sending to slave %d via tunnel qp (%d)\n", + slave, err); + return 0; +} + +static int ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + struct ib_wc *in_wc, struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad) +{ + u16 slid, prev_lid = 0; + int err; + struct ib_port_attr pattr; + + if (in_wc && in_wc->qp->qp_num) { + pr_debug("received MAD: slid:%d sqpn:%d " + "dlid_bits:%d dqpn:%d wc_flags:0x%x, cls %x, mtd %x, atr %x\n", + in_wc->slid, in_wc->src_qp, + in_wc->dlid_path_bits, + in_wc->qp->qp_num, + in_wc->wc_flags, + in_mad->mad_hdr.mgmt_class, in_mad->mad_hdr.method, + be16_to_cpu(in_mad->mad_hdr.attr_id)); + if (in_wc->wc_flags & IB_WC_GRH) { + pr_debug("sgid_hi:0x%016llx sgid_lo:0x%016llx\n", + be64_to_cpu(in_grh->sgid.global.subnet_prefix), + be64_to_cpu(in_grh->sgid.global.interface_id)); + pr_debug("dgid_hi:0x%016llx dgid_lo:0x%016llx\n", + be64_to_cpu(in_grh->dgid.global.subnet_prefix), + be64_to_cpu(in_grh->dgid.global.interface_id)); + } + } + + slid = in_wc ? in_wc->slid : be16_to_cpu(IB_LID_PERMISSIVE); + + if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP && slid == 0) { + forward_trap(to_mdev(ibdev), port_num, in_mad); + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; + } + + if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED || + in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { + if (in_mad->mad_hdr.method != IB_MGMT_METHOD_GET && + in_mad->mad_hdr.method != IB_MGMT_METHOD_SET && + in_mad->mad_hdr.method != IB_MGMT_METHOD_TRAP_REPRESS) + return IB_MAD_RESULT_SUCCESS; + + /* + * Don't process SMInfo queries -- the SMA can't handle them. + */ + if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_SM_INFO) + return IB_MAD_RESULT_SUCCESS; + } else if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT || + in_mad->mad_hdr.mgmt_class == MLX4_IB_VENDOR_CLASS1 || + in_mad->mad_hdr.mgmt_class == MLX4_IB_VENDOR_CLASS2 || + in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_CONG_MGMT) { + if (in_mad->mad_hdr.method != IB_MGMT_METHOD_GET && + in_mad->mad_hdr.method != IB_MGMT_METHOD_SET) + return IB_MAD_RESULT_SUCCESS; + } else + return IB_MAD_RESULT_SUCCESS; + + if ((in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED || + in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) && + in_mad->mad_hdr.method == IB_MGMT_METHOD_SET && + in_mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO && + !ib_query_port(ibdev, port_num, &pattr)) + prev_lid = pattr.lid; + + err = mlx4_MAD_IFC(to_mdev(ibdev), + (mad_flags & IB_MAD_IGNORE_MKEY ? MLX4_MAD_IFC_IGNORE_MKEY : 0) | + (mad_flags & IB_MAD_IGNORE_BKEY ? MLX4_MAD_IFC_IGNORE_BKEY : 0) | + MLX4_MAD_IFC_NET_VIEW, + port_num, in_wc, in_grh, in_mad, out_mad); + if (err) + return IB_MAD_RESULT_FAILURE; + + if (!out_mad->mad_hdr.status) { + if (!(to_mdev(ibdev)->dev->caps.flags & MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV)) + smp_snoop(ibdev, port_num, in_mad, prev_lid); + /* slaves get node desc from FW */ + if (!mlx4_is_slave(to_mdev(ibdev)->dev)) + node_desc_override(ibdev, out_mad); + } + + /* set return bit in status of directed route responses */ + if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + out_mad->mad_hdr.status |= cpu_to_be16(1 << 15); + + if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS) + /* no response for trap repress */ + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; + + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; +} + +static void edit_counter(struct mlx4_counter *cnt, + struct ib_pma_portcounters *pma_cnt) +{ + ASSIGN_32BIT_COUNTER(pma_cnt->port_xmit_data, + (be64_to_cpu(cnt->tx_bytes) >> 2)); + ASSIGN_32BIT_COUNTER(pma_cnt->port_rcv_data, + (be64_to_cpu(cnt->rx_bytes) >> 2)); + ASSIGN_32BIT_COUNTER(pma_cnt->port_xmit_packets, + be64_to_cpu(cnt->tx_frames)); + ASSIGN_32BIT_COUNTER(pma_cnt->port_rcv_packets, + be64_to_cpu(cnt->rx_frames)); +} + +static int iboe_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + struct ib_wc *in_wc, struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad) +{ + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_ib_dev *dev = to_mdev(ibdev); + int err; + u32 inmod = dev->counters[port_num - 1] & 0xffff; + u8 mode; + + if (in_mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_PERF_MGMT) + return -EINVAL; + + mailbox = mlx4_alloc_cmd_mailbox(dev->dev); + if (IS_ERR(mailbox)) + return IB_MAD_RESULT_FAILURE; + + err = mlx4_cmd_box(dev->dev, 0, mailbox->dma, inmod, 0, + MLX4_CMD_QUERY_IF_STAT, MLX4_CMD_TIME_CLASS_C, + MLX4_CMD_WRAPPED); + if (err) + err = IB_MAD_RESULT_FAILURE; + else { + memset(out_mad->data, 0, sizeof out_mad->data); + mode = ((struct mlx4_counter *)mailbox->buf)->counter_mode; + switch (mode & 0xf) { + case 0: + edit_counter(mailbox->buf, + (void *)(out_mad->data + 40)); + err = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; + break; + default: + err = IB_MAD_RESULT_FAILURE; + } + } + + mlx4_free_cmd_mailbox(dev->dev, mailbox); + + return err; +} + +int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + struct ib_wc *in_wc, struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad) +{ + switch (rdma_port_get_link_layer(ibdev, port_num)) { + case IB_LINK_LAYER_INFINIBAND: + return ib_process_mad(ibdev, mad_flags, port_num, in_wc, + in_grh, in_mad, out_mad); + case IB_LINK_LAYER_ETHERNET: + return iboe_process_mad(ibdev, mad_flags, port_num, in_wc, + in_grh, in_mad, out_mad); + default: + return -EINVAL; + } +} + +static void send_handler(struct ib_mad_agent *agent, + struct ib_mad_send_wc *mad_send_wc) +{ + if (mad_send_wc->send_buf->context[0]) + ib_destroy_ah(mad_send_wc->send_buf->context[0]); + ib_free_send_mad(mad_send_wc->send_buf); +} + +int mlx4_ib_mad_init(struct mlx4_ib_dev *dev) +{ + struct ib_mad_agent *agent; + int p, q; + int ret; + enum rdma_link_layer ll; + + for (p = 0; p < dev->num_ports; ++p) { + ll = rdma_port_get_link_layer(&dev->ib_dev, p + 1); + for (q = 0; q <= 1; ++q) { + if (ll == IB_LINK_LAYER_INFINIBAND) { + agent = ib_register_mad_agent(&dev->ib_dev, p + 1, + q ? IB_QPT_GSI : IB_QPT_SMI, + NULL, 0, send_handler, + NULL, NULL, 0); + if (IS_ERR(agent)) { + ret = PTR_ERR(agent); + goto err; + } + dev->send_agent[p][q] = agent; + } else + dev->send_agent[p][q] = NULL; + } + } + + return 0; + +err: + for (p = 0; p < dev->num_ports; ++p) + for (q = 0; q <= 1; ++q) + if (dev->send_agent[p][q]) + ib_unregister_mad_agent(dev->send_agent[p][q]); + + return ret; +} + +void mlx4_ib_mad_cleanup(struct mlx4_ib_dev *dev) +{ + struct ib_mad_agent *agent; + int p, q; + + for (p = 0; p < dev->num_ports; ++p) { + for (q = 0; q <= 1; ++q) { + agent = dev->send_agent[p][q]; + if (agent) { + dev->send_agent[p][q] = NULL; + ib_unregister_mad_agent(agent); + } + } + + if (dev->sm_ah[p]) + ib_destroy_ah(dev->sm_ah[p]); + } +} + +static void handle_lid_change_event(struct mlx4_ib_dev *dev, u8 port_num) +{ + mlx4_ib_dispatch_event(dev, port_num, IB_EVENT_LID_CHANGE); + + if (mlx4_is_master(dev->dev) && !dev->sriov.is_going_down) + mlx4_gen_slaves_port_mgt_ev(dev->dev, port_num, + MLX4_EQ_PORT_INFO_LID_CHANGE_MASK); +} + +static void handle_client_rereg_event(struct mlx4_ib_dev *dev, u8 port_num) +{ + /* re-configure the alias-guid and mcg's */ + if (mlx4_is_master(dev->dev)) { + mlx4_ib_invalidate_all_guid_record(dev, port_num); + + if (!dev->sriov.is_going_down) { + mlx4_ib_mcg_port_cleanup(&dev->sriov.demux[port_num - 1], 0); + mlx4_gen_slaves_port_mgt_ev(dev->dev, port_num, + MLX4_EQ_PORT_INFO_CLIENT_REREG_MASK); + } + } + mlx4_ib_dispatch_event(dev, port_num, IB_EVENT_CLIENT_REREGISTER); +} + +static void propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num, + struct mlx4_eqe *eqe) +{ + __propagate_pkey_ev(dev, port_num, GET_BLK_PTR_FROM_EQE(eqe), + GET_MASK_FROM_EQE(eqe)); +} + +static void handle_slaves_guid_change(struct mlx4_ib_dev *dev, u8 port_num, + u32 guid_tbl_blk_num, u32 change_bitmap) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + u16 i; + + if (!mlx4_is_mfunc(dev->dev) || !mlx4_is_master(dev->dev)) + return; + + in_mad = kmalloc(sizeof *in_mad, GFP_KERNEL); + out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); + if (!in_mad || !out_mad) { + mlx4_ib_warn(&dev->ib_dev, "failed to allocate memory for guid info mads\n"); + goto out; + } + + guid_tbl_blk_num *= 4; + + for (i = 0; i < 4; i++) { + if (change_bitmap && (!((change_bitmap >> (8 * i)) & 0xff))) + continue; + memset(in_mad, 0, sizeof *in_mad); + memset(out_mad, 0, sizeof *out_mad); + + in_mad->base_version = 1; + in_mad->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED; + in_mad->class_version = 1; + in_mad->method = IB_MGMT_METHOD_GET; + in_mad->attr_id = IB_SMP_ATTR_GUID_INFO; + in_mad->attr_mod = cpu_to_be32(guid_tbl_blk_num + i); + + if (mlx4_MAD_IFC(dev, + MLX4_MAD_IFC_IGNORE_KEYS | MLX4_MAD_IFC_NET_VIEW, + port_num, NULL, NULL, in_mad, out_mad)) { + mlx4_ib_warn(&dev->ib_dev, "Failed in get GUID INFO MAD_IFC\n"); + goto out; + } + + mlx4_ib_update_cache_on_guid_change(dev, guid_tbl_blk_num + i, + port_num, + (u8 *)(&((struct ib_smp *)out_mad)->data)); + mlx4_ib_notify_slaves_on_guid_change(dev, guid_tbl_blk_num + i, + port_num, + (u8 *)(&((struct ib_smp *)out_mad)->data)); + } + +out: + kfree(in_mad); + kfree(out_mad); + return; +} + +void handle_port_mgmt_change_event(struct work_struct *work) +{ + struct ib_event_work *ew = container_of(work, struct ib_event_work, work); + struct mlx4_ib_dev *dev = ew->ib_dev; + struct mlx4_eqe *eqe = &(ew->ib_eqe); + u8 port = eqe->event.port_mgmt_change.port; + u32 changed_attr; + u32 tbl_block; + u32 change_bitmap; + + switch (eqe->subtype) { + case MLX4_DEV_PMC_SUBTYPE_PORT_INFO: + changed_attr = be32_to_cpu(eqe->event.port_mgmt_change.params.port_info.changed_attr); + + /* Update the SM ah - This should be done before handling + the other changed attributes so that MADs can be sent to the SM */ + if (changed_attr & MSTR_SM_CHANGE_MASK) { + u16 lid = be16_to_cpu(eqe->event.port_mgmt_change.params.port_info.mstr_sm_lid); + u8 sl = eqe->event.port_mgmt_change.params.port_info.mstr_sm_sl & 0xf; + update_sm_ah(dev, port, lid, sl); + } + + /* Check if it is a lid change event */ + if (changed_attr & MLX4_EQ_PORT_INFO_LID_CHANGE_MASK) + handle_lid_change_event(dev, port); + + /* Generate GUID changed event */ + if (changed_attr & MLX4_EQ_PORT_INFO_GID_PFX_CHANGE_MASK) { + mlx4_ib_dispatch_event(dev, port, IB_EVENT_GID_CHANGE); + /*if master, notify all slaves*/ + if (mlx4_is_master(dev->dev)) + mlx4_gen_slaves_port_mgt_ev(dev->dev, port, + MLX4_EQ_PORT_INFO_GID_PFX_CHANGE_MASK); + } + + if (changed_attr & MLX4_EQ_PORT_INFO_CLIENT_REREG_MASK) + handle_client_rereg_event(dev, port); + break; + + case MLX4_DEV_PMC_SUBTYPE_PKEY_TABLE: + mlx4_ib_dispatch_event(dev, port, IB_EVENT_PKEY_CHANGE); + if (mlx4_is_master(dev->dev) && !dev->sriov.is_going_down) + propagate_pkey_ev(dev, port, eqe); + break; + case MLX4_DEV_PMC_SUBTYPE_GUID_INFO: + /* paravirtualized master's guid is guid 0 -- does not change */ + if (!mlx4_is_master(dev->dev)) + mlx4_ib_dispatch_event(dev, port, IB_EVENT_GID_CHANGE); + /*if master, notify relevant slaves*/ + else if (!dev->sriov.is_going_down) { + tbl_block = GET_BLK_PTR_FROM_EQE(eqe); + change_bitmap = GET_MASK_FROM_EQE(eqe); + handle_slaves_guid_change(dev, port, tbl_block, change_bitmap); + } + break; + default: + pr_warn("Unsupported subtype 0x%x for " + "Port Management Change event\n", eqe->subtype); + } + + kfree(ew); +} + +void mlx4_ib_dispatch_event(struct mlx4_ib_dev *dev, u8 port_num, + enum ib_event_type type) +{ + struct ib_event event; + + event.device = &dev->ib_dev; + event.element.port_num = port_num; + event.event = type; + + ib_dispatch_event(&event); +} + +static void mlx4_ib_tunnel_comp_handler(struct ib_cq *cq, void *arg) +{ + unsigned long flags; + struct mlx4_ib_demux_pv_ctx *ctx = cq->cq_context; + struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev); + spin_lock_irqsave(&dev->sriov.going_down_lock, flags); + if (!dev->sriov.is_going_down && ctx->state == DEMUX_PV_STATE_ACTIVE) + queue_work(ctx->wq, &ctx->work); + spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags); +} + +static int mlx4_ib_post_pv_qp_buf(struct mlx4_ib_demux_pv_ctx *ctx, + struct mlx4_ib_demux_pv_qp *tun_qp, + int index) +{ + struct ib_sge sg_list; + struct ib_recv_wr recv_wr, *bad_recv_wr; + int size; + + size = (tun_qp->qp->qp_type == IB_QPT_UD) ? + sizeof (struct mlx4_tunnel_mad) : sizeof (struct mlx4_mad_rcv_buf); + + sg_list.addr = tun_qp->ring[index].map; + sg_list.length = size; + sg_list.lkey = ctx->mr->lkey; + + recv_wr.next = NULL; + recv_wr.sg_list = &sg_list; + recv_wr.num_sge = 1; + recv_wr.wr_id = (u64) index | MLX4_TUN_WRID_RECV | + MLX4_TUN_SET_WRID_QPN(tun_qp->proxy_qpt); + ib_dma_sync_single_for_device(ctx->ib_dev, tun_qp->ring[index].map, + size, DMA_FROM_DEVICE); + return ib_post_recv(tun_qp->qp, &recv_wr, &bad_recv_wr); +} + +static int mlx4_ib_multiplex_sa_handler(struct ib_device *ibdev, int port, + int slave, struct ib_sa_mad *sa_mad) +{ + int ret = 0; + + /* dispatch to different sa handlers */ + switch (be16_to_cpu(sa_mad->mad_hdr.attr_id)) { + case IB_SA_ATTR_MC_MEMBER_REC: + ret = mlx4_ib_mcg_multiplex_handler(ibdev, port, slave, sa_mad); + break; + default: + break; + } + return ret; +} + +static int is_proxy_qp0(struct mlx4_ib_dev *dev, int qpn, int slave) +{ + int proxy_start = dev->dev->phys_caps.base_proxy_sqpn + 8 * slave; + + return (qpn >= proxy_start && qpn <= proxy_start + 1); +} + + +int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port, + enum ib_qp_type dest_qpt, u16 pkey_index, + u32 remote_qpn, u32 qkey, struct ib_ah_attr *attr, + u8 *s_mac, struct ib_mad *mad) +{ + struct ib_sge list; + struct ib_send_wr wr, *bad_wr; + struct mlx4_ib_demux_pv_ctx *sqp_ctx; + struct mlx4_ib_demux_pv_qp *sqp; + struct mlx4_mad_snd_buf *sqp_mad; + struct ib_ah *ah; + struct ib_qp *send_qp = NULL; + unsigned wire_tx_ix = 0; + int ret = 0; + u16 wire_pkey_ix; + int src_qpnum; + u8 sgid_index; + + + sqp_ctx = dev->sriov.sqps[port-1]; + + /* check if proxy qp created */ + if (!sqp_ctx || sqp_ctx->state != DEMUX_PV_STATE_ACTIVE) + return -EAGAIN; + + if (dest_qpt == IB_QPT_SMI) { + src_qpnum = 0; + sqp = &sqp_ctx->qp[0]; + wire_pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][0]; + } else { + src_qpnum = 1; + sqp = &sqp_ctx->qp[1]; + wire_pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][pkey_index]; + } + + send_qp = sqp->qp; + + /* create ah */ + sgid_index = attr->grh.sgid_index; + attr->grh.sgid_index = 0; + ah = ib_create_ah(sqp_ctx->pd, attr); + if (IS_ERR(ah)) + return -ENOMEM; + attr->grh.sgid_index = sgid_index; + to_mah(ah)->av.ib.gid_index = sgid_index; + /* get rid of force-loopback bit */ + to_mah(ah)->av.ib.port_pd &= cpu_to_be32(0x7FFFFFFF); + spin_lock(&sqp->tx_lock); + if (sqp->tx_ix_head - sqp->tx_ix_tail >= + (MLX4_NUM_TUNNEL_BUFS - 1)) + ret = -EAGAIN; + else + wire_tx_ix = (++sqp->tx_ix_head) & (MLX4_NUM_TUNNEL_BUFS - 1); + spin_unlock(&sqp->tx_lock); + if (ret) + goto out; + + sqp_mad = (struct mlx4_mad_snd_buf *) (sqp->tx_ring[wire_tx_ix].buf.addr); + if (sqp->tx_ring[wire_tx_ix].ah) + ib_destroy_ah(sqp->tx_ring[wire_tx_ix].ah); + sqp->tx_ring[wire_tx_ix].ah = ah; + ib_dma_sync_single_for_cpu(&dev->ib_dev, + sqp->tx_ring[wire_tx_ix].buf.map, + sizeof (struct mlx4_mad_snd_buf), + DMA_TO_DEVICE); + + memcpy(&sqp_mad->payload, mad, sizeof *mad); + + ib_dma_sync_single_for_device(&dev->ib_dev, + sqp->tx_ring[wire_tx_ix].buf.map, + sizeof (struct mlx4_mad_snd_buf), + DMA_TO_DEVICE); + + list.addr = sqp->tx_ring[wire_tx_ix].buf.map; + list.length = sizeof (struct mlx4_mad_snd_buf); + list.lkey = sqp_ctx->mr->lkey; + + wr.wr.ud.ah = ah; + wr.wr.ud.port_num = port; + wr.wr.ud.pkey_index = wire_pkey_ix; + wr.wr.ud.remote_qkey = qkey; + wr.wr.ud.remote_qpn = remote_qpn; + wr.next = NULL; + wr.wr_id = ((u64) wire_tx_ix) | MLX4_TUN_SET_WRID_QPN(src_qpnum); + wr.sg_list = &list; + wr.num_sge = 1; + wr.opcode = IB_WR_SEND; + wr.send_flags = IB_SEND_SIGNALED; + if (s_mac) + memcpy(to_mah(ah)->av.eth.s_mac, s_mac, 6); + + + ret = ib_post_send(send_qp, &wr, &bad_wr); +out: + if (ret) + ib_destroy_ah(ah); + return ret; +} + +static int get_slave_base_gid_ix(struct mlx4_ib_dev *dev, int slave, int port) +{ + if (rdma_port_get_link_layer(&dev->ib_dev, port) == IB_LINK_LAYER_INFINIBAND) + return slave; + return mlx4_get_base_gid_ix(dev->dev, slave, port); +} + +static void fill_in_real_sgid_index(struct mlx4_ib_dev *dev, int slave, int port, + struct ib_ah_attr *ah_attr) +{ + if (rdma_port_get_link_layer(&dev->ib_dev, port) == IB_LINK_LAYER_INFINIBAND) + ah_attr->grh.sgid_index = slave; + else + ah_attr->grh.sgid_index += get_slave_base_gid_ix(dev, slave, port); +} + +static void mlx4_ib_multiplex_mad(struct mlx4_ib_demux_pv_ctx *ctx, struct ib_wc *wc) +{ + struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev); + struct mlx4_ib_demux_pv_qp *tun_qp = &ctx->qp[MLX4_TUN_WRID_QPN(wc->wr_id)]; + int wr_ix = wc->wr_id & (MLX4_NUM_TUNNEL_BUFS - 1); + struct mlx4_tunnel_mad *tunnel = tun_qp->ring[wr_ix].addr; + struct mlx4_ib_ah ah; + struct ib_ah_attr ah_attr; + u8 *slave_id; + int slave; + int port; + + /* Get slave that sent this packet */ + if (wc->src_qp < dev->dev->phys_caps.base_proxy_sqpn || + wc->src_qp >= dev->dev->phys_caps.base_proxy_sqpn + 8 * MLX4_MFUNC_MAX || + (wc->src_qp & 0x1) != ctx->port - 1 || + wc->src_qp & 0x4) { + mlx4_ib_warn(ctx->ib_dev, "can't multiplex bad sqp:%d\n", wc->src_qp); + return; + } + slave = ((wc->src_qp & ~0x7) - dev->dev->phys_caps.base_proxy_sqpn) / 8; + if (slave != ctx->slave) { + mlx4_ib_warn(ctx->ib_dev, "can't multiplex bad sqp:%d: " + "belongs to another slave\n", wc->src_qp); + return; + } + + /* Map transaction ID */ + ib_dma_sync_single_for_cpu(ctx->ib_dev, tun_qp->ring[wr_ix].map, + sizeof (struct mlx4_tunnel_mad), + DMA_FROM_DEVICE); + switch (tunnel->mad.mad_hdr.method) { + case IB_MGMT_METHOD_SET: + case IB_MGMT_METHOD_GET: + case IB_MGMT_METHOD_REPORT: + case IB_SA_METHOD_GET_TABLE: + case IB_SA_METHOD_DELETE: + case IB_SA_METHOD_GET_MULTI: + case IB_SA_METHOD_GET_TRACE_TBL: + slave_id = (u8 *) &tunnel->mad.mad_hdr.tid; + if (*slave_id) { + mlx4_ib_warn(ctx->ib_dev, "egress mad has non-null tid msb:%d " + "class:%d slave:%d\n", *slave_id, + tunnel->mad.mad_hdr.mgmt_class, slave); + return; + } else + *slave_id = slave; + default: + /* nothing */; + } + + /* Class-specific handling */ + switch (tunnel->mad.mad_hdr.mgmt_class) { + case IB_MGMT_CLASS_SUBN_LID_ROUTED: + case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE: + if (slave != mlx4_master_func_num(dev->dev) && + !mlx4_vf_smi_enabled(dev->dev, slave, ctx->port)) + return; + break; + case IB_MGMT_CLASS_SUBN_ADM: + if (mlx4_ib_multiplex_sa_handler(ctx->ib_dev, ctx->port, slave, + (struct ib_sa_mad *) &tunnel->mad)) + return; + break; + case IB_MGMT_CLASS_CM: + if (mlx4_ib_multiplex_cm_handler(ctx->ib_dev, ctx->port, slave, + (struct ib_mad *) &tunnel->mad)) + return; + break; + case IB_MGMT_CLASS_DEVICE_MGMT: + if (tunnel->mad.mad_hdr.method != IB_MGMT_METHOD_GET && + tunnel->mad.mad_hdr.method != IB_MGMT_METHOD_SET) + return; + break; + default: + /* Drop unsupported classes for slaves in tunnel mode */ + if (slave != mlx4_master_func_num(dev->dev)) { + mlx4_ib_warn(ctx->ib_dev, "dropping unsupported egress mad from class:%d " + "for slave:%d\n", tunnel->mad.mad_hdr.mgmt_class, slave); + return; + } + } + + /* We are using standard ib_core services to send the mad, so generate a + * stadard address handle by decoding the tunnelled mlx4_ah fields */ + memcpy(&ah.av, &tunnel->hdr.av, sizeof (struct mlx4_av)); + ah.ibah.device = ctx->ib_dev; + mlx4_ib_query_ah(&ah.ibah, &ah_attr); + if (ah_attr.ah_flags & IB_AH_GRH) + fill_in_real_sgid_index(dev, slave, ctx->port, &ah_attr); + + port = mlx4_slave_convert_port(dev->dev, slave, ah_attr.port_num); + if (port < 0) + return; + ah_attr.port_num = port; + memcpy(ah_attr.dmac, tunnel->hdr.mac, 6); + ah_attr.vlan_id = be16_to_cpu(tunnel->hdr.vlan); + /* if slave have default vlan use it */ + mlx4_get_slave_default_vlan(dev->dev, ctx->port, slave, + &ah_attr.vlan_id, &ah_attr.sl); + + mlx4_ib_send_to_wire(dev, slave, ctx->port, + is_proxy_qp0(dev, wc->src_qp, slave) ? + IB_QPT_SMI : IB_QPT_GSI, + be16_to_cpu(tunnel->hdr.pkey_index), + be32_to_cpu(tunnel->hdr.remote_qpn), + be32_to_cpu(tunnel->hdr.qkey), + &ah_attr, wc->smac, &tunnel->mad); +} + +static int mlx4_ib_alloc_pv_bufs(struct mlx4_ib_demux_pv_ctx *ctx, + enum ib_qp_type qp_type, int is_tun) +{ + int i; + struct mlx4_ib_demux_pv_qp *tun_qp; + int rx_buf_size, tx_buf_size; + + if (qp_type > IB_QPT_GSI) + return -EINVAL; + + tun_qp = &ctx->qp[qp_type]; + + tun_qp->ring = kzalloc(sizeof (struct mlx4_ib_buf) * MLX4_NUM_TUNNEL_BUFS, + GFP_KERNEL); + if (!tun_qp->ring) + return -ENOMEM; + + tun_qp->tx_ring = kcalloc(MLX4_NUM_TUNNEL_BUFS, + sizeof (struct mlx4_ib_tun_tx_buf), + GFP_KERNEL); + if (!tun_qp->tx_ring) { + kfree(tun_qp->ring); + tun_qp->ring = NULL; + return -ENOMEM; + } + + if (is_tun) { + rx_buf_size = sizeof (struct mlx4_tunnel_mad); + tx_buf_size = sizeof (struct mlx4_rcv_tunnel_mad); + } else { + rx_buf_size = sizeof (struct mlx4_mad_rcv_buf); + tx_buf_size = sizeof (struct mlx4_mad_snd_buf); + } + + for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) { + tun_qp->ring[i].addr = kmalloc(rx_buf_size, GFP_KERNEL); + if (!tun_qp->ring[i].addr) + goto err; + tun_qp->ring[i].map = ib_dma_map_single(ctx->ib_dev, + tun_qp->ring[i].addr, + rx_buf_size, + DMA_FROM_DEVICE); + if (ib_dma_mapping_error(ctx->ib_dev, tun_qp->ring[i].map)) { + kfree(tun_qp->ring[i].addr); + goto err; + } + } + + for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) { + tun_qp->tx_ring[i].buf.addr = + kmalloc(tx_buf_size, GFP_KERNEL); + if (!tun_qp->tx_ring[i].buf.addr) + goto tx_err; + tun_qp->tx_ring[i].buf.map = + ib_dma_map_single(ctx->ib_dev, + tun_qp->tx_ring[i].buf.addr, + tx_buf_size, + DMA_TO_DEVICE); + if (ib_dma_mapping_error(ctx->ib_dev, + tun_qp->tx_ring[i].buf.map)) { + kfree(tun_qp->tx_ring[i].buf.addr); + goto tx_err; + } + tun_qp->tx_ring[i].ah = NULL; + } + spin_lock_init(&tun_qp->tx_lock); + tun_qp->tx_ix_head = 0; + tun_qp->tx_ix_tail = 0; + tun_qp->proxy_qpt = qp_type; + + return 0; + +tx_err: + while (i > 0) { + --i; + ib_dma_unmap_single(ctx->ib_dev, tun_qp->tx_ring[i].buf.map, + tx_buf_size, DMA_TO_DEVICE); + kfree(tun_qp->tx_ring[i].buf.addr); + } + kfree(tun_qp->tx_ring); + tun_qp->tx_ring = NULL; + i = MLX4_NUM_TUNNEL_BUFS; +err: + while (i > 0) { + --i; + ib_dma_unmap_single(ctx->ib_dev, tun_qp->ring[i].map, + rx_buf_size, DMA_FROM_DEVICE); + kfree(tun_qp->ring[i].addr); + } + kfree(tun_qp->ring); + tun_qp->ring = NULL; + return -ENOMEM; +} + +static void mlx4_ib_free_pv_qp_bufs(struct mlx4_ib_demux_pv_ctx *ctx, + enum ib_qp_type qp_type, int is_tun) +{ + int i; + struct mlx4_ib_demux_pv_qp *tun_qp; + int rx_buf_size, tx_buf_size; + + if (qp_type > IB_QPT_GSI) + return; + + tun_qp = &ctx->qp[qp_type]; + if (is_tun) { + rx_buf_size = sizeof (struct mlx4_tunnel_mad); + tx_buf_size = sizeof (struct mlx4_rcv_tunnel_mad); + } else { + rx_buf_size = sizeof (struct mlx4_mad_rcv_buf); + tx_buf_size = sizeof (struct mlx4_mad_snd_buf); + } + + + for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) { + ib_dma_unmap_single(ctx->ib_dev, tun_qp->ring[i].map, + rx_buf_size, DMA_FROM_DEVICE); + kfree(tun_qp->ring[i].addr); + } + + for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) { + ib_dma_unmap_single(ctx->ib_dev, tun_qp->tx_ring[i].buf.map, + tx_buf_size, DMA_TO_DEVICE); + kfree(tun_qp->tx_ring[i].buf.addr); + if (tun_qp->tx_ring[i].ah) + ib_destroy_ah(tun_qp->tx_ring[i].ah); + } + kfree(tun_qp->tx_ring); + kfree(tun_qp->ring); +} + +static void mlx4_ib_tunnel_comp_worker(struct work_struct *work) +{ + struct mlx4_ib_demux_pv_ctx *ctx; + struct mlx4_ib_demux_pv_qp *tun_qp; + struct ib_wc wc; + int ret; + ctx = container_of(work, struct mlx4_ib_demux_pv_ctx, work); + ib_req_notify_cq(ctx->cq, IB_CQ_NEXT_COMP); + + while (ib_poll_cq(ctx->cq, 1, &wc) == 1) { + tun_qp = &ctx->qp[MLX4_TUN_WRID_QPN(wc.wr_id)]; + if (wc.status == IB_WC_SUCCESS) { + switch (wc.opcode) { + case IB_WC_RECV: + mlx4_ib_multiplex_mad(ctx, &wc); + ret = mlx4_ib_post_pv_qp_buf(ctx, tun_qp, + wc.wr_id & + (MLX4_NUM_TUNNEL_BUFS - 1)); + if (ret) + pr_err("Failed reposting tunnel " + "buf:%lld\n", wc.wr_id); + break; + case IB_WC_SEND: + pr_debug("received tunnel send completion:" + "wrid=0x%llx, status=0x%x\n", + wc.wr_id, wc.status); + ib_destroy_ah(tun_qp->tx_ring[wc.wr_id & + (MLX4_NUM_TUNNEL_BUFS - 1)].ah); + tun_qp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah + = NULL; + spin_lock(&tun_qp->tx_lock); + tun_qp->tx_ix_tail++; + spin_unlock(&tun_qp->tx_lock); + + break; + default: + break; + } + } else { + pr_debug("mlx4_ib: completion error in tunnel: %d." + " status = %d, wrid = 0x%llx\n", + ctx->slave, wc.status, wc.wr_id); + if (!MLX4_TUN_IS_RECV(wc.wr_id)) { + ib_destroy_ah(tun_qp->tx_ring[wc.wr_id & + (MLX4_NUM_TUNNEL_BUFS - 1)].ah); + tun_qp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah + = NULL; + spin_lock(&tun_qp->tx_lock); + tun_qp->tx_ix_tail++; + spin_unlock(&tun_qp->tx_lock); + } + } + } +} + +static void pv_qp_event_handler(struct ib_event *event, void *qp_context) +{ + struct mlx4_ib_demux_pv_ctx *sqp = qp_context; + + /* It's worse than that! He's dead, Jim! */ + pr_err("Fatal error (%d) on a MAD QP on port %d\n", + event->event, sqp->port); +} + +static int create_pv_sqp(struct mlx4_ib_demux_pv_ctx *ctx, + enum ib_qp_type qp_type, int create_tun) +{ + int i, ret; + struct mlx4_ib_demux_pv_qp *tun_qp; + struct mlx4_ib_qp_tunnel_init_attr qp_init_attr; + struct ib_qp_attr attr; + int qp_attr_mask_INIT; + + if (qp_type > IB_QPT_GSI) + return -EINVAL; + + tun_qp = &ctx->qp[qp_type]; + + memset(&qp_init_attr, 0, sizeof qp_init_attr); + qp_init_attr.init_attr.send_cq = ctx->cq; + qp_init_attr.init_attr.recv_cq = ctx->cq; + qp_init_attr.init_attr.sq_sig_type = IB_SIGNAL_ALL_WR; + qp_init_attr.init_attr.cap.max_send_wr = MLX4_NUM_TUNNEL_BUFS; + qp_init_attr.init_attr.cap.max_recv_wr = MLX4_NUM_TUNNEL_BUFS; + qp_init_attr.init_attr.cap.max_send_sge = 1; + qp_init_attr.init_attr.cap.max_recv_sge = 1; + if (create_tun) { + qp_init_attr.init_attr.qp_type = IB_QPT_UD; + qp_init_attr.init_attr.create_flags = MLX4_IB_SRIOV_TUNNEL_QP; + qp_init_attr.port = ctx->port; + qp_init_attr.slave = ctx->slave; + qp_init_attr.proxy_qp_type = qp_type; + qp_attr_mask_INIT = IB_QP_STATE | IB_QP_PKEY_INDEX | + IB_QP_QKEY | IB_QP_PORT; + } else { + qp_init_attr.init_attr.qp_type = qp_type; + qp_init_attr.init_attr.create_flags = MLX4_IB_SRIOV_SQP; + qp_attr_mask_INIT = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_QKEY; + } + qp_init_attr.init_attr.port_num = ctx->port; + qp_init_attr.init_attr.qp_context = ctx; + qp_init_attr.init_attr.event_handler = pv_qp_event_handler; + tun_qp->qp = ib_create_qp(ctx->pd, &qp_init_attr.init_attr); + if (IS_ERR(tun_qp->qp)) { + ret = PTR_ERR(tun_qp->qp); + tun_qp->qp = NULL; + pr_err("Couldn't create %s QP (%d)\n", + create_tun ? "tunnel" : "special", ret); + return ret; + } + + memset(&attr, 0, sizeof attr); + attr.qp_state = IB_QPS_INIT; + ret = 0; + if (create_tun) + ret = find_slave_port_pkey_ix(to_mdev(ctx->ib_dev), ctx->slave, + ctx->port, IB_DEFAULT_PKEY_FULL, + &attr.pkey_index); + if (ret || !create_tun) + attr.pkey_index = + to_mdev(ctx->ib_dev)->pkeys.virt2phys_pkey[ctx->slave][ctx->port - 1][0]; + attr.qkey = IB_QP1_QKEY; + attr.port_num = ctx->port; + ret = ib_modify_qp(tun_qp->qp, &attr, qp_attr_mask_INIT); + if (ret) { + pr_err("Couldn't change %s qp state to INIT (%d)\n", + create_tun ? "tunnel" : "special", ret); + goto err_qp; + } + attr.qp_state = IB_QPS_RTR; + ret = ib_modify_qp(tun_qp->qp, &attr, IB_QP_STATE); + if (ret) { + pr_err("Couldn't change %s qp state to RTR (%d)\n", + create_tun ? "tunnel" : "special", ret); + goto err_qp; + } + attr.qp_state = IB_QPS_RTS; + attr.sq_psn = 0; + ret = ib_modify_qp(tun_qp->qp, &attr, IB_QP_STATE | IB_QP_SQ_PSN); + if (ret) { + pr_err("Couldn't change %s qp state to RTS (%d)\n", + create_tun ? "tunnel" : "special", ret); + goto err_qp; + } + + for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) { + ret = mlx4_ib_post_pv_qp_buf(ctx, tun_qp, i); + if (ret) { + pr_err(" mlx4_ib_post_pv_buf error" + " (err = %d, i = %d)\n", ret, i); + goto err_qp; + } + } + return 0; + +err_qp: + ib_destroy_qp(tun_qp->qp); + tun_qp->qp = NULL; + return ret; +} + +/* + * IB MAD completion callback for real SQPs + */ +static void mlx4_ib_sqp_comp_worker(struct work_struct *work) +{ + struct mlx4_ib_demux_pv_ctx *ctx; + struct mlx4_ib_demux_pv_qp *sqp; + struct ib_wc wc; + struct ib_grh *grh; + struct ib_mad *mad; + + ctx = container_of(work, struct mlx4_ib_demux_pv_ctx, work); + ib_req_notify_cq(ctx->cq, IB_CQ_NEXT_COMP); + + while (mlx4_ib_poll_cq(ctx->cq, 1, &wc) == 1) { + sqp = &ctx->qp[MLX4_TUN_WRID_QPN(wc.wr_id)]; + if (wc.status == IB_WC_SUCCESS) { + switch (wc.opcode) { + case IB_WC_SEND: + ib_destroy_ah(sqp->tx_ring[wc.wr_id & + (MLX4_NUM_TUNNEL_BUFS - 1)].ah); + sqp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah + = NULL; + spin_lock(&sqp->tx_lock); + sqp->tx_ix_tail++; + spin_unlock(&sqp->tx_lock); + break; + case IB_WC_RECV: + mad = (struct ib_mad *) &(((struct mlx4_mad_rcv_buf *) + (sqp->ring[wc.wr_id & + (MLX4_NUM_TUNNEL_BUFS - 1)].addr))->payload); + grh = &(((struct mlx4_mad_rcv_buf *) + (sqp->ring[wc.wr_id & + (MLX4_NUM_TUNNEL_BUFS - 1)].addr))->grh); + mlx4_ib_demux_mad(ctx->ib_dev, ctx->port, &wc, grh, mad); + if (mlx4_ib_post_pv_qp_buf(ctx, sqp, wc.wr_id & + (MLX4_NUM_TUNNEL_BUFS - 1))) + pr_err("Failed reposting SQP " + "buf:%lld\n", wc.wr_id); + break; + default: + BUG_ON(1); + break; + } + } else { + pr_debug("mlx4_ib: completion error in tunnel: %d." + " status = %d, wrid = 0x%llx\n", + ctx->slave, wc.status, wc.wr_id); + if (!MLX4_TUN_IS_RECV(wc.wr_id)) { + ib_destroy_ah(sqp->tx_ring[wc.wr_id & + (MLX4_NUM_TUNNEL_BUFS - 1)].ah); + sqp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah + = NULL; + spin_lock(&sqp->tx_lock); + sqp->tx_ix_tail++; + spin_unlock(&sqp->tx_lock); + } + } + } +} + +static int alloc_pv_object(struct mlx4_ib_dev *dev, int slave, int port, + struct mlx4_ib_demux_pv_ctx **ret_ctx) +{ + struct mlx4_ib_demux_pv_ctx *ctx; + + *ret_ctx = NULL; + ctx = kzalloc(sizeof (struct mlx4_ib_demux_pv_ctx), GFP_KERNEL); + if (!ctx) { + pr_err("failed allocating pv resource context " + "for port %d, slave %d\n", port, slave); + return -ENOMEM; + } + + ctx->ib_dev = &dev->ib_dev; + ctx->port = port; + ctx->slave = slave; + *ret_ctx = ctx; + return 0; +} + +static void free_pv_object(struct mlx4_ib_dev *dev, int slave, int port) +{ + if (dev->sriov.demux[port - 1].tun[slave]) { + kfree(dev->sriov.demux[port - 1].tun[slave]); + dev->sriov.demux[port - 1].tun[slave] = NULL; + } +} + +static int create_pv_resources(struct ib_device *ibdev, int slave, int port, + int create_tun, struct mlx4_ib_demux_pv_ctx *ctx) +{ + int ret, cq_size; + + if (ctx->state != DEMUX_PV_STATE_DOWN) + return -EEXIST; + + ctx->state = DEMUX_PV_STATE_STARTING; + /* have QP0 only if link layer is IB */ + if (rdma_port_get_link_layer(ibdev, ctx->port) == + IB_LINK_LAYER_INFINIBAND) + ctx->has_smi = 1; + + if (ctx->has_smi) { + ret = mlx4_ib_alloc_pv_bufs(ctx, IB_QPT_SMI, create_tun); + if (ret) { + pr_err("Failed allocating qp0 tunnel bufs (%d)\n", ret); + goto err_out; + } + } + + ret = mlx4_ib_alloc_pv_bufs(ctx, IB_QPT_GSI, create_tun); + if (ret) { + pr_err("Failed allocating qp1 tunnel bufs (%d)\n", ret); + goto err_out_qp0; + } + + cq_size = 2 * MLX4_NUM_TUNNEL_BUFS; + if (ctx->has_smi) + cq_size *= 2; + + ctx->cq = ib_create_cq(ctx->ib_dev, mlx4_ib_tunnel_comp_handler, + NULL, ctx, cq_size, 0); + if (IS_ERR(ctx->cq)) { + ret = PTR_ERR(ctx->cq); + pr_err("Couldn't create tunnel CQ (%d)\n", ret); + goto err_buf; + } + + ctx->pd = ib_alloc_pd(ctx->ib_dev); + if (IS_ERR(ctx->pd)) { + ret = PTR_ERR(ctx->pd); + pr_err("Couldn't create tunnel PD (%d)\n", ret); + goto err_cq; + } + + ctx->mr = ib_get_dma_mr(ctx->pd, IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(ctx->mr)) { + ret = PTR_ERR(ctx->mr); + pr_err("Couldn't get tunnel DMA MR (%d)\n", ret); + goto err_pd; + } + + if (ctx->has_smi) { + ret = create_pv_sqp(ctx, IB_QPT_SMI, create_tun); + if (ret) { + pr_err("Couldn't create %s QP0 (%d)\n", + create_tun ? "tunnel for" : "", ret); + goto err_mr; + } + } + + ret = create_pv_sqp(ctx, IB_QPT_GSI, create_tun); + if (ret) { + pr_err("Couldn't create %s QP1 (%d)\n", + create_tun ? "tunnel for" : "", ret); + goto err_qp0; + } + + if (create_tun) + INIT_WORK(&ctx->work, mlx4_ib_tunnel_comp_worker); + else + INIT_WORK(&ctx->work, mlx4_ib_sqp_comp_worker); + + ctx->wq = to_mdev(ibdev)->sriov.demux[port - 1].wq; + + ret = ib_req_notify_cq(ctx->cq, IB_CQ_NEXT_COMP); + if (ret) { + pr_err("Couldn't arm tunnel cq (%d)\n", ret); + goto err_wq; + } + ctx->state = DEMUX_PV_STATE_ACTIVE; + return 0; + +err_wq: + ctx->wq = NULL; + ib_destroy_qp(ctx->qp[1].qp); + ctx->qp[1].qp = NULL; + + +err_qp0: + if (ctx->has_smi) + ib_destroy_qp(ctx->qp[0].qp); + ctx->qp[0].qp = NULL; + +err_mr: + ib_dereg_mr(ctx->mr); + ctx->mr = NULL; + +err_pd: + ib_dealloc_pd(ctx->pd); + ctx->pd = NULL; + +err_cq: + ib_destroy_cq(ctx->cq); + ctx->cq = NULL; + +err_buf: + mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_GSI, create_tun); + +err_out_qp0: + if (ctx->has_smi) + mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_SMI, create_tun); +err_out: + ctx->state = DEMUX_PV_STATE_DOWN; + return ret; +} + +static void destroy_pv_resources(struct mlx4_ib_dev *dev, int slave, int port, + struct mlx4_ib_demux_pv_ctx *ctx, int flush) +{ + if (!ctx) + return; + if (ctx->state > DEMUX_PV_STATE_DOWN) { + ctx->state = DEMUX_PV_STATE_DOWNING; + if (flush) + flush_workqueue(ctx->wq); + if (ctx->has_smi) { + ib_destroy_qp(ctx->qp[0].qp); + ctx->qp[0].qp = NULL; + mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_SMI, 1); + } + ib_destroy_qp(ctx->qp[1].qp); + ctx->qp[1].qp = NULL; + mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_GSI, 1); + ib_dereg_mr(ctx->mr); + ctx->mr = NULL; + ib_dealloc_pd(ctx->pd); + ctx->pd = NULL; + ib_destroy_cq(ctx->cq); + ctx->cq = NULL; + ctx->state = DEMUX_PV_STATE_DOWN; + } +} + +static int mlx4_ib_tunnels_update(struct mlx4_ib_dev *dev, int slave, + int port, int do_init) +{ + int ret = 0; + + if (!do_init) { + clean_vf_mcast(&dev->sriov.demux[port - 1], slave); + /* for master, destroy real sqp resources */ + if (slave == mlx4_master_func_num(dev->dev)) + destroy_pv_resources(dev, slave, port, + dev->sriov.sqps[port - 1], 1); + /* destroy the tunnel qp resources */ + destroy_pv_resources(dev, slave, port, + dev->sriov.demux[port - 1].tun[slave], 1); + return 0; + } + + /* create the tunnel qp resources */ + ret = create_pv_resources(&dev->ib_dev, slave, port, 1, + dev->sriov.demux[port - 1].tun[slave]); + + /* for master, create the real sqp resources */ + if (!ret && slave == mlx4_master_func_num(dev->dev)) + ret = create_pv_resources(&dev->ib_dev, slave, port, 0, + dev->sriov.sqps[port - 1]); + return ret; +} + +void mlx4_ib_tunnels_update_work(struct work_struct *work) +{ + struct mlx4_ib_demux_work *dmxw; + + dmxw = container_of(work, struct mlx4_ib_demux_work, work); + mlx4_ib_tunnels_update(dmxw->dev, dmxw->slave, (int) dmxw->port, + dmxw->do_init); + kfree(dmxw); + return; +} + +static int mlx4_ib_alloc_demux_ctx(struct mlx4_ib_dev *dev, + struct mlx4_ib_demux_ctx *ctx, + int port) +{ + char name[12]; + int ret = 0; + int i; + + ctx->tun = kcalloc(dev->dev->caps.sqp_demux, + sizeof (struct mlx4_ib_demux_pv_ctx *), GFP_KERNEL); + if (!ctx->tun) + return -ENOMEM; + + ctx->dev = dev; + ctx->port = port; + ctx->ib_dev = &dev->ib_dev; + + for (i = 0; + i < min(dev->dev->caps.sqp_demux, + (u16)(dev->dev->persist->num_vfs + 1)); + i++) { + struct mlx4_active_ports actv_ports = + mlx4_get_active_ports(dev->dev, i); + + if (!test_bit(port - 1, actv_ports.ports)) + continue; + + ret = alloc_pv_object(dev, i, port, &ctx->tun[i]); + if (ret) { + ret = -ENOMEM; + goto err_mcg; + } + } + + ret = mlx4_ib_mcg_port_init(ctx); + if (ret) { + pr_err("Failed initializing mcg para-virt (%d)\n", ret); + goto err_mcg; + } + + snprintf(name, sizeof name, "mlx4_ibt%d", port); + ctx->wq = create_singlethread_workqueue(name); + if (!ctx->wq) { + pr_err("Failed to create tunnelling WQ for port %d\n", port); + ret = -ENOMEM; + goto err_wq; + } + + snprintf(name, sizeof name, "mlx4_ibud%d", port); + ctx->ud_wq = create_singlethread_workqueue(name); + if (!ctx->ud_wq) { + pr_err("Failed to create up/down WQ for port %d\n", port); + ret = -ENOMEM; + goto err_udwq; + } + + return 0; + +err_udwq: + destroy_workqueue(ctx->wq); + ctx->wq = NULL; + +err_wq: + mlx4_ib_mcg_port_cleanup(ctx, 1); +err_mcg: + for (i = 0; i < dev->dev->caps.sqp_demux; i++) + free_pv_object(dev, i, port); + kfree(ctx->tun); + ctx->tun = NULL; + return ret; +} + +static void mlx4_ib_free_sqp_ctx(struct mlx4_ib_demux_pv_ctx *sqp_ctx) +{ + if (sqp_ctx->state > DEMUX_PV_STATE_DOWN) { + sqp_ctx->state = DEMUX_PV_STATE_DOWNING; + flush_workqueue(sqp_ctx->wq); + if (sqp_ctx->has_smi) { + ib_destroy_qp(sqp_ctx->qp[0].qp); + sqp_ctx->qp[0].qp = NULL; + mlx4_ib_free_pv_qp_bufs(sqp_ctx, IB_QPT_SMI, 0); + } + ib_destroy_qp(sqp_ctx->qp[1].qp); + sqp_ctx->qp[1].qp = NULL; + mlx4_ib_free_pv_qp_bufs(sqp_ctx, IB_QPT_GSI, 0); + ib_dereg_mr(sqp_ctx->mr); + sqp_ctx->mr = NULL; + ib_dealloc_pd(sqp_ctx->pd); + sqp_ctx->pd = NULL; + ib_destroy_cq(sqp_ctx->cq); + sqp_ctx->cq = NULL; + sqp_ctx->state = DEMUX_PV_STATE_DOWN; + } +} + +static void mlx4_ib_free_demux_ctx(struct mlx4_ib_demux_ctx *ctx) +{ + int i; + if (ctx) { + struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev); + mlx4_ib_mcg_port_cleanup(ctx, 1); + for (i = 0; i < dev->dev->caps.sqp_demux; i++) { + if (!ctx->tun[i]) + continue; + if (ctx->tun[i]->state > DEMUX_PV_STATE_DOWN) + ctx->tun[i]->state = DEMUX_PV_STATE_DOWNING; + } + flush_workqueue(ctx->wq); + for (i = 0; i < dev->dev->caps.sqp_demux; i++) { + destroy_pv_resources(dev, i, ctx->port, ctx->tun[i], 0); + free_pv_object(dev, i, ctx->port); + } + kfree(ctx->tun); + destroy_workqueue(ctx->ud_wq); + destroy_workqueue(ctx->wq); + } +} + +static void mlx4_ib_master_tunnels(struct mlx4_ib_dev *dev, int do_init) +{ + int i; + + if (!mlx4_is_master(dev->dev)) + return; + /* initialize or tear down tunnel QPs for the master */ + for (i = 0; i < dev->dev->caps.num_ports; i++) + mlx4_ib_tunnels_update(dev, mlx4_master_func_num(dev->dev), i + 1, do_init); + return; +} + +int mlx4_ib_init_sriov(struct mlx4_ib_dev *dev) +{ + int i = 0; + int err; + + if (!mlx4_is_mfunc(dev->dev)) + return 0; + + dev->sriov.is_going_down = 0; + spin_lock_init(&dev->sriov.going_down_lock); + mlx4_ib_cm_paravirt_init(dev); + + mlx4_ib_warn(&dev->ib_dev, "multi-function enabled\n"); + + if (mlx4_is_slave(dev->dev)) { + mlx4_ib_warn(&dev->ib_dev, "operating in qp1 tunnel mode\n"); + return 0; + } + + for (i = 0; i < dev->dev->caps.sqp_demux; i++) { + if (i == mlx4_master_func_num(dev->dev)) + mlx4_put_slave_node_guid(dev->dev, i, dev->ib_dev.node_guid); + else + mlx4_put_slave_node_guid(dev->dev, i, mlx4_ib_gen_node_guid()); + } + + err = mlx4_ib_init_alias_guid_service(dev); + if (err) { + mlx4_ib_warn(&dev->ib_dev, "Failed init alias guid process.\n"); + goto paravirt_err; + } + err = mlx4_ib_device_register_sysfs(dev); + if (err) { + mlx4_ib_warn(&dev->ib_dev, "Failed to register sysfs\n"); + goto sysfs_err; + } + + mlx4_ib_warn(&dev->ib_dev, "initializing demux service for %d qp1 clients\n", + dev->dev->caps.sqp_demux); + for (i = 0; i < dev->num_ports; i++) { + union ib_gid gid; + err = __mlx4_ib_query_gid(&dev->ib_dev, i + 1, 0, &gid, 1); + if (err) + goto demux_err; + dev->sriov.demux[i].guid_cache[0] = gid.global.interface_id; + err = alloc_pv_object(dev, mlx4_master_func_num(dev->dev), i + 1, + &dev->sriov.sqps[i]); + if (err) + goto demux_err; + err = mlx4_ib_alloc_demux_ctx(dev, &dev->sriov.demux[i], i + 1); + if (err) + goto free_pv; + } + mlx4_ib_master_tunnels(dev, 1); + return 0; + +free_pv: + free_pv_object(dev, mlx4_master_func_num(dev->dev), i + 1); +demux_err: + while (--i >= 0) { + free_pv_object(dev, mlx4_master_func_num(dev->dev), i + 1); + mlx4_ib_free_demux_ctx(&dev->sriov.demux[i]); + } + mlx4_ib_device_unregister_sysfs(dev); + +sysfs_err: + mlx4_ib_destroy_alias_guid_service(dev); + +paravirt_err: + mlx4_ib_cm_paravirt_clean(dev, -1); + + return err; +} + +void mlx4_ib_close_sriov(struct mlx4_ib_dev *dev) +{ + int i; + unsigned long flags; + + if (!mlx4_is_mfunc(dev->dev)) + return; + + spin_lock_irqsave(&dev->sriov.going_down_lock, flags); + dev->sriov.is_going_down = 1; + spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags); + if (mlx4_is_master(dev->dev)) { + for (i = 0; i < dev->num_ports; i++) { + flush_workqueue(dev->sriov.demux[i].ud_wq); + mlx4_ib_free_sqp_ctx(dev->sriov.sqps[i]); + kfree(dev->sriov.sqps[i]); + dev->sriov.sqps[i] = NULL; + mlx4_ib_free_demux_ctx(&dev->sriov.demux[i]); + } + + mlx4_ib_cm_paravirt_clean(dev, -1); + mlx4_ib_destroy_alias_guid_service(dev); + mlx4_ib_device_unregister_sysfs(dev); + } +} diff --git a/kernel/drivers/infiniband/hw/mlx4/main.c b/kernel/drivers/infiniband/hw/mlx4/main.c new file mode 100644 index 000000000..cc64400d4 --- /dev/null +++ b/kernel/drivers/infiniband/hw/mlx4/main.c @@ -0,0 +1,2874 @@ +/* + * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +#include "mlx4_ib.h" +#include "user.h" + +#define DRV_NAME MLX4_IB_DRV_NAME +#define DRV_VERSION "2.2-1" +#define DRV_RELDATE "Feb 2014" + +#define MLX4_IB_FLOW_MAX_PRIO 0xFFF +#define MLX4_IB_FLOW_QPN_MASK 0xFFFFFF +#define MLX4_IB_CARD_REV_A0 0xA0 + +MODULE_AUTHOR("Roland Dreier"); +MODULE_DESCRIPTION("Mellanox ConnectX HCA InfiniBand driver"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION(DRV_VERSION); + +int mlx4_ib_sm_guid_assign = 0; +module_param_named(sm_guid_assign, mlx4_ib_sm_guid_assign, int, 0444); +MODULE_PARM_DESC(sm_guid_assign, "Enable SM alias_GUID assignment if sm_guid_assign > 0 (Default: 0)"); + +static const char mlx4_ib_version[] = + DRV_NAME ": Mellanox ConnectX InfiniBand driver v" + DRV_VERSION " (" DRV_RELDATE ")\n"; + +struct update_gid_work { + struct work_struct work; + union ib_gid gids[128]; + struct mlx4_ib_dev *dev; + int port; +}; + +static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init); + +static struct workqueue_struct *wq; + +static void init_query_mad(struct ib_smp *mad) +{ + mad->base_version = 1; + mad->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED; + mad->class_version = 1; + mad->method = IB_MGMT_METHOD_GET; +} + +static union ib_gid zgid; + +static int check_flow_steering_support(struct mlx4_dev *dev) +{ + int eth_num_ports = 0; + int ib_num_ports = 0; + + int dmfs = dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED; + + if (dmfs) { + int i; + mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) + eth_num_ports++; + mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) + ib_num_ports++; + dmfs &= (!ib_num_ports || + (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DMFS_IPOIB)) && + (!eth_num_ports || + (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FS_EN)); + if (ib_num_ports && mlx4_is_mfunc(dev)) { + pr_warn("Device managed flow steering is unavailable for IB port in multifunction env.\n"); + dmfs = 0; + } + } + return dmfs; +} + +static int num_ib_ports(struct mlx4_dev *dev) +{ + int ib_ports = 0; + int i; + + mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) + ib_ports++; + + return ib_ports; +} + +static int mlx4_ib_query_device(struct ib_device *ibdev, + struct ib_device_attr *props) +{ + struct mlx4_ib_dev *dev = to_mdev(ibdev); + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + int have_ib_ports; + + in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); + out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_NODE_INFO; + + err = mlx4_MAD_IFC(to_mdev(ibdev), MLX4_MAD_IFC_IGNORE_KEYS, + 1, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + memset(props, 0, sizeof *props); + + have_ib_ports = num_ib_ports(dev->dev); + + props->fw_ver = dev->dev->caps.fw_ver; + props->device_cap_flags = IB_DEVICE_CHANGE_PHY_PORT | + IB_DEVICE_PORT_ACTIVE_EVENT | + IB_DEVICE_SYS_IMAGE_GUID | + IB_DEVICE_RC_RNR_NAK_GEN | + IB_DEVICE_BLOCK_MULTICAST_LOOPBACK; + if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_PKEY_CNTR) + props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR; + if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR) + props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR; + if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_APM && have_ib_ports) + props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG; + if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_UD_AV_PORT) + props->device_cap_flags |= IB_DEVICE_UD_AV_PORT_ENFORCE; + if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_IPOIB_CSUM) + props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM; + if (dev->dev->caps.max_gso_sz && + (dev->dev->rev_id != MLX4_IB_CARD_REV_A0) && + (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BLH)) + props->device_cap_flags |= IB_DEVICE_UD_TSO; + if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_RESERVED_LKEY) + props->device_cap_flags |= IB_DEVICE_LOCAL_DMA_LKEY; + if ((dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_LOCAL_INV) && + (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_REMOTE_INV) && + (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_FAST_REG_WR)) + props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; + if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) + props->device_cap_flags |= IB_DEVICE_XRC; + if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW) + props->device_cap_flags |= IB_DEVICE_MEM_WINDOW; + if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_TYPE_2_WIN) { + if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_WIN_TYPE_2B) + props->device_cap_flags |= IB_DEVICE_MEM_WINDOW_TYPE_2B; + else + props->device_cap_flags |= IB_DEVICE_MEM_WINDOW_TYPE_2A; + if (dev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED) + props->device_cap_flags |= IB_DEVICE_MANAGED_FLOW_STEERING; + } + + props->vendor_id = be32_to_cpup((__be32 *) (out_mad->data + 36)) & + 0xffffff; + props->vendor_part_id = dev->dev->persist->pdev->device; + props->hw_ver = be32_to_cpup((__be32 *) (out_mad->data + 32)); + memcpy(&props->sys_image_guid, out_mad->data + 4, 8); + + props->max_mr_size = ~0ull; + props->page_size_cap = dev->dev->caps.page_size_cap; + props->max_qp = dev->dev->quotas.qp; + props->max_qp_wr = dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE; + props->max_sge = min(dev->dev->caps.max_sq_sg, + dev->dev->caps.max_rq_sg); + props->max_cq = dev->dev->quotas.cq; + props->max_cqe = dev->dev->caps.max_cqes; + props->max_mr = dev->dev->quotas.mpt; + props->max_pd = dev->dev->caps.num_pds - dev->dev->caps.reserved_pds; + props->max_qp_rd_atom = dev->dev->caps.max_qp_dest_rdma; + props->max_qp_init_rd_atom = dev->dev->caps.max_qp_init_rdma; + props->max_res_rd_atom = props->max_qp_rd_atom * props->max_qp; + props->max_srq = dev->dev->quotas.srq; + props->max_srq_wr = dev->dev->caps.max_srq_wqes - 1; + props->max_srq_sge = dev->dev->caps.max_srq_sge; + props->max_fast_reg_page_list_len = MLX4_MAX_FAST_REG_PAGES; + props->local_ca_ack_delay = dev->dev->caps.local_ca_ack_delay; + props->atomic_cap = dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_ATOMIC ? + IB_ATOMIC_HCA : IB_ATOMIC_NONE; + props->masked_atomic_cap = props->atomic_cap; + props->max_pkeys = dev->dev->caps.pkey_table_len[1]; + props->max_mcast_grp = dev->dev->caps.num_mgms + dev->dev->caps.num_amgms; + props->max_mcast_qp_attach = dev->dev->caps.num_qp_per_mgm; + props->max_total_mcast_qp_attach = props->max_mcast_qp_attach * + props->max_mcast_grp; + props->max_map_per_fmr = dev->dev->caps.max_fmr_maps; + +out: + kfree(in_mad); + kfree(out_mad); + + return err; +} + +static enum rdma_link_layer +mlx4_ib_port_link_layer(struct ib_device *device, u8 port_num) +{ + struct mlx4_dev *dev = to_mdev(device)->dev; + + return dev->caps.port_mask[port_num] == MLX4_PORT_TYPE_IB ? + IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET; +} + +static int ib_link_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props, int netw_view) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int ext_active_speed; + int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); + out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + if (mlx4_is_mfunc(to_mdev(ibdev)->dev) && netw_view) + mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW; + + err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, NULL, NULL, + in_mad, out_mad); + if (err) + goto out; + + + props->lid = be16_to_cpup((__be16 *) (out_mad->data + 16)); + props->lmc = out_mad->data[34] & 0x7; + props->sm_lid = be16_to_cpup((__be16 *) (out_mad->data + 18)); + props->sm_sl = out_mad->data[36] & 0xf; + props->state = out_mad->data[32] & 0xf; + props->phys_state = out_mad->data[33] >> 4; + props->port_cap_flags = be32_to_cpup((__be32 *) (out_mad->data + 20)); + if (netw_view) + props->gid_tbl_len = out_mad->data[50]; + else + props->gid_tbl_len = to_mdev(ibdev)->dev->caps.gid_table_len[port]; + props->max_msg_sz = to_mdev(ibdev)->dev->caps.max_msg_sz; + props->pkey_tbl_len = to_mdev(ibdev)->dev->caps.pkey_table_len[port]; + props->bad_pkey_cntr = be16_to_cpup((__be16 *) (out_mad->data + 46)); + props->qkey_viol_cntr = be16_to_cpup((__be16 *) (out_mad->data + 48)); + props->active_width = out_mad->data[31] & 0xf; + props->active_speed = out_mad->data[35] >> 4; + props->max_mtu = out_mad->data[41] & 0xf; + props->active_mtu = out_mad->data[36] >> 4; + props->subnet_timeout = out_mad->data[51] & 0x1f; + props->max_vl_num = out_mad->data[37] >> 4; + props->init_type_reply = out_mad->data[41] >> 4; + + /* Check if extended speeds (EDR/FDR/...) are supported */ + if (props->port_cap_flags & IB_PORT_EXTENDED_SPEEDS_SUP) { + ext_active_speed = out_mad->data[62] >> 4; + + switch (ext_active_speed) { + case 1: + props->active_speed = IB_SPEED_FDR; + break; + case 2: + props->active_speed = IB_SPEED_EDR; + break; + } + } + + /* If reported active speed is QDR, check if is FDR-10 */ + if (props->active_speed == IB_SPEED_QDR) { + init_query_mad(in_mad); + in_mad->attr_id = MLX4_ATTR_EXTENDED_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, + NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + /* Checking LinkSpeedActive for FDR-10 */ + if (out_mad->data[15] & 0x1) + props->active_speed = IB_SPEED_FDR10; + } + + /* Avoid wrong speed value returned by FW if the IB link is down. */ + if (props->state == IB_PORT_DOWN) + props->active_speed = IB_SPEED_SDR; + +out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +static u8 state_to_phys_state(enum ib_port_state state) +{ + return state == IB_PORT_ACTIVE ? 5 : 3; +} + +static int eth_link_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props, int netw_view) +{ + + struct mlx4_ib_dev *mdev = to_mdev(ibdev); + struct mlx4_ib_iboe *iboe = &mdev->iboe; + struct net_device *ndev; + enum ib_mtu tmp; + struct mlx4_cmd_mailbox *mailbox; + int err = 0; + int is_bonded = mlx4_is_bonded(mdev->dev); + + mailbox = mlx4_alloc_cmd_mailbox(mdev->dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + err = mlx4_cmd_box(mdev->dev, 0, mailbox->dma, port, 0, + MLX4_CMD_QUERY_PORT, MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_WRAPPED); + if (err) + goto out; + + props->active_width = (((u8 *)mailbox->buf)[5] == 0x40) ? + IB_WIDTH_4X : IB_WIDTH_1X; + props->active_speed = IB_SPEED_QDR; + props->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_IP_BASED_GIDS; + props->gid_tbl_len = mdev->dev->caps.gid_table_len[port]; + props->max_msg_sz = mdev->dev->caps.max_msg_sz; + props->pkey_tbl_len = 1; + props->max_mtu = IB_MTU_4096; + props->max_vl_num = 2; + props->state = IB_PORT_DOWN; + props->phys_state = state_to_phys_state(props->state); + props->active_mtu = IB_MTU_256; + if (is_bonded) + rtnl_lock(); /* required to get upper dev */ + spin_lock_bh(&iboe->lock); + ndev = iboe->netdevs[port - 1]; + if (ndev && is_bonded) + ndev = netdev_master_upper_dev_get(ndev); + if (!ndev) + goto out_unlock; + + tmp = iboe_get_mtu(ndev->mtu); + props->active_mtu = tmp ? min(props->max_mtu, tmp) : IB_MTU_256; + + props->state = (netif_running(ndev) && netif_carrier_ok(ndev)) ? + IB_PORT_ACTIVE : IB_PORT_DOWN; + props->phys_state = state_to_phys_state(props->state); +out_unlock: + spin_unlock_bh(&iboe->lock); + if (is_bonded) + rtnl_unlock(); +out: + mlx4_free_cmd_mailbox(mdev->dev, mailbox); + return err; +} + +int __mlx4_ib_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props, int netw_view) +{ + int err; + + memset(props, 0, sizeof *props); + + err = mlx4_ib_port_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND ? + ib_link_query_port(ibdev, port, props, netw_view) : + eth_link_query_port(ibdev, port, props, netw_view); + + return err; +} + +static int mlx4_ib_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props) +{ + /* returns host view */ + return __mlx4_ib_query_port(ibdev, port, props, 0); +} + +int __mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index, + union ib_gid *gid, int netw_view) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + struct mlx4_ib_dev *dev = to_mdev(ibdev); + int clear = 0; + int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS; + + in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); + out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + if (mlx4_is_mfunc(dev->dev) && netw_view) + mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW; + + err = mlx4_MAD_IFC(dev, mad_ifc_flags, port, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + memcpy(gid->raw, out_mad->data + 8, 8); + + if (mlx4_is_mfunc(dev->dev) && !netw_view) { + if (index) { + /* For any index > 0, return the null guid */ + err = 0; + clear = 1; + goto out; + } + } + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_GUID_INFO; + in_mad->attr_mod = cpu_to_be32(index / 8); + + err = mlx4_MAD_IFC(dev, mad_ifc_flags, port, + NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + memcpy(gid->raw + 8, out_mad->data + (index % 8) * 8, 8); + +out: + if (clear) + memset(gid->raw + 8, 0, 8); + kfree(in_mad); + kfree(out_mad); + return err; +} + +static int iboe_query_gid(struct ib_device *ibdev, u8 port, int index, + union ib_gid *gid) +{ + struct mlx4_ib_dev *dev = to_mdev(ibdev); + + *gid = dev->iboe.gid_table[port - 1][index]; + + return 0; +} + +static int mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index, + union ib_gid *gid) +{ + if (rdma_port_get_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND) + return __mlx4_ib_query_gid(ibdev, port, index, gid, 0); + else + return iboe_query_gid(ibdev, port, index, gid); +} + +int __mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, + u16 *pkey, int netw_view) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); + out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_PKEY_TABLE; + in_mad->attr_mod = cpu_to_be32(index / 32); + + if (mlx4_is_mfunc(to_mdev(ibdev)->dev) && netw_view) + mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW; + + err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, NULL, NULL, + in_mad, out_mad); + if (err) + goto out; + + *pkey = be16_to_cpu(((__be16 *) out_mad->data)[index % 32]); + +out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +static int mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey) +{ + return __mlx4_ib_query_pkey(ibdev, port, index, pkey, 0); +} + +static int mlx4_ib_modify_device(struct ib_device *ibdev, int mask, + struct ib_device_modify *props) +{ + struct mlx4_cmd_mailbox *mailbox; + unsigned long flags; + + if (mask & ~IB_DEVICE_MODIFY_NODE_DESC) + return -EOPNOTSUPP; + + if (!(mask & IB_DEVICE_MODIFY_NODE_DESC)) + return 0; + + if (mlx4_is_slave(to_mdev(ibdev)->dev)) + return -EOPNOTSUPP; + + spin_lock_irqsave(&to_mdev(ibdev)->sm_lock, flags); + memcpy(ibdev->node_desc, props->node_desc, 64); + spin_unlock_irqrestore(&to_mdev(ibdev)->sm_lock, flags); + + /* + * If possible, pass node desc to FW, so it can generate + * a 144 trap. If cmd fails, just ignore. + */ + mailbox = mlx4_alloc_cmd_mailbox(to_mdev(ibdev)->dev); + if (IS_ERR(mailbox)) + return 0; + + memcpy(mailbox->buf, props->node_desc, 64); + mlx4_cmd(to_mdev(ibdev)->dev, mailbox->dma, 1, 0, + MLX4_CMD_SET_NODE, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); + + mlx4_free_cmd_mailbox(to_mdev(ibdev)->dev, mailbox); + + return 0; +} + +static int mlx4_ib_SET_PORT(struct mlx4_ib_dev *dev, u8 port, int reset_qkey_viols, + u32 cap_mask) +{ + struct mlx4_cmd_mailbox *mailbox; + int err; + + mailbox = mlx4_alloc_cmd_mailbox(dev->dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + if (dev->dev->flags & MLX4_FLAG_OLD_PORT_CMDS) { + *(u8 *) mailbox->buf = !!reset_qkey_viols << 6; + ((__be32 *) mailbox->buf)[2] = cpu_to_be32(cap_mask); + } else { + ((u8 *) mailbox->buf)[3] = !!reset_qkey_viols; + ((__be32 *) mailbox->buf)[1] = cpu_to_be32(cap_mask); + } + + err = mlx4_cmd(dev->dev, mailbox->dma, port, MLX4_SET_PORT_IB_OPCODE, + MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_WRAPPED); + + mlx4_free_cmd_mailbox(dev->dev, mailbox); + return err; +} + +static int mlx4_ib_modify_port(struct ib_device *ibdev, u8 port, int mask, + struct ib_port_modify *props) +{ + struct mlx4_ib_dev *mdev = to_mdev(ibdev); + u8 is_eth = mdev->dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH; + struct ib_port_attr attr; + u32 cap_mask; + int err; + + /* return OK if this is RoCE. CM calls ib_modify_port() regardless + * of whether port link layer is ETH or IB. For ETH ports, qkey + * violations and port capabilities are not meaningful. + */ + if (is_eth) + return 0; + + mutex_lock(&mdev->cap_mask_mutex); + + err = mlx4_ib_query_port(ibdev, port, &attr); + if (err) + goto out; + + cap_mask = (attr.port_cap_flags | props->set_port_cap_mask) & + ~props->clr_port_cap_mask; + + err = mlx4_ib_SET_PORT(mdev, port, + !!(mask & IB_PORT_RESET_QKEY_CNTR), + cap_mask); + +out: + mutex_unlock(&to_mdev(ibdev)->cap_mask_mutex); + return err; +} + +static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev, + struct ib_udata *udata) +{ + struct mlx4_ib_dev *dev = to_mdev(ibdev); + struct mlx4_ib_ucontext *context; + struct mlx4_ib_alloc_ucontext_resp_v3 resp_v3; + struct mlx4_ib_alloc_ucontext_resp resp; + int err; + + if (!dev->ib_active) + return ERR_PTR(-EAGAIN); + + if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) { + resp_v3.qp_tab_size = dev->dev->caps.num_qps; + resp_v3.bf_reg_size = dev->dev->caps.bf_reg_size; + resp_v3.bf_regs_per_page = dev->dev->caps.bf_regs_per_page; + } else { + resp.dev_caps = dev->dev->caps.userspace_caps; + resp.qp_tab_size = dev->dev->caps.num_qps; + resp.bf_reg_size = dev->dev->caps.bf_reg_size; + resp.bf_regs_per_page = dev->dev->caps.bf_regs_per_page; + resp.cqe_size = dev->dev->caps.cqe_size; + } + + context = kmalloc(sizeof *context, GFP_KERNEL); + if (!context) + return ERR_PTR(-ENOMEM); + + err = mlx4_uar_alloc(to_mdev(ibdev)->dev, &context->uar); + if (err) { + kfree(context); + return ERR_PTR(err); + } + + INIT_LIST_HEAD(&context->db_page_list); + mutex_init(&context->db_page_mutex); + + if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) + err = ib_copy_to_udata(udata, &resp_v3, sizeof(resp_v3)); + else + err = ib_copy_to_udata(udata, &resp, sizeof(resp)); + + if (err) { + mlx4_uar_free(to_mdev(ibdev)->dev, &context->uar); + kfree(context); + return ERR_PTR(-EFAULT); + } + + return &context->ibucontext; +} + +static int mlx4_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) +{ + struct mlx4_ib_ucontext *context = to_mucontext(ibcontext); + + mlx4_uar_free(to_mdev(ibcontext->device)->dev, &context->uar); + kfree(context); + + return 0; +} + +static int mlx4_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) +{ + struct mlx4_ib_dev *dev = to_mdev(context->device); + + if (vma->vm_end - vma->vm_start != PAGE_SIZE) + return -EINVAL; + + if (vma->vm_pgoff == 0) { + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + + if (io_remap_pfn_range(vma, vma->vm_start, + to_mucontext(context)->uar.pfn, + PAGE_SIZE, vma->vm_page_prot)) + return -EAGAIN; + } else if (vma->vm_pgoff == 1 && dev->dev->caps.bf_reg_size != 0) { + vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); + + if (io_remap_pfn_range(vma, vma->vm_start, + to_mucontext(context)->uar.pfn + + dev->dev->caps.num_uars, + PAGE_SIZE, vma->vm_page_prot)) + return -EAGAIN; + } else + return -EINVAL; + + return 0; +} + +static struct ib_pd *mlx4_ib_alloc_pd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct mlx4_ib_pd *pd; + int err; + + pd = kmalloc(sizeof *pd, GFP_KERNEL); + if (!pd) + return ERR_PTR(-ENOMEM); + + err = mlx4_pd_alloc(to_mdev(ibdev)->dev, &pd->pdn); + if (err) { + kfree(pd); + return ERR_PTR(err); + } + + if (context) + if (ib_copy_to_udata(udata, &pd->pdn, sizeof (__u32))) { + mlx4_pd_free(to_mdev(ibdev)->dev, pd->pdn); + kfree(pd); + return ERR_PTR(-EFAULT); + } + + return &pd->ibpd; +} + +static int mlx4_ib_dealloc_pd(struct ib_pd *pd) +{ + mlx4_pd_free(to_mdev(pd->device)->dev, to_mpd(pd)->pdn); + kfree(pd); + + return 0; +} + +static struct ib_xrcd *mlx4_ib_alloc_xrcd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct mlx4_ib_xrcd *xrcd; + int err; + + if (!(to_mdev(ibdev)->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC)) + return ERR_PTR(-ENOSYS); + + xrcd = kmalloc(sizeof *xrcd, GFP_KERNEL); + if (!xrcd) + return ERR_PTR(-ENOMEM); + + err = mlx4_xrcd_alloc(to_mdev(ibdev)->dev, &xrcd->xrcdn); + if (err) + goto err1; + + xrcd->pd = ib_alloc_pd(ibdev); + if (IS_ERR(xrcd->pd)) { + err = PTR_ERR(xrcd->pd); + goto err2; + } + + xrcd->cq = ib_create_cq(ibdev, NULL, NULL, xrcd, 1, 0); + if (IS_ERR(xrcd->cq)) { + err = PTR_ERR(xrcd->cq); + goto err3; + } + + return &xrcd->ibxrcd; + +err3: + ib_dealloc_pd(xrcd->pd); +err2: + mlx4_xrcd_free(to_mdev(ibdev)->dev, xrcd->xrcdn); +err1: + kfree(xrcd); + return ERR_PTR(err); +} + +static int mlx4_ib_dealloc_xrcd(struct ib_xrcd *xrcd) +{ + ib_destroy_cq(to_mxrcd(xrcd)->cq); + ib_dealloc_pd(to_mxrcd(xrcd)->pd); + mlx4_xrcd_free(to_mdev(xrcd->device)->dev, to_mxrcd(xrcd)->xrcdn); + kfree(xrcd); + + return 0; +} + +static int add_gid_entry(struct ib_qp *ibqp, union ib_gid *gid) +{ + struct mlx4_ib_qp *mqp = to_mqp(ibqp); + struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); + struct mlx4_ib_gid_entry *ge; + + ge = kzalloc(sizeof *ge, GFP_KERNEL); + if (!ge) + return -ENOMEM; + + ge->gid = *gid; + if (mlx4_ib_add_mc(mdev, mqp, gid)) { + ge->port = mqp->port; + ge->added = 1; + } + + mutex_lock(&mqp->mutex); + list_add_tail(&ge->list, &mqp->gid_list); + mutex_unlock(&mqp->mutex); + + return 0; +} + +int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, + union ib_gid *gid) +{ + struct net_device *ndev; + int ret = 0; + + if (!mqp->port) + return 0; + + spin_lock_bh(&mdev->iboe.lock); + ndev = mdev->iboe.netdevs[mqp->port - 1]; + if (ndev) + dev_hold(ndev); + spin_unlock_bh(&mdev->iboe.lock); + + if (ndev) { + ret = 1; + dev_put(ndev); + } + + return ret; +} + +struct mlx4_ib_steering { + struct list_head list; + struct mlx4_flow_reg_id reg_id; + union ib_gid gid; +}; + +static int parse_flow_attr(struct mlx4_dev *dev, + u32 qp_num, + union ib_flow_spec *ib_spec, + struct _rule_hw *mlx4_spec) +{ + enum mlx4_net_trans_rule_id type; + + switch (ib_spec->type) { + case IB_FLOW_SPEC_ETH: + type = MLX4_NET_TRANS_RULE_ID_ETH; + memcpy(mlx4_spec->eth.dst_mac, ib_spec->eth.val.dst_mac, + ETH_ALEN); + memcpy(mlx4_spec->eth.dst_mac_msk, ib_spec->eth.mask.dst_mac, + ETH_ALEN); + mlx4_spec->eth.vlan_tag = ib_spec->eth.val.vlan_tag; + mlx4_spec->eth.vlan_tag_msk = ib_spec->eth.mask.vlan_tag; + break; + case IB_FLOW_SPEC_IB: + type = MLX4_NET_TRANS_RULE_ID_IB; + mlx4_spec->ib.l3_qpn = + cpu_to_be32(qp_num); + mlx4_spec->ib.qpn_mask = + cpu_to_be32(MLX4_IB_FLOW_QPN_MASK); + break; + + + case IB_FLOW_SPEC_IPV4: + type = MLX4_NET_TRANS_RULE_ID_IPV4; + mlx4_spec->ipv4.src_ip = ib_spec->ipv4.val.src_ip; + mlx4_spec->ipv4.src_ip_msk = ib_spec->ipv4.mask.src_ip; + mlx4_spec->ipv4.dst_ip = ib_spec->ipv4.val.dst_ip; + mlx4_spec->ipv4.dst_ip_msk = ib_spec->ipv4.mask.dst_ip; + break; + + case IB_FLOW_SPEC_TCP: + case IB_FLOW_SPEC_UDP: + type = ib_spec->type == IB_FLOW_SPEC_TCP ? + MLX4_NET_TRANS_RULE_ID_TCP : + MLX4_NET_TRANS_RULE_ID_UDP; + mlx4_spec->tcp_udp.dst_port = ib_spec->tcp_udp.val.dst_port; + mlx4_spec->tcp_udp.dst_port_msk = ib_spec->tcp_udp.mask.dst_port; + mlx4_spec->tcp_udp.src_port = ib_spec->tcp_udp.val.src_port; + mlx4_spec->tcp_udp.src_port_msk = ib_spec->tcp_udp.mask.src_port; + break; + + default: + return -EINVAL; + } + if (mlx4_map_sw_to_hw_steering_id(dev, type) < 0 || + mlx4_hw_rule_sz(dev, type) < 0) + return -EINVAL; + mlx4_spec->id = cpu_to_be16(mlx4_map_sw_to_hw_steering_id(dev, type)); + mlx4_spec->size = mlx4_hw_rule_sz(dev, type) >> 2; + return mlx4_hw_rule_sz(dev, type); +} + +struct default_rules { + __u32 mandatory_fields[IB_FLOW_SPEC_SUPPORT_LAYERS]; + __u32 mandatory_not_fields[IB_FLOW_SPEC_SUPPORT_LAYERS]; + __u32 rules_create_list[IB_FLOW_SPEC_SUPPORT_LAYERS]; + __u8 link_layer; +}; +static const struct default_rules default_table[] = { + { + .mandatory_fields = {IB_FLOW_SPEC_IPV4}, + .mandatory_not_fields = {IB_FLOW_SPEC_ETH}, + .rules_create_list = {IB_FLOW_SPEC_IB}, + .link_layer = IB_LINK_LAYER_INFINIBAND + } +}; + +static int __mlx4_ib_default_rules_match(struct ib_qp *qp, + struct ib_flow_attr *flow_attr) +{ + int i, j, k; + void *ib_flow; + const struct default_rules *pdefault_rules = default_table; + u8 link_layer = rdma_port_get_link_layer(qp->device, flow_attr->port); + + for (i = 0; i < ARRAY_SIZE(default_table); i++, pdefault_rules++) { + __u32 field_types[IB_FLOW_SPEC_SUPPORT_LAYERS]; + memset(&field_types, 0, sizeof(field_types)); + + if (link_layer != pdefault_rules->link_layer) + continue; + + ib_flow = flow_attr + 1; + /* we assume the specs are sorted */ + for (j = 0, k = 0; k < IB_FLOW_SPEC_SUPPORT_LAYERS && + j < flow_attr->num_of_specs; k++) { + union ib_flow_spec *current_flow = + (union ib_flow_spec *)ib_flow; + + /* same layer but different type */ + if (((current_flow->type & IB_FLOW_SPEC_LAYER_MASK) == + (pdefault_rules->mandatory_fields[k] & + IB_FLOW_SPEC_LAYER_MASK)) && + (current_flow->type != + pdefault_rules->mandatory_fields[k])) + goto out; + + /* same layer, try match next one */ + if (current_flow->type == + pdefault_rules->mandatory_fields[k]) { + j++; + ib_flow += + ((union ib_flow_spec *)ib_flow)->size; + } + } + + ib_flow = flow_attr + 1; + for (j = 0; j < flow_attr->num_of_specs; + j++, ib_flow += ((union ib_flow_spec *)ib_flow)->size) + for (k = 0; k < IB_FLOW_SPEC_SUPPORT_LAYERS; k++) + /* same layer and same type */ + if (((union ib_flow_spec *)ib_flow)->type == + pdefault_rules->mandatory_not_fields[k]) + goto out; + + return i; + } +out: + return -1; +} + +static int __mlx4_ib_create_default_rules( + struct mlx4_ib_dev *mdev, + struct ib_qp *qp, + const struct default_rules *pdefault_rules, + struct _rule_hw *mlx4_spec) { + int size = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(pdefault_rules->rules_create_list); i++) { + int ret; + union ib_flow_spec ib_spec; + switch (pdefault_rules->rules_create_list[i]) { + case 0: + /* no rule */ + continue; + case IB_FLOW_SPEC_IB: + ib_spec.type = IB_FLOW_SPEC_IB; + ib_spec.size = sizeof(struct ib_flow_spec_ib); + + break; + default: + /* invalid rule */ + return -EINVAL; + } + /* We must put empty rule, qpn is being ignored */ + ret = parse_flow_attr(mdev->dev, 0, &ib_spec, + mlx4_spec); + if (ret < 0) { + pr_info("invalid parsing\n"); + return -EINVAL; + } + + mlx4_spec = (void *)mlx4_spec + ret; + size += ret; + } + return size; +} + +static int __mlx4_ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_attr, + int domain, + enum mlx4_net_trans_promisc_mode flow_type, + u64 *reg_id) +{ + int ret, i; + int size = 0; + void *ib_flow; + struct mlx4_ib_dev *mdev = to_mdev(qp->device); + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_net_trans_rule_hw_ctrl *ctrl; + int default_flow; + + static const u16 __mlx4_domain[] = { + [IB_FLOW_DOMAIN_USER] = MLX4_DOMAIN_UVERBS, + [IB_FLOW_DOMAIN_ETHTOOL] = MLX4_DOMAIN_ETHTOOL, + [IB_FLOW_DOMAIN_RFS] = MLX4_DOMAIN_RFS, + [IB_FLOW_DOMAIN_NIC] = MLX4_DOMAIN_NIC, + }; + + if (flow_attr->priority > MLX4_IB_FLOW_MAX_PRIO) { + pr_err("Invalid priority value %d\n", flow_attr->priority); + return -EINVAL; + } + + if (domain >= IB_FLOW_DOMAIN_NUM) { + pr_err("Invalid domain value %d\n", domain); + return -EINVAL; + } + + if (mlx4_map_sw_to_hw_steering_mode(mdev->dev, flow_type) < 0) + return -EINVAL; + + mailbox = mlx4_alloc_cmd_mailbox(mdev->dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + ctrl = mailbox->buf; + + ctrl->prio = cpu_to_be16(__mlx4_domain[domain] | + flow_attr->priority); + ctrl->type = mlx4_map_sw_to_hw_steering_mode(mdev->dev, flow_type); + ctrl->port = flow_attr->port; + ctrl->qpn = cpu_to_be32(qp->qp_num); + + ib_flow = flow_attr + 1; + size += sizeof(struct mlx4_net_trans_rule_hw_ctrl); + /* Add default flows */ + default_flow = __mlx4_ib_default_rules_match(qp, flow_attr); + if (default_flow >= 0) { + ret = __mlx4_ib_create_default_rules( + mdev, qp, default_table + default_flow, + mailbox->buf + size); + if (ret < 0) { + mlx4_free_cmd_mailbox(mdev->dev, mailbox); + return -EINVAL; + } + size += ret; + } + for (i = 0; i < flow_attr->num_of_specs; i++) { + ret = parse_flow_attr(mdev->dev, qp->qp_num, ib_flow, + mailbox->buf + size); + if (ret < 0) { + mlx4_free_cmd_mailbox(mdev->dev, mailbox); + return -EINVAL; + } + ib_flow += ((union ib_flow_spec *) ib_flow)->size; + size += ret; + } + + ret = mlx4_cmd_imm(mdev->dev, mailbox->dma, reg_id, size >> 2, 0, + MLX4_QP_FLOW_STEERING_ATTACH, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); + if (ret == -ENOMEM) + pr_err("mcg table is full. Fail to register network rule.\n"); + else if (ret == -ENXIO) + pr_err("Device managed flow steering is disabled. Fail to register network rule.\n"); + else if (ret) + pr_err("Invalid argumant. Fail to register network rule.\n"); + + mlx4_free_cmd_mailbox(mdev->dev, mailbox); + return ret; +} + +static int __mlx4_ib_destroy_flow(struct mlx4_dev *dev, u64 reg_id) +{ + int err; + err = mlx4_cmd(dev, reg_id, 0, 0, + MLX4_QP_FLOW_STEERING_DETACH, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); + if (err) + pr_err("Fail to detach network rule. registration id = 0x%llx\n", + reg_id); + return err; +} + +static int mlx4_ib_tunnel_steer_add(struct ib_qp *qp, struct ib_flow_attr *flow_attr, + u64 *reg_id) +{ + void *ib_flow; + union ib_flow_spec *ib_spec; + struct mlx4_dev *dev = to_mdev(qp->device)->dev; + int err = 0; + + if (dev->caps.tunnel_offload_mode != MLX4_TUNNEL_OFFLOAD_MODE_VXLAN || + dev->caps.dmfs_high_steer_mode == MLX4_STEERING_DMFS_A0_STATIC) + return 0; /* do nothing */ + + ib_flow = flow_attr + 1; + ib_spec = (union ib_flow_spec *)ib_flow; + + if (ib_spec->type != IB_FLOW_SPEC_ETH || flow_attr->num_of_specs != 1) + return 0; /* do nothing */ + + err = mlx4_tunnel_steer_add(to_mdev(qp->device)->dev, ib_spec->eth.val.dst_mac, + flow_attr->port, qp->qp_num, + MLX4_DOMAIN_UVERBS | (flow_attr->priority & 0xff), + reg_id); + return err; +} + +static struct ib_flow *mlx4_ib_create_flow(struct ib_qp *qp, + struct ib_flow_attr *flow_attr, + int domain) +{ + int err = 0, i = 0, j = 0; + struct mlx4_ib_flow *mflow; + enum mlx4_net_trans_promisc_mode type[2]; + struct mlx4_dev *dev = (to_mdev(qp->device))->dev; + int is_bonded = mlx4_is_bonded(dev); + + memset(type, 0, sizeof(type)); + + mflow = kzalloc(sizeof(*mflow), GFP_KERNEL); + if (!mflow) { + err = -ENOMEM; + goto err_free; + } + + switch (flow_attr->type) { + case IB_FLOW_ATTR_NORMAL: + type[0] = MLX4_FS_REGULAR; + break; + + case IB_FLOW_ATTR_ALL_DEFAULT: + type[0] = MLX4_FS_ALL_DEFAULT; + break; + + case IB_FLOW_ATTR_MC_DEFAULT: + type[0] = MLX4_FS_MC_DEFAULT; + break; + + case IB_FLOW_ATTR_SNIFFER: + type[0] = MLX4_FS_UC_SNIFFER; + type[1] = MLX4_FS_MC_SNIFFER; + break; + + default: + err = -EINVAL; + goto err_free; + } + + while (i < ARRAY_SIZE(type) && type[i]) { + err = __mlx4_ib_create_flow(qp, flow_attr, domain, type[i], + &mflow->reg_id[i].id); + if (err) + goto err_create_flow; + i++; + if (is_bonded) { + /* Application always sees one port so the mirror rule + * must be on port #2 + */ + flow_attr->port = 2; + err = __mlx4_ib_create_flow(qp, flow_attr, + domain, type[j], + &mflow->reg_id[j].mirror); + flow_attr->port = 1; + if (err) + goto err_create_flow; + j++; + } + + } + + if (i < ARRAY_SIZE(type) && flow_attr->type == IB_FLOW_ATTR_NORMAL) { + err = mlx4_ib_tunnel_steer_add(qp, flow_attr, + &mflow->reg_id[i].id); + if (err) + goto err_create_flow; + i++; + if (is_bonded) { + flow_attr->port = 2; + err = mlx4_ib_tunnel_steer_add(qp, flow_attr, + &mflow->reg_id[j].mirror); + flow_attr->port = 1; + if (err) + goto err_create_flow; + j++; + } + /* function to create mirror rule */ + } + + return &mflow->ibflow; + +err_create_flow: + while (i) { + (void)__mlx4_ib_destroy_flow(to_mdev(qp->device)->dev, + mflow->reg_id[i].id); + i--; + } + + while (j) { + (void)__mlx4_ib_destroy_flow(to_mdev(qp->device)->dev, + mflow->reg_id[j].mirror); + j--; + } +err_free: + kfree(mflow); + return ERR_PTR(err); +} + +static int mlx4_ib_destroy_flow(struct ib_flow *flow_id) +{ + int err, ret = 0; + int i = 0; + struct mlx4_ib_dev *mdev = to_mdev(flow_id->qp->device); + struct mlx4_ib_flow *mflow = to_mflow(flow_id); + + while (i < ARRAY_SIZE(mflow->reg_id) && mflow->reg_id[i].id) { + err = __mlx4_ib_destroy_flow(mdev->dev, mflow->reg_id[i].id); + if (err) + ret = err; + if (mflow->reg_id[i].mirror) { + err = __mlx4_ib_destroy_flow(mdev->dev, + mflow->reg_id[i].mirror); + if (err) + ret = err; + } + i++; + } + + kfree(mflow); + return ret; +} + +static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + int err; + struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); + struct mlx4_dev *dev = mdev->dev; + struct mlx4_ib_qp *mqp = to_mqp(ibqp); + struct mlx4_ib_steering *ib_steering = NULL; + enum mlx4_protocol prot = MLX4_PROT_IB_IPV6; + struct mlx4_flow_reg_id reg_id; + + if (mdev->dev->caps.steering_mode == + MLX4_STEERING_MODE_DEVICE_MANAGED) { + ib_steering = kmalloc(sizeof(*ib_steering), GFP_KERNEL); + if (!ib_steering) + return -ENOMEM; + } + + err = mlx4_multicast_attach(mdev->dev, &mqp->mqp, gid->raw, mqp->port, + !!(mqp->flags & + MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK), + prot, ®_id.id); + if (err) { + pr_err("multicast attach op failed, err %d\n", err); + goto err_malloc; + } + + reg_id.mirror = 0; + if (mlx4_is_bonded(dev)) { + err = mlx4_multicast_attach(mdev->dev, &mqp->mqp, gid->raw, + (mqp->port == 1) ? 2 : 1, + !!(mqp->flags & + MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK), + prot, ®_id.mirror); + if (err) + goto err_add; + } + + err = add_gid_entry(ibqp, gid); + if (err) + goto err_add; + + if (ib_steering) { + memcpy(ib_steering->gid.raw, gid->raw, 16); + ib_steering->reg_id = reg_id; + mutex_lock(&mqp->mutex); + list_add(&ib_steering->list, &mqp->steering_rules); + mutex_unlock(&mqp->mutex); + } + return 0; + +err_add: + mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw, + prot, reg_id.id); + if (reg_id.mirror) + mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw, + prot, reg_id.mirror); +err_malloc: + kfree(ib_steering); + + return err; +} + +static struct mlx4_ib_gid_entry *find_gid_entry(struct mlx4_ib_qp *qp, u8 *raw) +{ + struct mlx4_ib_gid_entry *ge; + struct mlx4_ib_gid_entry *tmp; + struct mlx4_ib_gid_entry *ret = NULL; + + list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) { + if (!memcmp(raw, ge->gid.raw, 16)) { + ret = ge; + break; + } + } + + return ret; +} + +static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + int err; + struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); + struct mlx4_dev *dev = mdev->dev; + struct mlx4_ib_qp *mqp = to_mqp(ibqp); + struct net_device *ndev; + struct mlx4_ib_gid_entry *ge; + struct mlx4_flow_reg_id reg_id = {0, 0}; + enum mlx4_protocol prot = MLX4_PROT_IB_IPV6; + + if (mdev->dev->caps.steering_mode == + MLX4_STEERING_MODE_DEVICE_MANAGED) { + struct mlx4_ib_steering *ib_steering; + + mutex_lock(&mqp->mutex); + list_for_each_entry(ib_steering, &mqp->steering_rules, list) { + if (!memcmp(ib_steering->gid.raw, gid->raw, 16)) { + list_del(&ib_steering->list); + break; + } + } + mutex_unlock(&mqp->mutex); + if (&ib_steering->list == &mqp->steering_rules) { + pr_err("Couldn't find reg_id for mgid. Steering rule is left attached\n"); + return -EINVAL; + } + reg_id = ib_steering->reg_id; + kfree(ib_steering); + } + + err = mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw, + prot, reg_id.id); + if (err) + return err; + + if (mlx4_is_bonded(dev)) { + err = mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw, + prot, reg_id.mirror); + if (err) + return err; + } + + mutex_lock(&mqp->mutex); + ge = find_gid_entry(mqp, gid->raw); + if (ge) { + spin_lock_bh(&mdev->iboe.lock); + ndev = ge->added ? mdev->iboe.netdevs[ge->port - 1] : NULL; + if (ndev) + dev_hold(ndev); + spin_unlock_bh(&mdev->iboe.lock); + if (ndev) + dev_put(ndev); + list_del(&ge->list); + kfree(ge); + } else + pr_warn("could not find mgid entry\n"); + + mutex_unlock(&mqp->mutex); + + return 0; +} + +static int init_node_data(struct mlx4_ib_dev *dev) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); + out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_NODE_DESC; + if (mlx4_is_master(dev->dev)) + mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW; + + err = mlx4_MAD_IFC(dev, mad_ifc_flags, 1, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + memcpy(dev->ib_dev.node_desc, out_mad->data, 64); + + in_mad->attr_id = IB_SMP_ATTR_NODE_INFO; + + err = mlx4_MAD_IFC(dev, mad_ifc_flags, 1, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + dev->dev->rev_id = be32_to_cpup((__be32 *) (out_mad->data + 32)); + memcpy(&dev->ib_dev.node_guid, out_mad->data + 12, 8); + +out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +static ssize_t show_hca(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mlx4_ib_dev *dev = + container_of(device, struct mlx4_ib_dev, ib_dev.dev); + return sprintf(buf, "MT%d\n", dev->dev->persist->pdev->device); +} + +static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mlx4_ib_dev *dev = + container_of(device, struct mlx4_ib_dev, ib_dev.dev); + return sprintf(buf, "%d.%d.%d\n", (int) (dev->dev->caps.fw_ver >> 32), + (int) (dev->dev->caps.fw_ver >> 16) & 0xffff, + (int) dev->dev->caps.fw_ver & 0xffff); +} + +static ssize_t show_rev(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mlx4_ib_dev *dev = + container_of(device, struct mlx4_ib_dev, ib_dev.dev); + return sprintf(buf, "%x\n", dev->dev->rev_id); +} + +static ssize_t show_board(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mlx4_ib_dev *dev = + container_of(device, struct mlx4_ib_dev, ib_dev.dev); + return sprintf(buf, "%.*s\n", MLX4_BOARD_ID_LEN, + dev->dev->board_id); +} + +static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); +static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); +static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); +static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); + +static struct device_attribute *mlx4_class_attributes[] = { + &dev_attr_hw_rev, + &dev_attr_fw_ver, + &dev_attr_hca_type, + &dev_attr_board_id +}; + +static void mlx4_addrconf_ifid_eui48(u8 *eui, u16 vlan_id, + struct net_device *dev) +{ + memcpy(eui, dev->dev_addr, 3); + memcpy(eui + 5, dev->dev_addr + 3, 3); + if (vlan_id < 0x1000) { + eui[3] = vlan_id >> 8; + eui[4] = vlan_id & 0xff; + } else { + eui[3] = 0xff; + eui[4] = 0xfe; + } + eui[0] ^= 2; +} + +static void update_gids_task(struct work_struct *work) +{ + struct update_gid_work *gw = container_of(work, struct update_gid_work, work); + struct mlx4_cmd_mailbox *mailbox; + union ib_gid *gids; + int err; + struct mlx4_dev *dev = gw->dev->dev; + int is_bonded = mlx4_is_bonded(dev); + + if (!gw->dev->ib_active) + return; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) { + pr_warn("update gid table failed %ld\n", PTR_ERR(mailbox)); + return; + } + + gids = mailbox->buf; + memcpy(gids, gw->gids, sizeof gw->gids); + + err = mlx4_cmd(dev, mailbox->dma, MLX4_SET_PORT_GID_TABLE << 8 | gw->port, + MLX4_SET_PORT_ETH_OPCODE, MLX4_CMD_SET_PORT, + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); + if (err) + pr_warn("set port command failed\n"); + else + if ((gw->port == 1) || !is_bonded) + mlx4_ib_dispatch_event(gw->dev, + is_bonded ? 1 : gw->port, + IB_EVENT_GID_CHANGE); + + mlx4_free_cmd_mailbox(dev, mailbox); + kfree(gw); +} + +static void reset_gids_task(struct work_struct *work) +{ + struct update_gid_work *gw = + container_of(work, struct update_gid_work, work); + struct mlx4_cmd_mailbox *mailbox; + union ib_gid *gids; + int err; + struct mlx4_dev *dev = gw->dev->dev; + + if (!gw->dev->ib_active) + return; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) { + pr_warn("reset gid table failed\n"); + goto free; + } + + gids = mailbox->buf; + memcpy(gids, gw->gids, sizeof(gw->gids)); + + if (mlx4_ib_port_link_layer(&gw->dev->ib_dev, gw->port) == + IB_LINK_LAYER_ETHERNET) { + err = mlx4_cmd(dev, mailbox->dma, + MLX4_SET_PORT_GID_TABLE << 8 | gw->port, + MLX4_SET_PORT_ETH_OPCODE, MLX4_CMD_SET_PORT, + MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_WRAPPED); + if (err) + pr_warn("set port %d command failed\n", gw->port); + } + + mlx4_free_cmd_mailbox(dev, mailbox); +free: + kfree(gw); +} + +static int update_gid_table(struct mlx4_ib_dev *dev, int port, + union ib_gid *gid, int clear, + int default_gid) +{ + struct update_gid_work *work; + int i; + int need_update = 0; + int free = -1; + int found = -1; + int max_gids; + + if (default_gid) { + free = 0; + } else { + max_gids = dev->dev->caps.gid_table_len[port]; + for (i = 1; i < max_gids; ++i) { + if (!memcmp(&dev->iboe.gid_table[port - 1][i], gid, + sizeof(*gid))) + found = i; + + if (clear) { + if (found >= 0) { + need_update = 1; + dev->iboe.gid_table[port - 1][found] = + zgid; + break; + } + } else { + if (found >= 0) + break; + + if (free < 0 && + !memcmp(&dev->iboe.gid_table[port - 1][i], + &zgid, sizeof(*gid))) + free = i; + } + } + } + + if (found == -1 && !clear && free >= 0) { + dev->iboe.gid_table[port - 1][free] = *gid; + need_update = 1; + } + + if (!need_update) + return 0; + + work = kzalloc(sizeof(*work), GFP_ATOMIC); + if (!work) + return -ENOMEM; + + memcpy(work->gids, dev->iboe.gid_table[port - 1], sizeof(work->gids)); + INIT_WORK(&work->work, update_gids_task); + work->port = port; + work->dev = dev; + queue_work(wq, &work->work); + + return 0; +} + +static void mlx4_make_default_gid(struct net_device *dev, union ib_gid *gid) +{ + gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL); + mlx4_addrconf_ifid_eui48(&gid->raw[8], 0xffff, dev); +} + + +static int reset_gid_table(struct mlx4_ib_dev *dev, u8 port) +{ + struct update_gid_work *work; + + work = kzalloc(sizeof(*work), GFP_ATOMIC); + if (!work) + return -ENOMEM; + + memset(dev->iboe.gid_table[port - 1], 0, sizeof(work->gids)); + memset(work->gids, 0, sizeof(work->gids)); + INIT_WORK(&work->work, reset_gids_task); + work->dev = dev; + work->port = port; + queue_work(wq, &work->work); + return 0; +} + +static int mlx4_ib_addr_event(int event, struct net_device *event_netdev, + struct mlx4_ib_dev *ibdev, union ib_gid *gid) +{ + struct mlx4_ib_iboe *iboe; + int port = 0; + struct net_device *real_dev = rdma_vlan_dev_real_dev(event_netdev) ? + rdma_vlan_dev_real_dev(event_netdev) : + event_netdev; + union ib_gid default_gid; + + mlx4_make_default_gid(real_dev, &default_gid); + + if (!memcmp(gid, &default_gid, sizeof(*gid))) + return 0; + + if (event != NETDEV_DOWN && event != NETDEV_UP) + return 0; + + if ((real_dev != event_netdev) && + (event == NETDEV_DOWN) && + rdma_link_local_addr((struct in6_addr *)gid)) + return 0; + + iboe = &ibdev->iboe; + spin_lock_bh(&iboe->lock); + + for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) + if ((netif_is_bond_master(real_dev) && + (real_dev == iboe->masters[port - 1])) || + (!netif_is_bond_master(real_dev) && + (real_dev == iboe->netdevs[port - 1]))) + update_gid_table(ibdev, port, gid, + event == NETDEV_DOWN, 0); + + spin_unlock_bh(&iboe->lock); + return 0; + +} + +static u8 mlx4_ib_get_dev_port(struct net_device *dev, + struct mlx4_ib_dev *ibdev) +{ + u8 port = 0; + struct mlx4_ib_iboe *iboe; + struct net_device *real_dev = rdma_vlan_dev_real_dev(dev) ? + rdma_vlan_dev_real_dev(dev) : dev; + + iboe = &ibdev->iboe; + + for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) + if ((netif_is_bond_master(real_dev) && + (real_dev == iboe->masters[port - 1])) || + (!netif_is_bond_master(real_dev) && + (real_dev == iboe->netdevs[port - 1]))) + break; + + if ((port == 0) || (port > ibdev->dev->caps.num_ports)) + return 0; + else + return port; +} + +static int mlx4_ib_inet_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct mlx4_ib_dev *ibdev; + struct in_ifaddr *ifa = ptr; + union ib_gid gid; + struct net_device *event_netdev = ifa->ifa_dev->dev; + + ipv6_addr_set_v4mapped(ifa->ifa_address, (struct in6_addr *)&gid); + + ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb_inet); + + mlx4_ib_addr_event(event, event_netdev, ibdev, &gid); + return NOTIFY_DONE; +} + +#if IS_ENABLED(CONFIG_IPV6) +static int mlx4_ib_inet6_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct mlx4_ib_dev *ibdev; + struct inet6_ifaddr *ifa = ptr; + union ib_gid *gid = (union ib_gid *)&ifa->addr; + struct net_device *event_netdev = ifa->idev->dev; + + ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb_inet6); + + mlx4_ib_addr_event(event, event_netdev, ibdev, gid); + return NOTIFY_DONE; +} +#endif + +#define MLX4_IB_INVALID_MAC ((u64)-1) +static void mlx4_ib_update_qps(struct mlx4_ib_dev *ibdev, + struct net_device *dev, + int port) +{ + u64 new_smac = 0; + u64 release_mac = MLX4_IB_INVALID_MAC; + struct mlx4_ib_qp *qp; + + read_lock(&dev_base_lock); + new_smac = mlx4_mac_to_u64(dev->dev_addr); + read_unlock(&dev_base_lock); + + atomic64_set(&ibdev->iboe.mac[port - 1], new_smac); + + /* no need for update QP1 and mac registration in non-SRIOV */ + if (!mlx4_is_mfunc(ibdev->dev)) + return; + + mutex_lock(&ibdev->qp1_proxy_lock[port - 1]); + qp = ibdev->qp1_proxy[port - 1]; + if (qp) { + int new_smac_index; + u64 old_smac; + struct mlx4_update_qp_params update_params; + + mutex_lock(&qp->mutex); + old_smac = qp->pri.smac; + if (new_smac == old_smac) + goto unlock; + + new_smac_index = mlx4_register_mac(ibdev->dev, port, new_smac); + + if (new_smac_index < 0) + goto unlock; + + update_params.smac_index = new_smac_index; + if (mlx4_update_qp(ibdev->dev, qp->mqp.qpn, MLX4_UPDATE_QP_SMAC, + &update_params)) { + release_mac = new_smac; + goto unlock; + } + /* if old port was zero, no mac was yet registered for this QP */ + if (qp->pri.smac_port) + release_mac = old_smac; + qp->pri.smac = new_smac; + qp->pri.smac_port = port; + qp->pri.smac_index = new_smac_index; + } + +unlock: + if (release_mac != MLX4_IB_INVALID_MAC) + mlx4_unregister_mac(ibdev->dev, port, release_mac); + if (qp) + mutex_unlock(&qp->mutex); + mutex_unlock(&ibdev->qp1_proxy_lock[port - 1]); +} + +static void mlx4_ib_get_dev_addr(struct net_device *dev, + struct mlx4_ib_dev *ibdev, u8 port) +{ + struct in_device *in_dev; +#if IS_ENABLED(CONFIG_IPV6) + struct inet6_dev *in6_dev; + union ib_gid *pgid; + struct inet6_ifaddr *ifp; + union ib_gid default_gid; +#endif + union ib_gid gid; + + + if ((port == 0) || (port > ibdev->dev->caps.num_ports)) + return; + + /* IPv4 gids */ + in_dev = in_dev_get(dev); + if (in_dev) { + for_ifa(in_dev) { + /*ifa->ifa_address;*/ + ipv6_addr_set_v4mapped(ifa->ifa_address, + (struct in6_addr *)&gid); + update_gid_table(ibdev, port, &gid, 0, 0); + } + endfor_ifa(in_dev); + in_dev_put(in_dev); + } +#if IS_ENABLED(CONFIG_IPV6) + mlx4_make_default_gid(dev, &default_gid); + /* IPv6 gids */ + in6_dev = in6_dev_get(dev); + if (in6_dev) { + read_lock_bh(&in6_dev->lock); + list_for_each_entry(ifp, &in6_dev->addr_list, if_list) { + pgid = (union ib_gid *)&ifp->addr; + if (!memcmp(pgid, &default_gid, sizeof(*pgid))) + continue; + update_gid_table(ibdev, port, pgid, 0, 0); + } + read_unlock_bh(&in6_dev->lock); + in6_dev_put(in6_dev); + } +#endif +} + +static void mlx4_ib_set_default_gid(struct mlx4_ib_dev *ibdev, + struct net_device *dev, u8 port) +{ + union ib_gid gid; + mlx4_make_default_gid(dev, &gid); + update_gid_table(ibdev, port, &gid, 0, 1); +} + +static int mlx4_ib_init_gid_table(struct mlx4_ib_dev *ibdev) +{ + struct net_device *dev; + struct mlx4_ib_iboe *iboe = &ibdev->iboe; + int i; + int err = 0; + + for (i = 1; i <= ibdev->num_ports; ++i) { + if (rdma_port_get_link_layer(&ibdev->ib_dev, i) == + IB_LINK_LAYER_ETHERNET) { + err = reset_gid_table(ibdev, i); + if (err) + goto out; + } + } + + read_lock(&dev_base_lock); + spin_lock_bh(&iboe->lock); + + for_each_netdev(&init_net, dev) { + u8 port = mlx4_ib_get_dev_port(dev, ibdev); + /* port will be non-zero only for ETH ports */ + if (port) { + mlx4_ib_set_default_gid(ibdev, dev, port); + mlx4_ib_get_dev_addr(dev, ibdev, port); + } + } + + spin_unlock_bh(&iboe->lock); + read_unlock(&dev_base_lock); +out: + return err; +} + +static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev, + struct net_device *dev, + unsigned long event) + +{ + struct mlx4_ib_iboe *iboe; + int update_qps_port = -1; + int port; + + iboe = &ibdev->iboe; + + spin_lock_bh(&iboe->lock); + mlx4_foreach_ib_transport_port(port, ibdev->dev) { + enum ib_port_state port_state = IB_PORT_NOP; + struct net_device *old_master = iboe->masters[port - 1]; + struct net_device *curr_netdev; + struct net_device *curr_master; + + iboe->netdevs[port - 1] = + mlx4_get_protocol_dev(ibdev->dev, MLX4_PROT_ETH, port); + if (iboe->netdevs[port - 1]) + mlx4_ib_set_default_gid(ibdev, + iboe->netdevs[port - 1], port); + curr_netdev = iboe->netdevs[port - 1]; + + if (iboe->netdevs[port - 1] && + netif_is_bond_slave(iboe->netdevs[port - 1])) { + iboe->masters[port - 1] = netdev_master_upper_dev_get( + iboe->netdevs[port - 1]); + } else { + iboe->masters[port - 1] = NULL; + } + curr_master = iboe->masters[port - 1]; + + if (dev == iboe->netdevs[port - 1] && + (event == NETDEV_CHANGEADDR || event == NETDEV_REGISTER || + event == NETDEV_UP || event == NETDEV_CHANGE)) + update_qps_port = port; + + if (curr_netdev) { + port_state = (netif_running(curr_netdev) && netif_carrier_ok(curr_netdev)) ? + IB_PORT_ACTIVE : IB_PORT_DOWN; + mlx4_ib_set_default_gid(ibdev, curr_netdev, port); + if (curr_master) { + /* if using bonding/team and a slave port is down, we + * don't want the bond IP based gids in the table since + * flows that select port by gid may get the down port. + */ + if (port_state == IB_PORT_DOWN && + !mlx4_is_bonded(ibdev->dev)) { + reset_gid_table(ibdev, port); + mlx4_ib_set_default_gid(ibdev, + curr_netdev, + port); + } else { + /* gids from the upper dev (bond/team) + * should appear in port's gid table + */ + mlx4_ib_get_dev_addr(curr_master, + ibdev, port); + } + } + /* if bonding is used it is possible that we add it to + * masters only after IP address is assigned to the + * net bonding interface. + */ + if (curr_master && (old_master != curr_master)) { + reset_gid_table(ibdev, port); + mlx4_ib_set_default_gid(ibdev, + curr_netdev, port); + mlx4_ib_get_dev_addr(curr_master, ibdev, port); + } + + if (!curr_master && (old_master != curr_master)) { + reset_gid_table(ibdev, port); + mlx4_ib_set_default_gid(ibdev, + curr_netdev, port); + mlx4_ib_get_dev_addr(curr_netdev, ibdev, port); + } + } else { + reset_gid_table(ibdev, port); + } + } + + spin_unlock_bh(&iboe->lock); + + if (update_qps_port > 0) + mlx4_ib_update_qps(ibdev, dev, update_qps_port); +} + +static int mlx4_ib_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct mlx4_ib_dev *ibdev; + + if (!net_eq(dev_net(dev), &init_net)) + return NOTIFY_DONE; + + ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb); + mlx4_ib_scan_netdevs(ibdev, dev, event); + + return NOTIFY_DONE; +} + +static void init_pkeys(struct mlx4_ib_dev *ibdev) +{ + int port; + int slave; + int i; + + if (mlx4_is_master(ibdev->dev)) { + for (slave = 0; slave <= ibdev->dev->persist->num_vfs; + ++slave) { + for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) { + for (i = 0; + i < ibdev->dev->phys_caps.pkey_phys_table_len[port]; + ++i) { + ibdev->pkeys.virt2phys_pkey[slave][port - 1][i] = + /* master has the identity virt2phys pkey mapping */ + (slave == mlx4_master_func_num(ibdev->dev) || !i) ? i : + ibdev->dev->phys_caps.pkey_phys_table_len[port] - 1; + mlx4_sync_pkey_table(ibdev->dev, slave, port, i, + ibdev->pkeys.virt2phys_pkey[slave][port - 1][i]); + } + } + } + /* initialize pkey cache */ + for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) { + for (i = 0; + i < ibdev->dev->phys_caps.pkey_phys_table_len[port]; + ++i) + ibdev->pkeys.phys_pkey_cache[port-1][i] = + (i) ? 0 : 0xFFFF; + } + } +} + +static void mlx4_ib_alloc_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev) +{ + char name[80]; + int eq_per_port = 0; + int added_eqs = 0; + int total_eqs = 0; + int i, j, eq; + + /* Legacy mode or comp_pool is not large enough */ + if (dev->caps.comp_pool == 0 || + dev->caps.num_ports > dev->caps.comp_pool) + return; + + eq_per_port = dev->caps.comp_pool / dev->caps.num_ports; + + /* Init eq table */ + added_eqs = 0; + mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) + added_eqs += eq_per_port; + + total_eqs = dev->caps.num_comp_vectors + added_eqs; + + ibdev->eq_table = kzalloc(total_eqs * sizeof(int), GFP_KERNEL); + if (!ibdev->eq_table) + return; + + ibdev->eq_added = added_eqs; + + eq = 0; + mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) { + for (j = 0; j < eq_per_port; j++) { + snprintf(name, sizeof(name), "mlx4-ib-%d-%d@%s", + i, j, dev->persist->pdev->bus->name); + /* Set IRQ for specific name (per ring) */ + if (mlx4_assign_eq(dev, name, NULL, + &ibdev->eq_table[eq])) { + /* Use legacy (same as mlx4_en driver) */ + pr_warn("Can't allocate EQ %d; reverting to legacy\n", eq); + ibdev->eq_table[eq] = + (eq % dev->caps.num_comp_vectors); + } + eq++; + } + } + + /* Fill the reset of the vector with legacy EQ */ + for (i = 0, eq = added_eqs; i < dev->caps.num_comp_vectors; i++) + ibdev->eq_table[eq++] = i; + + /* Advertise the new number of EQs to clients */ + ibdev->ib_dev.num_comp_vectors = total_eqs; +} + +static void mlx4_ib_free_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev) +{ + int i; + + /* no additional eqs were added */ + if (!ibdev->eq_table) + return; + + /* Reset the advertised EQ number */ + ibdev->ib_dev.num_comp_vectors = dev->caps.num_comp_vectors; + + /* Free only the added eqs */ + for (i = 0; i < ibdev->eq_added; i++) { + /* Don't free legacy eqs if used */ + if (ibdev->eq_table[i] <= dev->caps.num_comp_vectors) + continue; + mlx4_release_eq(dev, ibdev->eq_table[i]); + } + + kfree(ibdev->eq_table); +} + +static void *mlx4_ib_add(struct mlx4_dev *dev) +{ + struct mlx4_ib_dev *ibdev; + int num_ports = 0; + int i, j; + int err; + struct mlx4_ib_iboe *iboe; + int ib_num_ports = 0; + int num_req_counters; + + pr_info_once("%s", mlx4_ib_version); + + num_ports = 0; + mlx4_foreach_ib_transport_port(i, dev) + num_ports++; + + /* No point in registering a device with no ports... */ + if (num_ports == 0) + return NULL; + + ibdev = (struct mlx4_ib_dev *) ib_alloc_device(sizeof *ibdev); + if (!ibdev) { + dev_err(&dev->persist->pdev->dev, + "Device struct alloc failed\n"); + return NULL; + } + + iboe = &ibdev->iboe; + + if (mlx4_pd_alloc(dev, &ibdev->priv_pdn)) + goto err_dealloc; + + if (mlx4_uar_alloc(dev, &ibdev->priv_uar)) + goto err_pd; + + ibdev->uar_map = ioremap((phys_addr_t) ibdev->priv_uar.pfn << PAGE_SHIFT, + PAGE_SIZE); + if (!ibdev->uar_map) + goto err_uar; + MLX4_INIT_DOORBELL_LOCK(&ibdev->uar_lock); + + ibdev->dev = dev; + ibdev->bond_next_port = 0; + + strlcpy(ibdev->ib_dev.name, "mlx4_%d", IB_DEVICE_NAME_MAX); + ibdev->ib_dev.owner = THIS_MODULE; + ibdev->ib_dev.node_type = RDMA_NODE_IB_CA; + ibdev->ib_dev.local_dma_lkey = dev->caps.reserved_lkey; + ibdev->num_ports = num_ports; + ibdev->ib_dev.phys_port_cnt = mlx4_is_bonded(dev) ? + 1 : ibdev->num_ports; + ibdev->ib_dev.num_comp_vectors = dev->caps.num_comp_vectors; + ibdev->ib_dev.dma_device = &dev->persist->pdev->dev; + + if (dev->caps.userspace_caps) + ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION; + else + ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION; + + ibdev->ib_dev.uverbs_cmd_mask = + (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | + (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | + (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_REREG_MR) | + (1ull << IB_USER_VERBS_CMD_DEREG_MR) | + (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | + (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_CMD_RESIZE_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_QP) | + (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | + (1ull << IB_USER_VERBS_CMD_QUERY_QP) | + (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | + (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_DETACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) | + (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) | + (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_XSRQ) | + (1ull << IB_USER_VERBS_CMD_OPEN_QP); + + ibdev->ib_dev.query_device = mlx4_ib_query_device; + ibdev->ib_dev.query_port = mlx4_ib_query_port; + ibdev->ib_dev.get_link_layer = mlx4_ib_port_link_layer; + ibdev->ib_dev.query_gid = mlx4_ib_query_gid; + ibdev->ib_dev.query_pkey = mlx4_ib_query_pkey; + ibdev->ib_dev.modify_device = mlx4_ib_modify_device; + ibdev->ib_dev.modify_port = mlx4_ib_modify_port; + ibdev->ib_dev.alloc_ucontext = mlx4_ib_alloc_ucontext; + ibdev->ib_dev.dealloc_ucontext = mlx4_ib_dealloc_ucontext; + ibdev->ib_dev.mmap = mlx4_ib_mmap; + ibdev->ib_dev.alloc_pd = mlx4_ib_alloc_pd; + ibdev->ib_dev.dealloc_pd = mlx4_ib_dealloc_pd; + ibdev->ib_dev.create_ah = mlx4_ib_create_ah; + ibdev->ib_dev.query_ah = mlx4_ib_query_ah; + ibdev->ib_dev.destroy_ah = mlx4_ib_destroy_ah; + ibdev->ib_dev.create_srq = mlx4_ib_create_srq; + ibdev->ib_dev.modify_srq = mlx4_ib_modify_srq; + ibdev->ib_dev.query_srq = mlx4_ib_query_srq; + ibdev->ib_dev.destroy_srq = mlx4_ib_destroy_srq; + ibdev->ib_dev.post_srq_recv = mlx4_ib_post_srq_recv; + ibdev->ib_dev.create_qp = mlx4_ib_create_qp; + ibdev->ib_dev.modify_qp = mlx4_ib_modify_qp; + ibdev->ib_dev.query_qp = mlx4_ib_query_qp; + ibdev->ib_dev.destroy_qp = mlx4_ib_destroy_qp; + ibdev->ib_dev.post_send = mlx4_ib_post_send; + ibdev->ib_dev.post_recv = mlx4_ib_post_recv; + ibdev->ib_dev.create_cq = mlx4_ib_create_cq; + ibdev->ib_dev.modify_cq = mlx4_ib_modify_cq; + ibdev->ib_dev.resize_cq = mlx4_ib_resize_cq; + ibdev->ib_dev.destroy_cq = mlx4_ib_destroy_cq; + ibdev->ib_dev.poll_cq = mlx4_ib_poll_cq; + ibdev->ib_dev.req_notify_cq = mlx4_ib_arm_cq; + ibdev->ib_dev.get_dma_mr = mlx4_ib_get_dma_mr; + ibdev->ib_dev.reg_user_mr = mlx4_ib_reg_user_mr; + ibdev->ib_dev.rereg_user_mr = mlx4_ib_rereg_user_mr; + ibdev->ib_dev.dereg_mr = mlx4_ib_dereg_mr; + ibdev->ib_dev.alloc_fast_reg_mr = mlx4_ib_alloc_fast_reg_mr; + ibdev->ib_dev.alloc_fast_reg_page_list = mlx4_ib_alloc_fast_reg_page_list; + ibdev->ib_dev.free_fast_reg_page_list = mlx4_ib_free_fast_reg_page_list; + ibdev->ib_dev.attach_mcast = mlx4_ib_mcg_attach; + ibdev->ib_dev.detach_mcast = mlx4_ib_mcg_detach; + ibdev->ib_dev.process_mad = mlx4_ib_process_mad; + + if (!mlx4_is_slave(ibdev->dev)) { + ibdev->ib_dev.alloc_fmr = mlx4_ib_fmr_alloc; + ibdev->ib_dev.map_phys_fmr = mlx4_ib_map_phys_fmr; + ibdev->ib_dev.unmap_fmr = mlx4_ib_unmap_fmr; + ibdev->ib_dev.dealloc_fmr = mlx4_ib_fmr_dealloc; + } + + if (dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW || + dev->caps.bmme_flags & MLX4_BMME_FLAG_TYPE_2_WIN) { + ibdev->ib_dev.alloc_mw = mlx4_ib_alloc_mw; + ibdev->ib_dev.bind_mw = mlx4_ib_bind_mw; + ibdev->ib_dev.dealloc_mw = mlx4_ib_dealloc_mw; + + ibdev->ib_dev.uverbs_cmd_mask |= + (1ull << IB_USER_VERBS_CMD_ALLOC_MW) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_MW); + } + + if (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) { + ibdev->ib_dev.alloc_xrcd = mlx4_ib_alloc_xrcd; + ibdev->ib_dev.dealloc_xrcd = mlx4_ib_dealloc_xrcd; + ibdev->ib_dev.uverbs_cmd_mask |= + (1ull << IB_USER_VERBS_CMD_OPEN_XRCD) | + (1ull << IB_USER_VERBS_CMD_CLOSE_XRCD); + } + + if (check_flow_steering_support(dev)) { + ibdev->steering_support = MLX4_STEERING_MODE_DEVICE_MANAGED; + ibdev->ib_dev.create_flow = mlx4_ib_create_flow; + ibdev->ib_dev.destroy_flow = mlx4_ib_destroy_flow; + + ibdev->ib_dev.uverbs_ex_cmd_mask |= + (1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) | + (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW); + } + + mlx4_ib_alloc_eqs(dev, ibdev); + + spin_lock_init(&iboe->lock); + + if (init_node_data(ibdev)) + goto err_map; + + num_req_counters = mlx4_is_bonded(dev) ? 1 : ibdev->num_ports; + for (i = 0; i < num_req_counters; ++i) { + mutex_init(&ibdev->qp1_proxy_lock[i]); + if (mlx4_ib_port_link_layer(&ibdev->ib_dev, i + 1) == + IB_LINK_LAYER_ETHERNET) { + err = mlx4_counter_alloc(ibdev->dev, &ibdev->counters[i]); + if (err) + ibdev->counters[i] = -1; + } else { + ibdev->counters[i] = -1; + } + } + if (mlx4_is_bonded(dev)) + for (i = 1; i < ibdev->num_ports ; ++i) + ibdev->counters[i] = ibdev->counters[0]; + + + mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) + ib_num_ports++; + + spin_lock_init(&ibdev->sm_lock); + mutex_init(&ibdev->cap_mask_mutex); + INIT_LIST_HEAD(&ibdev->qp_list); + spin_lock_init(&ibdev->reset_flow_resource_lock); + + if (ibdev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED && + ib_num_ports) { + ibdev->steer_qpn_count = MLX4_IB_UC_MAX_NUM_QPS; + err = mlx4_qp_reserve_range(dev, ibdev->steer_qpn_count, + MLX4_IB_UC_STEER_QPN_ALIGN, + &ibdev->steer_qpn_base, 0); + if (err) + goto err_counter; + + ibdev->ib_uc_qpns_bitmap = + kmalloc(BITS_TO_LONGS(ibdev->steer_qpn_count) * + sizeof(long), + GFP_KERNEL); + if (!ibdev->ib_uc_qpns_bitmap) { + dev_err(&dev->persist->pdev->dev, + "bit map alloc failed\n"); + goto err_steer_qp_release; + } + + bitmap_zero(ibdev->ib_uc_qpns_bitmap, ibdev->steer_qpn_count); + + err = mlx4_FLOW_STEERING_IB_UC_QP_RANGE( + dev, ibdev->steer_qpn_base, + ibdev->steer_qpn_base + + ibdev->steer_qpn_count - 1); + if (err) + goto err_steer_free_bitmap; + } + + for (j = 1; j <= ibdev->dev->caps.num_ports; j++) + atomic64_set(&iboe->mac[j - 1], ibdev->dev->caps.def_mac[j]); + + if (ib_register_device(&ibdev->ib_dev, NULL)) + goto err_steer_free_bitmap; + + if (mlx4_ib_mad_init(ibdev)) + goto err_reg; + + if (mlx4_ib_init_sriov(ibdev)) + goto err_mad; + + if (dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE) { + if (!iboe->nb.notifier_call) { + iboe->nb.notifier_call = mlx4_ib_netdev_event; + err = register_netdevice_notifier(&iboe->nb); + if (err) { + iboe->nb.notifier_call = NULL; + goto err_notif; + } + } + if (!iboe->nb_inet.notifier_call) { + iboe->nb_inet.notifier_call = mlx4_ib_inet_event; + err = register_inetaddr_notifier(&iboe->nb_inet); + if (err) { + iboe->nb_inet.notifier_call = NULL; + goto err_notif; + } + } +#if IS_ENABLED(CONFIG_IPV6) + if (!iboe->nb_inet6.notifier_call) { + iboe->nb_inet6.notifier_call = mlx4_ib_inet6_event; + err = register_inet6addr_notifier(&iboe->nb_inet6); + if (err) { + iboe->nb_inet6.notifier_call = NULL; + goto err_notif; + } + } +#endif + if (mlx4_ib_init_gid_table(ibdev)) + goto err_notif; + } + + for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) { + if (device_create_file(&ibdev->ib_dev.dev, + mlx4_class_attributes[j])) + goto err_notif; + } + + ibdev->ib_active = true; + + if (mlx4_is_mfunc(ibdev->dev)) + init_pkeys(ibdev); + + /* create paravirt contexts for any VFs which are active */ + if (mlx4_is_master(ibdev->dev)) { + for (j = 0; j < MLX4_MFUNC_MAX; j++) { + if (j == mlx4_master_func_num(ibdev->dev)) + continue; + if (mlx4_is_slave_active(ibdev->dev, j)) + do_slave_init(ibdev, j, 1); + } + } + return ibdev; + +err_notif: + if (ibdev->iboe.nb.notifier_call) { + if (unregister_netdevice_notifier(&ibdev->iboe.nb)) + pr_warn("failure unregistering notifier\n"); + ibdev->iboe.nb.notifier_call = NULL; + } + if (ibdev->iboe.nb_inet.notifier_call) { + if (unregister_inetaddr_notifier(&ibdev->iboe.nb_inet)) + pr_warn("failure unregistering notifier\n"); + ibdev->iboe.nb_inet.notifier_call = NULL; + } +#if IS_ENABLED(CONFIG_IPV6) + if (ibdev->iboe.nb_inet6.notifier_call) { + if (unregister_inet6addr_notifier(&ibdev->iboe.nb_inet6)) + pr_warn("failure unregistering notifier\n"); + ibdev->iboe.nb_inet6.notifier_call = NULL; + } +#endif + flush_workqueue(wq); + + mlx4_ib_close_sriov(ibdev); + +err_mad: + mlx4_ib_mad_cleanup(ibdev); + +err_reg: + ib_unregister_device(&ibdev->ib_dev); + +err_steer_free_bitmap: + kfree(ibdev->ib_uc_qpns_bitmap); + +err_steer_qp_release: + if (ibdev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED) + mlx4_qp_release_range(dev, ibdev->steer_qpn_base, + ibdev->steer_qpn_count); +err_counter: + for (; i; --i) + if (ibdev->counters[i - 1] != -1) + mlx4_counter_free(ibdev->dev, ibdev->counters[i - 1]); + +err_map: + iounmap(ibdev->uar_map); + +err_uar: + mlx4_uar_free(dev, &ibdev->priv_uar); + +err_pd: + mlx4_pd_free(dev, ibdev->priv_pdn); + +err_dealloc: + ib_dealloc_device(&ibdev->ib_dev); + + return NULL; +} + +int mlx4_ib_steer_qp_alloc(struct mlx4_ib_dev *dev, int count, int *qpn) +{ + int offset; + + WARN_ON(!dev->ib_uc_qpns_bitmap); + + offset = bitmap_find_free_region(dev->ib_uc_qpns_bitmap, + dev->steer_qpn_count, + get_count_order(count)); + if (offset < 0) + return offset; + + *qpn = dev->steer_qpn_base + offset; + return 0; +} + +void mlx4_ib_steer_qp_free(struct mlx4_ib_dev *dev, u32 qpn, int count) +{ + if (!qpn || + dev->steering_support != MLX4_STEERING_MODE_DEVICE_MANAGED) + return; + + BUG_ON(qpn < dev->steer_qpn_base); + + bitmap_release_region(dev->ib_uc_qpns_bitmap, + qpn - dev->steer_qpn_base, + get_count_order(count)); +} + +int mlx4_ib_steer_qp_reg(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, + int is_attach) +{ + int err; + size_t flow_size; + struct ib_flow_attr *flow = NULL; + struct ib_flow_spec_ib *ib_spec; + + if (is_attach) { + flow_size = sizeof(struct ib_flow_attr) + + sizeof(struct ib_flow_spec_ib); + flow = kzalloc(flow_size, GFP_KERNEL); + if (!flow) + return -ENOMEM; + flow->port = mqp->port; + flow->num_of_specs = 1; + flow->size = flow_size; + ib_spec = (struct ib_flow_spec_ib *)(flow + 1); + ib_spec->type = IB_FLOW_SPEC_IB; + ib_spec->size = sizeof(struct ib_flow_spec_ib); + /* Add an empty rule for IB L2 */ + memset(&ib_spec->mask, 0, sizeof(ib_spec->mask)); + + err = __mlx4_ib_create_flow(&mqp->ibqp, flow, + IB_FLOW_DOMAIN_NIC, + MLX4_FS_REGULAR, + &mqp->reg_id); + } else { + err = __mlx4_ib_destroy_flow(mdev->dev, mqp->reg_id); + } + kfree(flow); + return err; +} + +static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr) +{ + struct mlx4_ib_dev *ibdev = ibdev_ptr; + int p; + + ibdev->ib_active = false; + flush_workqueue(wq); + + mlx4_ib_close_sriov(ibdev); + mlx4_ib_mad_cleanup(ibdev); + ib_unregister_device(&ibdev->ib_dev); + if (ibdev->iboe.nb.notifier_call) { + if (unregister_netdevice_notifier(&ibdev->iboe.nb)) + pr_warn("failure unregistering notifier\n"); + ibdev->iboe.nb.notifier_call = NULL; + } + + if (ibdev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED) { + mlx4_qp_release_range(dev, ibdev->steer_qpn_base, + ibdev->steer_qpn_count); + kfree(ibdev->ib_uc_qpns_bitmap); + } + + if (ibdev->iboe.nb_inet.notifier_call) { + if (unregister_inetaddr_notifier(&ibdev->iboe.nb_inet)) + pr_warn("failure unregistering notifier\n"); + ibdev->iboe.nb_inet.notifier_call = NULL; + } +#if IS_ENABLED(CONFIG_IPV6) + if (ibdev->iboe.nb_inet6.notifier_call) { + if (unregister_inet6addr_notifier(&ibdev->iboe.nb_inet6)) + pr_warn("failure unregistering notifier\n"); + ibdev->iboe.nb_inet6.notifier_call = NULL; + } +#endif + + iounmap(ibdev->uar_map); + for (p = 0; p < ibdev->num_ports; ++p) + if (ibdev->counters[p] != -1) + mlx4_counter_free(ibdev->dev, ibdev->counters[p]); + mlx4_foreach_port(p, dev, MLX4_PORT_TYPE_IB) + mlx4_CLOSE_PORT(dev, p); + + mlx4_ib_free_eqs(dev, ibdev); + + mlx4_uar_free(dev, &ibdev->priv_uar); + mlx4_pd_free(dev, ibdev->priv_pdn); + ib_dealloc_device(&ibdev->ib_dev); +} + +static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init) +{ + struct mlx4_ib_demux_work **dm = NULL; + struct mlx4_dev *dev = ibdev->dev; + int i; + unsigned long flags; + struct mlx4_active_ports actv_ports; + unsigned int ports; + unsigned int first_port; + + if (!mlx4_is_master(dev)) + return; + + actv_ports = mlx4_get_active_ports(dev, slave); + ports = bitmap_weight(actv_ports.ports, dev->caps.num_ports); + first_port = find_first_bit(actv_ports.ports, dev->caps.num_ports); + + dm = kcalloc(ports, sizeof(*dm), GFP_ATOMIC); + if (!dm) { + pr_err("failed to allocate memory for tunneling qp update\n"); + goto out; + } + + for (i = 0; i < ports; i++) { + dm[i] = kmalloc(sizeof (struct mlx4_ib_demux_work), GFP_ATOMIC); + if (!dm[i]) { + pr_err("failed to allocate memory for tunneling qp update work struct\n"); + for (i = 0; i < dev->caps.num_ports; i++) { + if (dm[i]) + kfree(dm[i]); + } + goto out; + } + } + /* initialize or tear down tunnel QPs for the slave */ + for (i = 0; i < ports; i++) { + INIT_WORK(&dm[i]->work, mlx4_ib_tunnels_update_work); + dm[i]->port = first_port + i + 1; + dm[i]->slave = slave; + dm[i]->do_init = do_init; + dm[i]->dev = ibdev; + spin_lock_irqsave(&ibdev->sriov.going_down_lock, flags); + if (!ibdev->sriov.is_going_down) + queue_work(ibdev->sriov.demux[i].ud_wq, &dm[i]->work); + spin_unlock_irqrestore(&ibdev->sriov.going_down_lock, flags); + } +out: + kfree(dm); + return; +} + +static void mlx4_ib_handle_catas_error(struct mlx4_ib_dev *ibdev) +{ + struct mlx4_ib_qp *mqp; + unsigned long flags_qp; + unsigned long flags_cq; + struct mlx4_ib_cq *send_mcq, *recv_mcq; + struct list_head cq_notify_list; + struct mlx4_cq *mcq; + unsigned long flags; + + pr_warn("mlx4_ib_handle_catas_error was started\n"); + INIT_LIST_HEAD(&cq_notify_list); + + /* Go over qp list reside on that ibdev, sync with create/destroy qp.*/ + spin_lock_irqsave(&ibdev->reset_flow_resource_lock, flags); + + list_for_each_entry(mqp, &ibdev->qp_list, qps_list) { + spin_lock_irqsave(&mqp->sq.lock, flags_qp); + if (mqp->sq.tail != mqp->sq.head) { + send_mcq = to_mcq(mqp->ibqp.send_cq); + spin_lock_irqsave(&send_mcq->lock, flags_cq); + if (send_mcq->mcq.comp && + mqp->ibqp.send_cq->comp_handler) { + if (!send_mcq->mcq.reset_notify_added) { + send_mcq->mcq.reset_notify_added = 1; + list_add_tail(&send_mcq->mcq.reset_notify, + &cq_notify_list); + } + } + spin_unlock_irqrestore(&send_mcq->lock, flags_cq); + } + spin_unlock_irqrestore(&mqp->sq.lock, flags_qp); + /* Now, handle the QP's receive queue */ + spin_lock_irqsave(&mqp->rq.lock, flags_qp); + /* no handling is needed for SRQ */ + if (!mqp->ibqp.srq) { + if (mqp->rq.tail != mqp->rq.head) { + recv_mcq = to_mcq(mqp->ibqp.recv_cq); + spin_lock_irqsave(&recv_mcq->lock, flags_cq); + if (recv_mcq->mcq.comp && + mqp->ibqp.recv_cq->comp_handler) { + if (!recv_mcq->mcq.reset_notify_added) { + recv_mcq->mcq.reset_notify_added = 1; + list_add_tail(&recv_mcq->mcq.reset_notify, + &cq_notify_list); + } + } + spin_unlock_irqrestore(&recv_mcq->lock, + flags_cq); + } + } + spin_unlock_irqrestore(&mqp->rq.lock, flags_qp); + } + + list_for_each_entry(mcq, &cq_notify_list, reset_notify) { + mcq->comp(mcq); + } + spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags); + pr_warn("mlx4_ib_handle_catas_error ended\n"); +} + +static void handle_bonded_port_state_event(struct work_struct *work) +{ + struct ib_event_work *ew = + container_of(work, struct ib_event_work, work); + struct mlx4_ib_dev *ibdev = ew->ib_dev; + enum ib_port_state bonded_port_state = IB_PORT_NOP; + int i; + struct ib_event ibev; + + kfree(ew); + spin_lock_bh(&ibdev->iboe.lock); + for (i = 0; i < MLX4_MAX_PORTS; ++i) { + struct net_device *curr_netdev = ibdev->iboe.netdevs[i]; + enum ib_port_state curr_port_state; + + if (!curr_netdev) + continue; + + curr_port_state = + (netif_running(curr_netdev) && + netif_carrier_ok(curr_netdev)) ? + IB_PORT_ACTIVE : IB_PORT_DOWN; + + bonded_port_state = (bonded_port_state != IB_PORT_ACTIVE) ? + curr_port_state : IB_PORT_ACTIVE; + } + spin_unlock_bh(&ibdev->iboe.lock); + + ibev.device = &ibdev->ib_dev; + ibev.element.port_num = 1; + ibev.event = (bonded_port_state == IB_PORT_ACTIVE) ? + IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR; + + ib_dispatch_event(&ibev); +} + +static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr, + enum mlx4_dev_event event, unsigned long param) +{ + struct ib_event ibev; + struct mlx4_ib_dev *ibdev = to_mdev((struct ib_device *) ibdev_ptr); + struct mlx4_eqe *eqe = NULL; + struct ib_event_work *ew; + int p = 0; + + if (mlx4_is_bonded(dev) && + ((event == MLX4_DEV_EVENT_PORT_UP) || + (event == MLX4_DEV_EVENT_PORT_DOWN))) { + ew = kmalloc(sizeof(*ew), GFP_ATOMIC); + if (!ew) + return; + INIT_WORK(&ew->work, handle_bonded_port_state_event); + ew->ib_dev = ibdev; + queue_work(wq, &ew->work); + return; + } + + if (event == MLX4_DEV_EVENT_PORT_MGMT_CHANGE) + eqe = (struct mlx4_eqe *)param; + else + p = (int) param; + + switch (event) { + case MLX4_DEV_EVENT_PORT_UP: + if (p > ibdev->num_ports) + return; + if (mlx4_is_master(dev) && + rdma_port_get_link_layer(&ibdev->ib_dev, p) == + IB_LINK_LAYER_INFINIBAND) { + mlx4_ib_invalidate_all_guid_record(ibdev, p); + } + ibev.event = IB_EVENT_PORT_ACTIVE; + break; + + case MLX4_DEV_EVENT_PORT_DOWN: + if (p > ibdev->num_ports) + return; + ibev.event = IB_EVENT_PORT_ERR; + break; + + case MLX4_DEV_EVENT_CATASTROPHIC_ERROR: + ibdev->ib_active = false; + ibev.event = IB_EVENT_DEVICE_FATAL; + mlx4_ib_handle_catas_error(ibdev); + break; + + case MLX4_DEV_EVENT_PORT_MGMT_CHANGE: + ew = kmalloc(sizeof *ew, GFP_ATOMIC); + if (!ew) { + pr_err("failed to allocate memory for events work\n"); + break; + } + + INIT_WORK(&ew->work, handle_port_mgmt_change_event); + memcpy(&ew->ib_eqe, eqe, sizeof *eqe); + ew->ib_dev = ibdev; + /* need to queue only for port owner, which uses GEN_EQE */ + if (mlx4_is_master(dev)) + queue_work(wq, &ew->work); + else + handle_port_mgmt_change_event(&ew->work); + return; + + case MLX4_DEV_EVENT_SLAVE_INIT: + /* here, p is the slave id */ + do_slave_init(ibdev, p, 1); + if (mlx4_is_master(dev)) { + int i; + + for (i = 1; i <= ibdev->num_ports; i++) { + if (rdma_port_get_link_layer(&ibdev->ib_dev, i) + == IB_LINK_LAYER_INFINIBAND) + mlx4_ib_slave_alias_guid_event(ibdev, + p, i, + 1); + } + } + return; + + case MLX4_DEV_EVENT_SLAVE_SHUTDOWN: + if (mlx4_is_master(dev)) { + int i; + + for (i = 1; i <= ibdev->num_ports; i++) { + if (rdma_port_get_link_layer(&ibdev->ib_dev, i) + == IB_LINK_LAYER_INFINIBAND) + mlx4_ib_slave_alias_guid_event(ibdev, + p, i, + 0); + } + } + /* here, p is the slave id */ + do_slave_init(ibdev, p, 0); + return; + + default: + return; + } + + ibev.device = ibdev_ptr; + ibev.element.port_num = mlx4_is_bonded(ibdev->dev) ? 1 : (u8)p; + + ib_dispatch_event(&ibev); +} + +static struct mlx4_interface mlx4_ib_interface = { + .add = mlx4_ib_add, + .remove = mlx4_ib_remove, + .event = mlx4_ib_event, + .protocol = MLX4_PROT_IB_IPV6, + .flags = MLX4_INTFF_BONDING +}; + +static int __init mlx4_ib_init(void) +{ + int err; + + wq = create_singlethread_workqueue("mlx4_ib"); + if (!wq) + return -ENOMEM; + + err = mlx4_ib_mcg_init(); + if (err) + goto clean_wq; + + err = mlx4_register_interface(&mlx4_ib_interface); + if (err) + goto clean_mcg; + + return 0; + +clean_mcg: + mlx4_ib_mcg_destroy(); + +clean_wq: + destroy_workqueue(wq); + return err; +} + +static void __exit mlx4_ib_cleanup(void) +{ + mlx4_unregister_interface(&mlx4_ib_interface); + mlx4_ib_mcg_destroy(); + destroy_workqueue(wq); +} + +module_init(mlx4_ib_init); +module_exit(mlx4_ib_cleanup); diff --git a/kernel/drivers/infiniband/hw/mlx4/mcg.c b/kernel/drivers/infiniband/hw/mlx4/mcg.c new file mode 100644 index 000000000..ed327e6c8 --- /dev/null +++ b/kernel/drivers/infiniband/hw/mlx4/mcg.c @@ -0,0 +1,1257 @@ +/* + * Copyright (c) 2012 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#include +#include +#include + +#include "mlx4_ib.h" + +#define MAX_VFS 80 +#define MAX_PEND_REQS_PER_FUNC 4 +#define MAD_TIMEOUT_MS 2000 + +#define mcg_warn(fmt, arg...) pr_warn("MCG WARNING: " fmt, ##arg) +#define mcg_error(fmt, arg...) pr_err(fmt, ##arg) +#define mcg_warn_group(group, format, arg...) \ + pr_warn("%s-%d: %16s (port %d): WARNING: " format, __func__, __LINE__,\ + (group)->name, group->demux->port, ## arg) + +#define mcg_error_group(group, format, arg...) \ + pr_err(" %16s: " format, (group)->name, ## arg) + + +static union ib_gid mgid0; + +static struct workqueue_struct *clean_wq; + +enum mcast_state { + MCAST_NOT_MEMBER = 0, + MCAST_MEMBER, +}; + +enum mcast_group_state { + MCAST_IDLE, + MCAST_JOIN_SENT, + MCAST_LEAVE_SENT, + MCAST_RESP_READY +}; + +struct mcast_member { + enum mcast_state state; + uint8_t join_state; + int num_pend_reqs; + struct list_head pending; +}; + +struct ib_sa_mcmember_data { + union ib_gid mgid; + union ib_gid port_gid; + __be32 qkey; + __be16 mlid; + u8 mtusel_mtu; + u8 tclass; + __be16 pkey; + u8 ratesel_rate; + u8 lifetmsel_lifetm; + __be32 sl_flowlabel_hoplimit; + u8 scope_join_state; + u8 proxy_join; + u8 reserved[2]; +}; + +struct mcast_group { + struct ib_sa_mcmember_data rec; + struct rb_node node; + struct list_head mgid0_list; + struct mlx4_ib_demux_ctx *demux; + struct mcast_member func[MAX_VFS]; + struct mutex lock; + struct work_struct work; + struct list_head pending_list; + int members[3]; + enum mcast_group_state state; + enum mcast_group_state prev_state; + struct ib_sa_mad response_sa_mad; + __be64 last_req_tid; + + char name[33]; /* MGID string */ + struct device_attribute dentry; + + /* refcount is the reference count for the following: + 1. Each queued request + 2. Each invocation of the worker thread + 3. Membership of the port at the SA + */ + atomic_t refcount; + + /* delayed work to clean pending SM request */ + struct delayed_work timeout_work; + struct list_head cleanup_list; +}; + +struct mcast_req { + int func; + struct ib_sa_mad sa_mad; + struct list_head group_list; + struct list_head func_list; + struct mcast_group *group; + int clean; +}; + + +#define safe_atomic_dec(ref) \ + do {\ + if (atomic_dec_and_test(ref)) \ + mcg_warn_group(group, "did not expect to reach zero\n"); \ + } while (0) + +static const char *get_state_string(enum mcast_group_state state) +{ + switch (state) { + case MCAST_IDLE: + return "MCAST_IDLE"; + case MCAST_JOIN_SENT: + return "MCAST_JOIN_SENT"; + case MCAST_LEAVE_SENT: + return "MCAST_LEAVE_SENT"; + case MCAST_RESP_READY: + return "MCAST_RESP_READY"; + } + return "Invalid State"; +} + +static struct mcast_group *mcast_find(struct mlx4_ib_demux_ctx *ctx, + union ib_gid *mgid) +{ + struct rb_node *node = ctx->mcg_table.rb_node; + struct mcast_group *group; + int ret; + + while (node) { + group = rb_entry(node, struct mcast_group, node); + ret = memcmp(mgid->raw, group->rec.mgid.raw, sizeof *mgid); + if (!ret) + return group; + + if (ret < 0) + node = node->rb_left; + else + node = node->rb_right; + } + return NULL; +} + +static struct mcast_group *mcast_insert(struct mlx4_ib_demux_ctx *ctx, + struct mcast_group *group) +{ + struct rb_node **link = &ctx->mcg_table.rb_node; + struct rb_node *parent = NULL; + struct mcast_group *cur_group; + int ret; + + while (*link) { + parent = *link; + cur_group = rb_entry(parent, struct mcast_group, node); + + ret = memcmp(group->rec.mgid.raw, cur_group->rec.mgid.raw, + sizeof group->rec.mgid); + if (ret < 0) + link = &(*link)->rb_left; + else if (ret > 0) + link = &(*link)->rb_right; + else + return cur_group; + } + rb_link_node(&group->node, parent, link); + rb_insert_color(&group->node, &ctx->mcg_table); + return NULL; +} + +static int send_mad_to_wire(struct mlx4_ib_demux_ctx *ctx, struct ib_mad *mad) +{ + struct mlx4_ib_dev *dev = ctx->dev; + struct ib_ah_attr ah_attr; + + spin_lock(&dev->sm_lock); + if (!dev->sm_ah[ctx->port - 1]) { + /* port is not yet Active, sm_ah not ready */ + spin_unlock(&dev->sm_lock); + return -EAGAIN; + } + mlx4_ib_query_ah(dev->sm_ah[ctx->port - 1], &ah_attr); + spin_unlock(&dev->sm_lock); + return mlx4_ib_send_to_wire(dev, mlx4_master_func_num(dev->dev), + ctx->port, IB_QPT_GSI, 0, 1, IB_QP1_QKEY, + &ah_attr, NULL, mad); +} + +static int send_mad_to_slave(int slave, struct mlx4_ib_demux_ctx *ctx, + struct ib_mad *mad) +{ + struct mlx4_ib_dev *dev = ctx->dev; + struct ib_mad_agent *agent = dev->send_agent[ctx->port - 1][1]; + struct ib_wc wc; + struct ib_ah_attr ah_attr; + + /* Our agent might not yet be registered when mads start to arrive */ + if (!agent) + return -EAGAIN; + + ib_query_ah(dev->sm_ah[ctx->port - 1], &ah_attr); + + if (ib_find_cached_pkey(&dev->ib_dev, ctx->port, IB_DEFAULT_PKEY_FULL, &wc.pkey_index)) + return -EINVAL; + wc.sl = 0; + wc.dlid_path_bits = 0; + wc.port_num = ctx->port; + wc.slid = ah_attr.dlid; /* opensm lid */ + wc.src_qp = 1; + return mlx4_ib_send_to_slave(dev, slave, ctx->port, IB_QPT_GSI, &wc, NULL, mad); +} + +static int send_join_to_wire(struct mcast_group *group, struct ib_sa_mad *sa_mad) +{ + struct ib_sa_mad mad; + struct ib_sa_mcmember_data *sa_mad_data = (struct ib_sa_mcmember_data *)&mad.data; + int ret; + + /* we rely on a mad request as arrived from a VF */ + memcpy(&mad, sa_mad, sizeof mad); + + /* fix port GID to be the real one (slave 0) */ + sa_mad_data->port_gid.global.interface_id = group->demux->guid_cache[0]; + + /* assign our own TID */ + mad.mad_hdr.tid = mlx4_ib_get_new_demux_tid(group->demux); + group->last_req_tid = mad.mad_hdr.tid; /* keep it for later validation */ + + ret = send_mad_to_wire(group->demux, (struct ib_mad *)&mad); + /* set timeout handler */ + if (!ret) { + /* calls mlx4_ib_mcg_timeout_handler */ + queue_delayed_work(group->demux->mcg_wq, &group->timeout_work, + msecs_to_jiffies(MAD_TIMEOUT_MS)); + } + + return ret; +} + +static int send_leave_to_wire(struct mcast_group *group, u8 join_state) +{ + struct ib_sa_mad mad; + struct ib_sa_mcmember_data *sa_data = (struct ib_sa_mcmember_data *)&mad.data; + int ret; + + memset(&mad, 0, sizeof mad); + mad.mad_hdr.base_version = 1; + mad.mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM; + mad.mad_hdr.class_version = 2; + mad.mad_hdr.method = IB_SA_METHOD_DELETE; + mad.mad_hdr.status = cpu_to_be16(0); + mad.mad_hdr.class_specific = cpu_to_be16(0); + mad.mad_hdr.tid = mlx4_ib_get_new_demux_tid(group->demux); + group->last_req_tid = mad.mad_hdr.tid; /* keep it for later validation */ + mad.mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_MC_MEMBER_REC); + mad.mad_hdr.attr_mod = cpu_to_be32(0); + mad.sa_hdr.sm_key = 0x0; + mad.sa_hdr.attr_offset = cpu_to_be16(7); + mad.sa_hdr.comp_mask = IB_SA_MCMEMBER_REC_MGID | + IB_SA_MCMEMBER_REC_PORT_GID | IB_SA_MCMEMBER_REC_JOIN_STATE; + + *sa_data = group->rec; + sa_data->scope_join_state = join_state; + + ret = send_mad_to_wire(group->demux, (struct ib_mad *)&mad); + if (ret) + group->state = MCAST_IDLE; + + /* set timeout handler */ + if (!ret) { + /* calls mlx4_ib_mcg_timeout_handler */ + queue_delayed_work(group->demux->mcg_wq, &group->timeout_work, + msecs_to_jiffies(MAD_TIMEOUT_MS)); + } + + return ret; +} + +static int send_reply_to_slave(int slave, struct mcast_group *group, + struct ib_sa_mad *req_sa_mad, u16 status) +{ + struct ib_sa_mad mad; + struct ib_sa_mcmember_data *sa_data = (struct ib_sa_mcmember_data *)&mad.data; + struct ib_sa_mcmember_data *req_sa_data = (struct ib_sa_mcmember_data *)&req_sa_mad->data; + int ret; + + memset(&mad, 0, sizeof mad); + mad.mad_hdr.base_version = 1; + mad.mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM; + mad.mad_hdr.class_version = 2; + mad.mad_hdr.method = IB_MGMT_METHOD_GET_RESP; + mad.mad_hdr.status = cpu_to_be16(status); + mad.mad_hdr.class_specific = cpu_to_be16(0); + mad.mad_hdr.tid = req_sa_mad->mad_hdr.tid; + *(u8 *)&mad.mad_hdr.tid = 0; /* resetting tid to 0 */ + mad.mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_MC_MEMBER_REC); + mad.mad_hdr.attr_mod = cpu_to_be32(0); + mad.sa_hdr.sm_key = req_sa_mad->sa_hdr.sm_key; + mad.sa_hdr.attr_offset = cpu_to_be16(7); + mad.sa_hdr.comp_mask = 0; /* ignored on responses, see IBTA spec */ + + *sa_data = group->rec; + + /* reconstruct VF's requested join_state and port_gid */ + sa_data->scope_join_state &= 0xf0; + sa_data->scope_join_state |= (group->func[slave].join_state & 0x0f); + memcpy(&sa_data->port_gid, &req_sa_data->port_gid, sizeof req_sa_data->port_gid); + + ret = send_mad_to_slave(slave, group->demux, (struct ib_mad *)&mad); + return ret; +} + +static int check_selector(ib_sa_comp_mask comp_mask, + ib_sa_comp_mask selector_mask, + ib_sa_comp_mask value_mask, + u8 src_value, u8 dst_value) +{ + int err; + u8 selector = dst_value >> 6; + dst_value &= 0x3f; + src_value &= 0x3f; + + if (!(comp_mask & selector_mask) || !(comp_mask & value_mask)) + return 0; + + switch (selector) { + case IB_SA_GT: + err = (src_value <= dst_value); + break; + case IB_SA_LT: + err = (src_value >= dst_value); + break; + case IB_SA_EQ: + err = (src_value != dst_value); + break; + default: + err = 0; + break; + } + + return err; +} + +static u16 cmp_rec(struct ib_sa_mcmember_data *src, + struct ib_sa_mcmember_data *dst, ib_sa_comp_mask comp_mask) +{ + /* src is group record, dst is request record */ + /* MGID must already match */ + /* Port_GID we always replace to our Port_GID, so it is a match */ + +#define MAD_STATUS_REQ_INVALID 0x0200 + if (comp_mask & IB_SA_MCMEMBER_REC_QKEY && src->qkey != dst->qkey) + return MAD_STATUS_REQ_INVALID; + if (comp_mask & IB_SA_MCMEMBER_REC_MLID && src->mlid != dst->mlid) + return MAD_STATUS_REQ_INVALID; + if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_MTU_SELECTOR, + IB_SA_MCMEMBER_REC_MTU, + src->mtusel_mtu, dst->mtusel_mtu)) + return MAD_STATUS_REQ_INVALID; + if (comp_mask & IB_SA_MCMEMBER_REC_TRAFFIC_CLASS && + src->tclass != dst->tclass) + return MAD_STATUS_REQ_INVALID; + if (comp_mask & IB_SA_MCMEMBER_REC_PKEY && src->pkey != dst->pkey) + return MAD_STATUS_REQ_INVALID; + if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_RATE_SELECTOR, + IB_SA_MCMEMBER_REC_RATE, + src->ratesel_rate, dst->ratesel_rate)) + return MAD_STATUS_REQ_INVALID; + if (check_selector(comp_mask, + IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR, + IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME, + src->lifetmsel_lifetm, dst->lifetmsel_lifetm)) + return MAD_STATUS_REQ_INVALID; + if (comp_mask & IB_SA_MCMEMBER_REC_SL && + (be32_to_cpu(src->sl_flowlabel_hoplimit) & 0xf0000000) != + (be32_to_cpu(dst->sl_flowlabel_hoplimit) & 0xf0000000)) + return MAD_STATUS_REQ_INVALID; + if (comp_mask & IB_SA_MCMEMBER_REC_FLOW_LABEL && + (be32_to_cpu(src->sl_flowlabel_hoplimit) & 0x0fffff00) != + (be32_to_cpu(dst->sl_flowlabel_hoplimit) & 0x0fffff00)) + return MAD_STATUS_REQ_INVALID; + if (comp_mask & IB_SA_MCMEMBER_REC_HOP_LIMIT && + (be32_to_cpu(src->sl_flowlabel_hoplimit) & 0x000000ff) != + (be32_to_cpu(dst->sl_flowlabel_hoplimit) & 0x000000ff)) + return MAD_STATUS_REQ_INVALID; + if (comp_mask & IB_SA_MCMEMBER_REC_SCOPE && + (src->scope_join_state & 0xf0) != + (dst->scope_join_state & 0xf0)) + return MAD_STATUS_REQ_INVALID; + + /* join_state checked separately, proxy_join ignored */ + + return 0; +} + +/* release group, return 1 if this was last release and group is destroyed + * timout work is canceled sync */ +static int release_group(struct mcast_group *group, int from_timeout_handler) +{ + struct mlx4_ib_demux_ctx *ctx = group->demux; + int nzgroup; + + mutex_lock(&ctx->mcg_table_lock); + mutex_lock(&group->lock); + if (atomic_dec_and_test(&group->refcount)) { + if (!from_timeout_handler) { + if (group->state != MCAST_IDLE && + !cancel_delayed_work(&group->timeout_work)) { + atomic_inc(&group->refcount); + mutex_unlock(&group->lock); + mutex_unlock(&ctx->mcg_table_lock); + return 0; + } + } + + nzgroup = memcmp(&group->rec.mgid, &mgid0, sizeof mgid0); + if (nzgroup) + del_sysfs_port_mcg_attr(ctx->dev, ctx->port, &group->dentry.attr); + if (!list_empty(&group->pending_list)) + mcg_warn_group(group, "releasing a group with non empty pending list\n"); + if (nzgroup) + rb_erase(&group->node, &ctx->mcg_table); + list_del_init(&group->mgid0_list); + mutex_unlock(&group->lock); + mutex_unlock(&ctx->mcg_table_lock); + kfree(group); + return 1; + } else { + mutex_unlock(&group->lock); + mutex_unlock(&ctx->mcg_table_lock); + } + return 0; +} + +static void adjust_membership(struct mcast_group *group, u8 join_state, int inc) +{ + int i; + + for (i = 0; i < 3; i++, join_state >>= 1) + if (join_state & 0x1) + group->members[i] += inc; +} + +static u8 get_leave_state(struct mcast_group *group) +{ + u8 leave_state = 0; + int i; + + for (i = 0; i < 3; i++) + if (!group->members[i]) + leave_state |= (1 << i); + + return leave_state & (group->rec.scope_join_state & 7); +} + +static int join_group(struct mcast_group *group, int slave, u8 join_mask) +{ + int ret = 0; + u8 join_state; + + /* remove bits that slave is already member of, and adjust */ + join_state = join_mask & (~group->func[slave].join_state); + adjust_membership(group, join_state, 1); + group->func[slave].join_state |= join_state; + if (group->func[slave].state != MCAST_MEMBER && join_state) { + group->func[slave].state = MCAST_MEMBER; + ret = 1; + } + return ret; +} + +static int leave_group(struct mcast_group *group, int slave, u8 leave_state) +{ + int ret = 0; + + adjust_membership(group, leave_state, -1); + group->func[slave].join_state &= ~leave_state; + if (!group->func[slave].join_state) { + group->func[slave].state = MCAST_NOT_MEMBER; + ret = 1; + } + return ret; +} + +static int check_leave(struct mcast_group *group, int slave, u8 leave_mask) +{ + if (group->func[slave].state != MCAST_MEMBER) + return MAD_STATUS_REQ_INVALID; + + /* make sure we're not deleting unset bits */ + if (~group->func[slave].join_state & leave_mask) + return MAD_STATUS_REQ_INVALID; + + if (!leave_mask) + return MAD_STATUS_REQ_INVALID; + + return 0; +} + +static void mlx4_ib_mcg_timeout_handler(struct work_struct *work) +{ + struct delayed_work *delay = to_delayed_work(work); + struct mcast_group *group; + struct mcast_req *req = NULL; + + group = container_of(delay, typeof(*group), timeout_work); + + mutex_lock(&group->lock); + if (group->state == MCAST_JOIN_SENT) { + if (!list_empty(&group->pending_list)) { + req = list_first_entry(&group->pending_list, struct mcast_req, group_list); + list_del(&req->group_list); + list_del(&req->func_list); + --group->func[req->func].num_pend_reqs; + mutex_unlock(&group->lock); + kfree(req); + if (memcmp(&group->rec.mgid, &mgid0, sizeof mgid0)) { + if (release_group(group, 1)) + return; + } else { + kfree(group); + return; + } + mutex_lock(&group->lock); + } else + mcg_warn_group(group, "DRIVER BUG\n"); + } else if (group->state == MCAST_LEAVE_SENT) { + if (group->rec.scope_join_state & 7) + group->rec.scope_join_state &= 0xf8; + group->state = MCAST_IDLE; + mutex_unlock(&group->lock); + if (release_group(group, 1)) + return; + mutex_lock(&group->lock); + } else + mcg_warn_group(group, "invalid state %s\n", get_state_string(group->state)); + group->state = MCAST_IDLE; + atomic_inc(&group->refcount); + if (!queue_work(group->demux->mcg_wq, &group->work)) + safe_atomic_dec(&group->refcount); + + mutex_unlock(&group->lock); +} + +static int handle_leave_req(struct mcast_group *group, u8 leave_mask, + struct mcast_req *req) +{ + u16 status; + + if (req->clean) + leave_mask = group->func[req->func].join_state; + + status = check_leave(group, req->func, leave_mask); + if (!status) + leave_group(group, req->func, leave_mask); + + if (!req->clean) + send_reply_to_slave(req->func, group, &req->sa_mad, status); + --group->func[req->func].num_pend_reqs; + list_del(&req->group_list); + list_del(&req->func_list); + kfree(req); + return 1; +} + +static int handle_join_req(struct mcast_group *group, u8 join_mask, + struct mcast_req *req) +{ + u8 group_join_state = group->rec.scope_join_state & 7; + int ref = 0; + u16 status; + struct ib_sa_mcmember_data *sa_data = (struct ib_sa_mcmember_data *)req->sa_mad.data; + + if (join_mask == (group_join_state & join_mask)) { + /* port's membership need not change */ + status = cmp_rec(&group->rec, sa_data, req->sa_mad.sa_hdr.comp_mask); + if (!status) + join_group(group, req->func, join_mask); + + --group->func[req->func].num_pend_reqs; + send_reply_to_slave(req->func, group, &req->sa_mad, status); + list_del(&req->group_list); + list_del(&req->func_list); + kfree(req); + ++ref; + } else { + /* port's membership needs to be updated */ + group->prev_state = group->state; + if (send_join_to_wire(group, &req->sa_mad)) { + --group->func[req->func].num_pend_reqs; + list_del(&req->group_list); + list_del(&req->func_list); + kfree(req); + ref = 1; + group->state = group->prev_state; + } else + group->state = MCAST_JOIN_SENT; + } + + return ref; +} + +static void mlx4_ib_mcg_work_handler(struct work_struct *work) +{ + struct mcast_group *group; + struct mcast_req *req = NULL; + struct ib_sa_mcmember_data *sa_data; + u8 req_join_state; + int rc = 1; /* release_count - this is for the scheduled work */ + u16 status; + u8 method; + + group = container_of(work, typeof(*group), work); + + mutex_lock(&group->lock); + + /* First, let's see if a response from SM is waiting regarding this group. + * If so, we need to update the group's REC. If this is a bad response, we + * may need to send a bad response to a VF waiting for it. If VF is waiting + * and this is a good response, the VF will be answered later in this func. */ + if (group->state == MCAST_RESP_READY) { + /* cancels mlx4_ib_mcg_timeout_handler */ + cancel_delayed_work(&group->timeout_work); + status = be16_to_cpu(group->response_sa_mad.mad_hdr.status); + method = group->response_sa_mad.mad_hdr.method; + if (group->last_req_tid != group->response_sa_mad.mad_hdr.tid) { + mcg_warn_group(group, "Got MAD response to existing MGID but wrong TID, dropping. Resp TID=%llx, group TID=%llx\n", + be64_to_cpu(group->response_sa_mad.mad_hdr.tid), + be64_to_cpu(group->last_req_tid)); + group->state = group->prev_state; + goto process_requests; + } + if (status) { + if (!list_empty(&group->pending_list)) + req = list_first_entry(&group->pending_list, + struct mcast_req, group_list); + if ((method == IB_MGMT_METHOD_GET_RESP)) { + if (req) { + send_reply_to_slave(req->func, group, &req->sa_mad, status); + --group->func[req->func].num_pend_reqs; + list_del(&req->group_list); + list_del(&req->func_list); + kfree(req); + ++rc; + } else + mcg_warn_group(group, "no request for failed join\n"); + } else if (method == IB_SA_METHOD_DELETE_RESP && group->demux->flushing) + ++rc; + } else { + u8 resp_join_state; + u8 cur_join_state; + + resp_join_state = ((struct ib_sa_mcmember_data *) + group->response_sa_mad.data)->scope_join_state & 7; + cur_join_state = group->rec.scope_join_state & 7; + + if (method == IB_MGMT_METHOD_GET_RESP) { + /* successfull join */ + if (!cur_join_state && resp_join_state) + --rc; + } else if (!resp_join_state) + ++rc; + memcpy(&group->rec, group->response_sa_mad.data, sizeof group->rec); + } + group->state = MCAST_IDLE; + } + +process_requests: + /* We should now go over pending join/leave requests, as long as we are idle. */ + while (!list_empty(&group->pending_list) && group->state == MCAST_IDLE) { + req = list_first_entry(&group->pending_list, struct mcast_req, + group_list); + sa_data = (struct ib_sa_mcmember_data *)req->sa_mad.data; + req_join_state = sa_data->scope_join_state & 0x7; + + /* For a leave request, we will immediately answer the VF, and + * update our internal counters. The actual leave will be sent + * to SM later, if at all needed. We dequeue the request now. */ + if (req->sa_mad.mad_hdr.method == IB_SA_METHOD_DELETE) + rc += handle_leave_req(group, req_join_state, req); + else + rc += handle_join_req(group, req_join_state, req); + } + + /* Handle leaves */ + if (group->state == MCAST_IDLE) { + req_join_state = get_leave_state(group); + if (req_join_state) { + group->rec.scope_join_state &= ~req_join_state; + group->prev_state = group->state; + if (send_leave_to_wire(group, req_join_state)) { + group->state = group->prev_state; + ++rc; + } else + group->state = MCAST_LEAVE_SENT; + } + } + + if (!list_empty(&group->pending_list) && group->state == MCAST_IDLE) + goto process_requests; + mutex_unlock(&group->lock); + + while (rc--) + release_group(group, 0); +} + +static struct mcast_group *search_relocate_mgid0_group(struct mlx4_ib_demux_ctx *ctx, + __be64 tid, + union ib_gid *new_mgid) +{ + struct mcast_group *group = NULL, *cur_group; + struct mcast_req *req; + struct list_head *pos; + struct list_head *n; + + mutex_lock(&ctx->mcg_table_lock); + list_for_each_safe(pos, n, &ctx->mcg_mgid0_list) { + group = list_entry(pos, struct mcast_group, mgid0_list); + mutex_lock(&group->lock); + if (group->last_req_tid == tid) { + if (memcmp(new_mgid, &mgid0, sizeof mgid0)) { + group->rec.mgid = *new_mgid; + sprintf(group->name, "%016llx%016llx", + be64_to_cpu(group->rec.mgid.global.subnet_prefix), + be64_to_cpu(group->rec.mgid.global.interface_id)); + list_del_init(&group->mgid0_list); + cur_group = mcast_insert(ctx, group); + if (cur_group) { + /* A race between our code and SM. Silently cleaning the new one */ + req = list_first_entry(&group->pending_list, + struct mcast_req, group_list); + --group->func[req->func].num_pend_reqs; + list_del(&req->group_list); + list_del(&req->func_list); + kfree(req); + mutex_unlock(&group->lock); + mutex_unlock(&ctx->mcg_table_lock); + release_group(group, 0); + return NULL; + } + + atomic_inc(&group->refcount); + add_sysfs_port_mcg_attr(ctx->dev, ctx->port, &group->dentry.attr); + mutex_unlock(&group->lock); + mutex_unlock(&ctx->mcg_table_lock); + return group; + } else { + struct mcast_req *tmp1, *tmp2; + + list_del(&group->mgid0_list); + if (!list_empty(&group->pending_list) && group->state != MCAST_IDLE) + cancel_delayed_work_sync(&group->timeout_work); + + list_for_each_entry_safe(tmp1, tmp2, &group->pending_list, group_list) { + list_del(&tmp1->group_list); + kfree(tmp1); + } + mutex_unlock(&group->lock); + mutex_unlock(&ctx->mcg_table_lock); + kfree(group); + return NULL; + } + } + mutex_unlock(&group->lock); + } + mutex_unlock(&ctx->mcg_table_lock); + + return NULL; +} + +static ssize_t sysfs_show_group(struct device *dev, + struct device_attribute *attr, char *buf); + +static struct mcast_group *acquire_group(struct mlx4_ib_demux_ctx *ctx, + union ib_gid *mgid, int create, + gfp_t gfp_mask) +{ + struct mcast_group *group, *cur_group; + int is_mgid0; + int i; + + is_mgid0 = !memcmp(&mgid0, mgid, sizeof mgid0); + if (!is_mgid0) { + group = mcast_find(ctx, mgid); + if (group) + goto found; + } + + if (!create) + return ERR_PTR(-ENOENT); + + group = kzalloc(sizeof *group, gfp_mask); + if (!group) + return ERR_PTR(-ENOMEM); + + group->demux = ctx; + group->rec.mgid = *mgid; + INIT_LIST_HEAD(&group->pending_list); + INIT_LIST_HEAD(&group->mgid0_list); + for (i = 0; i < MAX_VFS; ++i) + INIT_LIST_HEAD(&group->func[i].pending); + INIT_WORK(&group->work, mlx4_ib_mcg_work_handler); + INIT_DELAYED_WORK(&group->timeout_work, mlx4_ib_mcg_timeout_handler); + mutex_init(&group->lock); + sprintf(group->name, "%016llx%016llx", + be64_to_cpu(group->rec.mgid.global.subnet_prefix), + be64_to_cpu(group->rec.mgid.global.interface_id)); + sysfs_attr_init(&group->dentry.attr); + group->dentry.show = sysfs_show_group; + group->dentry.store = NULL; + group->dentry.attr.name = group->name; + group->dentry.attr.mode = 0400; + group->state = MCAST_IDLE; + + if (is_mgid0) { + list_add(&group->mgid0_list, &ctx->mcg_mgid0_list); + goto found; + } + + cur_group = mcast_insert(ctx, group); + if (cur_group) { + mcg_warn("group just showed up %s - confused\n", cur_group->name); + kfree(group); + return ERR_PTR(-EINVAL); + } + + add_sysfs_port_mcg_attr(ctx->dev, ctx->port, &group->dentry.attr); + +found: + atomic_inc(&group->refcount); + return group; +} + +static void queue_req(struct mcast_req *req) +{ + struct mcast_group *group = req->group; + + atomic_inc(&group->refcount); /* for the request */ + atomic_inc(&group->refcount); /* for scheduling the work */ + list_add_tail(&req->group_list, &group->pending_list); + list_add_tail(&req->func_list, &group->func[req->func].pending); + /* calls mlx4_ib_mcg_work_handler */ + if (!queue_work(group->demux->mcg_wq, &group->work)) + safe_atomic_dec(&group->refcount); +} + +int mlx4_ib_mcg_demux_handler(struct ib_device *ibdev, int port, int slave, + struct ib_sa_mad *mad) +{ + struct mlx4_ib_dev *dev = to_mdev(ibdev); + struct ib_sa_mcmember_data *rec = (struct ib_sa_mcmember_data *)mad->data; + struct mlx4_ib_demux_ctx *ctx = &dev->sriov.demux[port - 1]; + struct mcast_group *group; + + switch (mad->mad_hdr.method) { + case IB_MGMT_METHOD_GET_RESP: + case IB_SA_METHOD_DELETE_RESP: + mutex_lock(&ctx->mcg_table_lock); + group = acquire_group(ctx, &rec->mgid, 0, GFP_KERNEL); + mutex_unlock(&ctx->mcg_table_lock); + if (IS_ERR(group)) { + if (mad->mad_hdr.method == IB_MGMT_METHOD_GET_RESP) { + __be64 tid = mad->mad_hdr.tid; + *(u8 *)(&tid) = (u8)slave; /* in group we kept the modified TID */ + group = search_relocate_mgid0_group(ctx, tid, &rec->mgid); + } else + group = NULL; + } + + if (!group) + return 1; + + mutex_lock(&group->lock); + group->response_sa_mad = *mad; + group->prev_state = group->state; + group->state = MCAST_RESP_READY; + /* calls mlx4_ib_mcg_work_handler */ + atomic_inc(&group->refcount); + if (!queue_work(ctx->mcg_wq, &group->work)) + safe_atomic_dec(&group->refcount); + mutex_unlock(&group->lock); + release_group(group, 0); + return 1; /* consumed */ + case IB_MGMT_METHOD_SET: + case IB_SA_METHOD_GET_TABLE: + case IB_SA_METHOD_GET_TABLE_RESP: + case IB_SA_METHOD_DELETE: + return 0; /* not consumed, pass-through to guest over tunnel */ + default: + mcg_warn("In demux, port %d: unexpected MCMember method: 0x%x, dropping\n", + port, mad->mad_hdr.method); + return 1; /* consumed */ + } +} + +int mlx4_ib_mcg_multiplex_handler(struct ib_device *ibdev, int port, + int slave, struct ib_sa_mad *sa_mad) +{ + struct mlx4_ib_dev *dev = to_mdev(ibdev); + struct ib_sa_mcmember_data *rec = (struct ib_sa_mcmember_data *)sa_mad->data; + struct mlx4_ib_demux_ctx *ctx = &dev->sriov.demux[port - 1]; + struct mcast_group *group; + struct mcast_req *req; + int may_create = 0; + + if (ctx->flushing) + return -EAGAIN; + + switch (sa_mad->mad_hdr.method) { + case IB_MGMT_METHOD_SET: + may_create = 1; + case IB_SA_METHOD_DELETE: + req = kzalloc(sizeof *req, GFP_KERNEL); + if (!req) + return -ENOMEM; + + req->func = slave; + req->sa_mad = *sa_mad; + + mutex_lock(&ctx->mcg_table_lock); + group = acquire_group(ctx, &rec->mgid, may_create, GFP_KERNEL); + mutex_unlock(&ctx->mcg_table_lock); + if (IS_ERR(group)) { + kfree(req); + return PTR_ERR(group); + } + mutex_lock(&group->lock); + if (group->func[slave].num_pend_reqs > MAX_PEND_REQS_PER_FUNC) { + mutex_unlock(&group->lock); + mcg_warn_group(group, "Port %d, Func %d has too many pending requests (%d), dropping\n", + port, slave, MAX_PEND_REQS_PER_FUNC); + release_group(group, 0); + kfree(req); + return -ENOMEM; + } + ++group->func[slave].num_pend_reqs; + req->group = group; + queue_req(req); + mutex_unlock(&group->lock); + release_group(group, 0); + return 1; /* consumed */ + case IB_SA_METHOD_GET_TABLE: + case IB_MGMT_METHOD_GET_RESP: + case IB_SA_METHOD_GET_TABLE_RESP: + case IB_SA_METHOD_DELETE_RESP: + return 0; /* not consumed, pass-through */ + default: + mcg_warn("In multiplex, port %d, func %d: unexpected MCMember method: 0x%x, dropping\n", + port, slave, sa_mad->mad_hdr.method); + return 1; /* consumed */ + } +} + +static ssize_t sysfs_show_group(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct mcast_group *group = + container_of(attr, struct mcast_group, dentry); + struct mcast_req *req = NULL; + char pending_str[40]; + char state_str[40]; + ssize_t len = 0; + int f; + + if (group->state == MCAST_IDLE) + sprintf(state_str, "%s", get_state_string(group->state)); + else + sprintf(state_str, "%s(TID=0x%llx)", + get_state_string(group->state), + be64_to_cpu(group->last_req_tid)); + if (list_empty(&group->pending_list)) { + sprintf(pending_str, "No"); + } else { + req = list_first_entry(&group->pending_list, struct mcast_req, group_list); + sprintf(pending_str, "Yes(TID=0x%llx)", + be64_to_cpu(req->sa_mad.mad_hdr.tid)); + } + len += sprintf(buf + len, "%1d [%02d,%02d,%02d] %4d %4s %5s ", + group->rec.scope_join_state & 0xf, + group->members[2], group->members[1], group->members[0], + atomic_read(&group->refcount), + pending_str, + state_str); + for (f = 0; f < MAX_VFS; ++f) + if (group->func[f].state == MCAST_MEMBER) + len += sprintf(buf + len, "%d[%1x] ", + f, group->func[f].join_state); + + len += sprintf(buf + len, "\t\t(%4hx %4x %2x %2x %2x %2x %2x " + "%4x %4x %2x %2x)\n", + be16_to_cpu(group->rec.pkey), + be32_to_cpu(group->rec.qkey), + (group->rec.mtusel_mtu & 0xc0) >> 6, + group->rec.mtusel_mtu & 0x3f, + group->rec.tclass, + (group->rec.ratesel_rate & 0xc0) >> 6, + group->rec.ratesel_rate & 0x3f, + (be32_to_cpu(group->rec.sl_flowlabel_hoplimit) & 0xf0000000) >> 28, + (be32_to_cpu(group->rec.sl_flowlabel_hoplimit) & 0x0fffff00) >> 8, + be32_to_cpu(group->rec.sl_flowlabel_hoplimit) & 0x000000ff, + group->rec.proxy_join); + + return len; +} + +int mlx4_ib_mcg_port_init(struct mlx4_ib_demux_ctx *ctx) +{ + char name[20]; + + atomic_set(&ctx->tid, 0); + sprintf(name, "mlx4_ib_mcg%d", ctx->port); + ctx->mcg_wq = create_singlethread_workqueue(name); + if (!ctx->mcg_wq) + return -ENOMEM; + + mutex_init(&ctx->mcg_table_lock); + ctx->mcg_table = RB_ROOT; + INIT_LIST_HEAD(&ctx->mcg_mgid0_list); + ctx->flushing = 0; + + return 0; +} + +static void force_clean_group(struct mcast_group *group) +{ + struct mcast_req *req, *tmp + ; + list_for_each_entry_safe(req, tmp, &group->pending_list, group_list) { + list_del(&req->group_list); + kfree(req); + } + del_sysfs_port_mcg_attr(group->demux->dev, group->demux->port, &group->dentry.attr); + rb_erase(&group->node, &group->demux->mcg_table); + kfree(group); +} + +static void _mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy_wq) +{ + int i; + struct rb_node *p; + struct mcast_group *group; + unsigned long end; + int count; + + for (i = 0; i < MAX_VFS; ++i) + clean_vf_mcast(ctx, i); + + end = jiffies + msecs_to_jiffies(MAD_TIMEOUT_MS + 3000); + do { + count = 0; + mutex_lock(&ctx->mcg_table_lock); + for (p = rb_first(&ctx->mcg_table); p; p = rb_next(p)) + ++count; + mutex_unlock(&ctx->mcg_table_lock); + if (!count) + break; + + msleep(1); + } while (time_after(end, jiffies)); + + flush_workqueue(ctx->mcg_wq); + if (destroy_wq) + destroy_workqueue(ctx->mcg_wq); + + mutex_lock(&ctx->mcg_table_lock); + while ((p = rb_first(&ctx->mcg_table)) != NULL) { + group = rb_entry(p, struct mcast_group, node); + if (atomic_read(&group->refcount)) + mcg_warn_group(group, "group refcount %d!!! (pointer %p)\n", atomic_read(&group->refcount), group); + + force_clean_group(group); + } + mutex_unlock(&ctx->mcg_table_lock); +} + +struct clean_work { + struct work_struct work; + struct mlx4_ib_demux_ctx *ctx; + int destroy_wq; +}; + +static void mcg_clean_task(struct work_struct *work) +{ + struct clean_work *cw = container_of(work, struct clean_work, work); + + _mlx4_ib_mcg_port_cleanup(cw->ctx, cw->destroy_wq); + cw->ctx->flushing = 0; + kfree(cw); +} + +void mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy_wq) +{ + struct clean_work *work; + + if (ctx->flushing) + return; + + ctx->flushing = 1; + + if (destroy_wq) { + _mlx4_ib_mcg_port_cleanup(ctx, destroy_wq); + ctx->flushing = 0; + return; + } + + work = kmalloc(sizeof *work, GFP_KERNEL); + if (!work) { + ctx->flushing = 0; + mcg_warn("failed allocating work for cleanup\n"); + return; + } + + work->ctx = ctx; + work->destroy_wq = destroy_wq; + INIT_WORK(&work->work, mcg_clean_task); + queue_work(clean_wq, &work->work); +} + +static void build_leave_mad(struct mcast_req *req) +{ + struct ib_sa_mad *mad = &req->sa_mad; + + mad->mad_hdr.method = IB_SA_METHOD_DELETE; +} + + +static void clear_pending_reqs(struct mcast_group *group, int vf) +{ + struct mcast_req *req, *tmp, *group_first = NULL; + int clear; + int pend = 0; + + if (!list_empty(&group->pending_list)) + group_first = list_first_entry(&group->pending_list, struct mcast_req, group_list); + + list_for_each_entry_safe(req, tmp, &group->func[vf].pending, func_list) { + clear = 1; + if (group_first == req && + (group->state == MCAST_JOIN_SENT || + group->state == MCAST_LEAVE_SENT)) { + clear = cancel_delayed_work(&group->timeout_work); + pend = !clear; + group->state = MCAST_IDLE; + } + if (clear) { + --group->func[vf].num_pend_reqs; + list_del(&req->group_list); + list_del(&req->func_list); + kfree(req); + atomic_dec(&group->refcount); + } + } + + if (!pend && (!list_empty(&group->func[vf].pending) || group->func[vf].num_pend_reqs)) { + mcg_warn_group(group, "DRIVER BUG: list_empty %d, num_pend_reqs %d\n", + list_empty(&group->func[vf].pending), group->func[vf].num_pend_reqs); + } +} + +static int push_deleteing_req(struct mcast_group *group, int slave) +{ + struct mcast_req *req; + struct mcast_req *pend_req; + + if (!group->func[slave].join_state) + return 0; + + req = kzalloc(sizeof *req, GFP_KERNEL); + if (!req) { + mcg_warn_group(group, "failed allocation - may leave stall groups\n"); + return -ENOMEM; + } + + if (!list_empty(&group->func[slave].pending)) { + pend_req = list_entry(group->func[slave].pending.prev, struct mcast_req, group_list); + if (pend_req->clean) { + kfree(req); + return 0; + } + } + + req->clean = 1; + req->func = slave; + req->group = group; + ++group->func[slave].num_pend_reqs; + build_leave_mad(req); + queue_req(req); + return 0; +} + +void clean_vf_mcast(struct mlx4_ib_demux_ctx *ctx, int slave) +{ + struct mcast_group *group; + struct rb_node *p; + + mutex_lock(&ctx->mcg_table_lock); + for (p = rb_first(&ctx->mcg_table); p; p = rb_next(p)) { + group = rb_entry(p, struct mcast_group, node); + mutex_lock(&group->lock); + if (atomic_read(&group->refcount)) { + /* clear pending requests of this VF */ + clear_pending_reqs(group, slave); + push_deleteing_req(group, slave); + } + mutex_unlock(&group->lock); + } + mutex_unlock(&ctx->mcg_table_lock); +} + + +int mlx4_ib_mcg_init(void) +{ + clean_wq = create_singlethread_workqueue("mlx4_ib_mcg"); + if (!clean_wq) + return -ENOMEM; + + return 0; +} + +void mlx4_ib_mcg_destroy(void) +{ + destroy_workqueue(clean_wq); +} diff --git a/kernel/drivers/infiniband/hw/mlx4/mlx4_ib.h b/kernel/drivers/infiniband/hw/mlx4/mlx4_ib.h new file mode 100644 index 000000000..fce393437 --- /dev/null +++ b/kernel/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -0,0 +1,819 @@ +/* + * Copyright (c) 2006, 2007 Cisco Systems. All rights reserved. + * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX4_IB_H +#define MLX4_IB_H + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +#define MLX4_IB_DRV_NAME "mlx4_ib" + +#ifdef pr_fmt +#undef pr_fmt +#endif +#define pr_fmt(fmt) "<" MLX4_IB_DRV_NAME "> %s: " fmt, __func__ + +#define mlx4_ib_warn(ibdev, format, arg...) \ + dev_warn((ibdev)->dma_device, MLX4_IB_DRV_NAME ": " format, ## arg) + +enum { + MLX4_IB_SQ_MIN_WQE_SHIFT = 6, + MLX4_IB_MAX_HEADROOM = 2048 +}; + +#define MLX4_IB_SQ_HEADROOM(shift) ((MLX4_IB_MAX_HEADROOM >> (shift)) + 1) +#define MLX4_IB_SQ_MAX_SPARE (MLX4_IB_SQ_HEADROOM(MLX4_IB_SQ_MIN_WQE_SHIFT)) + +/*module param to indicate if SM assigns the alias_GUID*/ +extern int mlx4_ib_sm_guid_assign; + +#define MLX4_IB_UC_STEER_QPN_ALIGN 1 +#define MLX4_IB_UC_MAX_NUM_QPS 256 +struct mlx4_ib_ucontext { + struct ib_ucontext ibucontext; + struct mlx4_uar uar; + struct list_head db_page_list; + struct mutex db_page_mutex; +}; + +struct mlx4_ib_pd { + struct ib_pd ibpd; + u32 pdn; +}; + +struct mlx4_ib_xrcd { + struct ib_xrcd ibxrcd; + u32 xrcdn; + struct ib_pd *pd; + struct ib_cq *cq; +}; + +struct mlx4_ib_cq_buf { + struct mlx4_buf buf; + struct mlx4_mtt mtt; + int entry_size; +}; + +struct mlx4_ib_cq_resize { + struct mlx4_ib_cq_buf buf; + int cqe; +}; + +struct mlx4_ib_cq { + struct ib_cq ibcq; + struct mlx4_cq mcq; + struct mlx4_ib_cq_buf buf; + struct mlx4_ib_cq_resize *resize_buf; + struct mlx4_db db; + spinlock_t lock; + struct mutex resize_mutex; + struct ib_umem *umem; + struct ib_umem *resize_umem; + /* List of qps that it serves.*/ + struct list_head send_qp_list; + struct list_head recv_qp_list; +}; + +struct mlx4_ib_mr { + struct ib_mr ibmr; + struct mlx4_mr mmr; + struct ib_umem *umem; +}; + +struct mlx4_ib_mw { + struct ib_mw ibmw; + struct mlx4_mw mmw; +}; + +struct mlx4_ib_fast_reg_page_list { + struct ib_fast_reg_page_list ibfrpl; + __be64 *mapped_page_list; + dma_addr_t map; +}; + +struct mlx4_ib_fmr { + struct ib_fmr ibfmr; + struct mlx4_fmr mfmr; +}; + +#define MAX_REGS_PER_FLOW 2 + +struct mlx4_flow_reg_id { + u64 id; + u64 mirror; +}; + +struct mlx4_ib_flow { + struct ib_flow ibflow; + /* translating DMFS verbs sniffer rule to FW API requires two reg IDs */ + struct mlx4_flow_reg_id reg_id[MAX_REGS_PER_FLOW]; +}; + +struct mlx4_ib_wq { + u64 *wrid; + spinlock_t lock; + int wqe_cnt; + int max_post; + int max_gs; + int offset; + int wqe_shift; + unsigned head; + unsigned tail; +}; + +enum mlx4_ib_qp_flags { + MLX4_IB_QP_LSO = IB_QP_CREATE_IPOIB_UD_LSO, + MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK = IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK, + MLX4_IB_QP_NETIF = IB_QP_CREATE_NETIF_QP, + MLX4_IB_QP_CREATE_USE_GFP_NOIO = IB_QP_CREATE_USE_GFP_NOIO, + MLX4_IB_SRIOV_TUNNEL_QP = 1 << 30, + MLX4_IB_SRIOV_SQP = 1 << 31, +}; + +struct mlx4_ib_gid_entry { + struct list_head list; + union ib_gid gid; + int added; + u8 port; +}; + +enum mlx4_ib_qp_type { + /* + * IB_QPT_SMI and IB_QPT_GSI have to be the first two entries + * here (and in that order) since the MAD layer uses them as + * indices into a 2-entry table. + */ + MLX4_IB_QPT_SMI = IB_QPT_SMI, + MLX4_IB_QPT_GSI = IB_QPT_GSI, + + MLX4_IB_QPT_RC = IB_QPT_RC, + MLX4_IB_QPT_UC = IB_QPT_UC, + MLX4_IB_QPT_UD = IB_QPT_UD, + MLX4_IB_QPT_RAW_IPV6 = IB_QPT_RAW_IPV6, + MLX4_IB_QPT_RAW_ETHERTYPE = IB_QPT_RAW_ETHERTYPE, + MLX4_IB_QPT_RAW_PACKET = IB_QPT_RAW_PACKET, + MLX4_IB_QPT_XRC_INI = IB_QPT_XRC_INI, + MLX4_IB_QPT_XRC_TGT = IB_QPT_XRC_TGT, + + MLX4_IB_QPT_PROXY_SMI_OWNER = 1 << 16, + MLX4_IB_QPT_PROXY_SMI = 1 << 17, + MLX4_IB_QPT_PROXY_GSI = 1 << 18, + MLX4_IB_QPT_TUN_SMI_OWNER = 1 << 19, + MLX4_IB_QPT_TUN_SMI = 1 << 20, + MLX4_IB_QPT_TUN_GSI = 1 << 21, +}; + +#define MLX4_IB_QPT_ANY_SRIOV (MLX4_IB_QPT_PROXY_SMI_OWNER | \ + MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER | \ + MLX4_IB_QPT_TUN_SMI | MLX4_IB_QPT_TUN_GSI) + +enum mlx4_ib_mad_ifc_flags { + MLX4_MAD_IFC_IGNORE_MKEY = 1, + MLX4_MAD_IFC_IGNORE_BKEY = 2, + MLX4_MAD_IFC_IGNORE_KEYS = (MLX4_MAD_IFC_IGNORE_MKEY | + MLX4_MAD_IFC_IGNORE_BKEY), + MLX4_MAD_IFC_NET_VIEW = 4, +}; + +enum { + MLX4_NUM_TUNNEL_BUFS = 256, +}; + +struct mlx4_ib_tunnel_header { + struct mlx4_av av; + __be32 remote_qpn; + __be32 qkey; + __be16 vlan; + u8 mac[6]; + __be16 pkey_index; + u8 reserved[6]; +}; + +struct mlx4_ib_buf { + void *addr; + dma_addr_t map; +}; + +struct mlx4_rcv_tunnel_hdr { + __be32 flags_src_qp; /* flags[6:5] is defined for VLANs: + * 0x0 - no vlan was in the packet + * 0x01 - C-VLAN was in the packet */ + u8 g_ml_path; /* gid bit stands for ipv6/4 header in RoCE */ + u8 reserved; + __be16 pkey_index; + __be16 sl_vid; + __be16 slid_mac_47_32; + __be32 mac_31_0; +}; + +struct mlx4_ib_proxy_sqp_hdr { + struct ib_grh grh; + struct mlx4_rcv_tunnel_hdr tun; +} __packed; + +struct mlx4_roce_smac_vlan_info { + u64 smac; + int smac_index; + int smac_port; + u64 candidate_smac; + int candidate_smac_index; + int candidate_smac_port; + u16 vid; + int vlan_index; + int vlan_port; + u16 candidate_vid; + int candidate_vlan_index; + int candidate_vlan_port; + int update_vid; +}; + +struct mlx4_ib_qp { + struct ib_qp ibqp; + struct mlx4_qp mqp; + struct mlx4_buf buf; + + struct mlx4_db db; + struct mlx4_ib_wq rq; + + u32 doorbell_qpn; + __be32 sq_signal_bits; + unsigned sq_next_wqe; + int sq_max_wqes_per_wr; + int sq_spare_wqes; + struct mlx4_ib_wq sq; + + enum mlx4_ib_qp_type mlx4_ib_qp_type; + struct ib_umem *umem; + struct mlx4_mtt mtt; + int buf_size; + struct mutex mutex; + u16 xrcdn; + u32 flags; + u8 port; + u8 alt_port; + u8 atomic_rd_en; + u8 resp_depth; + u8 sq_no_prefetch; + u8 state; + int mlx_type; + struct list_head gid_list; + struct list_head steering_rules; + struct mlx4_ib_buf *sqp_proxy_rcv; + struct mlx4_roce_smac_vlan_info pri; + struct mlx4_roce_smac_vlan_info alt; + u64 reg_id; + struct list_head qps_list; + struct list_head cq_recv_list; + struct list_head cq_send_list; +}; + +struct mlx4_ib_srq { + struct ib_srq ibsrq; + struct mlx4_srq msrq; + struct mlx4_buf buf; + struct mlx4_db db; + u64 *wrid; + spinlock_t lock; + int head; + int tail; + u16 wqe_ctr; + struct ib_umem *umem; + struct mlx4_mtt mtt; + struct mutex mutex; +}; + +struct mlx4_ib_ah { + struct ib_ah ibah; + union mlx4_ext_av av; +}; + +/****************************************/ +/* alias guid support */ +/****************************************/ +#define NUM_PORT_ALIAS_GUID 2 +#define NUM_ALIAS_GUID_IN_REC 8 +#define NUM_ALIAS_GUID_REC_IN_PORT 16 +#define GUID_REC_SIZE 8 +#define NUM_ALIAS_GUID_PER_PORT 128 +#define MLX4_NOT_SET_GUID (0x00LL) +#define MLX4_GUID_FOR_DELETE_VAL (~(0x00LL)) + +enum mlx4_guid_alias_rec_status { + MLX4_GUID_INFO_STATUS_IDLE, + MLX4_GUID_INFO_STATUS_SET, +}; + +#define GUID_STATE_NEED_PORT_INIT 0x01 + +enum mlx4_guid_alias_rec_method { + MLX4_GUID_INFO_RECORD_SET = IB_MGMT_METHOD_SET, + MLX4_GUID_INFO_RECORD_DELETE = IB_SA_METHOD_DELETE, +}; + +struct mlx4_sriov_alias_guid_info_rec_det { + u8 all_recs[GUID_REC_SIZE * NUM_ALIAS_GUID_IN_REC]; + ib_sa_comp_mask guid_indexes; /*indicates what from the 8 records are valid*/ + enum mlx4_guid_alias_rec_status status; /*indicates the administraively status of the record.*/ + unsigned int guids_retry_schedule[NUM_ALIAS_GUID_IN_REC]; + u64 time_to_run; +}; + +struct mlx4_sriov_alias_guid_port_rec_det { + struct mlx4_sriov_alias_guid_info_rec_det all_rec_per_port[NUM_ALIAS_GUID_REC_IN_PORT]; + struct workqueue_struct *wq; + struct delayed_work alias_guid_work; + u8 port; + u32 state_flags; + struct mlx4_sriov_alias_guid *parent; + struct list_head cb_list; +}; + +struct mlx4_sriov_alias_guid { + struct mlx4_sriov_alias_guid_port_rec_det ports_guid[MLX4_MAX_PORTS]; + spinlock_t ag_work_lock; + struct ib_sa_client *sa_client; +}; + +struct mlx4_ib_demux_work { + struct work_struct work; + struct mlx4_ib_dev *dev; + int slave; + int do_init; + u8 port; + +}; + +struct mlx4_ib_tun_tx_buf { + struct mlx4_ib_buf buf; + struct ib_ah *ah; +}; + +struct mlx4_ib_demux_pv_qp { + struct ib_qp *qp; + enum ib_qp_type proxy_qpt; + struct mlx4_ib_buf *ring; + struct mlx4_ib_tun_tx_buf *tx_ring; + spinlock_t tx_lock; + unsigned tx_ix_head; + unsigned tx_ix_tail; +}; + +enum mlx4_ib_demux_pv_state { + DEMUX_PV_STATE_DOWN, + DEMUX_PV_STATE_STARTING, + DEMUX_PV_STATE_ACTIVE, + DEMUX_PV_STATE_DOWNING, +}; + +struct mlx4_ib_demux_pv_ctx { + int port; + int slave; + enum mlx4_ib_demux_pv_state state; + int has_smi; + struct ib_device *ib_dev; + struct ib_cq *cq; + struct ib_pd *pd; + struct ib_mr *mr; + struct work_struct work; + struct workqueue_struct *wq; + struct mlx4_ib_demux_pv_qp qp[2]; +}; + +struct mlx4_ib_demux_ctx { + struct ib_device *ib_dev; + int port; + struct workqueue_struct *wq; + struct workqueue_struct *ud_wq; + spinlock_t ud_lock; + __be64 subnet_prefix; + __be64 guid_cache[128]; + struct mlx4_ib_dev *dev; + /* the following lock protects both mcg_table and mcg_mgid0_list */ + struct mutex mcg_table_lock; + struct rb_root mcg_table; + struct list_head mcg_mgid0_list; + struct workqueue_struct *mcg_wq; + struct mlx4_ib_demux_pv_ctx **tun; + atomic_t tid; + int flushing; /* flushing the work queue */ +}; + +struct mlx4_ib_sriov { + struct mlx4_ib_demux_ctx demux[MLX4_MAX_PORTS]; + struct mlx4_ib_demux_pv_ctx *sqps[MLX4_MAX_PORTS]; + /* when using this spinlock you should use "irq" because + * it may be called from interrupt context.*/ + spinlock_t going_down_lock; + int is_going_down; + + struct mlx4_sriov_alias_guid alias_guid; + + /* CM paravirtualization fields */ + struct list_head cm_list; + spinlock_t id_map_lock; + struct rb_root sl_id_map; + struct idr pv_id_table; +}; + +struct mlx4_ib_iboe { + spinlock_t lock; + struct net_device *netdevs[MLX4_MAX_PORTS]; + struct net_device *masters[MLX4_MAX_PORTS]; + atomic64_t mac[MLX4_MAX_PORTS]; + struct notifier_block nb; + struct notifier_block nb_inet; + struct notifier_block nb_inet6; + union ib_gid gid_table[MLX4_MAX_PORTS][128]; +}; + +struct pkey_mgt { + u8 virt2phys_pkey[MLX4_MFUNC_MAX][MLX4_MAX_PORTS][MLX4_MAX_PORT_PKEYS]; + u16 phys_pkey_cache[MLX4_MAX_PORTS][MLX4_MAX_PORT_PKEYS]; + struct list_head pkey_port_list[MLX4_MFUNC_MAX]; + struct kobject *device_parent[MLX4_MFUNC_MAX]; +}; + +struct mlx4_ib_iov_sysfs_attr { + void *ctx; + struct kobject *kobj; + unsigned long data; + u32 entry_num; + char name[15]; + struct device_attribute dentry; + struct device *dev; +}; + +struct mlx4_ib_iov_sysfs_attr_ar { + struct mlx4_ib_iov_sysfs_attr dentries[3 * NUM_ALIAS_GUID_PER_PORT + 1]; +}; + +struct mlx4_ib_iov_port { + char name[100]; + u8 num; + struct mlx4_ib_dev *dev; + struct list_head list; + struct mlx4_ib_iov_sysfs_attr_ar *dentr_ar; + struct ib_port_attr attr; + struct kobject *cur_port; + struct kobject *admin_alias_parent; + struct kobject *gids_parent; + struct kobject *pkeys_parent; + struct kobject *mcgs_parent; + struct mlx4_ib_iov_sysfs_attr mcg_dentry; +}; + +struct mlx4_ib_dev { + struct ib_device ib_dev; + struct mlx4_dev *dev; + int num_ports; + void __iomem *uar_map; + + struct mlx4_uar priv_uar; + u32 priv_pdn; + MLX4_DECLARE_DOORBELL_LOCK(uar_lock); + + struct ib_mad_agent *send_agent[MLX4_MAX_PORTS][2]; + struct ib_ah *sm_ah[MLX4_MAX_PORTS]; + spinlock_t sm_lock; + struct mlx4_ib_sriov sriov; + + struct mutex cap_mask_mutex; + bool ib_active; + struct mlx4_ib_iboe iboe; + int counters[MLX4_MAX_PORTS]; + int *eq_table; + int eq_added; + struct kobject *iov_parent; + struct kobject *ports_parent; + struct kobject *dev_ports_parent[MLX4_MFUNC_MAX]; + struct mlx4_ib_iov_port iov_ports[MLX4_MAX_PORTS]; + struct pkey_mgt pkeys; + unsigned long *ib_uc_qpns_bitmap; + int steer_qpn_count; + int steer_qpn_base; + int steering_support; + struct mlx4_ib_qp *qp1_proxy[MLX4_MAX_PORTS]; + /* lock when destroying qp1_proxy and getting netdev events */ + struct mutex qp1_proxy_lock[MLX4_MAX_PORTS]; + u8 bond_next_port; + /* protect resources needed as part of reset flow */ + spinlock_t reset_flow_resource_lock; + struct list_head qp_list; +}; + +struct ib_event_work { + struct work_struct work; + struct mlx4_ib_dev *ib_dev; + struct mlx4_eqe ib_eqe; +}; + +struct mlx4_ib_qp_tunnel_init_attr { + struct ib_qp_init_attr init_attr; + int slave; + enum ib_qp_type proxy_qp_type; + u8 port; +}; + +static inline struct mlx4_ib_dev *to_mdev(struct ib_device *ibdev) +{ + return container_of(ibdev, struct mlx4_ib_dev, ib_dev); +} + +static inline struct mlx4_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext) +{ + return container_of(ibucontext, struct mlx4_ib_ucontext, ibucontext); +} + +static inline struct mlx4_ib_pd *to_mpd(struct ib_pd *ibpd) +{ + return container_of(ibpd, struct mlx4_ib_pd, ibpd); +} + +static inline struct mlx4_ib_xrcd *to_mxrcd(struct ib_xrcd *ibxrcd) +{ + return container_of(ibxrcd, struct mlx4_ib_xrcd, ibxrcd); +} + +static inline struct mlx4_ib_cq *to_mcq(struct ib_cq *ibcq) +{ + return container_of(ibcq, struct mlx4_ib_cq, ibcq); +} + +static inline struct mlx4_ib_cq *to_mibcq(struct mlx4_cq *mcq) +{ + return container_of(mcq, struct mlx4_ib_cq, mcq); +} + +static inline struct mlx4_ib_mr *to_mmr(struct ib_mr *ibmr) +{ + return container_of(ibmr, struct mlx4_ib_mr, ibmr); +} + +static inline struct mlx4_ib_mw *to_mmw(struct ib_mw *ibmw) +{ + return container_of(ibmw, struct mlx4_ib_mw, ibmw); +} + +static inline struct mlx4_ib_fast_reg_page_list *to_mfrpl(struct ib_fast_reg_page_list *ibfrpl) +{ + return container_of(ibfrpl, struct mlx4_ib_fast_reg_page_list, ibfrpl); +} + +static inline struct mlx4_ib_fmr *to_mfmr(struct ib_fmr *ibfmr) +{ + return container_of(ibfmr, struct mlx4_ib_fmr, ibfmr); +} + +static inline struct mlx4_ib_flow *to_mflow(struct ib_flow *ibflow) +{ + return container_of(ibflow, struct mlx4_ib_flow, ibflow); +} + +static inline struct mlx4_ib_qp *to_mqp(struct ib_qp *ibqp) +{ + return container_of(ibqp, struct mlx4_ib_qp, ibqp); +} + +static inline struct mlx4_ib_qp *to_mibqp(struct mlx4_qp *mqp) +{ + return container_of(mqp, struct mlx4_ib_qp, mqp); +} + +static inline struct mlx4_ib_srq *to_msrq(struct ib_srq *ibsrq) +{ + return container_of(ibsrq, struct mlx4_ib_srq, ibsrq); +} + +static inline struct mlx4_ib_srq *to_mibsrq(struct mlx4_srq *msrq) +{ + return container_of(msrq, struct mlx4_ib_srq, msrq); +} + +static inline struct mlx4_ib_ah *to_mah(struct ib_ah *ibah) +{ + return container_of(ibah, struct mlx4_ib_ah, ibah); +} + +static inline u8 mlx4_ib_bond_next_port(struct mlx4_ib_dev *dev) +{ + dev->bond_next_port = (dev->bond_next_port + 1) % dev->num_ports; + + return dev->bond_next_port + 1; +} + +int mlx4_ib_init_sriov(struct mlx4_ib_dev *dev); +void mlx4_ib_close_sriov(struct mlx4_ib_dev *dev); + +int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt, + struct mlx4_db *db); +void mlx4_ib_db_unmap_user(struct mlx4_ib_ucontext *context, struct mlx4_db *db); + +struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc); +int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt, + struct ib_umem *umem); +struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int access_flags, + struct ib_udata *udata); +int mlx4_ib_dereg_mr(struct ib_mr *mr); +struct ib_mw *mlx4_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type); +int mlx4_ib_bind_mw(struct ib_qp *qp, struct ib_mw *mw, + struct ib_mw_bind *mw_bind); +int mlx4_ib_dealloc_mw(struct ib_mw *mw); +struct ib_mr *mlx4_ib_alloc_fast_reg_mr(struct ib_pd *pd, + int max_page_list_len); +struct ib_fast_reg_page_list *mlx4_ib_alloc_fast_reg_page_list(struct ib_device *ibdev, + int page_list_len); +void mlx4_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list); + +int mlx4_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period); +int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata); +struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector, + struct ib_ucontext *context, + struct ib_udata *udata); +int mlx4_ib_destroy_cq(struct ib_cq *cq); +int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); +int mlx4_ib_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags); +void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq); +void mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq); + +struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr); +int mlx4_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr); +int mlx4_ib_destroy_ah(struct ib_ah *ah); + +struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd, + struct ib_srq_init_attr *init_attr, + struct ib_udata *udata); +int mlx4_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, struct ib_udata *udata); +int mlx4_ib_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr); +int mlx4_ib_destroy_srq(struct ib_srq *srq); +void mlx4_ib_free_srq_wqe(struct mlx4_ib_srq *srq, int wqe_index); +int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr); + +struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata); +int mlx4_ib_destroy_qp(struct ib_qp *qp); +int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata); +int mlx4_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr); +int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr); +int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr); + +int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int mad_ifc_flags, + int port, struct ib_wc *in_wc, struct ib_grh *in_grh, + void *in_mad, void *response_mad); +int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + struct ib_wc *in_wc, struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad); +int mlx4_ib_mad_init(struct mlx4_ib_dev *dev); +void mlx4_ib_mad_cleanup(struct mlx4_ib_dev *dev); + +struct ib_fmr *mlx4_ib_fmr_alloc(struct ib_pd *pd, int mr_access_flags, + struct ib_fmr_attr *fmr_attr); +int mlx4_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, int npages, + u64 iova); +int mlx4_ib_unmap_fmr(struct list_head *fmr_list); +int mlx4_ib_fmr_dealloc(struct ib_fmr *fmr); +int __mlx4_ib_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props, int netw_view); +int __mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, + u16 *pkey, int netw_view); + +int __mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index, + union ib_gid *gid, int netw_view); + +static inline bool mlx4_ib_ah_grh_present(struct mlx4_ib_ah *ah) +{ + u8 port = be32_to_cpu(ah->av.ib.port_pd) >> 24 & 3; + + if (rdma_port_get_link_layer(ah->ibah.device, port) == IB_LINK_LAYER_ETHERNET) + return true; + + return !!(ah->av.ib.g_slid & 0x80); +} + +int mlx4_ib_mcg_port_init(struct mlx4_ib_demux_ctx *ctx); +void mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy_wq); +void clean_vf_mcast(struct mlx4_ib_demux_ctx *ctx, int slave); +int mlx4_ib_mcg_init(void); +void mlx4_ib_mcg_destroy(void); + +int mlx4_ib_find_real_gid(struct ib_device *ibdev, u8 port, __be64 guid); + +int mlx4_ib_mcg_multiplex_handler(struct ib_device *ibdev, int port, int slave, + struct ib_sa_mad *sa_mad); +int mlx4_ib_mcg_demux_handler(struct ib_device *ibdev, int port, int slave, + struct ib_sa_mad *mad); + +int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, + union ib_gid *gid); + +void mlx4_ib_dispatch_event(struct mlx4_ib_dev *dev, u8 port_num, + enum ib_event_type type); + +void mlx4_ib_tunnels_update_work(struct work_struct *work); + +int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port, + enum ib_qp_type qpt, struct ib_wc *wc, + struct ib_grh *grh, struct ib_mad *mad); + +int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port, + enum ib_qp_type dest_qpt, u16 pkey_index, u32 remote_qpn, + u32 qkey, struct ib_ah_attr *attr, u8 *s_mac, + struct ib_mad *mad); + +__be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx); + +int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave, + struct ib_mad *mad); + +int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id, + struct ib_mad *mad); + +void mlx4_ib_cm_paravirt_init(struct mlx4_ib_dev *dev); +void mlx4_ib_cm_paravirt_clean(struct mlx4_ib_dev *dev, int slave_id); + +/* alias guid support */ +void mlx4_ib_init_alias_guid_work(struct mlx4_ib_dev *dev, int port); +int mlx4_ib_init_alias_guid_service(struct mlx4_ib_dev *dev); +void mlx4_ib_destroy_alias_guid_service(struct mlx4_ib_dev *dev); +void mlx4_ib_invalidate_all_guid_record(struct mlx4_ib_dev *dev, int port); + +void mlx4_ib_notify_slaves_on_guid_change(struct mlx4_ib_dev *dev, + int block_num, + u8 port_num, u8 *p_data); + +void mlx4_ib_update_cache_on_guid_change(struct mlx4_ib_dev *dev, + int block_num, u8 port_num, + u8 *p_data); + +int add_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num, + struct attribute *attr); +void del_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num, + struct attribute *attr); +ib_sa_comp_mask mlx4_ib_get_aguid_comp_mask_from_ix(int index); +void mlx4_ib_slave_alias_guid_event(struct mlx4_ib_dev *dev, int slave, + int port, int slave_init); + +int mlx4_ib_device_register_sysfs(struct mlx4_ib_dev *device) ; + +void mlx4_ib_device_unregister_sysfs(struct mlx4_ib_dev *device); + +__be64 mlx4_ib_gen_node_guid(void); + +int mlx4_ib_steer_qp_alloc(struct mlx4_ib_dev *dev, int count, int *qpn); +void mlx4_ib_steer_qp_free(struct mlx4_ib_dev *dev, u32 qpn, int count); +int mlx4_ib_steer_qp_reg(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, + int is_attach); +int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags, + u64 start, u64 length, u64 virt_addr, + int mr_access_flags, struct ib_pd *pd, + struct ib_udata *udata); + +#endif /* MLX4_IB_H */ diff --git a/kernel/drivers/infiniband/hw/mlx4/mr.c b/kernel/drivers/infiniband/hw/mlx4/mr.c new file mode 100644 index 000000000..e0d271782 --- /dev/null +++ b/kernel/drivers/infiniband/hw/mlx4/mr.c @@ -0,0 +1,525 @@ +/* + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include "mlx4_ib.h" + +static u32 convert_access(int acc) +{ + return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX4_PERM_ATOMIC : 0) | + (acc & IB_ACCESS_REMOTE_WRITE ? MLX4_PERM_REMOTE_WRITE : 0) | + (acc & IB_ACCESS_REMOTE_READ ? MLX4_PERM_REMOTE_READ : 0) | + (acc & IB_ACCESS_LOCAL_WRITE ? MLX4_PERM_LOCAL_WRITE : 0) | + (acc & IB_ACCESS_MW_BIND ? MLX4_PERM_BIND_MW : 0) | + MLX4_PERM_LOCAL_READ; +} + +static enum mlx4_mw_type to_mlx4_type(enum ib_mw_type type) +{ + switch (type) { + case IB_MW_TYPE_1: return MLX4_MW_TYPE_1; + case IB_MW_TYPE_2: return MLX4_MW_TYPE_2; + default: return -1; + } +} + +struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc) +{ + struct mlx4_ib_mr *mr; + int err; + + mr = kmalloc(sizeof *mr, GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + err = mlx4_mr_alloc(to_mdev(pd->device)->dev, to_mpd(pd)->pdn, 0, + ~0ull, convert_access(acc), 0, 0, &mr->mmr); + if (err) + goto err_free; + + err = mlx4_mr_enable(to_mdev(pd->device)->dev, &mr->mmr); + if (err) + goto err_mr; + + mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key; + mr->umem = NULL; + + return &mr->ibmr; + +err_mr: + (void) mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr); + +err_free: + kfree(mr); + + return ERR_PTR(err); +} + +int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt, + struct ib_umem *umem) +{ + u64 *pages; + int i, k, entry; + int n; + int len; + int err = 0; + struct scatterlist *sg; + + pages = (u64 *) __get_free_page(GFP_KERNEL); + if (!pages) + return -ENOMEM; + + i = n = 0; + + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { + len = sg_dma_len(sg) >> mtt->page_shift; + for (k = 0; k < len; ++k) { + pages[i++] = sg_dma_address(sg) + + umem->page_size * k; + /* + * Be friendly to mlx4_write_mtt() and + * pass it chunks of appropriate size. + */ + if (i == PAGE_SIZE / sizeof (u64)) { + err = mlx4_write_mtt(dev->dev, mtt, n, + i, pages); + if (err) + goto out; + n += i; + i = 0; + } + } + } + + if (i) + err = mlx4_write_mtt(dev->dev, mtt, n, i, pages); + +out: + free_page((unsigned long) pages); + return err; +} + +struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int access_flags, + struct ib_udata *udata) +{ + struct mlx4_ib_dev *dev = to_mdev(pd->device); + struct mlx4_ib_mr *mr; + int shift; + int err; + int n; + + mr = kmalloc(sizeof *mr, GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + /* Force registering the memory as writable. */ + /* Used for memory re-registeration. HCA protects the access */ + mr->umem = ib_umem_get(pd->uobject->context, start, length, + access_flags | IB_ACCESS_LOCAL_WRITE, 0); + if (IS_ERR(mr->umem)) { + err = PTR_ERR(mr->umem); + goto err_free; + } + + n = ib_umem_page_count(mr->umem); + shift = ilog2(mr->umem->page_size); + + err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, virt_addr, length, + convert_access(access_flags), n, shift, &mr->mmr); + if (err) + goto err_umem; + + err = mlx4_ib_umem_write_mtt(dev, &mr->mmr.mtt, mr->umem); + if (err) + goto err_mr; + + err = mlx4_mr_enable(dev->dev, &mr->mmr); + if (err) + goto err_mr; + + mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key; + + return &mr->ibmr; + +err_mr: + (void) mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr); + +err_umem: + ib_umem_release(mr->umem); + +err_free: + kfree(mr); + + return ERR_PTR(err); +} + +int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags, + u64 start, u64 length, u64 virt_addr, + int mr_access_flags, struct ib_pd *pd, + struct ib_udata *udata) +{ + struct mlx4_ib_dev *dev = to_mdev(mr->device); + struct mlx4_ib_mr *mmr = to_mmr(mr); + struct mlx4_mpt_entry *mpt_entry; + struct mlx4_mpt_entry **pmpt_entry = &mpt_entry; + int err; + + /* Since we synchronize this call and mlx4_ib_dereg_mr via uverbs, + * we assume that the calls can't run concurrently. Otherwise, a + * race exists. + */ + err = mlx4_mr_hw_get_mpt(dev->dev, &mmr->mmr, &pmpt_entry); + + if (err) + return err; + + if (flags & IB_MR_REREG_PD) { + err = mlx4_mr_hw_change_pd(dev->dev, *pmpt_entry, + to_mpd(pd)->pdn); + + if (err) + goto release_mpt_entry; + } + + if (flags & IB_MR_REREG_ACCESS) { + err = mlx4_mr_hw_change_access(dev->dev, *pmpt_entry, + convert_access(mr_access_flags)); + + if (err) + goto release_mpt_entry; + } + + if (flags & IB_MR_REREG_TRANS) { + int shift; + int n; + + mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr); + ib_umem_release(mmr->umem); + mmr->umem = ib_umem_get(mr->uobject->context, start, length, + mr_access_flags | + IB_ACCESS_LOCAL_WRITE, + 0); + if (IS_ERR(mmr->umem)) { + err = PTR_ERR(mmr->umem); + /* Prevent mlx4_ib_dereg_mr from free'ing invalid pointer */ + mmr->umem = NULL; + goto release_mpt_entry; + } + n = ib_umem_page_count(mmr->umem); + shift = ilog2(mmr->umem->page_size); + + err = mlx4_mr_rereg_mem_write(dev->dev, &mmr->mmr, + virt_addr, length, n, shift, + *pmpt_entry); + if (err) { + ib_umem_release(mmr->umem); + goto release_mpt_entry; + } + mmr->mmr.iova = virt_addr; + mmr->mmr.size = length; + + err = mlx4_ib_umem_write_mtt(dev, &mmr->mmr.mtt, mmr->umem); + if (err) { + mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr); + ib_umem_release(mmr->umem); + goto release_mpt_entry; + } + } + + /* If we couldn't transfer the MR to the HCA, just remember to + * return a failure. But dereg_mr will free the resources. + */ + err = mlx4_mr_hw_write_mpt(dev->dev, &mmr->mmr, pmpt_entry); + if (!err && flags & IB_MR_REREG_ACCESS) + mmr->mmr.access = mr_access_flags; + +release_mpt_entry: + mlx4_mr_hw_put_mpt(dev->dev, pmpt_entry); + + return err; +} + +int mlx4_ib_dereg_mr(struct ib_mr *ibmr) +{ + struct mlx4_ib_mr *mr = to_mmr(ibmr); + int ret; + + ret = mlx4_mr_free(to_mdev(ibmr->device)->dev, &mr->mmr); + if (ret) + return ret; + if (mr->umem) + ib_umem_release(mr->umem); + kfree(mr); + + return 0; +} + +struct ib_mw *mlx4_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type) +{ + struct mlx4_ib_dev *dev = to_mdev(pd->device); + struct mlx4_ib_mw *mw; + int err; + + mw = kmalloc(sizeof(*mw), GFP_KERNEL); + if (!mw) + return ERR_PTR(-ENOMEM); + + err = mlx4_mw_alloc(dev->dev, to_mpd(pd)->pdn, + to_mlx4_type(type), &mw->mmw); + if (err) + goto err_free; + + err = mlx4_mw_enable(dev->dev, &mw->mmw); + if (err) + goto err_mw; + + mw->ibmw.rkey = mw->mmw.key; + + return &mw->ibmw; + +err_mw: + mlx4_mw_free(dev->dev, &mw->mmw); + +err_free: + kfree(mw); + + return ERR_PTR(err); +} + +int mlx4_ib_bind_mw(struct ib_qp *qp, struct ib_mw *mw, + struct ib_mw_bind *mw_bind) +{ + struct ib_send_wr wr; + struct ib_send_wr *bad_wr; + int ret; + + memset(&wr, 0, sizeof(wr)); + wr.opcode = IB_WR_BIND_MW; + wr.wr_id = mw_bind->wr_id; + wr.send_flags = mw_bind->send_flags; + wr.wr.bind_mw.mw = mw; + wr.wr.bind_mw.bind_info = mw_bind->bind_info; + wr.wr.bind_mw.rkey = ib_inc_rkey(mw->rkey); + + ret = mlx4_ib_post_send(qp, &wr, &bad_wr); + if (!ret) + mw->rkey = wr.wr.bind_mw.rkey; + + return ret; +} + +int mlx4_ib_dealloc_mw(struct ib_mw *ibmw) +{ + struct mlx4_ib_mw *mw = to_mmw(ibmw); + + mlx4_mw_free(to_mdev(ibmw->device)->dev, &mw->mmw); + kfree(mw); + + return 0; +} + +struct ib_mr *mlx4_ib_alloc_fast_reg_mr(struct ib_pd *pd, + int max_page_list_len) +{ + struct mlx4_ib_dev *dev = to_mdev(pd->device); + struct mlx4_ib_mr *mr; + int err; + + mr = kmalloc(sizeof *mr, GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, 0, 0, 0, + max_page_list_len, 0, &mr->mmr); + if (err) + goto err_free; + + err = mlx4_mr_enable(dev->dev, &mr->mmr); + if (err) + goto err_mr; + + mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key; + mr->umem = NULL; + + return &mr->ibmr; + +err_mr: + (void) mlx4_mr_free(dev->dev, &mr->mmr); + +err_free: + kfree(mr); + return ERR_PTR(err); +} + +struct ib_fast_reg_page_list *mlx4_ib_alloc_fast_reg_page_list(struct ib_device *ibdev, + int page_list_len) +{ + struct mlx4_ib_dev *dev = to_mdev(ibdev); + struct mlx4_ib_fast_reg_page_list *mfrpl; + int size = page_list_len * sizeof (u64); + + if (page_list_len > MLX4_MAX_FAST_REG_PAGES) + return ERR_PTR(-EINVAL); + + mfrpl = kmalloc(sizeof *mfrpl, GFP_KERNEL); + if (!mfrpl) + return ERR_PTR(-ENOMEM); + + mfrpl->ibfrpl.page_list = kmalloc(size, GFP_KERNEL); + if (!mfrpl->ibfrpl.page_list) + goto err_free; + + mfrpl->mapped_page_list = dma_alloc_coherent(&dev->dev->persist-> + pdev->dev, + size, &mfrpl->map, + GFP_KERNEL); + if (!mfrpl->mapped_page_list) + goto err_free; + + WARN_ON(mfrpl->map & 0x3f); + + return &mfrpl->ibfrpl; + +err_free: + kfree(mfrpl->ibfrpl.page_list); + kfree(mfrpl); + return ERR_PTR(-ENOMEM); +} + +void mlx4_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list) +{ + struct mlx4_ib_dev *dev = to_mdev(page_list->device); + struct mlx4_ib_fast_reg_page_list *mfrpl = to_mfrpl(page_list); + int size = page_list->max_page_list_len * sizeof (u64); + + dma_free_coherent(&dev->dev->persist->pdev->dev, size, + mfrpl->mapped_page_list, + mfrpl->map); + kfree(mfrpl->ibfrpl.page_list); + kfree(mfrpl); +} + +struct ib_fmr *mlx4_ib_fmr_alloc(struct ib_pd *pd, int acc, + struct ib_fmr_attr *fmr_attr) +{ + struct mlx4_ib_dev *dev = to_mdev(pd->device); + struct mlx4_ib_fmr *fmr; + int err = -ENOMEM; + + fmr = kmalloc(sizeof *fmr, GFP_KERNEL); + if (!fmr) + return ERR_PTR(-ENOMEM); + + err = mlx4_fmr_alloc(dev->dev, to_mpd(pd)->pdn, convert_access(acc), + fmr_attr->max_pages, fmr_attr->max_maps, + fmr_attr->page_shift, &fmr->mfmr); + if (err) + goto err_free; + + err = mlx4_fmr_enable(to_mdev(pd->device)->dev, &fmr->mfmr); + if (err) + goto err_mr; + + fmr->ibfmr.rkey = fmr->ibfmr.lkey = fmr->mfmr.mr.key; + + return &fmr->ibfmr; + +err_mr: + (void) mlx4_mr_free(to_mdev(pd->device)->dev, &fmr->mfmr.mr); + +err_free: + kfree(fmr); + + return ERR_PTR(err); +} + +int mlx4_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, + int npages, u64 iova) +{ + struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr); + struct mlx4_ib_dev *dev = to_mdev(ifmr->ibfmr.device); + + return mlx4_map_phys_fmr(dev->dev, &ifmr->mfmr, page_list, npages, iova, + &ifmr->ibfmr.lkey, &ifmr->ibfmr.rkey); +} + +int mlx4_ib_unmap_fmr(struct list_head *fmr_list) +{ + struct ib_fmr *ibfmr; + int err; + struct mlx4_dev *mdev = NULL; + + list_for_each_entry(ibfmr, fmr_list, list) { + if (mdev && to_mdev(ibfmr->device)->dev != mdev) + return -EINVAL; + mdev = to_mdev(ibfmr->device)->dev; + } + + if (!mdev) + return 0; + + list_for_each_entry(ibfmr, fmr_list, list) { + struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr); + + mlx4_fmr_unmap(mdev, &ifmr->mfmr, &ifmr->ibfmr.lkey, &ifmr->ibfmr.rkey); + } + + /* + * Make sure all MPT status updates are visible before issuing + * SYNC_TPT firmware command. + */ + wmb(); + + err = mlx4_SYNC_TPT(mdev); + if (err) + pr_warn("SYNC_TPT error %d when " + "unmapping FMRs\n", err); + + return 0; +} + +int mlx4_ib_fmr_dealloc(struct ib_fmr *ibfmr) +{ + struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr); + struct mlx4_ib_dev *dev = to_mdev(ibfmr->device); + int err; + + err = mlx4_fmr_free(dev->dev, &ifmr->mfmr); + + if (!err) + kfree(ifmr); + + return err; +} diff --git a/kernel/drivers/infiniband/hw/mlx4/qp.c b/kernel/drivers/infiniband/hw/mlx4/qp.c new file mode 100644 index 000000000..02fc91c68 --- /dev/null +++ b/kernel/drivers/infiniband/hw/mlx4/qp.c @@ -0,0 +1,3217 @@ +/* + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +#include "mlx4_ib.h" +#include "user.h" + +static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq, + struct mlx4_ib_cq *recv_cq); +static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq, + struct mlx4_ib_cq *recv_cq); + +enum { + MLX4_IB_ACK_REQ_FREQ = 8, +}; + +enum { + MLX4_IB_DEFAULT_SCHED_QUEUE = 0x83, + MLX4_IB_DEFAULT_QP0_SCHED_QUEUE = 0x3f, + MLX4_IB_LINK_TYPE_IB = 0, + MLX4_IB_LINK_TYPE_ETH = 1 +}; + +enum { + /* + * Largest possible UD header: send with GRH and immediate + * data plus 18 bytes for an Ethernet header with VLAN/802.1Q + * tag. (LRH would only use 8 bytes, so Ethernet is the + * biggest case) + */ + MLX4_IB_UD_HEADER_SIZE = 82, + MLX4_IB_LSO_HEADER_SPARE = 128, +}; + +enum { + MLX4_IB_IBOE_ETHERTYPE = 0x8915 +}; + +struct mlx4_ib_sqp { + struct mlx4_ib_qp qp; + int pkey_index; + u32 qkey; + u32 send_psn; + struct ib_ud_header ud_header; + u8 header_buf[MLX4_IB_UD_HEADER_SIZE]; +}; + +enum { + MLX4_IB_MIN_SQ_STRIDE = 6, + MLX4_IB_CACHE_LINE_SIZE = 64, +}; + +enum { + MLX4_RAW_QP_MTU = 7, + MLX4_RAW_QP_MSGMAX = 31, +}; + +#ifndef ETH_ALEN +#define ETH_ALEN 6 +#endif + +static const __be32 mlx4_ib_opcode[] = { + [IB_WR_SEND] = cpu_to_be32(MLX4_OPCODE_SEND), + [IB_WR_LSO] = cpu_to_be32(MLX4_OPCODE_LSO), + [IB_WR_SEND_WITH_IMM] = cpu_to_be32(MLX4_OPCODE_SEND_IMM), + [IB_WR_RDMA_WRITE] = cpu_to_be32(MLX4_OPCODE_RDMA_WRITE), + [IB_WR_RDMA_WRITE_WITH_IMM] = cpu_to_be32(MLX4_OPCODE_RDMA_WRITE_IMM), + [IB_WR_RDMA_READ] = cpu_to_be32(MLX4_OPCODE_RDMA_READ), + [IB_WR_ATOMIC_CMP_AND_SWP] = cpu_to_be32(MLX4_OPCODE_ATOMIC_CS), + [IB_WR_ATOMIC_FETCH_AND_ADD] = cpu_to_be32(MLX4_OPCODE_ATOMIC_FA), + [IB_WR_SEND_WITH_INV] = cpu_to_be32(MLX4_OPCODE_SEND_INVAL), + [IB_WR_LOCAL_INV] = cpu_to_be32(MLX4_OPCODE_LOCAL_INVAL), + [IB_WR_FAST_REG_MR] = cpu_to_be32(MLX4_OPCODE_FMR), + [IB_WR_MASKED_ATOMIC_CMP_AND_SWP] = cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_CS), + [IB_WR_MASKED_ATOMIC_FETCH_AND_ADD] = cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_FA), + [IB_WR_BIND_MW] = cpu_to_be32(MLX4_OPCODE_BIND_MW), +}; + +static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp) +{ + return container_of(mqp, struct mlx4_ib_sqp, qp); +} + +static int is_tunnel_qp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) +{ + if (!mlx4_is_master(dev->dev)) + return 0; + + return qp->mqp.qpn >= dev->dev->phys_caps.base_tunnel_sqpn && + qp->mqp.qpn < dev->dev->phys_caps.base_tunnel_sqpn + + 8 * MLX4_MFUNC_MAX; +} + +static int is_sqp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) +{ + int proxy_sqp = 0; + int real_sqp = 0; + int i; + /* PPF or Native -- real SQP */ + real_sqp = ((mlx4_is_master(dev->dev) || !mlx4_is_mfunc(dev->dev)) && + qp->mqp.qpn >= dev->dev->phys_caps.base_sqpn && + qp->mqp.qpn <= dev->dev->phys_caps.base_sqpn + 3); + if (real_sqp) + return 1; + /* VF or PF -- proxy SQP */ + if (mlx4_is_mfunc(dev->dev)) { + for (i = 0; i < dev->dev->caps.num_ports; i++) { + if (qp->mqp.qpn == dev->dev->caps.qp0_proxy[i] || + qp->mqp.qpn == dev->dev->caps.qp1_proxy[i]) { + proxy_sqp = 1; + break; + } + } + } + return proxy_sqp; +} + +/* used for INIT/CLOSE port logic */ +static int is_qp0(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) +{ + int proxy_qp0 = 0; + int real_qp0 = 0; + int i; + /* PPF or Native -- real QP0 */ + real_qp0 = ((mlx4_is_master(dev->dev) || !mlx4_is_mfunc(dev->dev)) && + qp->mqp.qpn >= dev->dev->phys_caps.base_sqpn && + qp->mqp.qpn <= dev->dev->phys_caps.base_sqpn + 1); + if (real_qp0) + return 1; + /* VF or PF -- proxy QP0 */ + if (mlx4_is_mfunc(dev->dev)) { + for (i = 0; i < dev->dev->caps.num_ports; i++) { + if (qp->mqp.qpn == dev->dev->caps.qp0_proxy[i]) { + proxy_qp0 = 1; + break; + } + } + } + return proxy_qp0; +} + +static void *get_wqe(struct mlx4_ib_qp *qp, int offset) +{ + return mlx4_buf_offset(&qp->buf, offset); +} + +static void *get_recv_wqe(struct mlx4_ib_qp *qp, int n) +{ + return get_wqe(qp, qp->rq.offset + (n << qp->rq.wqe_shift)); +} + +static void *get_send_wqe(struct mlx4_ib_qp *qp, int n) +{ + return get_wqe(qp, qp->sq.offset + (n << qp->sq.wqe_shift)); +} + +/* + * Stamp a SQ WQE so that it is invalid if prefetched by marking the + * first four bytes of every 64 byte chunk with + * 0x7FFFFFF | (invalid_ownership_value << 31). + * + * When the max work request size is less than or equal to the WQE + * basic block size, as an optimization, we can stamp all WQEs with + * 0xffffffff, and skip the very first chunk of each WQE. + */ +static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n, int size) +{ + __be32 *wqe; + int i; + int s; + int ind; + void *buf; + __be32 stamp; + struct mlx4_wqe_ctrl_seg *ctrl; + + if (qp->sq_max_wqes_per_wr > 1) { + s = roundup(size, 1U << qp->sq.wqe_shift); + for (i = 0; i < s; i += 64) { + ind = (i >> qp->sq.wqe_shift) + n; + stamp = ind & qp->sq.wqe_cnt ? cpu_to_be32(0x7fffffff) : + cpu_to_be32(0xffffffff); + buf = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1)); + wqe = buf + (i & ((1 << qp->sq.wqe_shift) - 1)); + *wqe = stamp; + } + } else { + ctrl = buf = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1)); + s = (ctrl->fence_size & 0x3f) << 4; + for (i = 64; i < s; i += 64) { + wqe = buf + i; + *wqe = cpu_to_be32(0xffffffff); + } + } +} + +static void post_nop_wqe(struct mlx4_ib_qp *qp, int n, int size) +{ + struct mlx4_wqe_ctrl_seg *ctrl; + struct mlx4_wqe_inline_seg *inl; + void *wqe; + int s; + + ctrl = wqe = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1)); + s = sizeof(struct mlx4_wqe_ctrl_seg); + + if (qp->ibqp.qp_type == IB_QPT_UD) { + struct mlx4_wqe_datagram_seg *dgram = wqe + sizeof *ctrl; + struct mlx4_av *av = (struct mlx4_av *)dgram->av; + memset(dgram, 0, sizeof *dgram); + av->port_pd = cpu_to_be32((qp->port << 24) | to_mpd(qp->ibqp.pd)->pdn); + s += sizeof(struct mlx4_wqe_datagram_seg); + } + + /* Pad the remainder of the WQE with an inline data segment. */ + if (size > s) { + inl = wqe + s; + inl->byte_count = cpu_to_be32(1 << 31 | (size - s - sizeof *inl)); + } + ctrl->srcrb_flags = 0; + ctrl->fence_size = size / 16; + /* + * Make sure descriptor is fully written before setting ownership bit + * (because HW can start executing as soon as we do). + */ + wmb(); + + ctrl->owner_opcode = cpu_to_be32(MLX4_OPCODE_NOP | MLX4_WQE_CTRL_NEC) | + (n & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0); + + stamp_send_wqe(qp, n + qp->sq_spare_wqes, size); +} + +/* Post NOP WQE to prevent wrap-around in the middle of WR */ +static inline unsigned pad_wraparound(struct mlx4_ib_qp *qp, int ind) +{ + unsigned s = qp->sq.wqe_cnt - (ind & (qp->sq.wqe_cnt - 1)); + if (unlikely(s < qp->sq_max_wqes_per_wr)) { + post_nop_wqe(qp, ind, s << qp->sq.wqe_shift); + ind += s; + } + return ind; +} + +static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type) +{ + struct ib_event event; + struct ib_qp *ibqp = &to_mibqp(qp)->ibqp; + + if (type == MLX4_EVENT_TYPE_PATH_MIG) + to_mibqp(qp)->port = to_mibqp(qp)->alt_port; + + if (ibqp->event_handler) { + event.device = ibqp->device; + event.element.qp = ibqp; + switch (type) { + case MLX4_EVENT_TYPE_PATH_MIG: + event.event = IB_EVENT_PATH_MIG; + break; + case MLX4_EVENT_TYPE_COMM_EST: + event.event = IB_EVENT_COMM_EST; + break; + case MLX4_EVENT_TYPE_SQ_DRAINED: + event.event = IB_EVENT_SQ_DRAINED; + break; + case MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE: + event.event = IB_EVENT_QP_LAST_WQE_REACHED; + break; + case MLX4_EVENT_TYPE_WQ_CATAS_ERROR: + event.event = IB_EVENT_QP_FATAL; + break; + case MLX4_EVENT_TYPE_PATH_MIG_FAILED: + event.event = IB_EVENT_PATH_MIG_ERR; + break; + case MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR: + event.event = IB_EVENT_QP_REQ_ERR; + break; + case MLX4_EVENT_TYPE_WQ_ACCESS_ERROR: + event.event = IB_EVENT_QP_ACCESS_ERR; + break; + default: + pr_warn("Unexpected event type %d " + "on QP %06x\n", type, qp->qpn); + return; + } + + ibqp->event_handler(&event, ibqp->qp_context); + } +} + +static int send_wqe_overhead(enum mlx4_ib_qp_type type, u32 flags) +{ + /* + * UD WQEs must have a datagram segment. + * RC and UC WQEs might have a remote address segment. + * MLX WQEs need two extra inline data segments (for the UD + * header and space for the ICRC). + */ + switch (type) { + case MLX4_IB_QPT_UD: + return sizeof (struct mlx4_wqe_ctrl_seg) + + sizeof (struct mlx4_wqe_datagram_seg) + + ((flags & MLX4_IB_QP_LSO) ? MLX4_IB_LSO_HEADER_SPARE : 0); + case MLX4_IB_QPT_PROXY_SMI_OWNER: + case MLX4_IB_QPT_PROXY_SMI: + case MLX4_IB_QPT_PROXY_GSI: + return sizeof (struct mlx4_wqe_ctrl_seg) + + sizeof (struct mlx4_wqe_datagram_seg) + 64; + case MLX4_IB_QPT_TUN_SMI_OWNER: + case MLX4_IB_QPT_TUN_GSI: + return sizeof (struct mlx4_wqe_ctrl_seg) + + sizeof (struct mlx4_wqe_datagram_seg); + + case MLX4_IB_QPT_UC: + return sizeof (struct mlx4_wqe_ctrl_seg) + + sizeof (struct mlx4_wqe_raddr_seg); + case MLX4_IB_QPT_RC: + return sizeof (struct mlx4_wqe_ctrl_seg) + + sizeof (struct mlx4_wqe_atomic_seg) + + sizeof (struct mlx4_wqe_raddr_seg); + case MLX4_IB_QPT_SMI: + case MLX4_IB_QPT_GSI: + return sizeof (struct mlx4_wqe_ctrl_seg) + + ALIGN(MLX4_IB_UD_HEADER_SIZE + + DIV_ROUND_UP(MLX4_IB_UD_HEADER_SIZE, + MLX4_INLINE_ALIGN) * + sizeof (struct mlx4_wqe_inline_seg), + sizeof (struct mlx4_wqe_data_seg)) + + ALIGN(4 + + sizeof (struct mlx4_wqe_inline_seg), + sizeof (struct mlx4_wqe_data_seg)); + default: + return sizeof (struct mlx4_wqe_ctrl_seg); + } +} + +static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, + int is_user, int has_rq, struct mlx4_ib_qp *qp) +{ + /* Sanity check RQ size before proceeding */ + if (cap->max_recv_wr > dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE || + cap->max_recv_sge > min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg)) + return -EINVAL; + + if (!has_rq) { + if (cap->max_recv_wr) + return -EINVAL; + + qp->rq.wqe_cnt = qp->rq.max_gs = 0; + } else { + /* HW requires >= 1 RQ entry with >= 1 gather entry */ + if (is_user && (!cap->max_recv_wr || !cap->max_recv_sge)) + return -EINVAL; + + qp->rq.wqe_cnt = roundup_pow_of_two(max(1U, cap->max_recv_wr)); + qp->rq.max_gs = roundup_pow_of_two(max(1U, cap->max_recv_sge)); + qp->rq.wqe_shift = ilog2(qp->rq.max_gs * sizeof (struct mlx4_wqe_data_seg)); + } + + /* leave userspace return values as they were, so as not to break ABI */ + if (is_user) { + cap->max_recv_wr = qp->rq.max_post = qp->rq.wqe_cnt; + cap->max_recv_sge = qp->rq.max_gs; + } else { + cap->max_recv_wr = qp->rq.max_post = + min(dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE, qp->rq.wqe_cnt); + cap->max_recv_sge = min(qp->rq.max_gs, + min(dev->dev->caps.max_sq_sg, + dev->dev->caps.max_rq_sg)); + } + + return 0; +} + +static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, + enum mlx4_ib_qp_type type, struct mlx4_ib_qp *qp) +{ + int s; + + /* Sanity check SQ size before proceeding */ + if (cap->max_send_wr > (dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE) || + cap->max_send_sge > min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg) || + cap->max_inline_data + send_wqe_overhead(type, qp->flags) + + sizeof (struct mlx4_wqe_inline_seg) > dev->dev->caps.max_sq_desc_sz) + return -EINVAL; + + /* + * For MLX transport we need 2 extra S/G entries: + * one for the header and one for the checksum at the end + */ + if ((type == MLX4_IB_QPT_SMI || type == MLX4_IB_QPT_GSI || + type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER)) && + cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg) + return -EINVAL; + + s = max(cap->max_send_sge * sizeof (struct mlx4_wqe_data_seg), + cap->max_inline_data + sizeof (struct mlx4_wqe_inline_seg)) + + send_wqe_overhead(type, qp->flags); + + if (s > dev->dev->caps.max_sq_desc_sz) + return -EINVAL; + + /* + * Hermon supports shrinking WQEs, such that a single work + * request can include multiple units of 1 << wqe_shift. This + * way, work requests can differ in size, and do not have to + * be a power of 2 in size, saving memory and speeding up send + * WR posting. Unfortunately, if we do this then the + * wqe_index field in CQEs can't be used to look up the WR ID + * anymore, so we do this only if selective signaling is off. + * + * Further, on 32-bit platforms, we can't use vmap() to make + * the QP buffer virtually contiguous. Thus we have to use + * constant-sized WRs to make sure a WR is always fully within + * a single page-sized chunk. + * + * Finally, we use NOP work requests to pad the end of the + * work queue, to avoid wrap-around in the middle of WR. We + * set NEC bit to avoid getting completions with error for + * these NOP WRs, but since NEC is only supported starting + * with firmware 2.2.232, we use constant-sized WRs for older + * firmware. + * + * And, since MLX QPs only support SEND, we use constant-sized + * WRs in this case. + * + * We look for the smallest value of wqe_shift such that the + * resulting number of wqes does not exceed device + * capabilities. + * + * We set WQE size to at least 64 bytes, this way stamping + * invalidates each WQE. + */ + if (dev->dev->caps.fw_ver >= MLX4_FW_VER_WQE_CTRL_NEC && + qp->sq_signal_bits && BITS_PER_LONG == 64 && + type != MLX4_IB_QPT_SMI && type != MLX4_IB_QPT_GSI && + !(type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_PROXY_SMI | + MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER))) + qp->sq.wqe_shift = ilog2(64); + else + qp->sq.wqe_shift = ilog2(roundup_pow_of_two(s)); + + for (;;) { + qp->sq_max_wqes_per_wr = DIV_ROUND_UP(s, 1U << qp->sq.wqe_shift); + + /* + * We need to leave 2 KB + 1 WR of headroom in the SQ to + * allow HW to prefetch. + */ + qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + qp->sq_max_wqes_per_wr; + qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr * + qp->sq_max_wqes_per_wr + + qp->sq_spare_wqes); + + if (qp->sq.wqe_cnt <= dev->dev->caps.max_wqes) + break; + + if (qp->sq_max_wqes_per_wr <= 1) + return -EINVAL; + + ++qp->sq.wqe_shift; + } + + qp->sq.max_gs = (min(dev->dev->caps.max_sq_desc_sz, + (qp->sq_max_wqes_per_wr << qp->sq.wqe_shift)) - + send_wqe_overhead(type, qp->flags)) / + sizeof (struct mlx4_wqe_data_seg); + + qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + + (qp->sq.wqe_cnt << qp->sq.wqe_shift); + if (qp->rq.wqe_shift > qp->sq.wqe_shift) { + qp->rq.offset = 0; + qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift; + } else { + qp->rq.offset = qp->sq.wqe_cnt << qp->sq.wqe_shift; + qp->sq.offset = 0; + } + + cap->max_send_wr = qp->sq.max_post = + (qp->sq.wqe_cnt - qp->sq_spare_wqes) / qp->sq_max_wqes_per_wr; + cap->max_send_sge = min(qp->sq.max_gs, + min(dev->dev->caps.max_sq_sg, + dev->dev->caps.max_rq_sg)); + /* We don't support inline sends for kernel QPs (yet) */ + cap->max_inline_data = 0; + + return 0; +} + +static int set_user_sq_size(struct mlx4_ib_dev *dev, + struct mlx4_ib_qp *qp, + struct mlx4_ib_create_qp *ucmd) +{ + /* Sanity check SQ size before proceeding */ + if ((1 << ucmd->log_sq_bb_count) > dev->dev->caps.max_wqes || + ucmd->log_sq_stride > + ilog2(roundup_pow_of_two(dev->dev->caps.max_sq_desc_sz)) || + ucmd->log_sq_stride < MLX4_IB_MIN_SQ_STRIDE) + return -EINVAL; + + qp->sq.wqe_cnt = 1 << ucmd->log_sq_bb_count; + qp->sq.wqe_shift = ucmd->log_sq_stride; + + qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + + (qp->sq.wqe_cnt << qp->sq.wqe_shift); + + return 0; +} + +static int alloc_proxy_bufs(struct ib_device *dev, struct mlx4_ib_qp *qp) +{ + int i; + + qp->sqp_proxy_rcv = + kmalloc(sizeof (struct mlx4_ib_buf) * qp->rq.wqe_cnt, + GFP_KERNEL); + if (!qp->sqp_proxy_rcv) + return -ENOMEM; + for (i = 0; i < qp->rq.wqe_cnt; i++) { + qp->sqp_proxy_rcv[i].addr = + kmalloc(sizeof (struct mlx4_ib_proxy_sqp_hdr), + GFP_KERNEL); + if (!qp->sqp_proxy_rcv[i].addr) + goto err; + qp->sqp_proxy_rcv[i].map = + ib_dma_map_single(dev, qp->sqp_proxy_rcv[i].addr, + sizeof (struct mlx4_ib_proxy_sqp_hdr), + DMA_FROM_DEVICE); + if (ib_dma_mapping_error(dev, qp->sqp_proxy_rcv[i].map)) { + kfree(qp->sqp_proxy_rcv[i].addr); + goto err; + } + } + return 0; + +err: + while (i > 0) { + --i; + ib_dma_unmap_single(dev, qp->sqp_proxy_rcv[i].map, + sizeof (struct mlx4_ib_proxy_sqp_hdr), + DMA_FROM_DEVICE); + kfree(qp->sqp_proxy_rcv[i].addr); + } + kfree(qp->sqp_proxy_rcv); + qp->sqp_proxy_rcv = NULL; + return -ENOMEM; +} + +static void free_proxy_bufs(struct ib_device *dev, struct mlx4_ib_qp *qp) +{ + int i; + + for (i = 0; i < qp->rq.wqe_cnt; i++) { + ib_dma_unmap_single(dev, qp->sqp_proxy_rcv[i].map, + sizeof (struct mlx4_ib_proxy_sqp_hdr), + DMA_FROM_DEVICE); + kfree(qp->sqp_proxy_rcv[i].addr); + } + kfree(qp->sqp_proxy_rcv); +} + +static int qp_has_rq(struct ib_qp_init_attr *attr) +{ + if (attr->qp_type == IB_QPT_XRC_INI || attr->qp_type == IB_QPT_XRC_TGT) + return 0; + + return !attr->srq; +} + +static int qp0_enabled_vf(struct mlx4_dev *dev, int qpn) +{ + int i; + for (i = 0; i < dev->caps.num_ports; i++) { + if (qpn == dev->caps.qp0_proxy[i]) + return !!dev->caps.qp0_qkey[i]; + } + return 0; +} + +static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata, int sqpn, struct mlx4_ib_qp **caller_qp, + gfp_t gfp) +{ + int qpn; + int err; + struct mlx4_ib_sqp *sqp; + struct mlx4_ib_qp *qp; + enum mlx4_ib_qp_type qp_type = (enum mlx4_ib_qp_type) init_attr->qp_type; + struct mlx4_ib_cq *mcq; + unsigned long flags; + + /* When tunneling special qps, we use a plain UD qp */ + if (sqpn) { + if (mlx4_is_mfunc(dev->dev) && + (!mlx4_is_master(dev->dev) || + !(init_attr->create_flags & MLX4_IB_SRIOV_SQP))) { + if (init_attr->qp_type == IB_QPT_GSI) + qp_type = MLX4_IB_QPT_PROXY_GSI; + else { + if (mlx4_is_master(dev->dev) || + qp0_enabled_vf(dev->dev, sqpn)) + qp_type = MLX4_IB_QPT_PROXY_SMI_OWNER; + else + qp_type = MLX4_IB_QPT_PROXY_SMI; + } + } + qpn = sqpn; + /* add extra sg entry for tunneling */ + init_attr->cap.max_recv_sge++; + } else if (init_attr->create_flags & MLX4_IB_SRIOV_TUNNEL_QP) { + struct mlx4_ib_qp_tunnel_init_attr *tnl_init = + container_of(init_attr, + struct mlx4_ib_qp_tunnel_init_attr, init_attr); + if ((tnl_init->proxy_qp_type != IB_QPT_SMI && + tnl_init->proxy_qp_type != IB_QPT_GSI) || + !mlx4_is_master(dev->dev)) + return -EINVAL; + if (tnl_init->proxy_qp_type == IB_QPT_GSI) + qp_type = MLX4_IB_QPT_TUN_GSI; + else if (tnl_init->slave == mlx4_master_func_num(dev->dev) || + mlx4_vf_smi_enabled(dev->dev, tnl_init->slave, + tnl_init->port)) + qp_type = MLX4_IB_QPT_TUN_SMI_OWNER; + else + qp_type = MLX4_IB_QPT_TUN_SMI; + /* we are definitely in the PPF here, since we are creating + * tunnel QPs. base_tunnel_sqpn is therefore valid. */ + qpn = dev->dev->phys_caps.base_tunnel_sqpn + 8 * tnl_init->slave + + tnl_init->proxy_qp_type * 2 + tnl_init->port - 1; + sqpn = qpn; + } + + if (!*caller_qp) { + if (qp_type == MLX4_IB_QPT_SMI || qp_type == MLX4_IB_QPT_GSI || + (qp_type & (MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_SMI_OWNER | + MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER))) { + sqp = kzalloc(sizeof (struct mlx4_ib_sqp), gfp); + if (!sqp) + return -ENOMEM; + qp = &sqp->qp; + qp->pri.vid = 0xFFFF; + qp->alt.vid = 0xFFFF; + } else { + qp = kzalloc(sizeof (struct mlx4_ib_qp), gfp); + if (!qp) + return -ENOMEM; + qp->pri.vid = 0xFFFF; + qp->alt.vid = 0xFFFF; + } + } else + qp = *caller_qp; + + qp->mlx4_ib_qp_type = qp_type; + + mutex_init(&qp->mutex); + spin_lock_init(&qp->sq.lock); + spin_lock_init(&qp->rq.lock); + INIT_LIST_HEAD(&qp->gid_list); + INIT_LIST_HEAD(&qp->steering_rules); + + qp->state = IB_QPS_RESET; + if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) + qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE); + + err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, qp_has_rq(init_attr), qp); + if (err) + goto err; + + if (pd->uobject) { + struct mlx4_ib_create_qp ucmd; + + if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) { + err = -EFAULT; + goto err; + } + + qp->sq_no_prefetch = ucmd.sq_no_prefetch; + + err = set_user_sq_size(dev, qp, &ucmd); + if (err) + goto err; + + qp->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, + qp->buf_size, 0, 0); + if (IS_ERR(qp->umem)) { + err = PTR_ERR(qp->umem); + goto err; + } + + err = mlx4_mtt_init(dev->dev, ib_umem_page_count(qp->umem), + ilog2(qp->umem->page_size), &qp->mtt); + if (err) + goto err_buf; + + err = mlx4_ib_umem_write_mtt(dev, &qp->mtt, qp->umem); + if (err) + goto err_mtt; + + if (qp_has_rq(init_attr)) { + err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context), + ucmd.db_addr, &qp->db); + if (err) + goto err_mtt; + } + } else { + qp->sq_no_prefetch = 0; + + if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) + qp->flags |= MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK; + + if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO) + qp->flags |= MLX4_IB_QP_LSO; + + if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP) { + if (dev->steering_support == + MLX4_STEERING_MODE_DEVICE_MANAGED) + qp->flags |= MLX4_IB_QP_NETIF; + else + goto err; + } + + err = set_kernel_sq_size(dev, &init_attr->cap, qp_type, qp); + if (err) + goto err; + + if (qp_has_rq(init_attr)) { + err = mlx4_db_alloc(dev->dev, &qp->db, 0, gfp); + if (err) + goto err; + + *qp->db.db = 0; + } + + if (mlx4_buf_alloc(dev->dev, qp->buf_size, PAGE_SIZE * 2, &qp->buf, gfp)) { + err = -ENOMEM; + goto err_db; + } + + err = mlx4_mtt_init(dev->dev, qp->buf.npages, qp->buf.page_shift, + &qp->mtt); + if (err) + goto err_buf; + + err = mlx4_buf_write_mtt(dev->dev, &qp->mtt, &qp->buf, gfp); + if (err) + goto err_mtt; + + qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof (u64), gfp); + qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof (u64), gfp); + if (!qp->sq.wrid || !qp->rq.wrid) { + err = -ENOMEM; + goto err_wrid; + } + } + + if (sqpn) { + if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER | + MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) { + if (alloc_proxy_bufs(pd->device, qp)) { + err = -ENOMEM; + goto err_wrid; + } + } + } else { + /* Raw packet QPNs may not have bits 6,7 set in their qp_num; + * otherwise, the WQE BlueFlame setup flow wrongly causes + * VLAN insertion. */ + if (init_attr->qp_type == IB_QPT_RAW_PACKET) + err = mlx4_qp_reserve_range(dev->dev, 1, 1, &qpn, + (init_attr->cap.max_send_wr ? + MLX4_RESERVE_ETH_BF_QP : 0) | + (init_attr->cap.max_recv_wr ? + MLX4_RESERVE_A0_QP : 0)); + else + if (qp->flags & MLX4_IB_QP_NETIF) + err = mlx4_ib_steer_qp_alloc(dev, 1, &qpn); + else + err = mlx4_qp_reserve_range(dev->dev, 1, 1, + &qpn, 0); + if (err) + goto err_proxy; + } + + err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp, gfp); + if (err) + goto err_qpn; + + if (init_attr->qp_type == IB_QPT_XRC_TGT) + qp->mqp.qpn |= (1 << 23); + + /* + * Hardware wants QPN written in big-endian order (after + * shifting) for send doorbell. Precompute this value to save + * a little bit when posting sends. + */ + qp->doorbell_qpn = swab32(qp->mqp.qpn << 8); + + qp->mqp.event = mlx4_ib_qp_event; + if (!*caller_qp) + *caller_qp = qp; + + spin_lock_irqsave(&dev->reset_flow_resource_lock, flags); + mlx4_ib_lock_cqs(to_mcq(init_attr->send_cq), + to_mcq(init_attr->recv_cq)); + /* Maintain device to QPs access, needed for further handling + * via reset flow + */ + list_add_tail(&qp->qps_list, &dev->qp_list); + /* Maintain CQ to QPs access, needed for further handling + * via reset flow + */ + mcq = to_mcq(init_attr->send_cq); + list_add_tail(&qp->cq_send_list, &mcq->send_qp_list); + mcq = to_mcq(init_attr->recv_cq); + list_add_tail(&qp->cq_recv_list, &mcq->recv_qp_list); + mlx4_ib_unlock_cqs(to_mcq(init_attr->send_cq), + to_mcq(init_attr->recv_cq)); + spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags); + return 0; + +err_qpn: + if (!sqpn) { + if (qp->flags & MLX4_IB_QP_NETIF) + mlx4_ib_steer_qp_free(dev, qpn, 1); + else + mlx4_qp_release_range(dev->dev, qpn, 1); + } +err_proxy: + if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI) + free_proxy_bufs(pd->device, qp); +err_wrid: + if (pd->uobject) { + if (qp_has_rq(init_attr)) + mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), &qp->db); + } else { + kfree(qp->sq.wrid); + kfree(qp->rq.wrid); + } + +err_mtt: + mlx4_mtt_cleanup(dev->dev, &qp->mtt); + +err_buf: + if (pd->uobject) + ib_umem_release(qp->umem); + else + mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf); + +err_db: + if (!pd->uobject && qp_has_rq(init_attr)) + mlx4_db_free(dev->dev, &qp->db); + +err: + if (!*caller_qp) + kfree(qp); + return err; +} + +static enum mlx4_qp_state to_mlx4_state(enum ib_qp_state state) +{ + switch (state) { + case IB_QPS_RESET: return MLX4_QP_STATE_RST; + case IB_QPS_INIT: return MLX4_QP_STATE_INIT; + case IB_QPS_RTR: return MLX4_QP_STATE_RTR; + case IB_QPS_RTS: return MLX4_QP_STATE_RTS; + case IB_QPS_SQD: return MLX4_QP_STATE_SQD; + case IB_QPS_SQE: return MLX4_QP_STATE_SQER; + case IB_QPS_ERR: return MLX4_QP_STATE_ERR; + default: return -1; + } +} + +static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq) + __acquires(&send_cq->lock) __acquires(&recv_cq->lock) +{ + if (send_cq == recv_cq) { + spin_lock(&send_cq->lock); + __acquire(&recv_cq->lock); + } else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { + spin_lock(&send_cq->lock); + spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING); + } else { + spin_lock(&recv_cq->lock); + spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING); + } +} + +static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq) + __releases(&send_cq->lock) __releases(&recv_cq->lock) +{ + if (send_cq == recv_cq) { + __release(&recv_cq->lock); + spin_unlock(&send_cq->lock); + } else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { + spin_unlock(&recv_cq->lock); + spin_unlock(&send_cq->lock); + } else { + spin_unlock(&send_cq->lock); + spin_unlock(&recv_cq->lock); + } +} + +static void del_gid_entries(struct mlx4_ib_qp *qp) +{ + struct mlx4_ib_gid_entry *ge, *tmp; + + list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) { + list_del(&ge->list); + kfree(ge); + } +} + +static struct mlx4_ib_pd *get_pd(struct mlx4_ib_qp *qp) +{ + if (qp->ibqp.qp_type == IB_QPT_XRC_TGT) + return to_mpd(to_mxrcd(qp->ibqp.xrcd)->pd); + else + return to_mpd(qp->ibqp.pd); +} + +static void get_cqs(struct mlx4_ib_qp *qp, + struct mlx4_ib_cq **send_cq, struct mlx4_ib_cq **recv_cq) +{ + switch (qp->ibqp.qp_type) { + case IB_QPT_XRC_TGT: + *send_cq = to_mcq(to_mxrcd(qp->ibqp.xrcd)->cq); + *recv_cq = *send_cq; + break; + case IB_QPT_XRC_INI: + *send_cq = to_mcq(qp->ibqp.send_cq); + *recv_cq = *send_cq; + break; + default: + *send_cq = to_mcq(qp->ibqp.send_cq); + *recv_cq = to_mcq(qp->ibqp.recv_cq); + break; + } +} + +static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, + int is_user) +{ + struct mlx4_ib_cq *send_cq, *recv_cq; + unsigned long flags; + + if (qp->state != IB_QPS_RESET) { + if (mlx4_qp_modify(dev->dev, NULL, to_mlx4_state(qp->state), + MLX4_QP_STATE_RST, NULL, 0, 0, &qp->mqp)) + pr_warn("modify QP %06x to RESET failed.\n", + qp->mqp.qpn); + if (qp->pri.smac || (!qp->pri.smac && qp->pri.smac_port)) { + mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac); + qp->pri.smac = 0; + qp->pri.smac_port = 0; + } + if (qp->alt.smac) { + mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac); + qp->alt.smac = 0; + } + if (qp->pri.vid < 0x1000) { + mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port, qp->pri.vid); + qp->pri.vid = 0xFFFF; + qp->pri.candidate_vid = 0xFFFF; + qp->pri.update_vid = 0; + } + if (qp->alt.vid < 0x1000) { + mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port, qp->alt.vid); + qp->alt.vid = 0xFFFF; + qp->alt.candidate_vid = 0xFFFF; + qp->alt.update_vid = 0; + } + } + + get_cqs(qp, &send_cq, &recv_cq); + + spin_lock_irqsave(&dev->reset_flow_resource_lock, flags); + mlx4_ib_lock_cqs(send_cq, recv_cq); + + /* del from lists under both locks above to protect reset flow paths */ + list_del(&qp->qps_list); + list_del(&qp->cq_send_list); + list_del(&qp->cq_recv_list); + if (!is_user) { + __mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn, + qp->ibqp.srq ? to_msrq(qp->ibqp.srq): NULL); + if (send_cq != recv_cq) + __mlx4_ib_cq_clean(send_cq, qp->mqp.qpn, NULL); + } + + mlx4_qp_remove(dev->dev, &qp->mqp); + + mlx4_ib_unlock_cqs(send_cq, recv_cq); + spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags); + + mlx4_qp_free(dev->dev, &qp->mqp); + + if (!is_sqp(dev, qp) && !is_tunnel_qp(dev, qp)) { + if (qp->flags & MLX4_IB_QP_NETIF) + mlx4_ib_steer_qp_free(dev, qp->mqp.qpn, 1); + else + mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1); + } + + mlx4_mtt_cleanup(dev->dev, &qp->mtt); + + if (is_user) { + if (qp->rq.wqe_cnt) + mlx4_ib_db_unmap_user(to_mucontext(qp->ibqp.uobject->context), + &qp->db); + ib_umem_release(qp->umem); + } else { + kfree(qp->sq.wrid); + kfree(qp->rq.wrid); + if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER | + MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) + free_proxy_bufs(&dev->ib_dev, qp); + mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf); + if (qp->rq.wqe_cnt) + mlx4_db_free(dev->dev, &qp->db); + } + + del_gid_entries(qp); +} + +static u32 get_sqp_num(struct mlx4_ib_dev *dev, struct ib_qp_init_attr *attr) +{ + /* Native or PPF */ + if (!mlx4_is_mfunc(dev->dev) || + (mlx4_is_master(dev->dev) && + attr->create_flags & MLX4_IB_SRIOV_SQP)) { + return dev->dev->phys_caps.base_sqpn + + (attr->qp_type == IB_QPT_SMI ? 0 : 2) + + attr->port_num - 1; + } + /* PF or VF -- creating proxies */ + if (attr->qp_type == IB_QPT_SMI) + return dev->dev->caps.qp0_proxy[attr->port_num - 1]; + else + return dev->dev->caps.qp1_proxy[attr->port_num - 1]; +} + +struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata) +{ + struct mlx4_ib_qp *qp = NULL; + int err; + u16 xrcdn = 0; + gfp_t gfp; + + gfp = (init_attr->create_flags & MLX4_IB_QP_CREATE_USE_GFP_NOIO) ? + GFP_NOIO : GFP_KERNEL; + /* + * We only support LSO, vendor flag1, and multicast loopback blocking, + * and only for kernel UD QPs. + */ + if (init_attr->create_flags & ~(MLX4_IB_QP_LSO | + MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK | + MLX4_IB_SRIOV_TUNNEL_QP | + MLX4_IB_SRIOV_SQP | + MLX4_IB_QP_NETIF | + MLX4_IB_QP_CREATE_USE_GFP_NOIO)) + return ERR_PTR(-EINVAL); + + if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP) { + if (init_attr->qp_type != IB_QPT_UD) + return ERR_PTR(-EINVAL); + } + + if (init_attr->create_flags && + (udata || + ((init_attr->create_flags & ~(MLX4_IB_SRIOV_SQP | MLX4_IB_QP_CREATE_USE_GFP_NOIO)) && + init_attr->qp_type != IB_QPT_UD) || + ((init_attr->create_flags & MLX4_IB_SRIOV_SQP) && + init_attr->qp_type > IB_QPT_GSI))) + return ERR_PTR(-EINVAL); + + switch (init_attr->qp_type) { + case IB_QPT_XRC_TGT: + pd = to_mxrcd(init_attr->xrcd)->pd; + xrcdn = to_mxrcd(init_attr->xrcd)->xrcdn; + init_attr->send_cq = to_mxrcd(init_attr->xrcd)->cq; + /* fall through */ + case IB_QPT_XRC_INI: + if (!(to_mdev(pd->device)->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC)) + return ERR_PTR(-ENOSYS); + init_attr->recv_cq = init_attr->send_cq; + /* fall through */ + case IB_QPT_RC: + case IB_QPT_UC: + case IB_QPT_RAW_PACKET: + qp = kzalloc(sizeof *qp, gfp); + if (!qp) + return ERR_PTR(-ENOMEM); + qp->pri.vid = 0xFFFF; + qp->alt.vid = 0xFFFF; + /* fall through */ + case IB_QPT_UD: + { + err = create_qp_common(to_mdev(pd->device), pd, init_attr, + udata, 0, &qp, gfp); + if (err) + return ERR_PTR(err); + + qp->ibqp.qp_num = qp->mqp.qpn; + qp->xrcdn = xrcdn; + + break; + } + case IB_QPT_SMI: + case IB_QPT_GSI: + { + /* Userspace is not allowed to create special QPs: */ + if (udata) + return ERR_PTR(-EINVAL); + + err = create_qp_common(to_mdev(pd->device), pd, init_attr, udata, + get_sqp_num(to_mdev(pd->device), init_attr), + &qp, gfp); + if (err) + return ERR_PTR(err); + + qp->port = init_attr->port_num; + qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0 : 1; + + break; + } + default: + /* Don't support raw QPs */ + return ERR_PTR(-EINVAL); + } + + return &qp->ibqp; +} + +int mlx4_ib_destroy_qp(struct ib_qp *qp) +{ + struct mlx4_ib_dev *dev = to_mdev(qp->device); + struct mlx4_ib_qp *mqp = to_mqp(qp); + struct mlx4_ib_pd *pd; + + if (is_qp0(dev, mqp)) + mlx4_CLOSE_PORT(dev->dev, mqp->port); + + if (dev->qp1_proxy[mqp->port - 1] == mqp) { + mutex_lock(&dev->qp1_proxy_lock[mqp->port - 1]); + dev->qp1_proxy[mqp->port - 1] = NULL; + mutex_unlock(&dev->qp1_proxy_lock[mqp->port - 1]); + } + + pd = get_pd(mqp); + destroy_qp_common(dev, mqp, !!pd->ibpd.uobject); + + if (is_sqp(dev, mqp)) + kfree(to_msqp(mqp)); + else + kfree(mqp); + + return 0; +} + +static int to_mlx4_st(struct mlx4_ib_dev *dev, enum mlx4_ib_qp_type type) +{ + switch (type) { + case MLX4_IB_QPT_RC: return MLX4_QP_ST_RC; + case MLX4_IB_QPT_UC: return MLX4_QP_ST_UC; + case MLX4_IB_QPT_UD: return MLX4_QP_ST_UD; + case MLX4_IB_QPT_XRC_INI: + case MLX4_IB_QPT_XRC_TGT: return MLX4_QP_ST_XRC; + case MLX4_IB_QPT_SMI: + case MLX4_IB_QPT_GSI: + case MLX4_IB_QPT_RAW_PACKET: return MLX4_QP_ST_MLX; + + case MLX4_IB_QPT_PROXY_SMI_OWNER: + case MLX4_IB_QPT_TUN_SMI_OWNER: return (mlx4_is_mfunc(dev->dev) ? + MLX4_QP_ST_MLX : -1); + case MLX4_IB_QPT_PROXY_SMI: + case MLX4_IB_QPT_TUN_SMI: + case MLX4_IB_QPT_PROXY_GSI: + case MLX4_IB_QPT_TUN_GSI: return (mlx4_is_mfunc(dev->dev) ? + MLX4_QP_ST_UD : -1); + default: return -1; + } +} + +static __be32 to_mlx4_access_flags(struct mlx4_ib_qp *qp, const struct ib_qp_attr *attr, + int attr_mask) +{ + u8 dest_rd_atomic; + u32 access_flags; + u32 hw_access_flags = 0; + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) + dest_rd_atomic = attr->max_dest_rd_atomic; + else + dest_rd_atomic = qp->resp_depth; + + if (attr_mask & IB_QP_ACCESS_FLAGS) + access_flags = attr->qp_access_flags; + else + access_flags = qp->atomic_rd_en; + + if (!dest_rd_atomic) + access_flags &= IB_ACCESS_REMOTE_WRITE; + + if (access_flags & IB_ACCESS_REMOTE_READ) + hw_access_flags |= MLX4_QP_BIT_RRE; + if (access_flags & IB_ACCESS_REMOTE_ATOMIC) + hw_access_flags |= MLX4_QP_BIT_RAE; + if (access_flags & IB_ACCESS_REMOTE_WRITE) + hw_access_flags |= MLX4_QP_BIT_RWE; + + return cpu_to_be32(hw_access_flags); +} + +static void store_sqp_attrs(struct mlx4_ib_sqp *sqp, const struct ib_qp_attr *attr, + int attr_mask) +{ + if (attr_mask & IB_QP_PKEY_INDEX) + sqp->pkey_index = attr->pkey_index; + if (attr_mask & IB_QP_QKEY) + sqp->qkey = attr->qkey; + if (attr_mask & IB_QP_SQ_PSN) + sqp->send_psn = attr->sq_psn; +} + +static void mlx4_set_sched(struct mlx4_qp_path *path, u8 port) +{ + path->sched_queue = (path->sched_queue & 0xbf) | ((port - 1) << 6); +} + +static int _mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah, + u64 smac, u16 vlan_tag, struct mlx4_qp_path *path, + struct mlx4_roce_smac_vlan_info *smac_info, u8 port) +{ + int is_eth = rdma_port_get_link_layer(&dev->ib_dev, port) == + IB_LINK_LAYER_ETHERNET; + int vidx; + int smac_index; + int err; + + + path->grh_mylmc = ah->src_path_bits & 0x7f; + path->rlid = cpu_to_be16(ah->dlid); + if (ah->static_rate) { + path->static_rate = ah->static_rate + MLX4_STAT_RATE_OFFSET; + while (path->static_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET && + !(1 << path->static_rate & dev->dev->caps.stat_rate_support)) + --path->static_rate; + } else + path->static_rate = 0; + + if (ah->ah_flags & IB_AH_GRH) { + if (ah->grh.sgid_index >= dev->dev->caps.gid_table_len[port]) { + pr_err("sgid_index (%u) too large. max is %d\n", + ah->grh.sgid_index, dev->dev->caps.gid_table_len[port] - 1); + return -1; + } + + path->grh_mylmc |= 1 << 7; + path->mgid_index = ah->grh.sgid_index; + path->hop_limit = ah->grh.hop_limit; + path->tclass_flowlabel = + cpu_to_be32((ah->grh.traffic_class << 20) | + (ah->grh.flow_label)); + memcpy(path->rgid, ah->grh.dgid.raw, 16); + } + + if (is_eth) { + if (!(ah->ah_flags & IB_AH_GRH)) + return -1; + + path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | + ((port - 1) << 6) | ((ah->sl & 7) << 3); + + path->feup |= MLX4_FEUP_FORCE_ETH_UP; + if (vlan_tag < 0x1000) { + if (smac_info->vid < 0x1000) { + /* both valid vlan ids */ + if (smac_info->vid != vlan_tag) { + /* different VIDs. unreg old and reg new */ + err = mlx4_register_vlan(dev->dev, port, vlan_tag, &vidx); + if (err) + return err; + smac_info->candidate_vid = vlan_tag; + smac_info->candidate_vlan_index = vidx; + smac_info->candidate_vlan_port = port; + smac_info->update_vid = 1; + path->vlan_index = vidx; + } else { + path->vlan_index = smac_info->vlan_index; + } + } else { + /* no current vlan tag in qp */ + err = mlx4_register_vlan(dev->dev, port, vlan_tag, &vidx); + if (err) + return err; + smac_info->candidate_vid = vlan_tag; + smac_info->candidate_vlan_index = vidx; + smac_info->candidate_vlan_port = port; + smac_info->update_vid = 1; + path->vlan_index = vidx; + } + path->feup |= MLX4_FVL_FORCE_ETH_VLAN; + path->fl = 1 << 6; + } else { + /* have current vlan tag. unregister it at modify-qp success */ + if (smac_info->vid < 0x1000) { + smac_info->candidate_vid = 0xFFFF; + smac_info->update_vid = 1; + } + } + + /* get smac_index for RoCE use. + * If no smac was yet assigned, register one. + * If one was already assigned, but the new mac differs, + * unregister the old one and register the new one. + */ + if ((!smac_info->smac && !smac_info->smac_port) || + smac_info->smac != smac) { + /* register candidate now, unreg if needed, after success */ + smac_index = mlx4_register_mac(dev->dev, port, smac); + if (smac_index >= 0) { + smac_info->candidate_smac_index = smac_index; + smac_info->candidate_smac = smac; + smac_info->candidate_smac_port = port; + } else { + return -EINVAL; + } + } else { + smac_index = smac_info->smac_index; + } + + memcpy(path->dmac, ah->dmac, 6); + path->ackto = MLX4_IB_LINK_TYPE_ETH; + /* put MAC table smac index for IBoE */ + path->grh_mylmc = (u8) (smac_index) | 0x80; + } else { + path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | + ((port - 1) << 6) | ((ah->sl & 0xf) << 2); + } + + return 0; +} + +static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_qp_attr *qp, + enum ib_qp_attr_mask qp_attr_mask, + struct mlx4_ib_qp *mqp, + struct mlx4_qp_path *path, u8 port) +{ + return _mlx4_set_path(dev, &qp->ah_attr, + mlx4_mac_to_u64((u8 *)qp->smac), + (qp_attr_mask & IB_QP_VID) ? qp->vlan_id : 0xffff, + path, &mqp->pri, port); +} + +static int mlx4_set_alt_path(struct mlx4_ib_dev *dev, + const struct ib_qp_attr *qp, + enum ib_qp_attr_mask qp_attr_mask, + struct mlx4_ib_qp *mqp, + struct mlx4_qp_path *path, u8 port) +{ + return _mlx4_set_path(dev, &qp->alt_ah_attr, + mlx4_mac_to_u64((u8 *)qp->alt_smac), + (qp_attr_mask & IB_QP_ALT_VID) ? + qp->alt_vlan_id : 0xffff, + path, &mqp->alt, port); +} + +static void update_mcg_macs(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) +{ + struct mlx4_ib_gid_entry *ge, *tmp; + + list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) { + if (!ge->added && mlx4_ib_add_mc(dev, qp, &ge->gid)) { + ge->added = 1; + ge->port = qp->port; + } + } +} + +static int handle_eth_ud_smac_index(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, u8 *smac, + struct mlx4_qp_context *context) +{ + u64 u64_mac; + int smac_index; + + u64_mac = atomic64_read(&dev->iboe.mac[qp->port - 1]); + + context->pri_path.sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | ((qp->port - 1) << 6); + if (!qp->pri.smac && !qp->pri.smac_port) { + smac_index = mlx4_register_mac(dev->dev, qp->port, u64_mac); + if (smac_index >= 0) { + qp->pri.candidate_smac_index = smac_index; + qp->pri.candidate_smac = u64_mac; + qp->pri.candidate_smac_port = qp->port; + context->pri_path.grh_mylmc = 0x80 | (u8) smac_index; + } else { + return -ENOENT; + } + } + return 0; +} + +static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, + const struct ib_qp_attr *attr, int attr_mask, + enum ib_qp_state cur_state, enum ib_qp_state new_state) +{ + struct mlx4_ib_dev *dev = to_mdev(ibqp->device); + struct mlx4_ib_qp *qp = to_mqp(ibqp); + struct mlx4_ib_pd *pd; + struct mlx4_ib_cq *send_cq, *recv_cq; + struct mlx4_qp_context *context; + enum mlx4_qp_optpar optpar = 0; + int sqd_event; + int steer_qp = 0; + int err = -EINVAL; + + /* APM is not supported under RoCE */ + if (attr_mask & IB_QP_ALT_PATH && + rdma_port_get_link_layer(&dev->ib_dev, qp->port) == + IB_LINK_LAYER_ETHERNET) + return -ENOTSUPP; + + context = kzalloc(sizeof *context, GFP_KERNEL); + if (!context) + return -ENOMEM; + + context->flags = cpu_to_be32((to_mlx4_state(new_state) << 28) | + (to_mlx4_st(dev, qp->mlx4_ib_qp_type) << 16)); + + if (!(attr_mask & IB_QP_PATH_MIG_STATE)) + context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11); + else { + optpar |= MLX4_QP_OPTPAR_PM_STATE; + switch (attr->path_mig_state) { + case IB_MIG_MIGRATED: + context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11); + break; + case IB_MIG_REARM: + context->flags |= cpu_to_be32(MLX4_QP_PM_REARM << 11); + break; + case IB_MIG_ARMED: + context->flags |= cpu_to_be32(MLX4_QP_PM_ARMED << 11); + break; + } + } + + if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI) + context->mtu_msgmax = (IB_MTU_4096 << 5) | 11; + else if (ibqp->qp_type == IB_QPT_RAW_PACKET) + context->mtu_msgmax = (MLX4_RAW_QP_MTU << 5) | MLX4_RAW_QP_MSGMAX; + else if (ibqp->qp_type == IB_QPT_UD) { + if (qp->flags & MLX4_IB_QP_LSO) + context->mtu_msgmax = (IB_MTU_4096 << 5) | + ilog2(dev->dev->caps.max_gso_sz); + else + context->mtu_msgmax = (IB_MTU_4096 << 5) | 12; + } else if (attr_mask & IB_QP_PATH_MTU) { + if (attr->path_mtu < IB_MTU_256 || attr->path_mtu > IB_MTU_4096) { + pr_err("path MTU (%u) is invalid\n", + attr->path_mtu); + goto out; + } + context->mtu_msgmax = (attr->path_mtu << 5) | + ilog2(dev->dev->caps.max_msg_sz); + } + + if (qp->rq.wqe_cnt) + context->rq_size_stride = ilog2(qp->rq.wqe_cnt) << 3; + context->rq_size_stride |= qp->rq.wqe_shift - 4; + + if (qp->sq.wqe_cnt) + context->sq_size_stride = ilog2(qp->sq.wqe_cnt) << 3; + context->sq_size_stride |= qp->sq.wqe_shift - 4; + + if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { + context->sq_size_stride |= !!qp->sq_no_prefetch << 7; + context->xrcd = cpu_to_be32((u32) qp->xrcdn); + if (ibqp->qp_type == IB_QPT_RAW_PACKET) + context->param3 |= cpu_to_be32(1 << 30); + } + + if (qp->ibqp.uobject) + context->usr_page = cpu_to_be32(to_mucontext(ibqp->uobject->context)->uar.index); + else + context->usr_page = cpu_to_be32(dev->priv_uar.index); + + if (attr_mask & IB_QP_DEST_QPN) + context->remote_qpn = cpu_to_be32(attr->dest_qp_num); + + if (attr_mask & IB_QP_PORT) { + if (cur_state == IB_QPS_SQD && new_state == IB_QPS_SQD && + !(attr_mask & IB_QP_AV)) { + mlx4_set_sched(&context->pri_path, attr->port_num); + optpar |= MLX4_QP_OPTPAR_SCHED_QUEUE; + } + } + + if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) { + if (dev->counters[qp->port - 1] != -1) { + context->pri_path.counter_index = + dev->counters[qp->port - 1]; + optpar |= MLX4_QP_OPTPAR_COUNTER_INDEX; + } else + context->pri_path.counter_index = 0xff; + + if (qp->flags & MLX4_IB_QP_NETIF) { + mlx4_ib_steer_qp_reg(dev, qp, 1); + steer_qp = 1; + } + } + + if (attr_mask & IB_QP_PKEY_INDEX) { + if (qp->mlx4_ib_qp_type & MLX4_IB_QPT_ANY_SRIOV) + context->pri_path.disable_pkey_check = 0x40; + context->pri_path.pkey_index = attr->pkey_index; + optpar |= MLX4_QP_OPTPAR_PKEY_INDEX; + } + + if (attr_mask & IB_QP_AV) { + if (mlx4_set_path(dev, attr, attr_mask, qp, &context->pri_path, + attr_mask & IB_QP_PORT ? + attr->port_num : qp->port)) + goto out; + + optpar |= (MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH | + MLX4_QP_OPTPAR_SCHED_QUEUE); + } + + if (attr_mask & IB_QP_TIMEOUT) { + context->pri_path.ackto |= attr->timeout << 3; + optpar |= MLX4_QP_OPTPAR_ACK_TIMEOUT; + } + + if (attr_mask & IB_QP_ALT_PATH) { + if (attr->alt_port_num == 0 || + attr->alt_port_num > dev->dev->caps.num_ports) + goto out; + + if (attr->alt_pkey_index >= + dev->dev->caps.pkey_table_len[attr->alt_port_num]) + goto out; + + if (mlx4_set_alt_path(dev, attr, attr_mask, qp, + &context->alt_path, + attr->alt_port_num)) + goto out; + + context->alt_path.pkey_index = attr->alt_pkey_index; + context->alt_path.ackto = attr->alt_timeout << 3; + optpar |= MLX4_QP_OPTPAR_ALT_ADDR_PATH; + } + + pd = get_pd(qp); + get_cqs(qp, &send_cq, &recv_cq); + context->pd = cpu_to_be32(pd->pdn); + context->cqn_send = cpu_to_be32(send_cq->mcq.cqn); + context->cqn_recv = cpu_to_be32(recv_cq->mcq.cqn); + context->params1 = cpu_to_be32(MLX4_IB_ACK_REQ_FREQ << 28); + + /* Set "fast registration enabled" for all kernel QPs */ + if (!qp->ibqp.uobject) + context->params1 |= cpu_to_be32(1 << 11); + + if (attr_mask & IB_QP_RNR_RETRY) { + context->params1 |= cpu_to_be32(attr->rnr_retry << 13); + optpar |= MLX4_QP_OPTPAR_RNR_RETRY; + } + + if (attr_mask & IB_QP_RETRY_CNT) { + context->params1 |= cpu_to_be32(attr->retry_cnt << 16); + optpar |= MLX4_QP_OPTPAR_RETRY_COUNT; + } + + if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) { + if (attr->max_rd_atomic) + context->params1 |= + cpu_to_be32(fls(attr->max_rd_atomic - 1) << 21); + optpar |= MLX4_QP_OPTPAR_SRA_MAX; + } + + if (attr_mask & IB_QP_SQ_PSN) + context->next_send_psn = cpu_to_be32(attr->sq_psn); + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) { + if (attr->max_dest_rd_atomic) + context->params2 |= + cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21); + optpar |= MLX4_QP_OPTPAR_RRA_MAX; + } + + if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) { + context->params2 |= to_mlx4_access_flags(qp, attr, attr_mask); + optpar |= MLX4_QP_OPTPAR_RWE | MLX4_QP_OPTPAR_RRE | MLX4_QP_OPTPAR_RAE; + } + + if (ibqp->srq) + context->params2 |= cpu_to_be32(MLX4_QP_BIT_RIC); + + if (attr_mask & IB_QP_MIN_RNR_TIMER) { + context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24); + optpar |= MLX4_QP_OPTPAR_RNR_TIMEOUT; + } + if (attr_mask & IB_QP_RQ_PSN) + context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn); + + /* proxy and tunnel qp qkeys will be changed in modify-qp wrappers */ + if (attr_mask & IB_QP_QKEY) { + if (qp->mlx4_ib_qp_type & + (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER)) + context->qkey = cpu_to_be32(IB_QP_SET_QKEY); + else { + if (mlx4_is_mfunc(dev->dev) && + !(qp->mlx4_ib_qp_type & MLX4_IB_QPT_ANY_SRIOV) && + (attr->qkey & MLX4_RESERVED_QKEY_MASK) == + MLX4_RESERVED_QKEY_BASE) { + pr_err("Cannot use reserved QKEY" + " 0x%x (range 0xffff0000..0xffffffff" + " is reserved)\n", attr->qkey); + err = -EINVAL; + goto out; + } + context->qkey = cpu_to_be32(attr->qkey); + } + optpar |= MLX4_QP_OPTPAR_Q_KEY; + } + + if (ibqp->srq) + context->srqn = cpu_to_be32(1 << 24 | to_msrq(ibqp->srq)->msrq.srqn); + + if (qp->rq.wqe_cnt && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) + context->db_rec_addr = cpu_to_be64(qp->db.dma); + + if (cur_state == IB_QPS_INIT && + new_state == IB_QPS_RTR && + (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI || + ibqp->qp_type == IB_QPT_UD || + ibqp->qp_type == IB_QPT_RAW_PACKET)) { + context->pri_path.sched_queue = (qp->port - 1) << 6; + if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_SMI || + qp->mlx4_ib_qp_type & + (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER)) { + context->pri_path.sched_queue |= MLX4_IB_DEFAULT_QP0_SCHED_QUEUE; + if (qp->mlx4_ib_qp_type != MLX4_IB_QPT_SMI) + context->pri_path.fl = 0x80; + } else { + if (qp->mlx4_ib_qp_type & MLX4_IB_QPT_ANY_SRIOV) + context->pri_path.fl = 0x80; + context->pri_path.sched_queue |= MLX4_IB_DEFAULT_SCHED_QUEUE; + } + if (rdma_port_get_link_layer(&dev->ib_dev, qp->port) == + IB_LINK_LAYER_ETHERNET) { + if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_TUN_GSI || + qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI) + context->pri_path.feup = 1 << 7; /* don't fsm */ + /* handle smac_index */ + if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_UD || + qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI || + qp->mlx4_ib_qp_type == MLX4_IB_QPT_TUN_GSI) { + err = handle_eth_ud_smac_index(dev, qp, (u8 *)attr->smac, context); + if (err) { + err = -EINVAL; + goto out; + } + if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI) + dev->qp1_proxy[qp->port - 1] = qp; + } + } + } + + if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) { + context->pri_path.ackto = (context->pri_path.ackto & 0xf8) | + MLX4_IB_LINK_TYPE_ETH; + if (dev->dev->caps.tunnel_offload_mode == MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) { + /* set QP to receive both tunneled & non-tunneled packets */ + if (!(context->flags & cpu_to_be32(1 << MLX4_RSS_QPC_FLAG_OFFSET))) + context->srqn = cpu_to_be32(7 << 28); + } + } + + if (ibqp->qp_type == IB_QPT_UD && (new_state == IB_QPS_RTR)) { + int is_eth = rdma_port_get_link_layer( + &dev->ib_dev, qp->port) == + IB_LINK_LAYER_ETHERNET; + if (is_eth) { + context->pri_path.ackto = MLX4_IB_LINK_TYPE_ETH; + optpar |= MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH; + } + } + + + if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD && + attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY && attr->en_sqd_async_notify) + sqd_event = 1; + else + sqd_event = 0; + + if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) + context->rlkey |= (1 << 4); + + /* + * Before passing a kernel QP to the HW, make sure that the + * ownership bits of the send queue are set and the SQ + * headroom is stamped so that the hardware doesn't start + * processing stale work requests. + */ + if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { + struct mlx4_wqe_ctrl_seg *ctrl; + int i; + + for (i = 0; i < qp->sq.wqe_cnt; ++i) { + ctrl = get_send_wqe(qp, i); + ctrl->owner_opcode = cpu_to_be32(1 << 31); + if (qp->sq_max_wqes_per_wr == 1) + ctrl->fence_size = 1 << (qp->sq.wqe_shift - 4); + + stamp_send_wqe(qp, i, 1 << qp->sq.wqe_shift); + } + } + + err = mlx4_qp_modify(dev->dev, &qp->mtt, to_mlx4_state(cur_state), + to_mlx4_state(new_state), context, optpar, + sqd_event, &qp->mqp); + if (err) + goto out; + + qp->state = new_state; + + if (attr_mask & IB_QP_ACCESS_FLAGS) + qp->atomic_rd_en = attr->qp_access_flags; + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) + qp->resp_depth = attr->max_dest_rd_atomic; + if (attr_mask & IB_QP_PORT) { + qp->port = attr->port_num; + update_mcg_macs(dev, qp); + } + if (attr_mask & IB_QP_ALT_PATH) + qp->alt_port = attr->alt_port_num; + + if (is_sqp(dev, qp)) + store_sqp_attrs(to_msqp(qp), attr, attr_mask); + + /* + * If we moved QP0 to RTR, bring the IB link up; if we moved + * QP0 to RESET or ERROR, bring the link back down. + */ + if (is_qp0(dev, qp)) { + if (cur_state != IB_QPS_RTR && new_state == IB_QPS_RTR) + if (mlx4_INIT_PORT(dev->dev, qp->port)) + pr_warn("INIT_PORT failed for port %d\n", + qp->port); + + if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR && + (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR)) + mlx4_CLOSE_PORT(dev->dev, qp->port); + } + + /* + * If we moved a kernel QP to RESET, clean up all old CQ + * entries and reinitialize the QP. + */ + if (new_state == IB_QPS_RESET) { + if (!ibqp->uobject) { + mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn, + ibqp->srq ? to_msrq(ibqp->srq) : NULL); + if (send_cq != recv_cq) + mlx4_ib_cq_clean(send_cq, qp->mqp.qpn, NULL); + + qp->rq.head = 0; + qp->rq.tail = 0; + qp->sq.head = 0; + qp->sq.tail = 0; + qp->sq_next_wqe = 0; + if (qp->rq.wqe_cnt) + *qp->db.db = 0; + + if (qp->flags & MLX4_IB_QP_NETIF) + mlx4_ib_steer_qp_reg(dev, qp, 0); + } + if (qp->pri.smac || (!qp->pri.smac && qp->pri.smac_port)) { + mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac); + qp->pri.smac = 0; + qp->pri.smac_port = 0; + } + if (qp->alt.smac) { + mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac); + qp->alt.smac = 0; + } + if (qp->pri.vid < 0x1000) { + mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port, qp->pri.vid); + qp->pri.vid = 0xFFFF; + qp->pri.candidate_vid = 0xFFFF; + qp->pri.update_vid = 0; + } + + if (qp->alt.vid < 0x1000) { + mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port, qp->alt.vid); + qp->alt.vid = 0xFFFF; + qp->alt.candidate_vid = 0xFFFF; + qp->alt.update_vid = 0; + } + } +out: + if (err && steer_qp) + mlx4_ib_steer_qp_reg(dev, qp, 0); + kfree(context); + if (qp->pri.candidate_smac || + (!qp->pri.candidate_smac && qp->pri.candidate_smac_port)) { + if (err) { + mlx4_unregister_mac(dev->dev, qp->pri.candidate_smac_port, qp->pri.candidate_smac); + } else { + if (qp->pri.smac || (!qp->pri.smac && qp->pri.smac_port)) + mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac); + qp->pri.smac = qp->pri.candidate_smac; + qp->pri.smac_index = qp->pri.candidate_smac_index; + qp->pri.smac_port = qp->pri.candidate_smac_port; + } + qp->pri.candidate_smac = 0; + qp->pri.candidate_smac_index = 0; + qp->pri.candidate_smac_port = 0; + } + if (qp->alt.candidate_smac) { + if (err) { + mlx4_unregister_mac(dev->dev, qp->alt.candidate_smac_port, qp->alt.candidate_smac); + } else { + if (qp->alt.smac) + mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac); + qp->alt.smac = qp->alt.candidate_smac; + qp->alt.smac_index = qp->alt.candidate_smac_index; + qp->alt.smac_port = qp->alt.candidate_smac_port; + } + qp->alt.candidate_smac = 0; + qp->alt.candidate_smac_index = 0; + qp->alt.candidate_smac_port = 0; + } + + if (qp->pri.update_vid) { + if (err) { + if (qp->pri.candidate_vid < 0x1000) + mlx4_unregister_vlan(dev->dev, qp->pri.candidate_vlan_port, + qp->pri.candidate_vid); + } else { + if (qp->pri.vid < 0x1000) + mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port, + qp->pri.vid); + qp->pri.vid = qp->pri.candidate_vid; + qp->pri.vlan_port = qp->pri.candidate_vlan_port; + qp->pri.vlan_index = qp->pri.candidate_vlan_index; + } + qp->pri.candidate_vid = 0xFFFF; + qp->pri.update_vid = 0; + } + + if (qp->alt.update_vid) { + if (err) { + if (qp->alt.candidate_vid < 0x1000) + mlx4_unregister_vlan(dev->dev, qp->alt.candidate_vlan_port, + qp->alt.candidate_vid); + } else { + if (qp->alt.vid < 0x1000) + mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port, + qp->alt.vid); + qp->alt.vid = qp->alt.candidate_vid; + qp->alt.vlan_port = qp->alt.candidate_vlan_port; + qp->alt.vlan_index = qp->alt.candidate_vlan_index; + } + qp->alt.candidate_vid = 0xFFFF; + qp->alt.update_vid = 0; + } + + return err; +} + +int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct mlx4_ib_dev *dev = to_mdev(ibqp->device); + struct mlx4_ib_qp *qp = to_mqp(ibqp); + enum ib_qp_state cur_state, new_state; + int err = -EINVAL; + int ll; + mutex_lock(&qp->mutex); + + cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state; + new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; + + if (cur_state == new_state && cur_state == IB_QPS_RESET) { + ll = IB_LINK_LAYER_UNSPECIFIED; + } else { + int port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; + ll = rdma_port_get_link_layer(&dev->ib_dev, port); + } + + if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, + attr_mask, ll)) { + pr_debug("qpn 0x%x: invalid attribute mask specified " + "for transition %d to %d. qp_type %d," + " attr_mask 0x%x\n", + ibqp->qp_num, cur_state, new_state, + ibqp->qp_type, attr_mask); + goto out; + } + + if (mlx4_is_bonded(dev->dev) && (attr_mask & IB_QP_PORT)) { + if ((cur_state == IB_QPS_RESET) && (new_state == IB_QPS_INIT)) { + if ((ibqp->qp_type == IB_QPT_RC) || + (ibqp->qp_type == IB_QPT_UD) || + (ibqp->qp_type == IB_QPT_UC) || + (ibqp->qp_type == IB_QPT_RAW_PACKET) || + (ibqp->qp_type == IB_QPT_XRC_INI)) { + attr->port_num = mlx4_ib_bond_next_port(dev); + } + } else { + /* no sense in changing port_num + * when ports are bonded */ + attr_mask &= ~IB_QP_PORT; + } + } + + if ((attr_mask & IB_QP_PORT) && + (attr->port_num == 0 || attr->port_num > dev->num_ports)) { + pr_debug("qpn 0x%x: invalid port number (%d) specified " + "for transition %d to %d. qp_type %d\n", + ibqp->qp_num, attr->port_num, cur_state, + new_state, ibqp->qp_type); + goto out; + } + + if ((attr_mask & IB_QP_PORT) && (ibqp->qp_type == IB_QPT_RAW_PACKET) && + (rdma_port_get_link_layer(&dev->ib_dev, attr->port_num) != + IB_LINK_LAYER_ETHERNET)) + goto out; + + if (attr_mask & IB_QP_PKEY_INDEX) { + int p = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; + if (attr->pkey_index >= dev->dev->caps.pkey_table_len[p]) { + pr_debug("qpn 0x%x: invalid pkey index (%d) specified " + "for transition %d to %d. qp_type %d\n", + ibqp->qp_num, attr->pkey_index, cur_state, + new_state, ibqp->qp_type); + goto out; + } + } + + if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC && + attr->max_rd_atomic > dev->dev->caps.max_qp_init_rdma) { + pr_debug("qpn 0x%x: max_rd_atomic (%d) too large. " + "Transition %d to %d. qp_type %d\n", + ibqp->qp_num, attr->max_rd_atomic, cur_state, + new_state, ibqp->qp_type); + goto out; + } + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC && + attr->max_dest_rd_atomic > dev->dev->caps.max_qp_dest_rdma) { + pr_debug("qpn 0x%x: max_dest_rd_atomic (%d) too large. " + "Transition %d to %d. qp_type %d\n", + ibqp->qp_num, attr->max_dest_rd_atomic, cur_state, + new_state, ibqp->qp_type); + goto out; + } + + if (cur_state == new_state && cur_state == IB_QPS_RESET) { + err = 0; + goto out; + } + + err = __mlx4_ib_modify_qp(ibqp, attr, attr_mask, cur_state, new_state); + + if (mlx4_is_bonded(dev->dev) && (attr_mask & IB_QP_PORT)) + attr->port_num = 1; + +out: + mutex_unlock(&qp->mutex); + return err; +} + +static int vf_get_qp0_qkey(struct mlx4_dev *dev, int qpn, u32 *qkey) +{ + int i; + for (i = 0; i < dev->caps.num_ports; i++) { + if (qpn == dev->caps.qp0_proxy[i] || + qpn == dev->caps.qp0_tunnel[i]) { + *qkey = dev->caps.qp0_qkey[i]; + return 0; + } + } + return -EINVAL; +} + +static int build_sriov_qp0_header(struct mlx4_ib_sqp *sqp, + struct ib_send_wr *wr, + void *wqe, unsigned *mlx_seg_len) +{ + struct mlx4_ib_dev *mdev = to_mdev(sqp->qp.ibqp.device); + struct ib_device *ib_dev = &mdev->ib_dev; + struct mlx4_wqe_mlx_seg *mlx = wqe; + struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx; + struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah); + u16 pkey; + u32 qkey; + int send_size; + int header_size; + int spc; + int i; + + if (wr->opcode != IB_WR_SEND) + return -EINVAL; + + send_size = 0; + + for (i = 0; i < wr->num_sge; ++i) + send_size += wr->sg_list[i].length; + + /* for proxy-qp0 sends, need to add in size of tunnel header */ + /* for tunnel-qp0 sends, tunnel header is already in s/g list */ + if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI_OWNER) + send_size += sizeof (struct mlx4_ib_tunnel_header); + + ib_ud_header_init(send_size, 1, 0, 0, 0, 0, &sqp->ud_header); + + if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI_OWNER) { + sqp->ud_header.lrh.service_level = + be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28; + sqp->ud_header.lrh.destination_lid = + cpu_to_be16(ah->av.ib.g_slid & 0x7f); + sqp->ud_header.lrh.source_lid = + cpu_to_be16(ah->av.ib.g_slid & 0x7f); + } + + mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE); + + /* force loopback */ + mlx->flags |= cpu_to_be32(MLX4_WQE_MLX_VL15 | 0x1 | MLX4_WQE_MLX_SLR); + mlx->rlid = sqp->ud_header.lrh.destination_lid; + + sqp->ud_header.lrh.virtual_lane = 0; + sqp->ud_header.bth.solicited_event = !!(wr->send_flags & IB_SEND_SOLICITED); + ib_get_cached_pkey(ib_dev, sqp->qp.port, 0, &pkey); + sqp->ud_header.bth.pkey = cpu_to_be16(pkey); + if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_TUN_SMI_OWNER) + sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->wr.ud.remote_qpn); + else + sqp->ud_header.bth.destination_qpn = + cpu_to_be32(mdev->dev->caps.qp0_tunnel[sqp->qp.port - 1]); + + sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1)); + if (mlx4_is_master(mdev->dev)) { + if (mlx4_get_parav_qkey(mdev->dev, sqp->qp.mqp.qpn, &qkey)) + return -EINVAL; + } else { + if (vf_get_qp0_qkey(mdev->dev, sqp->qp.mqp.qpn, &qkey)) + return -EINVAL; + } + sqp->ud_header.deth.qkey = cpu_to_be32(qkey); + sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.mqp.qpn); + + sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY; + sqp->ud_header.immediate_present = 0; + + header_size = ib_ud_header_pack(&sqp->ud_header, sqp->header_buf); + + /* + * Inline data segments may not cross a 64 byte boundary. If + * our UD header is bigger than the space available up to the + * next 64 byte boundary in the WQE, use two inline data + * segments to hold the UD header. + */ + spc = MLX4_INLINE_ALIGN - + ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1)); + if (header_size <= spc) { + inl->byte_count = cpu_to_be32(1 << 31 | header_size); + memcpy(inl + 1, sqp->header_buf, header_size); + i = 1; + } else { + inl->byte_count = cpu_to_be32(1 << 31 | spc); + memcpy(inl + 1, sqp->header_buf, spc); + + inl = (void *) (inl + 1) + spc; + memcpy(inl + 1, sqp->header_buf + spc, header_size - spc); + /* + * Need a barrier here to make sure all the data is + * visible before the byte_count field is set. + * Otherwise the HCA prefetcher could grab the 64-byte + * chunk with this inline segment and get a valid (!= + * 0xffffffff) byte count but stale data, and end up + * generating a packet with bad headers. + * + * The first inline segment's byte_count field doesn't + * need a barrier, because it comes after a + * control/MLX segment and therefore is at an offset + * of 16 mod 64. + */ + wmb(); + inl->byte_count = cpu_to_be32(1 << 31 | (header_size - spc)); + i = 2; + } + + *mlx_seg_len = + ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + header_size, 16); + return 0; +} + +static void mlx4_u64_to_smac(u8 *dst_mac, u64 src_mac) +{ + int i; + + for (i = ETH_ALEN; i; i--) { + dst_mac[i - 1] = src_mac & 0xff; + src_mac >>= 8; + } +} + +static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, + void *wqe, unsigned *mlx_seg_len) +{ + struct ib_device *ib_dev = sqp->qp.ibqp.device; + struct mlx4_wqe_mlx_seg *mlx = wqe; + struct mlx4_wqe_ctrl_seg *ctrl = wqe; + struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx; + struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah); + union ib_gid sgid; + u16 pkey; + int send_size; + int header_size; + int spc; + int i; + int err = 0; + u16 vlan = 0xffff; + bool is_eth; + bool is_vlan = false; + bool is_grh; + + send_size = 0; + for (i = 0; i < wr->num_sge; ++i) + send_size += wr->sg_list[i].length; + + is_eth = rdma_port_get_link_layer(sqp->qp.ibqp.device, sqp->qp.port) == IB_LINK_LAYER_ETHERNET; + is_grh = mlx4_ib_ah_grh_present(ah); + if (is_eth) { + if (mlx4_is_mfunc(to_mdev(ib_dev)->dev)) { + /* When multi-function is enabled, the ib_core gid + * indexes don't necessarily match the hw ones, so + * we must use our own cache */ + err = mlx4_get_roce_gid_from_slave(to_mdev(ib_dev)->dev, + be32_to_cpu(ah->av.ib.port_pd) >> 24, + ah->av.ib.gid_index, &sgid.raw[0]); + if (err) + return err; + } else { + err = ib_get_cached_gid(ib_dev, + be32_to_cpu(ah->av.ib.port_pd) >> 24, + ah->av.ib.gid_index, &sgid); + if (err) + return err; + } + + if (ah->av.eth.vlan != cpu_to_be16(0xffff)) { + vlan = be16_to_cpu(ah->av.eth.vlan) & 0x0fff; + is_vlan = 1; + } + } + ib_ud_header_init(send_size, !is_eth, is_eth, is_vlan, is_grh, 0, &sqp->ud_header); + + if (!is_eth) { + sqp->ud_header.lrh.service_level = + be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28; + sqp->ud_header.lrh.destination_lid = ah->av.ib.dlid; + sqp->ud_header.lrh.source_lid = cpu_to_be16(ah->av.ib.g_slid & 0x7f); + } + + if (is_grh) { + sqp->ud_header.grh.traffic_class = + (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20) & 0xff; + sqp->ud_header.grh.flow_label = + ah->av.ib.sl_tclass_flowlabel & cpu_to_be32(0xfffff); + sqp->ud_header.grh.hop_limit = ah->av.ib.hop_limit; + if (is_eth) + memcpy(sqp->ud_header.grh.source_gid.raw, sgid.raw, 16); + else { + if (mlx4_is_mfunc(to_mdev(ib_dev)->dev)) { + /* When multi-function is enabled, the ib_core gid + * indexes don't necessarily match the hw ones, so + * we must use our own cache */ + sqp->ud_header.grh.source_gid.global.subnet_prefix = + to_mdev(ib_dev)->sriov.demux[sqp->qp.port - 1]. + subnet_prefix; + sqp->ud_header.grh.source_gid.global.interface_id = + to_mdev(ib_dev)->sriov.demux[sqp->qp.port - 1]. + guid_cache[ah->av.ib.gid_index]; + } else + ib_get_cached_gid(ib_dev, + be32_to_cpu(ah->av.ib.port_pd) >> 24, + ah->av.ib.gid_index, + &sqp->ud_header.grh.source_gid); + } + memcpy(sqp->ud_header.grh.destination_gid.raw, + ah->av.ib.dgid, 16); + } + + mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE); + + if (!is_eth) { + mlx->flags |= cpu_to_be32((!sqp->qp.ibqp.qp_num ? MLX4_WQE_MLX_VL15 : 0) | + (sqp->ud_header.lrh.destination_lid == + IB_LID_PERMISSIVE ? MLX4_WQE_MLX_SLR : 0) | + (sqp->ud_header.lrh.service_level << 8)); + if (ah->av.ib.port_pd & cpu_to_be32(0x80000000)) + mlx->flags |= cpu_to_be32(0x1); /* force loopback */ + mlx->rlid = sqp->ud_header.lrh.destination_lid; + } + + switch (wr->opcode) { + case IB_WR_SEND: + sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY; + sqp->ud_header.immediate_present = 0; + break; + case IB_WR_SEND_WITH_IMM: + sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE; + sqp->ud_header.immediate_present = 1; + sqp->ud_header.immediate_data = wr->ex.imm_data; + break; + default: + return -EINVAL; + } + + if (is_eth) { + struct in6_addr in6; + + u16 pcp = (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 29) << 13; + + mlx->sched_prio = cpu_to_be16(pcp); + + memcpy(sqp->ud_header.eth.dmac_h, ah->av.eth.mac, 6); + /* FIXME: cache smac value? */ + memcpy(&ctrl->srcrb_flags16[0], ah->av.eth.mac, 2); + memcpy(&ctrl->imm, ah->av.eth.mac + 2, 4); + memcpy(&in6, sgid.raw, sizeof(in6)); + + if (!mlx4_is_mfunc(to_mdev(ib_dev)->dev)) { + u64 mac = atomic64_read(&to_mdev(ib_dev)->iboe.mac[sqp->qp.port - 1]); + u8 smac[ETH_ALEN]; + + mlx4_u64_to_smac(smac, mac); + memcpy(sqp->ud_header.eth.smac_h, smac, ETH_ALEN); + } else { + /* use the src mac of the tunnel */ + memcpy(sqp->ud_header.eth.smac_h, ah->av.eth.s_mac, ETH_ALEN); + } + + if (!memcmp(sqp->ud_header.eth.smac_h, sqp->ud_header.eth.dmac_h, 6)) + mlx->flags |= cpu_to_be32(MLX4_WQE_CTRL_FORCE_LOOPBACK); + if (!is_vlan) { + sqp->ud_header.eth.type = cpu_to_be16(MLX4_IB_IBOE_ETHERTYPE); + } else { + sqp->ud_header.vlan.type = cpu_to_be16(MLX4_IB_IBOE_ETHERTYPE); + sqp->ud_header.vlan.tag = cpu_to_be16(vlan | pcp); + } + } else { + sqp->ud_header.lrh.virtual_lane = !sqp->qp.ibqp.qp_num ? 15 : 0; + if (sqp->ud_header.lrh.destination_lid == IB_LID_PERMISSIVE) + sqp->ud_header.lrh.source_lid = IB_LID_PERMISSIVE; + } + sqp->ud_header.bth.solicited_event = !!(wr->send_flags & IB_SEND_SOLICITED); + if (!sqp->qp.ibqp.qp_num) + ib_get_cached_pkey(ib_dev, sqp->qp.port, sqp->pkey_index, &pkey); + else + ib_get_cached_pkey(ib_dev, sqp->qp.port, wr->wr.ud.pkey_index, &pkey); + sqp->ud_header.bth.pkey = cpu_to_be16(pkey); + sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->wr.ud.remote_qpn); + sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1)); + sqp->ud_header.deth.qkey = cpu_to_be32(wr->wr.ud.remote_qkey & 0x80000000 ? + sqp->qkey : wr->wr.ud.remote_qkey); + sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.ibqp.qp_num); + + header_size = ib_ud_header_pack(&sqp->ud_header, sqp->header_buf); + + if (0) { + pr_err("built UD header of size %d:\n", header_size); + for (i = 0; i < header_size / 4; ++i) { + if (i % 8 == 0) + pr_err(" [%02x] ", i * 4); + pr_cont(" %08x", + be32_to_cpu(((__be32 *) sqp->header_buf)[i])); + if ((i + 1) % 8 == 0) + pr_cont("\n"); + } + pr_err("\n"); + } + + /* + * Inline data segments may not cross a 64 byte boundary. If + * our UD header is bigger than the space available up to the + * next 64 byte boundary in the WQE, use two inline data + * segments to hold the UD header. + */ + spc = MLX4_INLINE_ALIGN - + ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1)); + if (header_size <= spc) { + inl->byte_count = cpu_to_be32(1 << 31 | header_size); + memcpy(inl + 1, sqp->header_buf, header_size); + i = 1; + } else { + inl->byte_count = cpu_to_be32(1 << 31 | spc); + memcpy(inl + 1, sqp->header_buf, spc); + + inl = (void *) (inl + 1) + spc; + memcpy(inl + 1, sqp->header_buf + spc, header_size - spc); + /* + * Need a barrier here to make sure all the data is + * visible before the byte_count field is set. + * Otherwise the HCA prefetcher could grab the 64-byte + * chunk with this inline segment and get a valid (!= + * 0xffffffff) byte count but stale data, and end up + * generating a packet with bad headers. + * + * The first inline segment's byte_count field doesn't + * need a barrier, because it comes after a + * control/MLX segment and therefore is at an offset + * of 16 mod 64. + */ + wmb(); + inl->byte_count = cpu_to_be32(1 << 31 | (header_size - spc)); + i = 2; + } + + *mlx_seg_len = + ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + header_size, 16); + return 0; +} + +static int mlx4_wq_overflow(struct mlx4_ib_wq *wq, int nreq, struct ib_cq *ib_cq) +{ + unsigned cur; + struct mlx4_ib_cq *cq; + + cur = wq->head - wq->tail; + if (likely(cur + nreq < wq->max_post)) + return 0; + + cq = to_mcq(ib_cq); + spin_lock(&cq->lock); + cur = wq->head - wq->tail; + spin_unlock(&cq->lock); + + return cur + nreq >= wq->max_post; +} + +static __be32 convert_access(int acc) +{ + return (acc & IB_ACCESS_REMOTE_ATOMIC ? + cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_ATOMIC) : 0) | + (acc & IB_ACCESS_REMOTE_WRITE ? + cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_WRITE) : 0) | + (acc & IB_ACCESS_REMOTE_READ ? + cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_READ) : 0) | + (acc & IB_ACCESS_LOCAL_WRITE ? cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_WRITE) : 0) | + cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_READ); +} + +static void set_fmr_seg(struct mlx4_wqe_fmr_seg *fseg, struct ib_send_wr *wr) +{ + struct mlx4_ib_fast_reg_page_list *mfrpl = to_mfrpl(wr->wr.fast_reg.page_list); + int i; + + for (i = 0; i < wr->wr.fast_reg.page_list_len; ++i) + mfrpl->mapped_page_list[i] = + cpu_to_be64(wr->wr.fast_reg.page_list->page_list[i] | + MLX4_MTT_FLAG_PRESENT); + + fseg->flags = convert_access(wr->wr.fast_reg.access_flags); + fseg->mem_key = cpu_to_be32(wr->wr.fast_reg.rkey); + fseg->buf_list = cpu_to_be64(mfrpl->map); + fseg->start_addr = cpu_to_be64(wr->wr.fast_reg.iova_start); + fseg->reg_len = cpu_to_be64(wr->wr.fast_reg.length); + fseg->offset = 0; /* XXX -- is this just for ZBVA? */ + fseg->page_size = cpu_to_be32(wr->wr.fast_reg.page_shift); + fseg->reserved[0] = 0; + fseg->reserved[1] = 0; +} + +static void set_bind_seg(struct mlx4_wqe_bind_seg *bseg, struct ib_send_wr *wr) +{ + bseg->flags1 = + convert_access(wr->wr.bind_mw.bind_info.mw_access_flags) & + cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_READ | + MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_WRITE | + MLX4_WQE_FMR_AND_BIND_PERM_ATOMIC); + bseg->flags2 = 0; + if (wr->wr.bind_mw.mw->type == IB_MW_TYPE_2) + bseg->flags2 |= cpu_to_be32(MLX4_WQE_BIND_TYPE_2); + if (wr->wr.bind_mw.bind_info.mw_access_flags & IB_ZERO_BASED) + bseg->flags2 |= cpu_to_be32(MLX4_WQE_BIND_ZERO_BASED); + bseg->new_rkey = cpu_to_be32(wr->wr.bind_mw.rkey); + bseg->lkey = cpu_to_be32(wr->wr.bind_mw.bind_info.mr->lkey); + bseg->addr = cpu_to_be64(wr->wr.bind_mw.bind_info.addr); + bseg->length = cpu_to_be64(wr->wr.bind_mw.bind_info.length); +} + +static void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg, u32 rkey) +{ + memset(iseg, 0, sizeof(*iseg)); + iseg->mem_key = cpu_to_be32(rkey); +} + +static __always_inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg, + u64 remote_addr, u32 rkey) +{ + rseg->raddr = cpu_to_be64(remote_addr); + rseg->rkey = cpu_to_be32(rkey); + rseg->reserved = 0; +} + +static void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg, struct ib_send_wr *wr) +{ + if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP) { + aseg->swap_add = cpu_to_be64(wr->wr.atomic.swap); + aseg->compare = cpu_to_be64(wr->wr.atomic.compare_add); + } else if (wr->opcode == IB_WR_MASKED_ATOMIC_FETCH_AND_ADD) { + aseg->swap_add = cpu_to_be64(wr->wr.atomic.compare_add); + aseg->compare = cpu_to_be64(wr->wr.atomic.compare_add_mask); + } else { + aseg->swap_add = cpu_to_be64(wr->wr.atomic.compare_add); + aseg->compare = 0; + } + +} + +static void set_masked_atomic_seg(struct mlx4_wqe_masked_atomic_seg *aseg, + struct ib_send_wr *wr) +{ + aseg->swap_add = cpu_to_be64(wr->wr.atomic.swap); + aseg->swap_add_mask = cpu_to_be64(wr->wr.atomic.swap_mask); + aseg->compare = cpu_to_be64(wr->wr.atomic.compare_add); + aseg->compare_mask = cpu_to_be64(wr->wr.atomic.compare_add_mask); +} + +static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg, + struct ib_send_wr *wr) +{ + memcpy(dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof (struct mlx4_av)); + dseg->dqpn = cpu_to_be32(wr->wr.ud.remote_qpn); + dseg->qkey = cpu_to_be32(wr->wr.ud.remote_qkey); + dseg->vlan = to_mah(wr->wr.ud.ah)->av.eth.vlan; + memcpy(dseg->mac, to_mah(wr->wr.ud.ah)->av.eth.mac, 6); +} + +static void set_tunnel_datagram_seg(struct mlx4_ib_dev *dev, + struct mlx4_wqe_datagram_seg *dseg, + struct ib_send_wr *wr, + enum mlx4_ib_qp_type qpt) +{ + union mlx4_ext_av *av = &to_mah(wr->wr.ud.ah)->av; + struct mlx4_av sqp_av = {0}; + int port = *((u8 *) &av->ib.port_pd) & 0x3; + + /* force loopback */ + sqp_av.port_pd = av->ib.port_pd | cpu_to_be32(0x80000000); + sqp_av.g_slid = av->ib.g_slid & 0x7f; /* no GRH */ + sqp_av.sl_tclass_flowlabel = av->ib.sl_tclass_flowlabel & + cpu_to_be32(0xf0000000); + + memcpy(dseg->av, &sqp_av, sizeof (struct mlx4_av)); + if (qpt == MLX4_IB_QPT_PROXY_GSI) + dseg->dqpn = cpu_to_be32(dev->dev->caps.qp1_tunnel[port - 1]); + else + dseg->dqpn = cpu_to_be32(dev->dev->caps.qp0_tunnel[port - 1]); + /* Use QKEY from the QP context, which is set by master */ + dseg->qkey = cpu_to_be32(IB_QP_SET_QKEY); +} + +static void build_tunnel_header(struct ib_send_wr *wr, void *wqe, unsigned *mlx_seg_len) +{ + struct mlx4_wqe_inline_seg *inl = wqe; + struct mlx4_ib_tunnel_header hdr; + struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah); + int spc; + int i; + + memcpy(&hdr.av, &ah->av, sizeof hdr.av); + hdr.remote_qpn = cpu_to_be32(wr->wr.ud.remote_qpn); + hdr.pkey_index = cpu_to_be16(wr->wr.ud.pkey_index); + hdr.qkey = cpu_to_be32(wr->wr.ud.remote_qkey); + memcpy(hdr.mac, ah->av.eth.mac, 6); + hdr.vlan = ah->av.eth.vlan; + + spc = MLX4_INLINE_ALIGN - + ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1)); + if (sizeof (hdr) <= spc) { + memcpy(inl + 1, &hdr, sizeof (hdr)); + wmb(); + inl->byte_count = cpu_to_be32(1 << 31 | sizeof (hdr)); + i = 1; + } else { + memcpy(inl + 1, &hdr, spc); + wmb(); + inl->byte_count = cpu_to_be32(1 << 31 | spc); + + inl = (void *) (inl + 1) + spc; + memcpy(inl + 1, (void *) &hdr + spc, sizeof (hdr) - spc); + wmb(); + inl->byte_count = cpu_to_be32(1 << 31 | (sizeof (hdr) - spc)); + i = 2; + } + + *mlx_seg_len = + ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + sizeof (hdr), 16); +} + +static void set_mlx_icrc_seg(void *dseg) +{ + u32 *t = dseg; + struct mlx4_wqe_inline_seg *iseg = dseg; + + t[1] = 0; + + /* + * Need a barrier here before writing the byte_count field to + * make sure that all the data is visible before the + * byte_count field is set. Otherwise, if the segment begins + * a new cacheline, the HCA prefetcher could grab the 64-byte + * chunk and get a valid (!= * 0xffffffff) byte count but + * stale data, and end up sending the wrong data. + */ + wmb(); + + iseg->byte_count = cpu_to_be32((1 << 31) | 4); +} + +static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg) +{ + dseg->lkey = cpu_to_be32(sg->lkey); + dseg->addr = cpu_to_be64(sg->addr); + + /* + * Need a barrier here before writing the byte_count field to + * make sure that all the data is visible before the + * byte_count field is set. Otherwise, if the segment begins + * a new cacheline, the HCA prefetcher could grab the 64-byte + * chunk and get a valid (!= * 0xffffffff) byte count but + * stale data, and end up sending the wrong data. + */ + wmb(); + + dseg->byte_count = cpu_to_be32(sg->length); +} + +static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg) +{ + dseg->byte_count = cpu_to_be32(sg->length); + dseg->lkey = cpu_to_be32(sg->lkey); + dseg->addr = cpu_to_be64(sg->addr); +} + +static int build_lso_seg(struct mlx4_wqe_lso_seg *wqe, struct ib_send_wr *wr, + struct mlx4_ib_qp *qp, unsigned *lso_seg_len, + __be32 *lso_hdr_sz, __be32 *blh) +{ + unsigned halign = ALIGN(sizeof *wqe + wr->wr.ud.hlen, 16); + + if (unlikely(halign > MLX4_IB_CACHE_LINE_SIZE)) + *blh = cpu_to_be32(1 << 6); + + if (unlikely(!(qp->flags & MLX4_IB_QP_LSO) && + wr->num_sge > qp->sq.max_gs - (halign >> 4))) + return -EINVAL; + + memcpy(wqe->header, wr->wr.ud.header, wr->wr.ud.hlen); + + *lso_hdr_sz = cpu_to_be32(wr->wr.ud.mss << 16 | wr->wr.ud.hlen); + *lso_seg_len = halign; + return 0; +} + +static __be32 send_ieth(struct ib_send_wr *wr) +{ + switch (wr->opcode) { + case IB_WR_SEND_WITH_IMM: + case IB_WR_RDMA_WRITE_WITH_IMM: + return wr->ex.imm_data; + + case IB_WR_SEND_WITH_INV: + return cpu_to_be32(wr->ex.invalidate_rkey); + + default: + return 0; + } +} + +static void add_zero_len_inline(void *wqe) +{ + struct mlx4_wqe_inline_seg *inl = wqe; + memset(wqe, 0, 16); + inl->byte_count = cpu_to_be32(1 << 31); +} + +int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr) +{ + struct mlx4_ib_qp *qp = to_mqp(ibqp); + void *wqe; + struct mlx4_wqe_ctrl_seg *ctrl; + struct mlx4_wqe_data_seg *dseg; + unsigned long flags; + int nreq; + int err = 0; + unsigned ind; + int uninitialized_var(stamp); + int uninitialized_var(size); + unsigned uninitialized_var(seglen); + __be32 dummy; + __be32 *lso_wqe; + __be32 uninitialized_var(lso_hdr_sz); + __be32 blh; + int i; + struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); + + spin_lock_irqsave(&qp->sq.lock, flags); + if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) { + err = -EIO; + *bad_wr = wr; + nreq = 0; + goto out; + } + + ind = qp->sq_next_wqe; + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + lso_wqe = &dummy; + blh = 0; + + if (mlx4_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) { + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + if (unlikely(wr->num_sge > qp->sq.max_gs)) { + err = -EINVAL; + *bad_wr = wr; + goto out; + } + + ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1)); + qp->sq.wrid[(qp->sq.head + nreq) & (qp->sq.wqe_cnt - 1)] = wr->wr_id; + + ctrl->srcrb_flags = + (wr->send_flags & IB_SEND_SIGNALED ? + cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE) : 0) | + (wr->send_flags & IB_SEND_SOLICITED ? + cpu_to_be32(MLX4_WQE_CTRL_SOLICITED) : 0) | + ((wr->send_flags & IB_SEND_IP_CSUM) ? + cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM | + MLX4_WQE_CTRL_TCP_UDP_CSUM) : 0) | + qp->sq_signal_bits; + + ctrl->imm = send_ieth(wr); + + wqe += sizeof *ctrl; + size = sizeof *ctrl / 16; + + switch (qp->mlx4_ib_qp_type) { + case MLX4_IB_QPT_RC: + case MLX4_IB_QPT_UC: + switch (wr->opcode) { + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + case IB_WR_MASKED_ATOMIC_FETCH_AND_ADD: + set_raddr_seg(wqe, wr->wr.atomic.remote_addr, + wr->wr.atomic.rkey); + wqe += sizeof (struct mlx4_wqe_raddr_seg); + + set_atomic_seg(wqe, wr); + wqe += sizeof (struct mlx4_wqe_atomic_seg); + + size += (sizeof (struct mlx4_wqe_raddr_seg) + + sizeof (struct mlx4_wqe_atomic_seg)) / 16; + + break; + + case IB_WR_MASKED_ATOMIC_CMP_AND_SWP: + set_raddr_seg(wqe, wr->wr.atomic.remote_addr, + wr->wr.atomic.rkey); + wqe += sizeof (struct mlx4_wqe_raddr_seg); + + set_masked_atomic_seg(wqe, wr); + wqe += sizeof (struct mlx4_wqe_masked_atomic_seg); + + size += (sizeof (struct mlx4_wqe_raddr_seg) + + sizeof (struct mlx4_wqe_masked_atomic_seg)) / 16; + + break; + + case IB_WR_RDMA_READ: + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + set_raddr_seg(wqe, wr->wr.rdma.remote_addr, + wr->wr.rdma.rkey); + wqe += sizeof (struct mlx4_wqe_raddr_seg); + size += sizeof (struct mlx4_wqe_raddr_seg) / 16; + break; + + case IB_WR_LOCAL_INV: + ctrl->srcrb_flags |= + cpu_to_be32(MLX4_WQE_CTRL_STRONG_ORDER); + set_local_inv_seg(wqe, wr->ex.invalidate_rkey); + wqe += sizeof (struct mlx4_wqe_local_inval_seg); + size += sizeof (struct mlx4_wqe_local_inval_seg) / 16; + break; + + case IB_WR_FAST_REG_MR: + ctrl->srcrb_flags |= + cpu_to_be32(MLX4_WQE_CTRL_STRONG_ORDER); + set_fmr_seg(wqe, wr); + wqe += sizeof (struct mlx4_wqe_fmr_seg); + size += sizeof (struct mlx4_wqe_fmr_seg) / 16; + break; + + case IB_WR_BIND_MW: + ctrl->srcrb_flags |= + cpu_to_be32(MLX4_WQE_CTRL_STRONG_ORDER); + set_bind_seg(wqe, wr); + wqe += sizeof(struct mlx4_wqe_bind_seg); + size += sizeof(struct mlx4_wqe_bind_seg) / 16; + break; + default: + /* No extra segments required for sends */ + break; + } + break; + + case MLX4_IB_QPT_TUN_SMI_OWNER: + err = build_sriov_qp0_header(to_msqp(qp), wr, ctrl, &seglen); + if (unlikely(err)) { + *bad_wr = wr; + goto out; + } + wqe += seglen; + size += seglen / 16; + break; + case MLX4_IB_QPT_TUN_SMI: + case MLX4_IB_QPT_TUN_GSI: + /* this is a UD qp used in MAD responses to slaves. */ + set_datagram_seg(wqe, wr); + /* set the forced-loopback bit in the data seg av */ + *(__be32 *) wqe |= cpu_to_be32(0x80000000); + wqe += sizeof (struct mlx4_wqe_datagram_seg); + size += sizeof (struct mlx4_wqe_datagram_seg) / 16; + break; + case MLX4_IB_QPT_UD: + set_datagram_seg(wqe, wr); + wqe += sizeof (struct mlx4_wqe_datagram_seg); + size += sizeof (struct mlx4_wqe_datagram_seg) / 16; + + if (wr->opcode == IB_WR_LSO) { + err = build_lso_seg(wqe, wr, qp, &seglen, &lso_hdr_sz, &blh); + if (unlikely(err)) { + *bad_wr = wr; + goto out; + } + lso_wqe = (__be32 *) wqe; + wqe += seglen; + size += seglen / 16; + } + break; + + case MLX4_IB_QPT_PROXY_SMI_OWNER: + err = build_sriov_qp0_header(to_msqp(qp), wr, ctrl, &seglen); + if (unlikely(err)) { + *bad_wr = wr; + goto out; + } + wqe += seglen; + size += seglen / 16; + /* to start tunnel header on a cache-line boundary */ + add_zero_len_inline(wqe); + wqe += 16; + size++; + build_tunnel_header(wr, wqe, &seglen); + wqe += seglen; + size += seglen / 16; + break; + case MLX4_IB_QPT_PROXY_SMI: + case MLX4_IB_QPT_PROXY_GSI: + /* If we are tunneling special qps, this is a UD qp. + * In this case we first add a UD segment targeting + * the tunnel qp, and then add a header with address + * information */ + set_tunnel_datagram_seg(to_mdev(ibqp->device), wqe, wr, + qp->mlx4_ib_qp_type); + wqe += sizeof (struct mlx4_wqe_datagram_seg); + size += sizeof (struct mlx4_wqe_datagram_seg) / 16; + build_tunnel_header(wr, wqe, &seglen); + wqe += seglen; + size += seglen / 16; + break; + + case MLX4_IB_QPT_SMI: + case MLX4_IB_QPT_GSI: + err = build_mlx_header(to_msqp(qp), wr, ctrl, &seglen); + if (unlikely(err)) { + *bad_wr = wr; + goto out; + } + wqe += seglen; + size += seglen / 16; + break; + + default: + break; + } + + /* + * Write data segments in reverse order, so as to + * overwrite cacheline stamp last within each + * cacheline. This avoids issues with WQE + * prefetching. + */ + + dseg = wqe; + dseg += wr->num_sge - 1; + size += wr->num_sge * (sizeof (struct mlx4_wqe_data_seg) / 16); + + /* Add one more inline data segment for ICRC for MLX sends */ + if (unlikely(qp->mlx4_ib_qp_type == MLX4_IB_QPT_SMI || + qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI || + qp->mlx4_ib_qp_type & + (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER))) { + set_mlx_icrc_seg(dseg + 1); + size += sizeof (struct mlx4_wqe_data_seg) / 16; + } + + for (i = wr->num_sge - 1; i >= 0; --i, --dseg) + set_data_seg(dseg, wr->sg_list + i); + + /* + * Possibly overwrite stamping in cacheline with LSO + * segment only after making sure all data segments + * are written. + */ + wmb(); + *lso_wqe = lso_hdr_sz; + + ctrl->fence_size = (wr->send_flags & IB_SEND_FENCE ? + MLX4_WQE_CTRL_FENCE : 0) | size; + + /* + * Make sure descriptor is fully written before + * setting ownership bit (because HW can start + * executing as soon as we do). + */ + wmb(); + + if (wr->opcode < 0 || wr->opcode >= ARRAY_SIZE(mlx4_ib_opcode)) { + *bad_wr = wr; + err = -EINVAL; + goto out; + } + + ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] | + (ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0) | blh; + + stamp = ind + qp->sq_spare_wqes; + ind += DIV_ROUND_UP(size * 16, 1U << qp->sq.wqe_shift); + + /* + * We can improve latency by not stamping the last + * send queue WQE until after ringing the doorbell, so + * only stamp here if there are still more WQEs to post. + * + * Same optimization applies to padding with NOP wqe + * in case of WQE shrinking (used to prevent wrap-around + * in the middle of WR). + */ + if (wr->next) { + stamp_send_wqe(qp, stamp, size * 16); + ind = pad_wraparound(qp, ind); + } + } + +out: + if (likely(nreq)) { + qp->sq.head += nreq; + + /* + * Make sure that descriptors are written before + * doorbell record. + */ + wmb(); + + writel(qp->doorbell_qpn, + to_mdev(ibqp->device)->uar_map + MLX4_SEND_DOORBELL); + + /* + * Make sure doorbells don't leak out of SQ spinlock + * and reach the HCA out of order. + */ + mmiowb(); + + stamp_send_wqe(qp, stamp, size * 16); + + ind = pad_wraparound(qp, ind); + qp->sq_next_wqe = ind; + } + + spin_unlock_irqrestore(&qp->sq.lock, flags); + + return err; +} + +int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct mlx4_ib_qp *qp = to_mqp(ibqp); + struct mlx4_wqe_data_seg *scat; + unsigned long flags; + int err = 0; + int nreq; + int ind; + int max_gs; + int i; + struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); + + max_gs = qp->rq.max_gs; + spin_lock_irqsave(&qp->rq.lock, flags); + + if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) { + err = -EIO; + *bad_wr = wr; + nreq = 0; + goto out; + } + + ind = qp->rq.head & (qp->rq.wqe_cnt - 1); + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + if (mlx4_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) { + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + if (unlikely(wr->num_sge > qp->rq.max_gs)) { + err = -EINVAL; + *bad_wr = wr; + goto out; + } + + scat = get_recv_wqe(qp, ind); + + if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER | + MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) { + ib_dma_sync_single_for_device(ibqp->device, + qp->sqp_proxy_rcv[ind].map, + sizeof (struct mlx4_ib_proxy_sqp_hdr), + DMA_FROM_DEVICE); + scat->byte_count = + cpu_to_be32(sizeof (struct mlx4_ib_proxy_sqp_hdr)); + /* use dma lkey from upper layer entry */ + scat->lkey = cpu_to_be32(wr->sg_list->lkey); + scat->addr = cpu_to_be64(qp->sqp_proxy_rcv[ind].map); + scat++; + max_gs--; + } + + for (i = 0; i < wr->num_sge; ++i) + __set_data_seg(scat + i, wr->sg_list + i); + + if (i < max_gs) { + scat[i].byte_count = 0; + scat[i].lkey = cpu_to_be32(MLX4_INVALID_LKEY); + scat[i].addr = 0; + } + + qp->rq.wrid[ind] = wr->wr_id; + + ind = (ind + 1) & (qp->rq.wqe_cnt - 1); + } + +out: + if (likely(nreq)) { + qp->rq.head += nreq; + + /* + * Make sure that descriptors are written before + * doorbell record. + */ + wmb(); + + *qp->db.db = cpu_to_be32(qp->rq.head & 0xffff); + } + + spin_unlock_irqrestore(&qp->rq.lock, flags); + + return err; +} + +static inline enum ib_qp_state to_ib_qp_state(enum mlx4_qp_state mlx4_state) +{ + switch (mlx4_state) { + case MLX4_QP_STATE_RST: return IB_QPS_RESET; + case MLX4_QP_STATE_INIT: return IB_QPS_INIT; + case MLX4_QP_STATE_RTR: return IB_QPS_RTR; + case MLX4_QP_STATE_RTS: return IB_QPS_RTS; + case MLX4_QP_STATE_SQ_DRAINING: + case MLX4_QP_STATE_SQD: return IB_QPS_SQD; + case MLX4_QP_STATE_SQER: return IB_QPS_SQE; + case MLX4_QP_STATE_ERR: return IB_QPS_ERR; + default: return -1; + } +} + +static inline enum ib_mig_state to_ib_mig_state(int mlx4_mig_state) +{ + switch (mlx4_mig_state) { + case MLX4_QP_PM_ARMED: return IB_MIG_ARMED; + case MLX4_QP_PM_REARM: return IB_MIG_REARM; + case MLX4_QP_PM_MIGRATED: return IB_MIG_MIGRATED; + default: return -1; + } +} + +static int to_ib_qp_access_flags(int mlx4_flags) +{ + int ib_flags = 0; + + if (mlx4_flags & MLX4_QP_BIT_RRE) + ib_flags |= IB_ACCESS_REMOTE_READ; + if (mlx4_flags & MLX4_QP_BIT_RWE) + ib_flags |= IB_ACCESS_REMOTE_WRITE; + if (mlx4_flags & MLX4_QP_BIT_RAE) + ib_flags |= IB_ACCESS_REMOTE_ATOMIC; + + return ib_flags; +} + +static void to_ib_ah_attr(struct mlx4_ib_dev *ibdev, struct ib_ah_attr *ib_ah_attr, + struct mlx4_qp_path *path) +{ + struct mlx4_dev *dev = ibdev->dev; + int is_eth; + + memset(ib_ah_attr, 0, sizeof *ib_ah_attr); + ib_ah_attr->port_num = path->sched_queue & 0x40 ? 2 : 1; + + if (ib_ah_attr->port_num == 0 || ib_ah_attr->port_num > dev->caps.num_ports) + return; + + is_eth = rdma_port_get_link_layer(&ibdev->ib_dev, ib_ah_attr->port_num) == + IB_LINK_LAYER_ETHERNET; + if (is_eth) + ib_ah_attr->sl = ((path->sched_queue >> 3) & 0x7) | + ((path->sched_queue & 4) << 1); + else + ib_ah_attr->sl = (path->sched_queue >> 2) & 0xf; + + ib_ah_attr->dlid = be16_to_cpu(path->rlid); + ib_ah_attr->src_path_bits = path->grh_mylmc & 0x7f; + ib_ah_attr->static_rate = path->static_rate ? path->static_rate - 5 : 0; + ib_ah_attr->ah_flags = (path->grh_mylmc & (1 << 7)) ? IB_AH_GRH : 0; + if (ib_ah_attr->ah_flags) { + ib_ah_attr->grh.sgid_index = path->mgid_index; + ib_ah_attr->grh.hop_limit = path->hop_limit; + ib_ah_attr->grh.traffic_class = + (be32_to_cpu(path->tclass_flowlabel) >> 20) & 0xff; + ib_ah_attr->grh.flow_label = + be32_to_cpu(path->tclass_flowlabel) & 0xfffff; + memcpy(ib_ah_attr->grh.dgid.raw, + path->rgid, sizeof ib_ah_attr->grh.dgid.raw); + } +} + +int mlx4_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr) +{ + struct mlx4_ib_dev *dev = to_mdev(ibqp->device); + struct mlx4_ib_qp *qp = to_mqp(ibqp); + struct mlx4_qp_context context; + int mlx4_state; + int err = 0; + + mutex_lock(&qp->mutex); + + if (qp->state == IB_QPS_RESET) { + qp_attr->qp_state = IB_QPS_RESET; + goto done; + } + + err = mlx4_qp_query(dev->dev, &qp->mqp, &context); + if (err) { + err = -EINVAL; + goto out; + } + + mlx4_state = be32_to_cpu(context.flags) >> 28; + + qp->state = to_ib_qp_state(mlx4_state); + qp_attr->qp_state = qp->state; + qp_attr->path_mtu = context.mtu_msgmax >> 5; + qp_attr->path_mig_state = + to_ib_mig_state((be32_to_cpu(context.flags) >> 11) & 0x3); + qp_attr->qkey = be32_to_cpu(context.qkey); + qp_attr->rq_psn = be32_to_cpu(context.rnr_nextrecvpsn) & 0xffffff; + qp_attr->sq_psn = be32_to_cpu(context.next_send_psn) & 0xffffff; + qp_attr->dest_qp_num = be32_to_cpu(context.remote_qpn) & 0xffffff; + qp_attr->qp_access_flags = + to_ib_qp_access_flags(be32_to_cpu(context.params2)); + + if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) { + to_ib_ah_attr(dev, &qp_attr->ah_attr, &context.pri_path); + to_ib_ah_attr(dev, &qp_attr->alt_ah_attr, &context.alt_path); + qp_attr->alt_pkey_index = context.alt_path.pkey_index & 0x7f; + qp_attr->alt_port_num = qp_attr->alt_ah_attr.port_num; + } + + qp_attr->pkey_index = context.pri_path.pkey_index & 0x7f; + if (qp_attr->qp_state == IB_QPS_INIT) + qp_attr->port_num = qp->port; + else + qp_attr->port_num = context.pri_path.sched_queue & 0x40 ? 2 : 1; + + /* qp_attr->en_sqd_async_notify is only applicable in modify qp */ + qp_attr->sq_draining = mlx4_state == MLX4_QP_STATE_SQ_DRAINING; + + qp_attr->max_rd_atomic = 1 << ((be32_to_cpu(context.params1) >> 21) & 0x7); + + qp_attr->max_dest_rd_atomic = + 1 << ((be32_to_cpu(context.params2) >> 21) & 0x7); + qp_attr->min_rnr_timer = + (be32_to_cpu(context.rnr_nextrecvpsn) >> 24) & 0x1f; + qp_attr->timeout = context.pri_path.ackto >> 3; + qp_attr->retry_cnt = (be32_to_cpu(context.params1) >> 16) & 0x7; + qp_attr->rnr_retry = (be32_to_cpu(context.params1) >> 13) & 0x7; + qp_attr->alt_timeout = context.alt_path.ackto >> 3; + +done: + qp_attr->cur_qp_state = qp_attr->qp_state; + qp_attr->cap.max_recv_wr = qp->rq.wqe_cnt; + qp_attr->cap.max_recv_sge = qp->rq.max_gs; + + if (!ibqp->uobject) { + qp_attr->cap.max_send_wr = qp->sq.wqe_cnt; + qp_attr->cap.max_send_sge = qp->sq.max_gs; + } else { + qp_attr->cap.max_send_wr = 0; + qp_attr->cap.max_send_sge = 0; + } + + /* + * We don't support inline sends for kernel QPs (yet), and we + * don't know what userspace's value should be. + */ + qp_attr->cap.max_inline_data = 0; + + qp_init_attr->cap = qp_attr->cap; + + qp_init_attr->create_flags = 0; + if (qp->flags & MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK) + qp_init_attr->create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK; + + if (qp->flags & MLX4_IB_QP_LSO) + qp_init_attr->create_flags |= IB_QP_CREATE_IPOIB_UD_LSO; + + if (qp->flags & MLX4_IB_QP_NETIF) + qp_init_attr->create_flags |= IB_QP_CREATE_NETIF_QP; + + qp_init_attr->sq_sig_type = + qp->sq_signal_bits == cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE) ? + IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; + +out: + mutex_unlock(&qp->mutex); + return err; +} + diff --git a/kernel/drivers/infiniband/hw/mlx4/srq.c b/kernel/drivers/infiniband/hw/mlx4/srq.c new file mode 100644 index 000000000..dce5dfe3a --- /dev/null +++ b/kernel/drivers/infiniband/hw/mlx4/srq.c @@ -0,0 +1,377 @@ +/* + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include "mlx4_ib.h" +#include "user.h" + +static void *get_wqe(struct mlx4_ib_srq *srq, int n) +{ + return mlx4_buf_offset(&srq->buf, n << srq->msrq.wqe_shift); +} + +static void mlx4_ib_srq_event(struct mlx4_srq *srq, enum mlx4_event type) +{ + struct ib_event event; + struct ib_srq *ibsrq = &to_mibsrq(srq)->ibsrq; + + if (ibsrq->event_handler) { + event.device = ibsrq->device; + event.element.srq = ibsrq; + switch (type) { + case MLX4_EVENT_TYPE_SRQ_LIMIT: + event.event = IB_EVENT_SRQ_LIMIT_REACHED; + break; + case MLX4_EVENT_TYPE_SRQ_CATAS_ERROR: + event.event = IB_EVENT_SRQ_ERR; + break; + default: + pr_warn("Unexpected event type %d " + "on SRQ %06x\n", type, srq->srqn); + return; + } + + ibsrq->event_handler(&event, ibsrq->srq_context); + } +} + +struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd, + struct ib_srq_init_attr *init_attr, + struct ib_udata *udata) +{ + struct mlx4_ib_dev *dev = to_mdev(pd->device); + struct mlx4_ib_srq *srq; + struct mlx4_wqe_srq_next_seg *next; + struct mlx4_wqe_data_seg *scatter; + u32 cqn; + u16 xrcdn; + int desc_size; + int buf_size; + int err; + int i; + + /* Sanity check SRQ size before proceeding */ + if (init_attr->attr.max_wr >= dev->dev->caps.max_srq_wqes || + init_attr->attr.max_sge > dev->dev->caps.max_srq_sge) + return ERR_PTR(-EINVAL); + + srq = kmalloc(sizeof *srq, GFP_KERNEL); + if (!srq) + return ERR_PTR(-ENOMEM); + + mutex_init(&srq->mutex); + spin_lock_init(&srq->lock); + srq->msrq.max = roundup_pow_of_two(init_attr->attr.max_wr + 1); + srq->msrq.max_gs = init_attr->attr.max_sge; + + desc_size = max(32UL, + roundup_pow_of_two(sizeof (struct mlx4_wqe_srq_next_seg) + + srq->msrq.max_gs * + sizeof (struct mlx4_wqe_data_seg))); + srq->msrq.wqe_shift = ilog2(desc_size); + + buf_size = srq->msrq.max * desc_size; + + if (pd->uobject) { + struct mlx4_ib_create_srq ucmd; + + if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) { + err = -EFAULT; + goto err_srq; + } + + srq->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, + buf_size, 0, 0); + if (IS_ERR(srq->umem)) { + err = PTR_ERR(srq->umem); + goto err_srq; + } + + err = mlx4_mtt_init(dev->dev, ib_umem_page_count(srq->umem), + ilog2(srq->umem->page_size), &srq->mtt); + if (err) + goto err_buf; + + err = mlx4_ib_umem_write_mtt(dev, &srq->mtt, srq->umem); + if (err) + goto err_mtt; + + err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context), + ucmd.db_addr, &srq->db); + if (err) + goto err_mtt; + } else { + err = mlx4_db_alloc(dev->dev, &srq->db, 0, GFP_KERNEL); + if (err) + goto err_srq; + + *srq->db.db = 0; + + if (mlx4_buf_alloc(dev->dev, buf_size, PAGE_SIZE * 2, &srq->buf, + GFP_KERNEL)) { + err = -ENOMEM; + goto err_db; + } + + srq->head = 0; + srq->tail = srq->msrq.max - 1; + srq->wqe_ctr = 0; + + for (i = 0; i < srq->msrq.max; ++i) { + next = get_wqe(srq, i); + next->next_wqe_index = + cpu_to_be16((i + 1) & (srq->msrq.max - 1)); + + for (scatter = (void *) (next + 1); + (void *) scatter < (void *) next + desc_size; + ++scatter) + scatter->lkey = cpu_to_be32(MLX4_INVALID_LKEY); + } + + err = mlx4_mtt_init(dev->dev, srq->buf.npages, srq->buf.page_shift, + &srq->mtt); + if (err) + goto err_buf; + + err = mlx4_buf_write_mtt(dev->dev, &srq->mtt, &srq->buf, GFP_KERNEL); + if (err) + goto err_mtt; + + srq->wrid = kmalloc(srq->msrq.max * sizeof (u64), GFP_KERNEL); + if (!srq->wrid) { + err = -ENOMEM; + goto err_mtt; + } + } + + cqn = (init_attr->srq_type == IB_SRQT_XRC) ? + to_mcq(init_attr->ext.xrc.cq)->mcq.cqn : 0; + xrcdn = (init_attr->srq_type == IB_SRQT_XRC) ? + to_mxrcd(init_attr->ext.xrc.xrcd)->xrcdn : + (u16) dev->dev->caps.reserved_xrcds; + err = mlx4_srq_alloc(dev->dev, to_mpd(pd)->pdn, cqn, xrcdn, &srq->mtt, + srq->db.dma, &srq->msrq); + if (err) + goto err_wrid; + + srq->msrq.event = mlx4_ib_srq_event; + srq->ibsrq.ext.xrc.srq_num = srq->msrq.srqn; + + if (pd->uobject) + if (ib_copy_to_udata(udata, &srq->msrq.srqn, sizeof (__u32))) { + err = -EFAULT; + goto err_wrid; + } + + init_attr->attr.max_wr = srq->msrq.max - 1; + + return &srq->ibsrq; + +err_wrid: + if (pd->uobject) + mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), &srq->db); + else + kfree(srq->wrid); + +err_mtt: + mlx4_mtt_cleanup(dev->dev, &srq->mtt); + +err_buf: + if (pd->uobject) + ib_umem_release(srq->umem); + else + mlx4_buf_free(dev->dev, buf_size, &srq->buf); + +err_db: + if (!pd->uobject) + mlx4_db_free(dev->dev, &srq->db); + +err_srq: + kfree(srq); + + return ERR_PTR(err); +} + +int mlx4_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, struct ib_udata *udata) +{ + struct mlx4_ib_dev *dev = to_mdev(ibsrq->device); + struct mlx4_ib_srq *srq = to_msrq(ibsrq); + int ret; + + /* We don't support resizing SRQs (yet?) */ + if (attr_mask & IB_SRQ_MAX_WR) + return -EINVAL; + + if (attr_mask & IB_SRQ_LIMIT) { + if (attr->srq_limit >= srq->msrq.max) + return -EINVAL; + + mutex_lock(&srq->mutex); + ret = mlx4_srq_arm(dev->dev, &srq->msrq, attr->srq_limit); + mutex_unlock(&srq->mutex); + + if (ret) + return ret; + } + + return 0; +} + +int mlx4_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr) +{ + struct mlx4_ib_dev *dev = to_mdev(ibsrq->device); + struct mlx4_ib_srq *srq = to_msrq(ibsrq); + int ret; + int limit_watermark; + + ret = mlx4_srq_query(dev->dev, &srq->msrq, &limit_watermark); + if (ret) + return ret; + + srq_attr->srq_limit = limit_watermark; + srq_attr->max_wr = srq->msrq.max - 1; + srq_attr->max_sge = srq->msrq.max_gs; + + return 0; +} + +int mlx4_ib_destroy_srq(struct ib_srq *srq) +{ + struct mlx4_ib_dev *dev = to_mdev(srq->device); + struct mlx4_ib_srq *msrq = to_msrq(srq); + + mlx4_srq_free(dev->dev, &msrq->msrq); + mlx4_mtt_cleanup(dev->dev, &msrq->mtt); + + if (srq->uobject) { + mlx4_ib_db_unmap_user(to_mucontext(srq->uobject->context), &msrq->db); + ib_umem_release(msrq->umem); + } else { + kfree(msrq->wrid); + mlx4_buf_free(dev->dev, msrq->msrq.max << msrq->msrq.wqe_shift, + &msrq->buf); + mlx4_db_free(dev->dev, &msrq->db); + } + + kfree(msrq); + + return 0; +} + +void mlx4_ib_free_srq_wqe(struct mlx4_ib_srq *srq, int wqe_index) +{ + struct mlx4_wqe_srq_next_seg *next; + + /* always called with interrupts disabled. */ + spin_lock(&srq->lock); + + next = get_wqe(srq, srq->tail); + next->next_wqe_index = cpu_to_be16(wqe_index); + srq->tail = wqe_index; + + spin_unlock(&srq->lock); +} + +int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct mlx4_ib_srq *srq = to_msrq(ibsrq); + struct mlx4_wqe_srq_next_seg *next; + struct mlx4_wqe_data_seg *scat; + unsigned long flags; + int err = 0; + int nreq; + int i; + struct mlx4_ib_dev *mdev = to_mdev(ibsrq->device); + + spin_lock_irqsave(&srq->lock, flags); + if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) { + err = -EIO; + *bad_wr = wr; + nreq = 0; + goto out; + } + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + if (unlikely(wr->num_sge > srq->msrq.max_gs)) { + err = -EINVAL; + *bad_wr = wr; + break; + } + + if (unlikely(srq->head == srq->tail)) { + err = -ENOMEM; + *bad_wr = wr; + break; + } + + srq->wrid[srq->head] = wr->wr_id; + + next = get_wqe(srq, srq->head); + srq->head = be16_to_cpu(next->next_wqe_index); + scat = (struct mlx4_wqe_data_seg *) (next + 1); + + for (i = 0; i < wr->num_sge; ++i) { + scat[i].byte_count = cpu_to_be32(wr->sg_list[i].length); + scat[i].lkey = cpu_to_be32(wr->sg_list[i].lkey); + scat[i].addr = cpu_to_be64(wr->sg_list[i].addr); + } + + if (i < srq->msrq.max_gs) { + scat[i].byte_count = 0; + scat[i].lkey = cpu_to_be32(MLX4_INVALID_LKEY); + scat[i].addr = 0; + } + } + + if (likely(nreq)) { + srq->wqe_ctr += nreq; + + /* + * Make sure that descriptors are written before + * doorbell record. + */ + wmb(); + + *srq->db.db = cpu_to_be32(srq->wqe_ctr); + } +out: + + spin_unlock_irqrestore(&srq->lock, flags); + + return err; +} diff --git a/kernel/drivers/infiniband/hw/mlx4/sysfs.c b/kernel/drivers/infiniband/hw/mlx4/sysfs.c new file mode 100644 index 000000000..6797108ce --- /dev/null +++ b/kernel/drivers/infiniband/hw/mlx4/sysfs.c @@ -0,0 +1,886 @@ +/* + * Copyright (c) 2012 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/*#include "core_priv.h"*/ +#include "mlx4_ib.h" +#include +#include +#include + +#include +/*show_admin_alias_guid returns the administratively assigned value of that GUID. + * Values returned in buf parameter string: + * 0 - requests opensm to assign a value. + * ffffffffffffffff - delete this entry. + * other - value assigned by administrator. + */ +static ssize_t show_admin_alias_guid(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct mlx4_ib_iov_sysfs_attr *mlx4_ib_iov_dentry = + container_of(attr, struct mlx4_ib_iov_sysfs_attr, dentry); + struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx; + struct mlx4_ib_dev *mdev = port->dev; + __be64 sysadmin_ag_val; + + sysadmin_ag_val = mlx4_get_admin_guid(mdev->dev, + mlx4_ib_iov_dentry->entry_num, + port->num); + + return sprintf(buf, "%llx\n", be64_to_cpu(sysadmin_ag_val)); +} + +/* store_admin_alias_guid stores the (new) administratively assigned value of that GUID. + * Values in buf parameter string: + * 0 - requests opensm to assign a value. + * 0xffffffffffffffff - delete this entry. + * other - guid value assigned by the administrator. + */ +static ssize_t store_admin_alias_guid(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + int record_num;/*0-15*/ + int guid_index_in_rec; /*0 - 7*/ + struct mlx4_ib_iov_sysfs_attr *mlx4_ib_iov_dentry = + container_of(attr, struct mlx4_ib_iov_sysfs_attr, dentry); + struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx; + struct mlx4_ib_dev *mdev = port->dev; + u64 sysadmin_ag_val; + unsigned long flags; + + record_num = mlx4_ib_iov_dentry->entry_num / 8; + guid_index_in_rec = mlx4_ib_iov_dentry->entry_num % 8; + if (0 == record_num && 0 == guid_index_in_rec) { + pr_err("GUID 0 block 0 is RO\n"); + return count; + } + spin_lock_irqsave(&mdev->sriov.alias_guid.ag_work_lock, flags); + sscanf(buf, "%llx", &sysadmin_ag_val); + *(__be64 *)&mdev->sriov.alias_guid.ports_guid[port->num - 1]. + all_rec_per_port[record_num]. + all_recs[GUID_REC_SIZE * guid_index_in_rec] = + cpu_to_be64(sysadmin_ag_val); + + /* Change the state to be pending for update */ + mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].status + = MLX4_GUID_INFO_STATUS_IDLE ; + mlx4_set_admin_guid(mdev->dev, cpu_to_be64(sysadmin_ag_val), + mlx4_ib_iov_dentry->entry_num, + port->num); + + /* set the record index */ + mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].guid_indexes + |= mlx4_ib_get_aguid_comp_mask_from_ix(guid_index_in_rec); + + spin_unlock_irqrestore(&mdev->sriov.alias_guid.ag_work_lock, flags); + mlx4_ib_init_alias_guid_work(mdev, port->num - 1); + + return count; +} + +static ssize_t show_port_gid(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct mlx4_ib_iov_sysfs_attr *mlx4_ib_iov_dentry = + container_of(attr, struct mlx4_ib_iov_sysfs_attr, dentry); + struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx; + struct mlx4_ib_dev *mdev = port->dev; + union ib_gid gid; + ssize_t ret; + + ret = __mlx4_ib_query_gid(&mdev->ib_dev, port->num, + mlx4_ib_iov_dentry->entry_num, &gid, 1); + if (ret) + return ret; + ret = sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", + be16_to_cpu(((__be16 *) gid.raw)[0]), + be16_to_cpu(((__be16 *) gid.raw)[1]), + be16_to_cpu(((__be16 *) gid.raw)[2]), + be16_to_cpu(((__be16 *) gid.raw)[3]), + be16_to_cpu(((__be16 *) gid.raw)[4]), + be16_to_cpu(((__be16 *) gid.raw)[5]), + be16_to_cpu(((__be16 *) gid.raw)[6]), + be16_to_cpu(((__be16 *) gid.raw)[7])); + return ret; +} + +static ssize_t show_phys_port_pkey(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct mlx4_ib_iov_sysfs_attr *mlx4_ib_iov_dentry = + container_of(attr, struct mlx4_ib_iov_sysfs_attr, dentry); + struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx; + struct mlx4_ib_dev *mdev = port->dev; + u16 pkey; + ssize_t ret; + + ret = __mlx4_ib_query_pkey(&mdev->ib_dev, port->num, + mlx4_ib_iov_dentry->entry_num, &pkey, 1); + if (ret) + return ret; + + return sprintf(buf, "0x%04x\n", pkey); +} + +#define DENTRY_REMOVE(_dentry) \ +do { \ + sysfs_remove_file((_dentry)->kobj, &(_dentry)->dentry.attr); \ +} while (0); + +static int create_sysfs_entry(void *_ctx, struct mlx4_ib_iov_sysfs_attr *_dentry, + char *_name, struct kobject *_kobj, + ssize_t (*show)(struct device *dev, + struct device_attribute *attr, + char *buf), + ssize_t (*store)(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) + ) +{ + int ret = 0; + struct mlx4_ib_iov_sysfs_attr *vdentry = _dentry; + + vdentry->ctx = _ctx; + vdentry->dentry.show = show; + vdentry->dentry.store = store; + sysfs_attr_init(&vdentry->dentry.attr); + vdentry->dentry.attr.name = vdentry->name; + vdentry->dentry.attr.mode = 0; + vdentry->kobj = _kobj; + snprintf(vdentry->name, 15, "%s", _name); + + if (vdentry->dentry.store) + vdentry->dentry.attr.mode |= S_IWUSR; + + if (vdentry->dentry.show) + vdentry->dentry.attr.mode |= S_IRUGO; + + ret = sysfs_create_file(vdentry->kobj, &vdentry->dentry.attr); + if (ret) { + pr_err("failed to create %s\n", vdentry->dentry.attr.name); + vdentry->ctx = NULL; + return ret; + } + + return ret; +} + +int add_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num, + struct attribute *attr) +{ + struct mlx4_ib_iov_port *port = &device->iov_ports[port_num - 1]; + int ret; + + ret = sysfs_create_file(port->mcgs_parent, attr); + if (ret) + pr_err("failed to create %s\n", attr->name); + + return ret; +} + +void del_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num, + struct attribute *attr) +{ + struct mlx4_ib_iov_port *port = &device->iov_ports[port_num - 1]; + + sysfs_remove_file(port->mcgs_parent, attr); +} + +static int add_port_entries(struct mlx4_ib_dev *device, int port_num) +{ + int i; + char buff[10]; + struct mlx4_ib_iov_port *port = NULL; + int ret = 0 ; + struct ib_port_attr attr; + + /* get the physical gid and pkey table sizes.*/ + ret = __mlx4_ib_query_port(&device->ib_dev, port_num, &attr, 1); + if (ret) + goto err; + + port = &device->iov_ports[port_num - 1]; + port->dev = device; + port->num = port_num; + /* Directory structure: + * iov - + * port num - + * admin_guids + * gids (operational) + * mcg_table + */ + port->dentr_ar = kzalloc(sizeof (struct mlx4_ib_iov_sysfs_attr_ar), + GFP_KERNEL); + if (!port->dentr_ar) { + ret = -ENOMEM; + goto err; + } + sprintf(buff, "%d", port_num); + port->cur_port = kobject_create_and_add(buff, + kobject_get(device->ports_parent)); + if (!port->cur_port) { + ret = -ENOMEM; + goto kobj_create_err; + } + /* admin GUIDs */ + port->admin_alias_parent = kobject_create_and_add("admin_guids", + kobject_get(port->cur_port)); + if (!port->admin_alias_parent) { + ret = -ENOMEM; + goto err_admin_guids; + } + for (i = 0 ; i < attr.gid_tbl_len; i++) { + sprintf(buff, "%d", i); + port->dentr_ar->dentries[i].entry_num = i; + ret = create_sysfs_entry(port, &port->dentr_ar->dentries[i], + buff, port->admin_alias_parent, + show_admin_alias_guid, store_admin_alias_guid); + if (ret) + goto err_admin_alias_parent; + } + + /* gids subdirectory (operational gids) */ + port->gids_parent = kobject_create_and_add("gids", + kobject_get(port->cur_port)); + if (!port->gids_parent) { + ret = -ENOMEM; + goto err_gids; + } + + for (i = 0 ; i < attr.gid_tbl_len; i++) { + sprintf(buff, "%d", i); + port->dentr_ar->dentries[attr.gid_tbl_len + i].entry_num = i; + ret = create_sysfs_entry(port, + &port->dentr_ar->dentries[attr.gid_tbl_len + i], + buff, + port->gids_parent, show_port_gid, NULL); + if (ret) + goto err_gids_parent; + } + + /* physical port pkey table */ + port->pkeys_parent = + kobject_create_and_add("pkeys", kobject_get(port->cur_port)); + if (!port->pkeys_parent) { + ret = -ENOMEM; + goto err_pkeys; + } + + for (i = 0 ; i < attr.pkey_tbl_len; i++) { + sprintf(buff, "%d", i); + port->dentr_ar->dentries[2 * attr.gid_tbl_len + i].entry_num = i; + ret = create_sysfs_entry(port, + &port->dentr_ar->dentries[2 * attr.gid_tbl_len + i], + buff, port->pkeys_parent, + show_phys_port_pkey, NULL); + if (ret) + goto err_pkeys_parent; + } + + /* MCGs table */ + port->mcgs_parent = + kobject_create_and_add("mcgs", kobject_get(port->cur_port)); + if (!port->mcgs_parent) { + ret = -ENOMEM; + goto err_mcgs; + } + return 0; + +err_mcgs: + kobject_put(port->cur_port); + +err_pkeys_parent: + kobject_put(port->pkeys_parent); + +err_pkeys: + kobject_put(port->cur_port); + +err_gids_parent: + kobject_put(port->gids_parent); + +err_gids: + kobject_put(port->cur_port); + +err_admin_alias_parent: + kobject_put(port->admin_alias_parent); + +err_admin_guids: + kobject_put(port->cur_port); + kobject_put(port->cur_port); /* once more for create_and_add buff */ + +kobj_create_err: + kobject_put(device->ports_parent); + kfree(port->dentr_ar); + +err: + pr_err("add_port_entries FAILED: for port:%d, error: %d\n", + port_num, ret); + return ret; +} + +static void get_name(struct mlx4_ib_dev *dev, char *name, int i, int max) +{ + char base_name[9]; + + /* pci_name format is: bus:dev:func -> xxxx:yy:zz.n */ + strlcpy(name, pci_name(dev->dev->persist->pdev), max); + strncpy(base_name, name, 8); /*till xxxx:yy:*/ + base_name[8] = '\0'; + /* with no ARI only 3 last bits are used so when the fn is higher than 8 + * need to add it to the dev num, so count in the last number will be + * modulo 8 */ + sprintf(name, "%s%.2d.%d", base_name, (i/8), (i%8)); +} + +struct mlx4_port { + struct kobject kobj; + struct mlx4_ib_dev *dev; + struct attribute_group pkey_group; + struct attribute_group gid_group; + struct device_attribute enable_smi_admin; + struct device_attribute smi_enabled; + int slave; + u8 port_num; +}; + + +static void mlx4_port_release(struct kobject *kobj) +{ + struct mlx4_port *p = container_of(kobj, struct mlx4_port, kobj); + struct attribute *a; + int i; + + for (i = 0; (a = p->pkey_group.attrs[i]); ++i) + kfree(a); + kfree(p->pkey_group.attrs); + for (i = 0; (a = p->gid_group.attrs[i]); ++i) + kfree(a); + kfree(p->gid_group.attrs); + kfree(p); +} + +struct port_attribute { + struct attribute attr; + ssize_t (*show)(struct mlx4_port *, struct port_attribute *, char *buf); + ssize_t (*store)(struct mlx4_port *, struct port_attribute *, + const char *buf, size_t count); +}; + +static ssize_t port_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct port_attribute *port_attr = + container_of(attr, struct port_attribute, attr); + struct mlx4_port *p = container_of(kobj, struct mlx4_port, kobj); + + if (!port_attr->show) + return -EIO; + return port_attr->show(p, port_attr, buf); +} + +static ssize_t port_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t size) +{ + struct port_attribute *port_attr = + container_of(attr, struct port_attribute, attr); + struct mlx4_port *p = container_of(kobj, struct mlx4_port, kobj); + + if (!port_attr->store) + return -EIO; + return port_attr->store(p, port_attr, buf, size); +} + +static const struct sysfs_ops port_sysfs_ops = { + .show = port_attr_show, + .store = port_attr_store, +}; + +static struct kobj_type port_type = { + .release = mlx4_port_release, + .sysfs_ops = &port_sysfs_ops, +}; + +struct port_table_attribute { + struct port_attribute attr; + char name[8]; + int index; +}; + +static ssize_t show_port_pkey(struct mlx4_port *p, struct port_attribute *attr, + char *buf) +{ + struct port_table_attribute *tab_attr = + container_of(attr, struct port_table_attribute, attr); + ssize_t ret = -ENODEV; + + if (p->dev->pkeys.virt2phys_pkey[p->slave][p->port_num - 1][tab_attr->index] >= + (p->dev->dev->caps.pkey_table_len[p->port_num])) + ret = sprintf(buf, "none\n"); + else + ret = sprintf(buf, "%d\n", + p->dev->pkeys.virt2phys_pkey[p->slave] + [p->port_num - 1][tab_attr->index]); + return ret; +} + +static ssize_t store_port_pkey(struct mlx4_port *p, struct port_attribute *attr, + const char *buf, size_t count) +{ + struct port_table_attribute *tab_attr = + container_of(attr, struct port_table_attribute, attr); + int idx; + int err; + + /* do not allow remapping Dom0 virtual pkey table */ + if (p->slave == mlx4_master_func_num(p->dev->dev)) + return -EINVAL; + + if (!strncasecmp(buf, "no", 2)) + idx = p->dev->dev->phys_caps.pkey_phys_table_len[p->port_num] - 1; + else if (sscanf(buf, "%i", &idx) != 1 || + idx >= p->dev->dev->caps.pkey_table_len[p->port_num] || + idx < 0) + return -EINVAL; + + p->dev->pkeys.virt2phys_pkey[p->slave][p->port_num - 1] + [tab_attr->index] = idx; + mlx4_sync_pkey_table(p->dev->dev, p->slave, p->port_num, + tab_attr->index, idx); + err = mlx4_gen_pkey_eqe(p->dev->dev, p->slave, p->port_num); + if (err) { + pr_err("mlx4_gen_pkey_eqe failed for slave %d," + " port %d, index %d\n", p->slave, p->port_num, idx); + return err; + } + return count; +} + +static ssize_t show_port_gid_idx(struct mlx4_port *p, + struct port_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", p->slave); +} + +static struct attribute ** +alloc_group_attrs(ssize_t (*show)(struct mlx4_port *, + struct port_attribute *, char *buf), + ssize_t (*store)(struct mlx4_port *, struct port_attribute *, + const char *buf, size_t count), + int len) +{ + struct attribute **tab_attr; + struct port_table_attribute *element; + int i; + + tab_attr = kcalloc(1 + len, sizeof (struct attribute *), GFP_KERNEL); + if (!tab_attr) + return NULL; + + for (i = 0; i < len; i++) { + element = kzalloc(sizeof (struct port_table_attribute), + GFP_KERNEL); + if (!element) + goto err; + if (snprintf(element->name, sizeof (element->name), + "%d", i) >= sizeof (element->name)) { + kfree(element); + goto err; + } + sysfs_attr_init(&element->attr.attr); + element->attr.attr.name = element->name; + if (store) { + element->attr.attr.mode = S_IWUSR | S_IRUGO; + element->attr.store = store; + } else + element->attr.attr.mode = S_IRUGO; + + element->attr.show = show; + element->index = i; + tab_attr[i] = &element->attr.attr; + } + return tab_attr; + +err: + while (--i >= 0) + kfree(tab_attr[i]); + kfree(tab_attr); + return NULL; +} + +static ssize_t sysfs_show_smi_enabled(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct mlx4_port *p = + container_of(attr, struct mlx4_port, smi_enabled); + ssize_t len = 0; + + if (mlx4_vf_smi_enabled(p->dev->dev, p->slave, p->port_num)) + len = sprintf(buf, "%d\n", 1); + else + len = sprintf(buf, "%d\n", 0); + + return len; +} + +static ssize_t sysfs_show_enable_smi_admin(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct mlx4_port *p = + container_of(attr, struct mlx4_port, enable_smi_admin); + ssize_t len = 0; + + if (mlx4_vf_get_enable_smi_admin(p->dev->dev, p->slave, p->port_num)) + len = sprintf(buf, "%d\n", 1); + else + len = sprintf(buf, "%d\n", 0); + + return len; +} + +static ssize_t sysfs_store_enable_smi_admin(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct mlx4_port *p = + container_of(attr, struct mlx4_port, enable_smi_admin); + int enable; + + if (sscanf(buf, "%i", &enable) != 1 || + enable < 0 || enable > 1) + return -EINVAL; + + if (mlx4_vf_set_enable_smi_admin(p->dev->dev, p->slave, p->port_num, enable)) + return -EINVAL; + return count; +} + +static int add_vf_smi_entries(struct mlx4_port *p) +{ + int is_eth = rdma_port_get_link_layer(&p->dev->ib_dev, p->port_num) == + IB_LINK_LAYER_ETHERNET; + int ret; + + /* do not display entries if eth transport, or if master */ + if (is_eth || p->slave == mlx4_master_func_num(p->dev->dev)) + return 0; + + sysfs_attr_init(&p->smi_enabled.attr); + p->smi_enabled.show = sysfs_show_smi_enabled; + p->smi_enabled.store = NULL; + p->smi_enabled.attr.name = "smi_enabled"; + p->smi_enabled.attr.mode = 0444; + ret = sysfs_create_file(&p->kobj, &p->smi_enabled.attr); + if (ret) { + pr_err("failed to create smi_enabled\n"); + return ret; + } + + sysfs_attr_init(&p->enable_smi_admin.attr); + p->enable_smi_admin.show = sysfs_show_enable_smi_admin; + p->enable_smi_admin.store = sysfs_store_enable_smi_admin; + p->enable_smi_admin.attr.name = "enable_smi_admin"; + p->enable_smi_admin.attr.mode = 0644; + ret = sysfs_create_file(&p->kobj, &p->enable_smi_admin.attr); + if (ret) { + pr_err("failed to create enable_smi_admin\n"); + sysfs_remove_file(&p->kobj, &p->smi_enabled.attr); + return ret; + } + return 0; +} + +static void remove_vf_smi_entries(struct mlx4_port *p) +{ + int is_eth = rdma_port_get_link_layer(&p->dev->ib_dev, p->port_num) == + IB_LINK_LAYER_ETHERNET; + + if (is_eth || p->slave == mlx4_master_func_num(p->dev->dev)) + return; + + sysfs_remove_file(&p->kobj, &p->smi_enabled.attr); + sysfs_remove_file(&p->kobj, &p->enable_smi_admin.attr); +} + +static int add_port(struct mlx4_ib_dev *dev, int port_num, int slave) +{ + struct mlx4_port *p; + int i; + int ret; + + p = kzalloc(sizeof *p, GFP_KERNEL); + if (!p) + return -ENOMEM; + + p->dev = dev; + p->port_num = port_num; + p->slave = slave; + + ret = kobject_init_and_add(&p->kobj, &port_type, + kobject_get(dev->dev_ports_parent[slave]), + "%d", port_num); + if (ret) + goto err_alloc; + + p->pkey_group.name = "pkey_idx"; + p->pkey_group.attrs = + alloc_group_attrs(show_port_pkey, store_port_pkey, + dev->dev->caps.pkey_table_len[port_num]); + if (!p->pkey_group.attrs) { + ret = -ENOMEM; + goto err_alloc; + } + + ret = sysfs_create_group(&p->kobj, &p->pkey_group); + if (ret) + goto err_free_pkey; + + p->gid_group.name = "gid_idx"; + p->gid_group.attrs = alloc_group_attrs(show_port_gid_idx, NULL, 1); + if (!p->gid_group.attrs) { + ret = -ENOMEM; + goto err_free_pkey; + } + + ret = sysfs_create_group(&p->kobj, &p->gid_group); + if (ret) + goto err_free_gid; + + ret = add_vf_smi_entries(p); + if (ret) + goto err_free_gid; + + list_add_tail(&p->kobj.entry, &dev->pkeys.pkey_port_list[slave]); + return 0; + +err_free_gid: + kfree(p->gid_group.attrs[0]); + kfree(p->gid_group.attrs); + +err_free_pkey: + for (i = 0; i < dev->dev->caps.pkey_table_len[port_num]; ++i) + kfree(p->pkey_group.attrs[i]); + kfree(p->pkey_group.attrs); + +err_alloc: + kobject_put(dev->dev_ports_parent[slave]); + kfree(p); + return ret; +} + +static int register_one_pkey_tree(struct mlx4_ib_dev *dev, int slave) +{ + char name[32]; + int err; + int port; + struct kobject *p, *t; + struct mlx4_port *mport; + struct mlx4_active_ports actv_ports; + + get_name(dev, name, slave, sizeof name); + + dev->pkeys.device_parent[slave] = + kobject_create_and_add(name, kobject_get(dev->iov_parent)); + + if (!dev->pkeys.device_parent[slave]) { + err = -ENOMEM; + goto fail_dev; + } + + INIT_LIST_HEAD(&dev->pkeys.pkey_port_list[slave]); + + dev->dev_ports_parent[slave] = + kobject_create_and_add("ports", + kobject_get(dev->pkeys.device_parent[slave])); + + if (!dev->dev_ports_parent[slave]) { + err = -ENOMEM; + goto err_ports; + } + + actv_ports = mlx4_get_active_ports(dev->dev, slave); + + for (port = 1; port <= dev->dev->caps.num_ports; ++port) { + if (!test_bit(port - 1, actv_ports.ports)) + continue; + err = add_port(dev, port, slave); + if (err) + goto err_add; + } + return 0; + +err_add: + list_for_each_entry_safe(p, t, + &dev->pkeys.pkey_port_list[slave], + entry) { + list_del(&p->entry); + mport = container_of(p, struct mlx4_port, kobj); + sysfs_remove_group(p, &mport->pkey_group); + sysfs_remove_group(p, &mport->gid_group); + remove_vf_smi_entries(mport); + kobject_put(p); + } + kobject_put(dev->dev_ports_parent[slave]); + +err_ports: + kobject_put(dev->pkeys.device_parent[slave]); + /* extra put for the device_parent create_and_add */ + kobject_put(dev->pkeys.device_parent[slave]); + +fail_dev: + kobject_put(dev->iov_parent); + return err; +} + +static int register_pkey_tree(struct mlx4_ib_dev *device) +{ + int i; + + if (!mlx4_is_master(device->dev)) + return 0; + + for (i = 0; i <= device->dev->persist->num_vfs; ++i) + register_one_pkey_tree(device, i); + + return 0; +} + +static void unregister_pkey_tree(struct mlx4_ib_dev *device) +{ + int slave; + struct kobject *p, *t; + struct mlx4_port *port; + + if (!mlx4_is_master(device->dev)) + return; + + for (slave = device->dev->persist->num_vfs; slave >= 0; --slave) { + list_for_each_entry_safe(p, t, + &device->pkeys.pkey_port_list[slave], + entry) { + list_del(&p->entry); + port = container_of(p, struct mlx4_port, kobj); + sysfs_remove_group(p, &port->pkey_group); + sysfs_remove_group(p, &port->gid_group); + remove_vf_smi_entries(port); + kobject_put(p); + kobject_put(device->dev_ports_parent[slave]); + } + kobject_put(device->dev_ports_parent[slave]); + kobject_put(device->pkeys.device_parent[slave]); + kobject_put(device->pkeys.device_parent[slave]); + kobject_put(device->iov_parent); + } +} + +int mlx4_ib_device_register_sysfs(struct mlx4_ib_dev *dev) +{ + int i; + int ret = 0; + + if (!mlx4_is_master(dev->dev)) + return 0; + + dev->iov_parent = + kobject_create_and_add("iov", + kobject_get(dev->ib_dev.ports_parent->parent)); + if (!dev->iov_parent) { + ret = -ENOMEM; + goto err; + } + dev->ports_parent = + kobject_create_and_add("ports", + kobject_get(dev->iov_parent)); + if (!dev->ports_parent) { + ret = -ENOMEM; + goto err_ports; + } + + for (i = 1; i <= dev->ib_dev.phys_port_cnt; ++i) { + ret = add_port_entries(dev, i); + if (ret) + goto err_add_entries; + } + + ret = register_pkey_tree(dev); + if (ret) + goto err_add_entries; + return 0; + +err_add_entries: + kobject_put(dev->ports_parent); + +err_ports: + kobject_put(dev->iov_parent); +err: + kobject_put(dev->ib_dev.ports_parent->parent); + pr_err("mlx4_ib_device_register_sysfs error (%d)\n", ret); + return ret; +} + +static void unregister_alias_guid_tree(struct mlx4_ib_dev *device) +{ + struct mlx4_ib_iov_port *p; + int i; + + if (!mlx4_is_master(device->dev)) + return; + + for (i = 0; i < device->dev->caps.num_ports; i++) { + p = &device->iov_ports[i]; + kobject_put(p->admin_alias_parent); + kobject_put(p->gids_parent); + kobject_put(p->pkeys_parent); + kobject_put(p->mcgs_parent); + kobject_put(p->cur_port); + kobject_put(p->cur_port); + kobject_put(p->cur_port); + kobject_put(p->cur_port); + kobject_put(p->cur_port); + kobject_put(p->dev->ports_parent); + kfree(p->dentr_ar); + } +} + +void mlx4_ib_device_unregister_sysfs(struct mlx4_ib_dev *device) +{ + unregister_alias_guid_tree(device); + unregister_pkey_tree(device); + kobject_put(device->ports_parent); + kobject_put(device->iov_parent); + kobject_put(device->iov_parent); + kobject_put(device->ib_dev.ports_parent->parent); +} diff --git a/kernel/drivers/infiniband/hw/mlx4/user.h b/kernel/drivers/infiniband/hw/mlx4/user.h new file mode 100644 index 000000000..07e6769ef --- /dev/null +++ b/kernel/drivers/infiniband/hw/mlx4/user.h @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX4_IB_USER_H +#define MLX4_IB_USER_H + +#include + +/* + * Increment this value if any changes that break userspace ABI + * compatibility are made. + */ + +#define MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION 3 +#define MLX4_IB_UVERBS_ABI_VERSION 4 + +/* + * Make sure that all structs defined in this file remain laid out so + * that they pack the same way on 32-bit and 64-bit architectures (to + * avoid incompatibility between 32-bit userspace and 64-bit kernels). + * In particular do not use pointer types -- pass pointers in __u64 + * instead. + */ + +struct mlx4_ib_alloc_ucontext_resp_v3 { + __u32 qp_tab_size; + __u16 bf_reg_size; + __u16 bf_regs_per_page; +}; + +struct mlx4_ib_alloc_ucontext_resp { + __u32 dev_caps; + __u32 qp_tab_size; + __u16 bf_reg_size; + __u16 bf_regs_per_page; + __u32 cqe_size; +}; + +struct mlx4_ib_alloc_pd_resp { + __u32 pdn; + __u32 reserved; +}; + +struct mlx4_ib_create_cq { + __u64 buf_addr; + __u64 db_addr; +}; + +struct mlx4_ib_create_cq_resp { + __u32 cqn; + __u32 reserved; +}; + +struct mlx4_ib_resize_cq { + __u64 buf_addr; +}; + +struct mlx4_ib_create_srq { + __u64 buf_addr; + __u64 db_addr; +}; + +struct mlx4_ib_create_srq_resp { + __u32 srqn; + __u32 reserved; +}; + +struct mlx4_ib_create_qp { + __u64 buf_addr; + __u64 db_addr; + __u8 log_sq_bb_count; + __u8 log_sq_stride; + __u8 sq_no_prefetch; + __u8 reserved[5]; +}; + +#endif /* MLX4_IB_USER_H */ diff --git a/kernel/drivers/infiniband/hw/mlx5/Kconfig b/kernel/drivers/infiniband/hw/mlx5/Kconfig new file mode 100644 index 000000000..10df386c6 --- /dev/null +++ b/kernel/drivers/infiniband/hw/mlx5/Kconfig @@ -0,0 +1,10 @@ +config MLX5_INFINIBAND + tristate "Mellanox Connect-IB HCA support" + depends on NETDEVICES && ETHERNET && PCI + select NET_VENDOR_MELLANOX + select MLX5_CORE + ---help--- + This driver provides low-level InfiniBand support for + Mellanox Connect-IB PCI Express host channel adapters (HCAs). + This is required to use InfiniBand protocols such as + IP-over-IB or SRP with these devices. diff --git a/kernel/drivers/infiniband/hw/mlx5/Makefile b/kernel/drivers/infiniband/hw/mlx5/Makefile new file mode 100644 index 000000000..27a70159e --- /dev/null +++ b/kernel/drivers/infiniband/hw/mlx5/Makefile @@ -0,0 +1,4 @@ +obj-$(CONFIG_MLX5_INFINIBAND) += mlx5_ib.o + +mlx5_ib-y := main.o cq.o doorbell.o qp.o mem.o srq.o mr.o ah.o mad.o +mlx5_ib-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += odp.o diff --git a/kernel/drivers/infiniband/hw/mlx5/ah.c b/kernel/drivers/infiniband/hw/mlx5/ah.c new file mode 100644 index 000000000..66080580e --- /dev/null +++ b/kernel/drivers/infiniband/hw/mlx5/ah.c @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "mlx5_ib.h" + +struct ib_ah *create_ib_ah(struct ib_ah_attr *ah_attr, + struct mlx5_ib_ah *ah) +{ + if (ah_attr->ah_flags & IB_AH_GRH) { + memcpy(ah->av.rgid, &ah_attr->grh.dgid, 16); + ah->av.grh_gid_fl = cpu_to_be32(ah_attr->grh.flow_label | + (1 << 30) | + ah_attr->grh.sgid_index << 20); + ah->av.hop_limit = ah_attr->grh.hop_limit; + ah->av.tclass = ah_attr->grh.traffic_class; + } + + ah->av.rlid = cpu_to_be16(ah_attr->dlid); + ah->av.fl_mlid = ah_attr->src_path_bits & 0x7f; + ah->av.stat_rate_sl = (ah_attr->static_rate << 4) | (ah_attr->sl & 0xf); + + return &ah->ibah; +} + +struct ib_ah *mlx5_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr) +{ + struct mlx5_ib_ah *ah; + + ah = kzalloc(sizeof(*ah), GFP_ATOMIC); + if (!ah) + return ERR_PTR(-ENOMEM); + + return create_ib_ah(ah_attr, ah); /* never fails */ +} + +int mlx5_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr) +{ + struct mlx5_ib_ah *ah = to_mah(ibah); + u32 tmp; + + memset(ah_attr, 0, sizeof(*ah_attr)); + + tmp = be32_to_cpu(ah->av.grh_gid_fl); + if (tmp & (1 << 30)) { + ah_attr->ah_flags = IB_AH_GRH; + ah_attr->grh.sgid_index = (tmp >> 20) & 0xff; + ah_attr->grh.flow_label = tmp & 0xfffff; + memcpy(&ah_attr->grh.dgid, ah->av.rgid, 16); + ah_attr->grh.hop_limit = ah->av.hop_limit; + ah_attr->grh.traffic_class = ah->av.tclass; + } + ah_attr->dlid = be16_to_cpu(ah->av.rlid); + ah_attr->static_rate = ah->av.stat_rate_sl >> 4; + ah_attr->sl = ah->av.stat_rate_sl & 0xf; + + return 0; +} + +int mlx5_ib_destroy_ah(struct ib_ah *ah) +{ + kfree(to_mah(ah)); + return 0; +} diff --git a/kernel/drivers/infiniband/hw/mlx5/cq.c b/kernel/drivers/infiniband/hw/mlx5/cq.c new file mode 100644 index 000000000..2ee6b1051 --- /dev/null +++ b/kernel/drivers/infiniband/hw/mlx5/cq.c @@ -0,0 +1,1189 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include "mlx5_ib.h" +#include "user.h" + +static void mlx5_ib_cq_comp(struct mlx5_core_cq *cq) +{ + struct ib_cq *ibcq = &to_mibcq(cq)->ibcq; + + ibcq->comp_handler(ibcq, ibcq->cq_context); +} + +static void mlx5_ib_cq_event(struct mlx5_core_cq *mcq, enum mlx5_event type) +{ + struct mlx5_ib_cq *cq = container_of(mcq, struct mlx5_ib_cq, mcq); + struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device); + struct ib_cq *ibcq = &cq->ibcq; + struct ib_event event; + + if (type != MLX5_EVENT_TYPE_CQ_ERROR) { + mlx5_ib_warn(dev, "Unexpected event type %d on CQ %06x\n", + type, mcq->cqn); + return; + } + + if (ibcq->event_handler) { + event.device = &dev->ib_dev; + event.event = IB_EVENT_CQ_ERR; + event.element.cq = ibcq; + ibcq->event_handler(&event, ibcq->cq_context); + } +} + +static void *get_cqe_from_buf(struct mlx5_ib_cq_buf *buf, int n, int size) +{ + return mlx5_buf_offset(&buf->buf, n * size); +} + +static void *get_cqe(struct mlx5_ib_cq *cq, int n) +{ + return get_cqe_from_buf(&cq->buf, n, cq->mcq.cqe_sz); +} + +static u8 sw_ownership_bit(int n, int nent) +{ + return (n & nent) ? 1 : 0; +} + +static void *get_sw_cqe(struct mlx5_ib_cq *cq, int n) +{ + void *cqe = get_cqe(cq, n & cq->ibcq.cqe); + struct mlx5_cqe64 *cqe64; + + cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64; + + if (likely((cqe64->op_own) >> 4 != MLX5_CQE_INVALID) && + !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & (cq->ibcq.cqe + 1)))) { + return cqe; + } else { + return NULL; + } +} + +static void *next_cqe_sw(struct mlx5_ib_cq *cq) +{ + return get_sw_cqe(cq, cq->mcq.cons_index); +} + +static enum ib_wc_opcode get_umr_comp(struct mlx5_ib_wq *wq, int idx) +{ + switch (wq->wr_data[idx]) { + case MLX5_IB_WR_UMR: + return 0; + + case IB_WR_LOCAL_INV: + return IB_WC_LOCAL_INV; + + case IB_WR_FAST_REG_MR: + return IB_WC_FAST_REG_MR; + + default: + pr_warn("unknown completion status\n"); + return 0; + } +} + +static void handle_good_req(struct ib_wc *wc, struct mlx5_cqe64 *cqe, + struct mlx5_ib_wq *wq, int idx) +{ + wc->wc_flags = 0; + switch (be32_to_cpu(cqe->sop_drop_qpn) >> 24) { + case MLX5_OPCODE_RDMA_WRITE_IMM: + wc->wc_flags |= IB_WC_WITH_IMM; + case MLX5_OPCODE_RDMA_WRITE: + wc->opcode = IB_WC_RDMA_WRITE; + break; + case MLX5_OPCODE_SEND_IMM: + wc->wc_flags |= IB_WC_WITH_IMM; + case MLX5_OPCODE_SEND: + case MLX5_OPCODE_SEND_INVAL: + wc->opcode = IB_WC_SEND; + break; + case MLX5_OPCODE_RDMA_READ: + wc->opcode = IB_WC_RDMA_READ; + wc->byte_len = be32_to_cpu(cqe->byte_cnt); + break; + case MLX5_OPCODE_ATOMIC_CS: + wc->opcode = IB_WC_COMP_SWAP; + wc->byte_len = 8; + break; + case MLX5_OPCODE_ATOMIC_FA: + wc->opcode = IB_WC_FETCH_ADD; + wc->byte_len = 8; + break; + case MLX5_OPCODE_ATOMIC_MASKED_CS: + wc->opcode = IB_WC_MASKED_COMP_SWAP; + wc->byte_len = 8; + break; + case MLX5_OPCODE_ATOMIC_MASKED_FA: + wc->opcode = IB_WC_MASKED_FETCH_ADD; + wc->byte_len = 8; + break; + case MLX5_OPCODE_BIND_MW: + wc->opcode = IB_WC_BIND_MW; + break; + case MLX5_OPCODE_UMR: + wc->opcode = get_umr_comp(wq, idx); + break; + } +} + +enum { + MLX5_GRH_IN_BUFFER = 1, + MLX5_GRH_IN_CQE = 2, +}; + +static void handle_responder(struct ib_wc *wc, struct mlx5_cqe64 *cqe, + struct mlx5_ib_qp *qp) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.device); + struct mlx5_ib_srq *srq; + struct mlx5_ib_wq *wq; + u16 wqe_ctr; + u8 g; + + if (qp->ibqp.srq || qp->ibqp.xrcd) { + struct mlx5_core_srq *msrq = NULL; + + if (qp->ibqp.xrcd) { + msrq = mlx5_core_get_srq(dev->mdev, + be32_to_cpu(cqe->srqn)); + srq = to_mibsrq(msrq); + } else { + srq = to_msrq(qp->ibqp.srq); + } + if (srq) { + wqe_ctr = be16_to_cpu(cqe->wqe_counter); + wc->wr_id = srq->wrid[wqe_ctr]; + mlx5_ib_free_srq_wqe(srq, wqe_ctr); + if (msrq && atomic_dec_and_test(&msrq->refcount)) + complete(&msrq->free); + } + } else { + wq = &qp->rq; + wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; + ++wq->tail; + } + wc->byte_len = be32_to_cpu(cqe->byte_cnt); + + switch (cqe->op_own >> 4) { + case MLX5_CQE_RESP_WR_IMM: + wc->opcode = IB_WC_RECV_RDMA_WITH_IMM; + wc->wc_flags = IB_WC_WITH_IMM; + wc->ex.imm_data = cqe->imm_inval_pkey; + break; + case MLX5_CQE_RESP_SEND: + wc->opcode = IB_WC_RECV; + wc->wc_flags = 0; + break; + case MLX5_CQE_RESP_SEND_IMM: + wc->opcode = IB_WC_RECV; + wc->wc_flags = IB_WC_WITH_IMM; + wc->ex.imm_data = cqe->imm_inval_pkey; + break; + case MLX5_CQE_RESP_SEND_INV: + wc->opcode = IB_WC_RECV; + wc->wc_flags = IB_WC_WITH_INVALIDATE; + wc->ex.invalidate_rkey = be32_to_cpu(cqe->imm_inval_pkey); + break; + } + wc->slid = be16_to_cpu(cqe->slid); + wc->sl = (be32_to_cpu(cqe->flags_rqpn) >> 24) & 0xf; + wc->src_qp = be32_to_cpu(cqe->flags_rqpn) & 0xffffff; + wc->dlid_path_bits = cqe->ml_path; + g = (be32_to_cpu(cqe->flags_rqpn) >> 28) & 3; + wc->wc_flags |= g ? IB_WC_GRH : 0; + wc->pkey_index = be32_to_cpu(cqe->imm_inval_pkey) & 0xffff; +} + +static void dump_cqe(struct mlx5_ib_dev *dev, struct mlx5_err_cqe *cqe) +{ + __be32 *p = (__be32 *)cqe; + int i; + + mlx5_ib_warn(dev, "dump error cqe\n"); + for (i = 0; i < sizeof(*cqe) / 16; i++, p += 4) + pr_info("%08x %08x %08x %08x\n", be32_to_cpu(p[0]), + be32_to_cpu(p[1]), be32_to_cpu(p[2]), + be32_to_cpu(p[3])); +} + +static void mlx5_handle_error_cqe(struct mlx5_ib_dev *dev, + struct mlx5_err_cqe *cqe, + struct ib_wc *wc) +{ + int dump = 1; + + switch (cqe->syndrome) { + case MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR: + wc->status = IB_WC_LOC_LEN_ERR; + break; + case MLX5_CQE_SYNDROME_LOCAL_QP_OP_ERR: + wc->status = IB_WC_LOC_QP_OP_ERR; + break; + case MLX5_CQE_SYNDROME_LOCAL_PROT_ERR: + wc->status = IB_WC_LOC_PROT_ERR; + break; + case MLX5_CQE_SYNDROME_WR_FLUSH_ERR: + dump = 0; + wc->status = IB_WC_WR_FLUSH_ERR; + break; + case MLX5_CQE_SYNDROME_MW_BIND_ERR: + wc->status = IB_WC_MW_BIND_ERR; + break; + case MLX5_CQE_SYNDROME_BAD_RESP_ERR: + wc->status = IB_WC_BAD_RESP_ERR; + break; + case MLX5_CQE_SYNDROME_LOCAL_ACCESS_ERR: + wc->status = IB_WC_LOC_ACCESS_ERR; + break; + case MLX5_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR: + wc->status = IB_WC_REM_INV_REQ_ERR; + break; + case MLX5_CQE_SYNDROME_REMOTE_ACCESS_ERR: + wc->status = IB_WC_REM_ACCESS_ERR; + break; + case MLX5_CQE_SYNDROME_REMOTE_OP_ERR: + wc->status = IB_WC_REM_OP_ERR; + break; + case MLX5_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR: + wc->status = IB_WC_RETRY_EXC_ERR; + dump = 0; + break; + case MLX5_CQE_SYNDROME_RNR_RETRY_EXC_ERR: + wc->status = IB_WC_RNR_RETRY_EXC_ERR; + dump = 0; + break; + case MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR: + wc->status = IB_WC_REM_ABORT_ERR; + break; + default: + wc->status = IB_WC_GENERAL_ERR; + break; + } + + wc->vendor_err = cqe->vendor_err_synd; + if (dump) + dump_cqe(dev, cqe); +} + +static int is_atomic_response(struct mlx5_ib_qp *qp, uint16_t idx) +{ + /* TBD: waiting decision + */ + return 0; +} + +static void *mlx5_get_atomic_laddr(struct mlx5_ib_qp *qp, uint16_t idx) +{ + struct mlx5_wqe_data_seg *dpseg; + void *addr; + + dpseg = mlx5_get_send_wqe(qp, idx) + sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_raddr_seg) + + sizeof(struct mlx5_wqe_atomic_seg); + addr = (void *)(unsigned long)be64_to_cpu(dpseg->addr); + return addr; +} + +static void handle_atomic(struct mlx5_ib_qp *qp, struct mlx5_cqe64 *cqe64, + uint16_t idx) +{ + void *addr; + int byte_count; + int i; + + if (!is_atomic_response(qp, idx)) + return; + + byte_count = be32_to_cpu(cqe64->byte_cnt); + addr = mlx5_get_atomic_laddr(qp, idx); + + if (byte_count == 4) { + *(uint32_t *)addr = be32_to_cpu(*((__be32 *)addr)); + } else { + for (i = 0; i < byte_count; i += 8) { + *(uint64_t *)addr = be64_to_cpu(*((__be64 *)addr)); + addr += 8; + } + } + + return; +} + +static void handle_atomics(struct mlx5_ib_qp *qp, struct mlx5_cqe64 *cqe64, + u16 tail, u16 head) +{ + u16 idx; + + do { + idx = tail & (qp->sq.wqe_cnt - 1); + handle_atomic(qp, cqe64, idx); + if (idx == head) + break; + + tail = qp->sq.w_list[idx].next; + } while (1); + tail = qp->sq.w_list[idx].next; + qp->sq.last_poll = tail; +} + +static void free_cq_buf(struct mlx5_ib_dev *dev, struct mlx5_ib_cq_buf *buf) +{ + mlx5_buf_free(dev->mdev, &buf->buf); +} + +static void get_sig_err_item(struct mlx5_sig_err_cqe *cqe, + struct ib_sig_err *item) +{ + u16 syndrome = be16_to_cpu(cqe->syndrome); + +#define GUARD_ERR (1 << 13) +#define APPTAG_ERR (1 << 12) +#define REFTAG_ERR (1 << 11) + + if (syndrome & GUARD_ERR) { + item->err_type = IB_SIG_BAD_GUARD; + item->expected = be32_to_cpu(cqe->expected_trans_sig) >> 16; + item->actual = be32_to_cpu(cqe->actual_trans_sig) >> 16; + } else + if (syndrome & REFTAG_ERR) { + item->err_type = IB_SIG_BAD_REFTAG; + item->expected = be32_to_cpu(cqe->expected_reftag); + item->actual = be32_to_cpu(cqe->actual_reftag); + } else + if (syndrome & APPTAG_ERR) { + item->err_type = IB_SIG_BAD_APPTAG; + item->expected = be32_to_cpu(cqe->expected_trans_sig) & 0xffff; + item->actual = be32_to_cpu(cqe->actual_trans_sig) & 0xffff; + } else { + pr_err("Got signature completion error with bad syndrome %04x\n", + syndrome); + } + + item->sig_err_offset = be64_to_cpu(cqe->err_offset); + item->key = be32_to_cpu(cqe->mkey); +} + +static int mlx5_poll_one(struct mlx5_ib_cq *cq, + struct mlx5_ib_qp **cur_qp, + struct ib_wc *wc) +{ + struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device); + struct mlx5_err_cqe *err_cqe; + struct mlx5_cqe64 *cqe64; + struct mlx5_core_qp *mqp; + struct mlx5_ib_wq *wq; + struct mlx5_sig_err_cqe *sig_err_cqe; + struct mlx5_core_mr *mmr; + struct mlx5_ib_mr *mr; + uint8_t opcode; + uint32_t qpn; + u16 wqe_ctr; + void *cqe; + int idx; + +repoll: + cqe = next_cqe_sw(cq); + if (!cqe) + return -EAGAIN; + + cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64; + + ++cq->mcq.cons_index; + + /* Make sure we read CQ entry contents after we've checked the + * ownership bit. + */ + rmb(); + + opcode = cqe64->op_own >> 4; + if (unlikely(opcode == MLX5_CQE_RESIZE_CQ)) { + if (likely(cq->resize_buf)) { + free_cq_buf(dev, &cq->buf); + cq->buf = *cq->resize_buf; + kfree(cq->resize_buf); + cq->resize_buf = NULL; + goto repoll; + } else { + mlx5_ib_warn(dev, "unexpected resize cqe\n"); + } + } + + qpn = ntohl(cqe64->sop_drop_qpn) & 0xffffff; + if (!*cur_qp || (qpn != (*cur_qp)->ibqp.qp_num)) { + /* We do not have to take the QP table lock here, + * because CQs will be locked while QPs are removed + * from the table. + */ + mqp = __mlx5_qp_lookup(dev->mdev, qpn); + if (unlikely(!mqp)) { + mlx5_ib_warn(dev, "CQE@CQ %06x for unknown QPN %6x\n", + cq->mcq.cqn, qpn); + return -EINVAL; + } + + *cur_qp = to_mibqp(mqp); + } + + wc->qp = &(*cur_qp)->ibqp; + switch (opcode) { + case MLX5_CQE_REQ: + wq = &(*cur_qp)->sq; + wqe_ctr = be16_to_cpu(cqe64->wqe_counter); + idx = wqe_ctr & (wq->wqe_cnt - 1); + handle_good_req(wc, cqe64, wq, idx); + handle_atomics(*cur_qp, cqe64, wq->last_poll, idx); + wc->wr_id = wq->wrid[idx]; + wq->tail = wq->wqe_head[idx] + 1; + wc->status = IB_WC_SUCCESS; + break; + case MLX5_CQE_RESP_WR_IMM: + case MLX5_CQE_RESP_SEND: + case MLX5_CQE_RESP_SEND_IMM: + case MLX5_CQE_RESP_SEND_INV: + handle_responder(wc, cqe64, *cur_qp); + wc->status = IB_WC_SUCCESS; + break; + case MLX5_CQE_RESIZE_CQ: + break; + case MLX5_CQE_REQ_ERR: + case MLX5_CQE_RESP_ERR: + err_cqe = (struct mlx5_err_cqe *)cqe64; + mlx5_handle_error_cqe(dev, err_cqe, wc); + mlx5_ib_dbg(dev, "%s error cqe on cqn 0x%x:\n", + opcode == MLX5_CQE_REQ_ERR ? + "Requestor" : "Responder", cq->mcq.cqn); + mlx5_ib_dbg(dev, "syndrome 0x%x, vendor syndrome 0x%x\n", + err_cqe->syndrome, err_cqe->vendor_err_synd); + if (opcode == MLX5_CQE_REQ_ERR) { + wq = &(*cur_qp)->sq; + wqe_ctr = be16_to_cpu(cqe64->wqe_counter); + idx = wqe_ctr & (wq->wqe_cnt - 1); + wc->wr_id = wq->wrid[idx]; + wq->tail = wq->wqe_head[idx] + 1; + } else { + struct mlx5_ib_srq *srq; + + if ((*cur_qp)->ibqp.srq) { + srq = to_msrq((*cur_qp)->ibqp.srq); + wqe_ctr = be16_to_cpu(cqe64->wqe_counter); + wc->wr_id = srq->wrid[wqe_ctr]; + mlx5_ib_free_srq_wqe(srq, wqe_ctr); + } else { + wq = &(*cur_qp)->rq; + wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; + ++wq->tail; + } + } + break; + case MLX5_CQE_SIG_ERR: + sig_err_cqe = (struct mlx5_sig_err_cqe *)cqe64; + + read_lock(&dev->mdev->priv.mr_table.lock); + mmr = __mlx5_mr_lookup(dev->mdev, + mlx5_base_mkey(be32_to_cpu(sig_err_cqe->mkey))); + if (unlikely(!mmr)) { + read_unlock(&dev->mdev->priv.mr_table.lock); + mlx5_ib_warn(dev, "CQE@CQ %06x for unknown MR %6x\n", + cq->mcq.cqn, be32_to_cpu(sig_err_cqe->mkey)); + return -EINVAL; + } + + mr = to_mibmr(mmr); + get_sig_err_item(sig_err_cqe, &mr->sig->err_item); + mr->sig->sig_err_exists = true; + mr->sig->sigerr_count++; + + mlx5_ib_warn(dev, "CQN: 0x%x Got SIGERR on key: 0x%x err_type %x err_offset %llx expected %x actual %x\n", + cq->mcq.cqn, mr->sig->err_item.key, + mr->sig->err_item.err_type, + mr->sig->err_item.sig_err_offset, + mr->sig->err_item.expected, + mr->sig->err_item.actual); + + read_unlock(&dev->mdev->priv.mr_table.lock); + goto repoll; + } + + return 0; +} + +int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) +{ + struct mlx5_ib_cq *cq = to_mcq(ibcq); + struct mlx5_ib_qp *cur_qp = NULL; + unsigned long flags; + int npolled; + int err = 0; + + spin_lock_irqsave(&cq->lock, flags); + + for (npolled = 0; npolled < num_entries; npolled++) { + err = mlx5_poll_one(cq, &cur_qp, wc + npolled); + if (err) + break; + } + + if (npolled) + mlx5_cq_set_ci(&cq->mcq); + + spin_unlock_irqrestore(&cq->lock, flags); + + if (err == 0 || err == -EAGAIN) + return npolled; + else + return err; +} + +int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) +{ + struct mlx5_core_dev *mdev = to_mdev(ibcq->device)->mdev; + void __iomem *uar_page = mdev->priv.uuari.uars[0].map; + + mlx5_cq_arm(&to_mcq(ibcq)->mcq, + (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ? + MLX5_CQ_DB_REQ_NOT_SOL : MLX5_CQ_DB_REQ_NOT, + uar_page, + MLX5_GET_DOORBELL_LOCK(&mdev->priv.cq_uar_lock), + to_mcq(ibcq)->mcq.cons_index); + + return 0; +} + +static int alloc_cq_buf(struct mlx5_ib_dev *dev, struct mlx5_ib_cq_buf *buf, + int nent, int cqe_size) +{ + int err; + + err = mlx5_buf_alloc(dev->mdev, nent * cqe_size, + PAGE_SIZE * 2, &buf->buf); + if (err) + return err; + + buf->cqe_size = cqe_size; + buf->nent = nent; + + return 0; +} + +static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata, + struct ib_ucontext *context, struct mlx5_ib_cq *cq, + int entries, struct mlx5_create_cq_mbox_in **cqb, + int *cqe_size, int *index, int *inlen) +{ + struct mlx5_ib_create_cq ucmd; + size_t ucmdlen; + int page_shift; + int npages; + int ncont; + int err; + + ucmdlen = + (udata->inlen - sizeof(struct ib_uverbs_cmd_hdr) < + sizeof(ucmd)) ? (sizeof(ucmd) - + sizeof(ucmd.reserved)) : sizeof(ucmd); + + if (ib_copy_from_udata(&ucmd, udata, ucmdlen)) + return -EFAULT; + + if (ucmdlen == sizeof(ucmd) && + ucmd.reserved != 0) + return -EINVAL; + + if (ucmd.cqe_size != 64 && ucmd.cqe_size != 128) + return -EINVAL; + + *cqe_size = ucmd.cqe_size; + + cq->buf.umem = ib_umem_get(context, ucmd.buf_addr, + entries * ucmd.cqe_size, + IB_ACCESS_LOCAL_WRITE, 1); + if (IS_ERR(cq->buf.umem)) { + err = PTR_ERR(cq->buf.umem); + return err; + } + + err = mlx5_ib_db_map_user(to_mucontext(context), ucmd.db_addr, + &cq->db); + if (err) + goto err_umem; + + mlx5_ib_cont_pages(cq->buf.umem, ucmd.buf_addr, &npages, &page_shift, + &ncont, NULL); + mlx5_ib_dbg(dev, "addr 0x%llx, size %u, npages %d, page_shift %d, ncont %d\n", + ucmd.buf_addr, entries * ucmd.cqe_size, npages, page_shift, ncont); + + *inlen = sizeof(**cqb) + sizeof(*(*cqb)->pas) * ncont; + *cqb = mlx5_vzalloc(*inlen); + if (!*cqb) { + err = -ENOMEM; + goto err_db; + } + mlx5_ib_populate_pas(dev, cq->buf.umem, page_shift, (*cqb)->pas, 0); + (*cqb)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; + + *index = to_mucontext(context)->uuari.uars[0].index; + + return 0; + +err_db: + mlx5_ib_db_unmap_user(to_mucontext(context), &cq->db); + +err_umem: + ib_umem_release(cq->buf.umem); + return err; +} + +static void destroy_cq_user(struct mlx5_ib_cq *cq, struct ib_ucontext *context) +{ + mlx5_ib_db_unmap_user(to_mucontext(context), &cq->db); + ib_umem_release(cq->buf.umem); +} + +static void init_cq_buf(struct mlx5_ib_cq *cq, struct mlx5_ib_cq_buf *buf) +{ + int i; + void *cqe; + struct mlx5_cqe64 *cqe64; + + for (i = 0; i < buf->nent; i++) { + cqe = get_cqe_from_buf(buf, i, buf->cqe_size); + cqe64 = buf->cqe_size == 64 ? cqe : cqe + 64; + cqe64->op_own = MLX5_CQE_INVALID << 4; + } +} + +static int create_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, + int entries, int cqe_size, + struct mlx5_create_cq_mbox_in **cqb, + int *index, int *inlen) +{ + int err; + + err = mlx5_db_alloc(dev->mdev, &cq->db); + if (err) + return err; + + cq->mcq.set_ci_db = cq->db.db; + cq->mcq.arm_db = cq->db.db + 1; + cq->mcq.cqe_sz = cqe_size; + + err = alloc_cq_buf(dev, &cq->buf, entries, cqe_size); + if (err) + goto err_db; + + init_cq_buf(cq, &cq->buf); + + *inlen = sizeof(**cqb) + sizeof(*(*cqb)->pas) * cq->buf.buf.npages; + *cqb = mlx5_vzalloc(*inlen); + if (!*cqb) { + err = -ENOMEM; + goto err_buf; + } + mlx5_fill_page_array(&cq->buf.buf, (*cqb)->pas); + + (*cqb)->ctx.log_pg_sz = cq->buf.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT; + *index = dev->mdev->priv.uuari.uars[0].index; + + return 0; + +err_buf: + free_cq_buf(dev, &cq->buf); + +err_db: + mlx5_db_free(dev->mdev, &cq->db); + return err; +} + +static void destroy_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq) +{ + free_cq_buf(dev, &cq->buf); + mlx5_db_free(dev->mdev, &cq->db); +} + +struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, int entries, + int vector, struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct mlx5_create_cq_mbox_in *cqb = NULL; + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_ib_cq *cq; + int uninitialized_var(index); + int uninitialized_var(inlen); + int cqe_size; + int irqn; + int eqn; + int err; + + if (entries < 0) + return ERR_PTR(-EINVAL); + + entries = roundup_pow_of_two(entries + 1); + if (entries > dev->mdev->caps.gen.max_cqes) + return ERR_PTR(-EINVAL); + + cq = kzalloc(sizeof(*cq), GFP_KERNEL); + if (!cq) + return ERR_PTR(-ENOMEM); + + cq->ibcq.cqe = entries - 1; + mutex_init(&cq->resize_mutex); + spin_lock_init(&cq->lock); + cq->resize_buf = NULL; + cq->resize_umem = NULL; + + if (context) { + err = create_cq_user(dev, udata, context, cq, entries, + &cqb, &cqe_size, &index, &inlen); + if (err) + goto err_create; + } else { + /* for now choose 64 bytes till we have a proper interface */ + cqe_size = 64; + err = create_cq_kernel(dev, cq, entries, cqe_size, &cqb, + &index, &inlen); + if (err) + goto err_create; + } + + cq->cqe_size = cqe_size; + cqb->ctx.cqe_sz_flags = cqe_sz_to_mlx_sz(cqe_size) << 5; + cqb->ctx.log_sz_usr_page = cpu_to_be32((ilog2(entries) << 24) | index); + err = mlx5_vector2eqn(dev->mdev, vector, &eqn, &irqn); + if (err) + goto err_cqb; + + cqb->ctx.c_eqn = cpu_to_be16(eqn); + cqb->ctx.db_record_addr = cpu_to_be64(cq->db.dma); + + err = mlx5_core_create_cq(dev->mdev, &cq->mcq, cqb, inlen); + if (err) + goto err_cqb; + + mlx5_ib_dbg(dev, "cqn 0x%x\n", cq->mcq.cqn); + cq->mcq.irqn = irqn; + cq->mcq.comp = mlx5_ib_cq_comp; + cq->mcq.event = mlx5_ib_cq_event; + + if (context) + if (ib_copy_to_udata(udata, &cq->mcq.cqn, sizeof(__u32))) { + err = -EFAULT; + goto err_cmd; + } + + + kvfree(cqb); + return &cq->ibcq; + +err_cmd: + mlx5_core_destroy_cq(dev->mdev, &cq->mcq); + +err_cqb: + kvfree(cqb); + if (context) + destroy_cq_user(cq, context); + else + destroy_cq_kernel(dev, cq); + +err_create: + kfree(cq); + + return ERR_PTR(err); +} + + +int mlx5_ib_destroy_cq(struct ib_cq *cq) +{ + struct mlx5_ib_dev *dev = to_mdev(cq->device); + struct mlx5_ib_cq *mcq = to_mcq(cq); + struct ib_ucontext *context = NULL; + + if (cq->uobject) + context = cq->uobject->context; + + mlx5_core_destroy_cq(dev->mdev, &mcq->mcq); + if (context) + destroy_cq_user(mcq, context); + else + destroy_cq_kernel(dev, mcq); + + kfree(mcq); + + return 0; +} + +static int is_equal_rsn(struct mlx5_cqe64 *cqe64, u32 rsn) +{ + return rsn == (ntohl(cqe64->sop_drop_qpn) & 0xffffff); +} + +void __mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 rsn, struct mlx5_ib_srq *srq) +{ + struct mlx5_cqe64 *cqe64, *dest64; + void *cqe, *dest; + u32 prod_index; + int nfreed = 0; + u8 owner_bit; + + if (!cq) + return; + + /* First we need to find the current producer index, so we + * know where to start cleaning from. It doesn't matter if HW + * adds new entries after this loop -- the QP we're worried + * about is already in RESET, so the new entries won't come + * from our QP and therefore don't need to be checked. + */ + for (prod_index = cq->mcq.cons_index; get_sw_cqe(cq, prod_index); prod_index++) + if (prod_index == cq->mcq.cons_index + cq->ibcq.cqe) + break; + + /* Now sweep backwards through the CQ, removing CQ entries + * that match our QP by copying older entries on top of them. + */ + while ((int) --prod_index - (int) cq->mcq.cons_index >= 0) { + cqe = get_cqe(cq, prod_index & cq->ibcq.cqe); + cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64; + if (is_equal_rsn(cqe64, rsn)) { + if (srq && (ntohl(cqe64->srqn) & 0xffffff)) + mlx5_ib_free_srq_wqe(srq, be16_to_cpu(cqe64->wqe_counter)); + ++nfreed; + } else if (nfreed) { + dest = get_cqe(cq, (prod_index + nfreed) & cq->ibcq.cqe); + dest64 = (cq->mcq.cqe_sz == 64) ? dest : dest + 64; + owner_bit = dest64->op_own & MLX5_CQE_OWNER_MASK; + memcpy(dest, cqe, cq->mcq.cqe_sz); + dest64->op_own = owner_bit | + (dest64->op_own & ~MLX5_CQE_OWNER_MASK); + } + } + + if (nfreed) { + cq->mcq.cons_index += nfreed; + /* Make sure update of buffer contents is done before + * updating consumer index. + */ + wmb(); + mlx5_cq_set_ci(&cq->mcq); + } +} + +void mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq) +{ + if (!cq) + return; + + spin_lock_irq(&cq->lock); + __mlx5_ib_cq_clean(cq, qpn, srq); + spin_unlock_irq(&cq->lock); +} + +int mlx5_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period) +{ + struct mlx5_modify_cq_mbox_in *in; + struct mlx5_ib_dev *dev = to_mdev(cq->device); + struct mlx5_ib_cq *mcq = to_mcq(cq); + int err; + u32 fsel; + + if (!(dev->mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_CQ_MODER)) + return -ENOSYS; + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) + return -ENOMEM; + + in->cqn = cpu_to_be32(mcq->mcq.cqn); + fsel = (MLX5_CQ_MODIFY_PERIOD | MLX5_CQ_MODIFY_COUNT); + in->ctx.cq_period = cpu_to_be16(cq_period); + in->ctx.cq_max_count = cpu_to_be16(cq_count); + in->field_select = cpu_to_be32(fsel); + err = mlx5_core_modify_cq(dev->mdev, &mcq->mcq, in, sizeof(*in)); + kfree(in); + + if (err) + mlx5_ib_warn(dev, "modify cq 0x%x failed\n", mcq->mcq.cqn); + + return err; +} + +static int resize_user(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, + int entries, struct ib_udata *udata, int *npas, + int *page_shift, int *cqe_size) +{ + struct mlx5_ib_resize_cq ucmd; + struct ib_umem *umem; + int err; + int npages; + struct ib_ucontext *context = cq->buf.umem->context; + + err = ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)); + if (err) + return err; + + if (ucmd.reserved0 || ucmd.reserved1) + return -EINVAL; + + umem = ib_umem_get(context, ucmd.buf_addr, entries * ucmd.cqe_size, + IB_ACCESS_LOCAL_WRITE, 1); + if (IS_ERR(umem)) { + err = PTR_ERR(umem); + return err; + } + + mlx5_ib_cont_pages(umem, ucmd.buf_addr, &npages, page_shift, + npas, NULL); + + cq->resize_umem = umem; + *cqe_size = ucmd.cqe_size; + + return 0; +} + +static void un_resize_user(struct mlx5_ib_cq *cq) +{ + ib_umem_release(cq->resize_umem); +} + +static int resize_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, + int entries, int cqe_size) +{ + int err; + + cq->resize_buf = kzalloc(sizeof(*cq->resize_buf), GFP_KERNEL); + if (!cq->resize_buf) + return -ENOMEM; + + err = alloc_cq_buf(dev, cq->resize_buf, entries, cqe_size); + if (err) + goto ex; + + init_cq_buf(cq, cq->resize_buf); + + return 0; + +ex: + kfree(cq->resize_buf); + return err; +} + +static void un_resize_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq) +{ + free_cq_buf(dev, cq->resize_buf); + cq->resize_buf = NULL; +} + +static int copy_resize_cqes(struct mlx5_ib_cq *cq) +{ + struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device); + struct mlx5_cqe64 *scqe64; + struct mlx5_cqe64 *dcqe64; + void *start_cqe; + void *scqe; + void *dcqe; + int ssize; + int dsize; + int i; + u8 sw_own; + + ssize = cq->buf.cqe_size; + dsize = cq->resize_buf->cqe_size; + if (ssize != dsize) { + mlx5_ib_warn(dev, "resize from different cqe size is not supported\n"); + return -EINVAL; + } + + i = cq->mcq.cons_index; + scqe = get_sw_cqe(cq, i); + scqe64 = ssize == 64 ? scqe : scqe + 64; + start_cqe = scqe; + if (!scqe) { + mlx5_ib_warn(dev, "expected cqe in sw ownership\n"); + return -EINVAL; + } + + while ((scqe64->op_own >> 4) != MLX5_CQE_RESIZE_CQ) { + dcqe = get_cqe_from_buf(cq->resize_buf, + (i + 1) & (cq->resize_buf->nent), + dsize); + dcqe64 = dsize == 64 ? dcqe : dcqe + 64; + sw_own = sw_ownership_bit(i + 1, cq->resize_buf->nent); + memcpy(dcqe, scqe, dsize); + dcqe64->op_own = (dcqe64->op_own & ~MLX5_CQE_OWNER_MASK) | sw_own; + + ++i; + scqe = get_sw_cqe(cq, i); + scqe64 = ssize == 64 ? scqe : scqe + 64; + if (!scqe) { + mlx5_ib_warn(dev, "expected cqe in sw ownership\n"); + return -EINVAL; + } + + if (scqe == start_cqe) { + pr_warn("resize CQ failed to get resize CQE, CQN 0x%x\n", + cq->mcq.cqn); + return -ENOMEM; + } + } + ++cq->mcq.cons_index; + return 0; +} + +int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(ibcq->device); + struct mlx5_ib_cq *cq = to_mcq(ibcq); + struct mlx5_modify_cq_mbox_in *in; + int err; + int npas; + int page_shift; + int inlen; + int uninitialized_var(cqe_size); + unsigned long flags; + + if (!(dev->mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_RESIZE_CQ)) { + pr_info("Firmware does not support resize CQ\n"); + return -ENOSYS; + } + + if (entries < 1) + return -EINVAL; + + entries = roundup_pow_of_two(entries + 1); + if (entries > dev->mdev->caps.gen.max_cqes + 1) + return -EINVAL; + + if (entries == ibcq->cqe + 1) + return 0; + + mutex_lock(&cq->resize_mutex); + if (udata) { + err = resize_user(dev, cq, entries, udata, &npas, &page_shift, + &cqe_size); + } else { + cqe_size = 64; + err = resize_kernel(dev, cq, entries, cqe_size); + if (!err) { + npas = cq->resize_buf->buf.npages; + page_shift = cq->resize_buf->buf.page_shift; + } + } + + if (err) + goto ex; + + inlen = sizeof(*in) + npas * sizeof(in->pas[0]); + in = mlx5_vzalloc(inlen); + if (!in) { + err = -ENOMEM; + goto ex_resize; + } + + if (udata) + mlx5_ib_populate_pas(dev, cq->resize_umem, page_shift, + in->pas, 0); + else + mlx5_fill_page_array(&cq->resize_buf->buf, in->pas); + + in->field_select = cpu_to_be32(MLX5_MODIFY_CQ_MASK_LOG_SIZE | + MLX5_MODIFY_CQ_MASK_PG_OFFSET | + MLX5_MODIFY_CQ_MASK_PG_SIZE); + in->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; + in->ctx.cqe_sz_flags = cqe_sz_to_mlx_sz(cqe_size) << 5; + in->ctx.page_offset = 0; + in->ctx.log_sz_usr_page = cpu_to_be32(ilog2(entries) << 24); + in->hdr.opmod = cpu_to_be16(MLX5_CQ_OPMOD_RESIZE); + in->cqn = cpu_to_be32(cq->mcq.cqn); + + err = mlx5_core_modify_cq(dev->mdev, &cq->mcq, in, inlen); + if (err) + goto ex_alloc; + + if (udata) { + cq->ibcq.cqe = entries - 1; + ib_umem_release(cq->buf.umem); + cq->buf.umem = cq->resize_umem; + cq->resize_umem = NULL; + } else { + struct mlx5_ib_cq_buf tbuf; + int resized = 0; + + spin_lock_irqsave(&cq->lock, flags); + if (cq->resize_buf) { + err = copy_resize_cqes(cq); + if (!err) { + tbuf = cq->buf; + cq->buf = *cq->resize_buf; + kfree(cq->resize_buf); + cq->resize_buf = NULL; + resized = 1; + } + } + cq->ibcq.cqe = entries - 1; + spin_unlock_irqrestore(&cq->lock, flags); + if (resized) + free_cq_buf(dev, &tbuf); + } + mutex_unlock(&cq->resize_mutex); + + kvfree(in); + return 0; + +ex_alloc: + kvfree(in); + +ex_resize: + if (udata) + un_resize_user(cq); + else + un_resize_kernel(dev, cq); +ex: + mutex_unlock(&cq->resize_mutex); + return err; +} + +int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq) +{ + struct mlx5_ib_cq *cq; + + if (!ibcq) + return 128; + + cq = to_mcq(ibcq); + return cq->cqe_size; +} diff --git a/kernel/drivers/infiniband/hw/mlx5/doorbell.c b/kernel/drivers/infiniband/hw/mlx5/doorbell.c new file mode 100644 index 000000000..a0e4e6ddb --- /dev/null +++ b/kernel/drivers/infiniband/hw/mlx5/doorbell.c @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include "mlx5_ib.h" + +struct mlx5_ib_user_db_page { + struct list_head list; + struct ib_umem *umem; + unsigned long user_virt; + int refcnt; +}; + +int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, unsigned long virt, + struct mlx5_db *db) +{ + struct mlx5_ib_user_db_page *page; + int err = 0; + + mutex_lock(&context->db_page_mutex); + + list_for_each_entry(page, &context->db_page_list, list) + if (page->user_virt == (virt & PAGE_MASK)) + goto found; + + page = kmalloc(sizeof(*page), GFP_KERNEL); + if (!page) { + err = -ENOMEM; + goto out; + } + + page->user_virt = (virt & PAGE_MASK); + page->refcnt = 0; + page->umem = ib_umem_get(&context->ibucontext, virt & PAGE_MASK, + PAGE_SIZE, 0, 0); + if (IS_ERR(page->umem)) { + err = PTR_ERR(page->umem); + kfree(page); + goto out; + } + + list_add(&page->list, &context->db_page_list); + +found: + db->dma = sg_dma_address(page->umem->sg_head.sgl) + (virt & ~PAGE_MASK); + db->u.user_page = page; + ++page->refcnt; + +out: + mutex_unlock(&context->db_page_mutex); + + return err; +} + +void mlx5_ib_db_unmap_user(struct mlx5_ib_ucontext *context, struct mlx5_db *db) +{ + mutex_lock(&context->db_page_mutex); + + if (!--db->u.user_page->refcnt) { + list_del(&db->u.user_page->list); + ib_umem_release(db->u.user_page->umem); + kfree(db->u.user_page); + } + + mutex_unlock(&context->db_page_mutex); +} diff --git a/kernel/drivers/infiniband/hw/mlx5/mad.c b/kernel/drivers/infiniband/hw/mlx5/mad.c new file mode 100644 index 000000000..9cf9a37bb --- /dev/null +++ b/kernel/drivers/infiniband/hw/mlx5/mad.c @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include "mlx5_ib.h" + +enum { + MLX5_IB_VENDOR_CLASS1 = 0x9, + MLX5_IB_VENDOR_CLASS2 = 0xa +}; + +int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey, int ignore_bkey, + u8 port, struct ib_wc *in_wc, struct ib_grh *in_grh, + void *in_mad, void *response_mad) +{ + u8 op_modifier = 0; + + /* Key check traps can't be generated unless we have in_wc to + * tell us where to send the trap. + */ + if (ignore_mkey || !in_wc) + op_modifier |= 0x1; + if (ignore_bkey || !in_wc) + op_modifier |= 0x2; + + return mlx5_core_mad_ifc(dev->mdev, in_mad, response_mad, op_modifier, port); +} + +int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + struct ib_wc *in_wc, struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad) +{ + u16 slid; + int err; + + slid = in_wc ? in_wc->slid : be16_to_cpu(IB_LID_PERMISSIVE); + + if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP && slid == 0) + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; + + if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED || + in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { + if (in_mad->mad_hdr.method != IB_MGMT_METHOD_GET && + in_mad->mad_hdr.method != IB_MGMT_METHOD_SET && + in_mad->mad_hdr.method != IB_MGMT_METHOD_TRAP_REPRESS) + return IB_MAD_RESULT_SUCCESS; + + /* Don't process SMInfo queries -- the SMA can't handle them. + */ + if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_SM_INFO) + return IB_MAD_RESULT_SUCCESS; + } else if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT || + in_mad->mad_hdr.mgmt_class == MLX5_IB_VENDOR_CLASS1 || + in_mad->mad_hdr.mgmt_class == MLX5_IB_VENDOR_CLASS2 || + in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_CONG_MGMT) { + if (in_mad->mad_hdr.method != IB_MGMT_METHOD_GET && + in_mad->mad_hdr.method != IB_MGMT_METHOD_SET) + return IB_MAD_RESULT_SUCCESS; + } else { + return IB_MAD_RESULT_SUCCESS; + } + + err = mlx5_MAD_IFC(to_mdev(ibdev), + mad_flags & IB_MAD_IGNORE_MKEY, + mad_flags & IB_MAD_IGNORE_BKEY, + port_num, in_wc, in_grh, in_mad, out_mad); + if (err) + return IB_MAD_RESULT_FAILURE; + + /* set return bit in status of directed route responses */ + if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + out_mad->mad_hdr.status |= cpu_to_be16(1 << 15); + + if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS) + /* no response for trap repress */ + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; + + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; +} + +int mlx5_query_ext_port_caps(struct mlx5_ib_dev *dev, u8 port) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + u16 packet_error; + + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = MLX5_ATTR_EXTENDED_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + err = mlx5_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad); + + packet_error = be16_to_cpu(out_mad->status); + + dev->mdev->caps.gen.ext_port_cap[port - 1] = (!err && !packet_error) ? + MLX_EXT_PORT_CAP_FLAG_EXTENDED_PORT_INFO : 0; + +out: + kfree(in_mad); + kfree(out_mad); + return err; +} diff --git a/kernel/drivers/infiniband/hw/mlx5/main.c b/kernel/drivers/infiniband/hw/mlx5/main.c new file mode 100644 index 000000000..57c9809e8 --- /dev/null +++ b/kernel/drivers/infiniband/hw/mlx5/main.c @@ -0,0 +1,1397 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "user.h" +#include "mlx5_ib.h" + +#define DRIVER_NAME "mlx5_ib" +#define DRIVER_VERSION "2.2-1" +#define DRIVER_RELDATE "Feb 2014" + +MODULE_AUTHOR("Eli Cohen "); +MODULE_DESCRIPTION("Mellanox Connect-IB HCA IB driver"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION(DRIVER_VERSION); + +static int deprecated_prof_sel = 2; +module_param_named(prof_sel, deprecated_prof_sel, int, 0444); +MODULE_PARM_DESC(prof_sel, "profile selector. Deprecated here. Moved to module mlx5_core"); + +static char mlx5_version[] = + DRIVER_NAME ": Mellanox Connect-IB Infiniband driver v" + DRIVER_VERSION " (" DRIVER_RELDATE ")\n"; + +static int mlx5_ib_query_device(struct ib_device *ibdev, + struct ib_device_attr *props) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + struct mlx5_general_caps *gen; + int err = -ENOMEM; + int max_rq_sg; + int max_sq_sg; + u64 flags; + + gen = &dev->mdev->caps.gen; + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_NODE_INFO; + + err = mlx5_MAD_IFC(to_mdev(ibdev), 1, 1, 1, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + memset(props, 0, sizeof(*props)); + + props->fw_ver = ((u64)fw_rev_maj(dev->mdev) << 32) | + (fw_rev_min(dev->mdev) << 16) | + fw_rev_sub(dev->mdev); + props->device_cap_flags = IB_DEVICE_CHANGE_PHY_PORT | + IB_DEVICE_PORT_ACTIVE_EVENT | + IB_DEVICE_SYS_IMAGE_GUID | + IB_DEVICE_RC_RNR_NAK_GEN; + flags = gen->flags; + if (flags & MLX5_DEV_CAP_FLAG_BAD_PKEY_CNTR) + props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR; + if (flags & MLX5_DEV_CAP_FLAG_BAD_QKEY_CNTR) + props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR; + if (flags & MLX5_DEV_CAP_FLAG_APM) + props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG; + props->device_cap_flags |= IB_DEVICE_LOCAL_DMA_LKEY; + if (flags & MLX5_DEV_CAP_FLAG_XRC) + props->device_cap_flags |= IB_DEVICE_XRC; + props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; + if (flags & MLX5_DEV_CAP_FLAG_SIG_HAND_OVER) { + props->device_cap_flags |= IB_DEVICE_SIGNATURE_HANDOVER; + /* At this stage no support for signature handover */ + props->sig_prot_cap = IB_PROT_T10DIF_TYPE_1 | + IB_PROT_T10DIF_TYPE_2 | + IB_PROT_T10DIF_TYPE_3; + props->sig_guard_cap = IB_GUARD_T10DIF_CRC | + IB_GUARD_T10DIF_CSUM; + } + if (flags & MLX5_DEV_CAP_FLAG_BLOCK_MCAST) + props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK; + + props->vendor_id = be32_to_cpup((__be32 *)(out_mad->data + 36)) & + 0xffffff; + props->vendor_part_id = be16_to_cpup((__be16 *)(out_mad->data + 30)); + props->hw_ver = be32_to_cpup((__be32 *)(out_mad->data + 32)); + memcpy(&props->sys_image_guid, out_mad->data + 4, 8); + + props->max_mr_size = ~0ull; + props->page_size_cap = gen->min_page_sz; + props->max_qp = 1 << gen->log_max_qp; + props->max_qp_wr = gen->max_wqes; + max_rq_sg = gen->max_rq_desc_sz / sizeof(struct mlx5_wqe_data_seg); + max_sq_sg = (gen->max_sq_desc_sz - sizeof(struct mlx5_wqe_ctrl_seg)) / + sizeof(struct mlx5_wqe_data_seg); + props->max_sge = min(max_rq_sg, max_sq_sg); + props->max_cq = 1 << gen->log_max_cq; + props->max_cqe = gen->max_cqes - 1; + props->max_mr = 1 << gen->log_max_mkey; + props->max_pd = 1 << gen->log_max_pd; + props->max_qp_rd_atom = 1 << gen->log_max_ra_req_qp; + props->max_qp_init_rd_atom = 1 << gen->log_max_ra_res_qp; + props->max_srq = 1 << gen->log_max_srq; + props->max_srq_wr = gen->max_srq_wqes - 1; + props->local_ca_ack_delay = gen->local_ca_ack_delay; + props->max_res_rd_atom = props->max_qp_rd_atom * props->max_qp; + props->max_srq_sge = max_rq_sg - 1; + props->max_fast_reg_page_list_len = (unsigned int)-1; + props->local_ca_ack_delay = gen->local_ca_ack_delay; + props->atomic_cap = IB_ATOMIC_NONE; + props->masked_atomic_cap = IB_ATOMIC_NONE; + props->max_pkeys = be16_to_cpup((__be16 *)(out_mad->data + 28)); + props->max_mcast_grp = 1 << gen->log_max_mcg; + props->max_mcast_qp_attach = gen->max_qp_mcg; + props->max_total_mcast_qp_attach = props->max_mcast_qp_attach * + props->max_mcast_grp; + props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */ + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + if (dev->mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_ON_DMND_PG) + props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING; + props->odp_caps = dev->odp_caps; +#endif + +out: + kfree(in_mad); + kfree(out_mad); + + return err; +} + +int mlx5_ib_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + struct mlx5_general_caps *gen; + int ext_active_speed; + int err = -ENOMEM; + + gen = &dev->mdev->caps.gen; + if (port < 1 || port > gen->num_ports) { + mlx5_ib_warn(dev, "invalid port number %d\n", port); + return -EINVAL; + } + + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + memset(props, 0, sizeof(*props)); + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + err = mlx5_MAD_IFC(dev, 1, 1, port, NULL, NULL, in_mad, out_mad); + if (err) { + mlx5_ib_warn(dev, "err %d\n", err); + goto out; + } + + + props->lid = be16_to_cpup((__be16 *)(out_mad->data + 16)); + props->lmc = out_mad->data[34] & 0x7; + props->sm_lid = be16_to_cpup((__be16 *)(out_mad->data + 18)); + props->sm_sl = out_mad->data[36] & 0xf; + props->state = out_mad->data[32] & 0xf; + props->phys_state = out_mad->data[33] >> 4; + props->port_cap_flags = be32_to_cpup((__be32 *)(out_mad->data + 20)); + props->gid_tbl_len = out_mad->data[50]; + props->max_msg_sz = 1 << gen->log_max_msg; + props->pkey_tbl_len = gen->port[port - 1].pkey_table_len; + props->bad_pkey_cntr = be16_to_cpup((__be16 *)(out_mad->data + 46)); + props->qkey_viol_cntr = be16_to_cpup((__be16 *)(out_mad->data + 48)); + props->active_width = out_mad->data[31] & 0xf; + props->active_speed = out_mad->data[35] >> 4; + props->max_mtu = out_mad->data[41] & 0xf; + props->active_mtu = out_mad->data[36] >> 4; + props->subnet_timeout = out_mad->data[51] & 0x1f; + props->max_vl_num = out_mad->data[37] >> 4; + props->init_type_reply = out_mad->data[41] >> 4; + + /* Check if extended speeds (EDR/FDR/...) are supported */ + if (props->port_cap_flags & IB_PORT_EXTENDED_SPEEDS_SUP) { + ext_active_speed = out_mad->data[62] >> 4; + + switch (ext_active_speed) { + case 1: + props->active_speed = 16; /* FDR */ + break; + case 2: + props->active_speed = 32; /* EDR */ + break; + } + } + + /* If reported active speed is QDR, check if is FDR-10 */ + if (props->active_speed == 4) { + if (gen->ext_port_cap[port - 1] & + MLX_EXT_PORT_CAP_FLAG_EXTENDED_PORT_INFO) { + init_query_mad(in_mad); + in_mad->attr_id = MLX5_ATTR_EXTENDED_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + err = mlx5_MAD_IFC(dev, 1, 1, port, + NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + /* Checking LinkSpeedActive for FDR-10 */ + if (out_mad->data[15] & 0x1) + props->active_speed = 8; + } + } + +out: + kfree(in_mad); + kfree(out_mad); + + return err; +} + +static int mlx5_ib_query_gid(struct ib_device *ibdev, u8 port, int index, + union ib_gid *gid) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + err = mlx5_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + memcpy(gid->raw, out_mad->data + 8, 8); + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_GUID_INFO; + in_mad->attr_mod = cpu_to_be32(index / 8); + + err = mlx5_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + memcpy(gid->raw + 8, out_mad->data + (index % 8) * 8, 8); + +out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +static int mlx5_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, + u16 *pkey) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_PKEY_TABLE; + in_mad->attr_mod = cpu_to_be32(index / 32); + + err = mlx5_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + *pkey = be16_to_cpu(((__be16 *)out_mad->data)[index % 32]); + +out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +struct mlx5_reg_node_desc { + u8 desc[64]; +}; + +static int mlx5_ib_modify_device(struct ib_device *ibdev, int mask, + struct ib_device_modify *props) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_reg_node_desc in; + struct mlx5_reg_node_desc out; + int err; + + if (mask & ~IB_DEVICE_MODIFY_NODE_DESC) + return -EOPNOTSUPP; + + if (!(mask & IB_DEVICE_MODIFY_NODE_DESC)) + return 0; + + /* + * If possible, pass node desc to FW, so it can generate + * a 144 trap. If cmd fails, just ignore. + */ + memcpy(&in, props->node_desc, 64); + err = mlx5_core_access_reg(dev->mdev, &in, sizeof(in), &out, + sizeof(out), MLX5_REG_NODE_DESC, 0, 1); + if (err) + return err; + + memcpy(ibdev->node_desc, props->node_desc, 64); + + return err; +} + +static int mlx5_ib_modify_port(struct ib_device *ibdev, u8 port, int mask, + struct ib_port_modify *props) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct ib_port_attr attr; + u32 tmp; + int err; + + mutex_lock(&dev->cap_mask_mutex); + + err = mlx5_ib_query_port(ibdev, port, &attr); + if (err) + goto out; + + tmp = (attr.port_cap_flags | props->set_port_cap_mask) & + ~props->clr_port_cap_mask; + + err = mlx5_set_port_caps(dev->mdev, port, tmp); + +out: + mutex_unlock(&dev->cap_mask_mutex); + return err; +} + +static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_ib_alloc_ucontext_req_v2 req; + struct mlx5_ib_alloc_ucontext_resp resp; + struct mlx5_ib_ucontext *context; + struct mlx5_general_caps *gen; + struct mlx5_uuar_info *uuari; + struct mlx5_uar *uars; + int gross_uuars; + int num_uars; + int ver; + int uuarn; + int err; + int i; + size_t reqlen; + + gen = &dev->mdev->caps.gen; + if (!dev->ib_active) + return ERR_PTR(-EAGAIN); + + memset(&req, 0, sizeof(req)); + reqlen = udata->inlen - sizeof(struct ib_uverbs_cmd_hdr); + if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req)) + ver = 0; + else if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req_v2)) + ver = 2; + else + return ERR_PTR(-EINVAL); + + err = ib_copy_from_udata(&req, udata, reqlen); + if (err) + return ERR_PTR(err); + + if (req.flags || req.reserved) + return ERR_PTR(-EINVAL); + + if (req.total_num_uuars > MLX5_MAX_UUARS) + return ERR_PTR(-ENOMEM); + + if (req.total_num_uuars == 0) + return ERR_PTR(-EINVAL); + + req.total_num_uuars = ALIGN(req.total_num_uuars, + MLX5_NON_FP_BF_REGS_PER_PAGE); + if (req.num_low_latency_uuars > req.total_num_uuars - 1) + return ERR_PTR(-EINVAL); + + num_uars = req.total_num_uuars / MLX5_NON_FP_BF_REGS_PER_PAGE; + gross_uuars = num_uars * MLX5_BF_REGS_PER_PAGE; + resp.qp_tab_size = 1 << gen->log_max_qp; + resp.bf_reg_size = gen->bf_reg_size; + resp.cache_line_size = L1_CACHE_BYTES; + resp.max_sq_desc_sz = gen->max_sq_desc_sz; + resp.max_rq_desc_sz = gen->max_rq_desc_sz; + resp.max_send_wqebb = gen->max_wqes; + resp.max_recv_wr = gen->max_wqes; + resp.max_srq_recv_wr = gen->max_srq_wqes; + + context = kzalloc(sizeof(*context), GFP_KERNEL); + if (!context) + return ERR_PTR(-ENOMEM); + + uuari = &context->uuari; + mutex_init(&uuari->lock); + uars = kcalloc(num_uars, sizeof(*uars), GFP_KERNEL); + if (!uars) { + err = -ENOMEM; + goto out_ctx; + } + + uuari->bitmap = kcalloc(BITS_TO_LONGS(gross_uuars), + sizeof(*uuari->bitmap), + GFP_KERNEL); + if (!uuari->bitmap) { + err = -ENOMEM; + goto out_uar_ctx; + } + /* + * clear all fast path uuars + */ + for (i = 0; i < gross_uuars; i++) { + uuarn = i & 3; + if (uuarn == 2 || uuarn == 3) + set_bit(i, uuari->bitmap); + } + + uuari->count = kcalloc(gross_uuars, sizeof(*uuari->count), GFP_KERNEL); + if (!uuari->count) { + err = -ENOMEM; + goto out_bitmap; + } + + for (i = 0; i < num_uars; i++) { + err = mlx5_cmd_alloc_uar(dev->mdev, &uars[i].index); + if (err) + goto out_count; + } + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + context->ibucontext.invalidate_range = &mlx5_ib_invalidate_range; +#endif + + INIT_LIST_HEAD(&context->db_page_list); + mutex_init(&context->db_page_mutex); + + resp.tot_uuars = req.total_num_uuars; + resp.num_ports = gen->num_ports; + err = ib_copy_to_udata(udata, &resp, + sizeof(resp) - sizeof(resp.reserved)); + if (err) + goto out_uars; + + uuari->ver = ver; + uuari->num_low_latency_uuars = req.num_low_latency_uuars; + uuari->uars = uars; + uuari->num_uars = num_uars; + return &context->ibucontext; + +out_uars: + for (i--; i >= 0; i--) + mlx5_cmd_free_uar(dev->mdev, uars[i].index); +out_count: + kfree(uuari->count); + +out_bitmap: + kfree(uuari->bitmap); + +out_uar_ctx: + kfree(uars); + +out_ctx: + kfree(context); + return ERR_PTR(err); +} + +static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) +{ + struct mlx5_ib_ucontext *context = to_mucontext(ibcontext); + struct mlx5_ib_dev *dev = to_mdev(ibcontext->device); + struct mlx5_uuar_info *uuari = &context->uuari; + int i; + + for (i = 0; i < uuari->num_uars; i++) { + if (mlx5_cmd_free_uar(dev->mdev, uuari->uars[i].index)) + mlx5_ib_warn(dev, "failed to free UAR 0x%x\n", uuari->uars[i].index); + } + + kfree(uuari->count); + kfree(uuari->bitmap); + kfree(uuari->uars); + kfree(context); + + return 0; +} + +static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev, int index) +{ + return (pci_resource_start(dev->mdev->pdev, 0) >> PAGE_SHIFT) + index; +} + +static int get_command(unsigned long offset) +{ + return (offset >> MLX5_IB_MMAP_CMD_SHIFT) & MLX5_IB_MMAP_CMD_MASK; +} + +static int get_arg(unsigned long offset) +{ + return offset & ((1 << MLX5_IB_MMAP_CMD_SHIFT) - 1); +} + +static int get_index(unsigned long offset) +{ + return get_arg(offset); +} + +static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma) +{ + struct mlx5_ib_ucontext *context = to_mucontext(ibcontext); + struct mlx5_ib_dev *dev = to_mdev(ibcontext->device); + struct mlx5_uuar_info *uuari = &context->uuari; + unsigned long command; + unsigned long idx; + phys_addr_t pfn; + + command = get_command(vma->vm_pgoff); + switch (command) { + case MLX5_IB_MMAP_REGULAR_PAGE: + if (vma->vm_end - vma->vm_start != PAGE_SIZE) + return -EINVAL; + + idx = get_index(vma->vm_pgoff); + if (idx >= uuari->num_uars) + return -EINVAL; + + pfn = uar_index2pfn(dev, uuari->uars[idx].index); + mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn 0x%llx\n", idx, + (unsigned long long)pfn); + + vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); + if (io_remap_pfn_range(vma, vma->vm_start, pfn, + PAGE_SIZE, vma->vm_page_prot)) + return -EAGAIN; + + mlx5_ib_dbg(dev, "mapped WC at 0x%lx, PA 0x%llx\n", + vma->vm_start, + (unsigned long long)pfn << PAGE_SHIFT); + break; + + case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES: + return -ENOSYS; + + default: + return -EINVAL; + } + + return 0; +} + +static int alloc_pa_mkey(struct mlx5_ib_dev *dev, u32 *key, u32 pdn) +{ + struct mlx5_create_mkey_mbox_in *in; + struct mlx5_mkey_seg *seg; + struct mlx5_core_mr mr; + int err; + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) + return -ENOMEM; + + seg = &in->seg; + seg->flags = MLX5_PERM_LOCAL_READ | MLX5_ACCESS_MODE_PA; + seg->flags_pd = cpu_to_be32(pdn | MLX5_MKEY_LEN64); + seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); + seg->start_addr = 0; + + err = mlx5_core_create_mkey(dev->mdev, &mr, in, sizeof(*in), + NULL, NULL, NULL); + if (err) { + mlx5_ib_warn(dev, "failed to create mkey, %d\n", err); + goto err_in; + } + + kfree(in); + *key = mr.key; + + return 0; + +err_in: + kfree(in); + + return err; +} + +static void free_pa_mkey(struct mlx5_ib_dev *dev, u32 key) +{ + struct mlx5_core_mr mr; + int err; + + memset(&mr, 0, sizeof(mr)); + mr.key = key; + err = mlx5_core_destroy_mkey(dev->mdev, &mr); + if (err) + mlx5_ib_warn(dev, "failed to destroy mkey 0x%x\n", key); +} + +static struct ib_pd *mlx5_ib_alloc_pd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct mlx5_ib_alloc_pd_resp resp; + struct mlx5_ib_pd *pd; + int err; + + pd = kmalloc(sizeof(*pd), GFP_KERNEL); + if (!pd) + return ERR_PTR(-ENOMEM); + + err = mlx5_core_alloc_pd(to_mdev(ibdev)->mdev, &pd->pdn); + if (err) { + kfree(pd); + return ERR_PTR(err); + } + + if (context) { + resp.pdn = pd->pdn; + if (ib_copy_to_udata(udata, &resp, sizeof(resp))) { + mlx5_core_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn); + kfree(pd); + return ERR_PTR(-EFAULT); + } + } else { + err = alloc_pa_mkey(to_mdev(ibdev), &pd->pa_lkey, pd->pdn); + if (err) { + mlx5_core_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn); + kfree(pd); + return ERR_PTR(err); + } + } + + return &pd->ibpd; +} + +static int mlx5_ib_dealloc_pd(struct ib_pd *pd) +{ + struct mlx5_ib_dev *mdev = to_mdev(pd->device); + struct mlx5_ib_pd *mpd = to_mpd(pd); + + if (!pd->uobject) + free_pa_mkey(mdev, mpd->pa_lkey); + + mlx5_core_dealloc_pd(mdev->mdev, mpd->pdn); + kfree(mpd); + + return 0; +} + +static int mlx5_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + int err; + + err = mlx5_core_attach_mcg(dev->mdev, gid, ibqp->qp_num); + if (err) + mlx5_ib_warn(dev, "failed attaching QPN 0x%x, MGID %pI6\n", + ibqp->qp_num, gid->raw); + + return err; +} + +static int mlx5_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + int err; + + err = mlx5_core_detach_mcg(dev->mdev, gid, ibqp->qp_num); + if (err) + mlx5_ib_warn(dev, "failed detaching QPN 0x%x, MGID %pI6\n", + ibqp->qp_num, gid->raw); + + return err; +} + +static int init_node_data(struct mlx5_ib_dev *dev) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_NODE_DESC; + + err = mlx5_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + memcpy(dev->ib_dev.node_desc, out_mad->data, 64); + + in_mad->attr_id = IB_SMP_ATTR_NODE_INFO; + + err = mlx5_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + dev->mdev->rev_id = be32_to_cpup((__be32 *)(out_mad->data + 32)); + memcpy(&dev->ib_dev.node_guid, out_mad->data + 12, 8); + +out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +static ssize_t show_fw_pages(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mlx5_ib_dev *dev = + container_of(device, struct mlx5_ib_dev, ib_dev.dev); + + return sprintf(buf, "%d\n", dev->mdev->priv.fw_pages); +} + +static ssize_t show_reg_pages(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct mlx5_ib_dev *dev = + container_of(device, struct mlx5_ib_dev, ib_dev.dev); + + return sprintf(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages)); +} + +static ssize_t show_hca(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mlx5_ib_dev *dev = + container_of(device, struct mlx5_ib_dev, ib_dev.dev); + return sprintf(buf, "MT%d\n", dev->mdev->pdev->device); +} + +static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mlx5_ib_dev *dev = + container_of(device, struct mlx5_ib_dev, ib_dev.dev); + return sprintf(buf, "%d.%d.%d\n", fw_rev_maj(dev->mdev), + fw_rev_min(dev->mdev), fw_rev_sub(dev->mdev)); +} + +static ssize_t show_rev(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mlx5_ib_dev *dev = + container_of(device, struct mlx5_ib_dev, ib_dev.dev); + return sprintf(buf, "%x\n", dev->mdev->rev_id); +} + +static ssize_t show_board(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mlx5_ib_dev *dev = + container_of(device, struct mlx5_ib_dev, ib_dev.dev); + return sprintf(buf, "%.*s\n", MLX5_BOARD_ID_LEN, + dev->mdev->board_id); +} + +static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); +static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); +static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); +static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); +static DEVICE_ATTR(fw_pages, S_IRUGO, show_fw_pages, NULL); +static DEVICE_ATTR(reg_pages, S_IRUGO, show_reg_pages, NULL); + +static struct device_attribute *mlx5_class_attributes[] = { + &dev_attr_hw_rev, + &dev_attr_fw_ver, + &dev_attr_hca_type, + &dev_attr_board_id, + &dev_attr_fw_pages, + &dev_attr_reg_pages, +}; + +static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context, + enum mlx5_dev_event event, unsigned long param) +{ + struct mlx5_ib_dev *ibdev = (struct mlx5_ib_dev *)context; + struct ib_event ibev; + + u8 port = 0; + + switch (event) { + case MLX5_DEV_EVENT_SYS_ERROR: + ibdev->ib_active = false; + ibev.event = IB_EVENT_DEVICE_FATAL; + break; + + case MLX5_DEV_EVENT_PORT_UP: + ibev.event = IB_EVENT_PORT_ACTIVE; + port = (u8)param; + break; + + case MLX5_DEV_EVENT_PORT_DOWN: + ibev.event = IB_EVENT_PORT_ERR; + port = (u8)param; + break; + + case MLX5_DEV_EVENT_PORT_INITIALIZED: + /* not used by ULPs */ + return; + + case MLX5_DEV_EVENT_LID_CHANGE: + ibev.event = IB_EVENT_LID_CHANGE; + port = (u8)param; + break; + + case MLX5_DEV_EVENT_PKEY_CHANGE: + ibev.event = IB_EVENT_PKEY_CHANGE; + port = (u8)param; + break; + + case MLX5_DEV_EVENT_GUID_CHANGE: + ibev.event = IB_EVENT_GID_CHANGE; + port = (u8)param; + break; + + case MLX5_DEV_EVENT_CLIENT_REREG: + ibev.event = IB_EVENT_CLIENT_REREGISTER; + port = (u8)param; + break; + } + + ibev.device = &ibdev->ib_dev; + ibev.element.port_num = port; + + if (port < 1 || port > ibdev->num_ports) { + mlx5_ib_warn(ibdev, "warning: event on port %d\n", port); + return; + } + + if (ibdev->ib_active) + ib_dispatch_event(&ibev); +} + +static void get_ext_port_caps(struct mlx5_ib_dev *dev) +{ + struct mlx5_general_caps *gen; + int port; + + gen = &dev->mdev->caps.gen; + for (port = 1; port <= gen->num_ports; port++) + mlx5_query_ext_port_caps(dev, port); +} + +static int get_port_caps(struct mlx5_ib_dev *dev) +{ + struct ib_device_attr *dprops = NULL; + struct ib_port_attr *pprops = NULL; + struct mlx5_general_caps *gen; + int err = -ENOMEM; + int port; + + gen = &dev->mdev->caps.gen; + pprops = kmalloc(sizeof(*pprops), GFP_KERNEL); + if (!pprops) + goto out; + + dprops = kmalloc(sizeof(*dprops), GFP_KERNEL); + if (!dprops) + goto out; + + err = mlx5_ib_query_device(&dev->ib_dev, dprops); + if (err) { + mlx5_ib_warn(dev, "query_device failed %d\n", err); + goto out; + } + + for (port = 1; port <= gen->num_ports; port++) { + err = mlx5_ib_query_port(&dev->ib_dev, port, pprops); + if (err) { + mlx5_ib_warn(dev, "query_port %d failed %d\n", port, err); + break; + } + gen->port[port - 1].pkey_table_len = dprops->max_pkeys; + gen->port[port - 1].gid_table_len = pprops->gid_tbl_len; + mlx5_ib_dbg(dev, "pkey_table_len %d, gid_table_len %d\n", + dprops->max_pkeys, pprops->gid_tbl_len); + } + +out: + kfree(pprops); + kfree(dprops); + + return err; +} + +static void destroy_umrc_res(struct mlx5_ib_dev *dev) +{ + int err; + + err = mlx5_mr_cache_cleanup(dev); + if (err) + mlx5_ib_warn(dev, "mr cache cleanup failed\n"); + + mlx5_ib_destroy_qp(dev->umrc.qp); + ib_destroy_cq(dev->umrc.cq); + ib_dereg_mr(dev->umrc.mr); + ib_dealloc_pd(dev->umrc.pd); +} + +enum { + MAX_UMR_WR = 128, +}; + +static int create_umr_res(struct mlx5_ib_dev *dev) +{ + struct ib_qp_init_attr *init_attr = NULL; + struct ib_qp_attr *attr = NULL; + struct ib_pd *pd; + struct ib_cq *cq; + struct ib_qp *qp; + struct ib_mr *mr; + int ret; + + attr = kzalloc(sizeof(*attr), GFP_KERNEL); + init_attr = kzalloc(sizeof(*init_attr), GFP_KERNEL); + if (!attr || !init_attr) { + ret = -ENOMEM; + goto error_0; + } + + pd = ib_alloc_pd(&dev->ib_dev); + if (IS_ERR(pd)) { + mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n"); + ret = PTR_ERR(pd); + goto error_0; + } + + mr = ib_get_dma_mr(pd, IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(mr)) { + mlx5_ib_dbg(dev, "Couldn't create DMA MR for sync UMR QP\n"); + ret = PTR_ERR(mr); + goto error_1; + } + + cq = ib_create_cq(&dev->ib_dev, mlx5_umr_cq_handler, NULL, NULL, 128, + 0); + if (IS_ERR(cq)) { + mlx5_ib_dbg(dev, "Couldn't create CQ for sync UMR QP\n"); + ret = PTR_ERR(cq); + goto error_2; + } + ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + + init_attr->send_cq = cq; + init_attr->recv_cq = cq; + init_attr->sq_sig_type = IB_SIGNAL_ALL_WR; + init_attr->cap.max_send_wr = MAX_UMR_WR; + init_attr->cap.max_send_sge = 1; + init_attr->qp_type = MLX5_IB_QPT_REG_UMR; + init_attr->port_num = 1; + qp = mlx5_ib_create_qp(pd, init_attr, NULL); + if (IS_ERR(qp)) { + mlx5_ib_dbg(dev, "Couldn't create sync UMR QP\n"); + ret = PTR_ERR(qp); + goto error_3; + } + qp->device = &dev->ib_dev; + qp->real_qp = qp; + qp->uobject = NULL; + qp->qp_type = MLX5_IB_QPT_REG_UMR; + + attr->qp_state = IB_QPS_INIT; + attr->port_num = 1; + ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_PKEY_INDEX | + IB_QP_PORT, NULL); + if (ret) { + mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n"); + goto error_4; + } + + memset(attr, 0, sizeof(*attr)); + attr->qp_state = IB_QPS_RTR; + attr->path_mtu = IB_MTU_256; + + ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL); + if (ret) { + mlx5_ib_dbg(dev, "Couldn't modify umr QP to rtr\n"); + goto error_4; + } + + memset(attr, 0, sizeof(*attr)); + attr->qp_state = IB_QPS_RTS; + ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL); + if (ret) { + mlx5_ib_dbg(dev, "Couldn't modify umr QP to rts\n"); + goto error_4; + } + + dev->umrc.qp = qp; + dev->umrc.cq = cq; + dev->umrc.mr = mr; + dev->umrc.pd = pd; + + sema_init(&dev->umrc.sem, MAX_UMR_WR); + ret = mlx5_mr_cache_init(dev); + if (ret) { + mlx5_ib_warn(dev, "mr cache init failed %d\n", ret); + goto error_4; + } + + kfree(attr); + kfree(init_attr); + + return 0; + +error_4: + mlx5_ib_destroy_qp(qp); + +error_3: + ib_destroy_cq(cq); + +error_2: + ib_dereg_mr(mr); + +error_1: + ib_dealloc_pd(pd); + +error_0: + kfree(attr); + kfree(init_attr); + return ret; +} + +static int create_dev_resources(struct mlx5_ib_resources *devr) +{ + struct ib_srq_init_attr attr; + struct mlx5_ib_dev *dev; + int ret = 0; + + dev = container_of(devr, struct mlx5_ib_dev, devr); + + devr->p0 = mlx5_ib_alloc_pd(&dev->ib_dev, NULL, NULL); + if (IS_ERR(devr->p0)) { + ret = PTR_ERR(devr->p0); + goto error0; + } + devr->p0->device = &dev->ib_dev; + devr->p0->uobject = NULL; + atomic_set(&devr->p0->usecnt, 0); + + devr->c0 = mlx5_ib_create_cq(&dev->ib_dev, 1, 0, NULL, NULL); + if (IS_ERR(devr->c0)) { + ret = PTR_ERR(devr->c0); + goto error1; + } + devr->c0->device = &dev->ib_dev; + devr->c0->uobject = NULL; + devr->c0->comp_handler = NULL; + devr->c0->event_handler = NULL; + devr->c0->cq_context = NULL; + atomic_set(&devr->c0->usecnt, 0); + + devr->x0 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL); + if (IS_ERR(devr->x0)) { + ret = PTR_ERR(devr->x0); + goto error2; + } + devr->x0->device = &dev->ib_dev; + devr->x0->inode = NULL; + atomic_set(&devr->x0->usecnt, 0); + mutex_init(&devr->x0->tgt_qp_mutex); + INIT_LIST_HEAD(&devr->x0->tgt_qp_list); + + devr->x1 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL); + if (IS_ERR(devr->x1)) { + ret = PTR_ERR(devr->x1); + goto error3; + } + devr->x1->device = &dev->ib_dev; + devr->x1->inode = NULL; + atomic_set(&devr->x1->usecnt, 0); + mutex_init(&devr->x1->tgt_qp_mutex); + INIT_LIST_HEAD(&devr->x1->tgt_qp_list); + + memset(&attr, 0, sizeof(attr)); + attr.attr.max_sge = 1; + attr.attr.max_wr = 1; + attr.srq_type = IB_SRQT_XRC; + attr.ext.xrc.cq = devr->c0; + attr.ext.xrc.xrcd = devr->x0; + + devr->s0 = mlx5_ib_create_srq(devr->p0, &attr, NULL); + if (IS_ERR(devr->s0)) { + ret = PTR_ERR(devr->s0); + goto error4; + } + devr->s0->device = &dev->ib_dev; + devr->s0->pd = devr->p0; + devr->s0->uobject = NULL; + devr->s0->event_handler = NULL; + devr->s0->srq_context = NULL; + devr->s0->srq_type = IB_SRQT_XRC; + devr->s0->ext.xrc.xrcd = devr->x0; + devr->s0->ext.xrc.cq = devr->c0; + atomic_inc(&devr->s0->ext.xrc.xrcd->usecnt); + atomic_inc(&devr->s0->ext.xrc.cq->usecnt); + atomic_inc(&devr->p0->usecnt); + atomic_set(&devr->s0->usecnt, 0); + + return 0; + +error4: + mlx5_ib_dealloc_xrcd(devr->x1); +error3: + mlx5_ib_dealloc_xrcd(devr->x0); +error2: + mlx5_ib_destroy_cq(devr->c0); +error1: + mlx5_ib_dealloc_pd(devr->p0); +error0: + return ret; +} + +static void destroy_dev_resources(struct mlx5_ib_resources *devr) +{ + mlx5_ib_destroy_srq(devr->s0); + mlx5_ib_dealloc_xrcd(devr->x0); + mlx5_ib_dealloc_xrcd(devr->x1); + mlx5_ib_destroy_cq(devr->c0); + mlx5_ib_dealloc_pd(devr->p0); +} + +static void *mlx5_ib_add(struct mlx5_core_dev *mdev) +{ + struct mlx5_ib_dev *dev; + int err; + int i; + + printk_once(KERN_INFO "%s", mlx5_version); + + dev = (struct mlx5_ib_dev *)ib_alloc_device(sizeof(*dev)); + if (!dev) + return NULL; + + dev->mdev = mdev; + + err = get_port_caps(dev); + if (err) + goto err_dealloc; + + get_ext_port_caps(dev); + + MLX5_INIT_DOORBELL_LOCK(&dev->uar_lock); + + strlcpy(dev->ib_dev.name, "mlx5_%d", IB_DEVICE_NAME_MAX); + dev->ib_dev.owner = THIS_MODULE; + dev->ib_dev.node_type = RDMA_NODE_IB_CA; + dev->ib_dev.local_dma_lkey = mdev->caps.gen.reserved_lkey; + dev->num_ports = mdev->caps.gen.num_ports; + dev->ib_dev.phys_port_cnt = dev->num_ports; + dev->ib_dev.num_comp_vectors = + dev->mdev->priv.eq_table.num_comp_vectors; + dev->ib_dev.dma_device = &mdev->pdev->dev; + + dev->ib_dev.uverbs_abi_ver = MLX5_IB_UVERBS_ABI_VERSION; + dev->ib_dev.uverbs_cmd_mask = + (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | + (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | + (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_DEREG_MR) | + (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | + (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_CMD_RESIZE_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_QP) | + (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | + (1ull << IB_USER_VERBS_CMD_QUERY_QP) | + (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | + (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_DETACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) | + (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) | + (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_XSRQ) | + (1ull << IB_USER_VERBS_CMD_OPEN_QP); + dev->ib_dev.uverbs_ex_cmd_mask = + (1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE); + + dev->ib_dev.query_device = mlx5_ib_query_device; + dev->ib_dev.query_port = mlx5_ib_query_port; + dev->ib_dev.query_gid = mlx5_ib_query_gid; + dev->ib_dev.query_pkey = mlx5_ib_query_pkey; + dev->ib_dev.modify_device = mlx5_ib_modify_device; + dev->ib_dev.modify_port = mlx5_ib_modify_port; + dev->ib_dev.alloc_ucontext = mlx5_ib_alloc_ucontext; + dev->ib_dev.dealloc_ucontext = mlx5_ib_dealloc_ucontext; + dev->ib_dev.mmap = mlx5_ib_mmap; + dev->ib_dev.alloc_pd = mlx5_ib_alloc_pd; + dev->ib_dev.dealloc_pd = mlx5_ib_dealloc_pd; + dev->ib_dev.create_ah = mlx5_ib_create_ah; + dev->ib_dev.query_ah = mlx5_ib_query_ah; + dev->ib_dev.destroy_ah = mlx5_ib_destroy_ah; + dev->ib_dev.create_srq = mlx5_ib_create_srq; + dev->ib_dev.modify_srq = mlx5_ib_modify_srq; + dev->ib_dev.query_srq = mlx5_ib_query_srq; + dev->ib_dev.destroy_srq = mlx5_ib_destroy_srq; + dev->ib_dev.post_srq_recv = mlx5_ib_post_srq_recv; + dev->ib_dev.create_qp = mlx5_ib_create_qp; + dev->ib_dev.modify_qp = mlx5_ib_modify_qp; + dev->ib_dev.query_qp = mlx5_ib_query_qp; + dev->ib_dev.destroy_qp = mlx5_ib_destroy_qp; + dev->ib_dev.post_send = mlx5_ib_post_send; + dev->ib_dev.post_recv = mlx5_ib_post_recv; + dev->ib_dev.create_cq = mlx5_ib_create_cq; + dev->ib_dev.modify_cq = mlx5_ib_modify_cq; + dev->ib_dev.resize_cq = mlx5_ib_resize_cq; + dev->ib_dev.destroy_cq = mlx5_ib_destroy_cq; + dev->ib_dev.poll_cq = mlx5_ib_poll_cq; + dev->ib_dev.req_notify_cq = mlx5_ib_arm_cq; + dev->ib_dev.get_dma_mr = mlx5_ib_get_dma_mr; + dev->ib_dev.reg_user_mr = mlx5_ib_reg_user_mr; + dev->ib_dev.dereg_mr = mlx5_ib_dereg_mr; + dev->ib_dev.destroy_mr = mlx5_ib_destroy_mr; + dev->ib_dev.attach_mcast = mlx5_ib_mcg_attach; + dev->ib_dev.detach_mcast = mlx5_ib_mcg_detach; + dev->ib_dev.process_mad = mlx5_ib_process_mad; + dev->ib_dev.create_mr = mlx5_ib_create_mr; + dev->ib_dev.alloc_fast_reg_mr = mlx5_ib_alloc_fast_reg_mr; + dev->ib_dev.alloc_fast_reg_page_list = mlx5_ib_alloc_fast_reg_page_list; + dev->ib_dev.free_fast_reg_page_list = mlx5_ib_free_fast_reg_page_list; + dev->ib_dev.check_mr_status = mlx5_ib_check_mr_status; + + mlx5_ib_internal_query_odp_caps(dev); + + if (mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_XRC) { + dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd; + dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd; + dev->ib_dev.uverbs_cmd_mask |= + (1ull << IB_USER_VERBS_CMD_OPEN_XRCD) | + (1ull << IB_USER_VERBS_CMD_CLOSE_XRCD); + } + + err = init_node_data(dev); + if (err) + goto err_dealloc; + + mutex_init(&dev->cap_mask_mutex); + + err = create_dev_resources(&dev->devr); + if (err) + goto err_dealloc; + + err = mlx5_ib_odp_init_one(dev); + if (err) + goto err_rsrc; + + err = ib_register_device(&dev->ib_dev, NULL); + if (err) + goto err_odp; + + err = create_umr_res(dev); + if (err) + goto err_dev; + + for (i = 0; i < ARRAY_SIZE(mlx5_class_attributes); i++) { + err = device_create_file(&dev->ib_dev.dev, + mlx5_class_attributes[i]); + if (err) + goto err_umrc; + } + + dev->ib_active = true; + + return dev; + +err_umrc: + destroy_umrc_res(dev); + +err_dev: + ib_unregister_device(&dev->ib_dev); + +err_odp: + mlx5_ib_odp_remove_one(dev); + +err_rsrc: + destroy_dev_resources(&dev->devr); + +err_dealloc: + ib_dealloc_device((struct ib_device *)dev); + + return NULL; +} + +static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context) +{ + struct mlx5_ib_dev *dev = context; + + ib_unregister_device(&dev->ib_dev); + destroy_umrc_res(dev); + mlx5_ib_odp_remove_one(dev); + destroy_dev_resources(&dev->devr); + ib_dealloc_device(&dev->ib_dev); +} + +static struct mlx5_interface mlx5_ib_interface = { + .add = mlx5_ib_add, + .remove = mlx5_ib_remove, + .event = mlx5_ib_event, + .protocol = MLX5_INTERFACE_PROTOCOL_IB, +}; + +static int __init mlx5_ib_init(void) +{ + int err; + + if (deprecated_prof_sel != 2) + pr_warn("prof_sel is deprecated for mlx5_ib, set it for mlx5_core\n"); + + err = mlx5_ib_odp_init(); + if (err) + return err; + + err = mlx5_register_interface(&mlx5_ib_interface); + if (err) + goto clean_odp; + + return err; + +clean_odp: + mlx5_ib_odp_cleanup(); + return err; +} + +static void __exit mlx5_ib_cleanup(void) +{ + mlx5_unregister_interface(&mlx5_ib_interface); + mlx5_ib_odp_cleanup(); +} + +module_init(mlx5_ib_init); +module_exit(mlx5_ib_cleanup); diff --git a/kernel/drivers/infiniband/hw/mlx5/mem.c b/kernel/drivers/infiniband/hw/mlx5/mem.c new file mode 100644 index 000000000..40df2cca0 --- /dev/null +++ b/kernel/drivers/infiniband/hw/mlx5/mem.c @@ -0,0 +1,225 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include "mlx5_ib.h" + +/* @umem: umem object to scan + * @addr: ib virtual address requested by the user + * @count: number of PAGE_SIZE pages covered by umem + * @shift: page shift for the compound pages found in the region + * @ncont: number of compund pages + * @order: log2 of the number of compound pages + */ +void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift, + int *ncont, int *order) +{ + unsigned long tmp; + unsigned long m; + int i, k; + u64 base = 0; + int p = 0; + int skip; + int mask; + u64 len; + u64 pfn; + struct scatterlist *sg; + int entry; + unsigned long page_shift = ilog2(umem->page_size); + + /* With ODP we must always match OS page size. */ + if (umem->odp_data) { + *count = ib_umem_page_count(umem); + *shift = PAGE_SHIFT; + *ncont = *count; + if (order) + *order = ilog2(roundup_pow_of_two(*count)); + + return; + } + + addr = addr >> page_shift; + tmp = (unsigned long)addr; + m = find_first_bit(&tmp, sizeof(tmp)); + skip = 1 << m; + mask = skip - 1; + i = 0; + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { + len = sg_dma_len(sg) >> page_shift; + pfn = sg_dma_address(sg) >> page_shift; + for (k = 0; k < len; k++) { + if (!(i & mask)) { + tmp = (unsigned long)pfn; + m = min_t(unsigned long, m, find_first_bit(&tmp, sizeof(tmp))); + skip = 1 << m; + mask = skip - 1; + base = pfn; + p = 0; + } else { + if (base + p != pfn) { + tmp = (unsigned long)p; + m = find_first_bit(&tmp, sizeof(tmp)); + skip = 1 << m; + mask = skip - 1; + base = pfn; + p = 0; + } + } + p++; + i++; + } + } + + if (i) { + m = min_t(unsigned long, ilog2(roundup_pow_of_two(i)), m); + + if (order) + *order = ilog2(roundup_pow_of_two(i) >> m); + + *ncont = DIV_ROUND_UP(i, (1 << m)); + } else { + m = 0; + + if (order) + *order = 0; + + *ncont = 0; + } + *shift = page_shift + m; + *count = i; +} + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING +static u64 umem_dma_to_mtt(dma_addr_t umem_dma) +{ + u64 mtt_entry = umem_dma & ODP_DMA_ADDR_MASK; + + if (umem_dma & ODP_READ_ALLOWED_BIT) + mtt_entry |= MLX5_IB_MTT_READ; + if (umem_dma & ODP_WRITE_ALLOWED_BIT) + mtt_entry |= MLX5_IB_MTT_WRITE; + + return mtt_entry; +} +#endif + +/* + * Populate the given array with bus addresses from the umem. + * + * dev - mlx5_ib device + * umem - umem to use to fill the pages + * page_shift - determines the page size used in the resulting array + * offset - offset into the umem to start from, + * only implemented for ODP umems + * num_pages - total number of pages to fill + * pas - bus addresses array to fill + * access_flags - access flags to set on all present pages. + use enum mlx5_ib_mtt_access_flags for this. + */ +void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, + int page_shift, size_t offset, size_t num_pages, + __be64 *pas, int access_flags) +{ + unsigned long umem_page_shift = ilog2(umem->page_size); + int shift = page_shift - umem_page_shift; + int mask = (1 << shift) - 1; + int i, k; + u64 cur = 0; + u64 base; + int len; + struct scatterlist *sg; + int entry; +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + const bool odp = umem->odp_data != NULL; + + if (odp) { + WARN_ON(shift != 0); + WARN_ON(access_flags != (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE)); + + for (i = 0; i < num_pages; ++i) { + dma_addr_t pa = umem->odp_data->dma_list[offset + i]; + + pas[i] = cpu_to_be64(umem_dma_to_mtt(pa)); + } + return; + } +#endif + + i = 0; + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { + len = sg_dma_len(sg) >> umem_page_shift; + base = sg_dma_address(sg); + for (k = 0; k < len; k++) { + if (!(i & mask)) { + cur = base + (k << umem_page_shift); + cur |= access_flags; + + pas[i >> shift] = cpu_to_be64(cur); + mlx5_ib_dbg(dev, "pas[%d] 0x%llx\n", + i >> shift, be64_to_cpu(pas[i >> shift])); + } else + mlx5_ib_dbg(dev, "=====> 0x%llx\n", + base + (k << umem_page_shift)); + i++; + } + } +} + +void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, + int page_shift, __be64 *pas, int access_flags) +{ + return __mlx5_ib_populate_pas(dev, umem, page_shift, 0, + ib_umem_num_pages(umem), pas, + access_flags); +} +int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset) +{ + u64 page_size; + u64 page_mask; + u64 off_size; + u64 off_mask; + u64 buf_off; + + page_size = (u64)1 << page_shift; + page_mask = page_size - 1; + buf_off = addr & page_mask; + off_size = page_size >> 6; + off_mask = off_size - 1; + + if (buf_off & off_mask) + return -EINVAL; + + *offset = buf_off >> ilog2(off_size); + return 0; +} diff --git a/kernel/drivers/infiniband/hw/mlx5/mlx5_ib.h b/kernel/drivers/infiniband/hw/mlx5/mlx5_ib.h new file mode 100644 index 000000000..dff1cfcdf --- /dev/null +++ b/kernel/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -0,0 +1,669 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX5_IB_H +#define MLX5_IB_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define mlx5_ib_dbg(dev, format, arg...) \ +pr_debug("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \ + __LINE__, current->pid, ##arg) + +#define mlx5_ib_err(dev, format, arg...) \ +pr_err("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \ + __LINE__, current->pid, ##arg) + +#define mlx5_ib_warn(dev, format, arg...) \ +pr_warn("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \ + __LINE__, current->pid, ##arg) + +enum { + MLX5_IB_MMAP_CMD_SHIFT = 8, + MLX5_IB_MMAP_CMD_MASK = 0xff, +}; + +enum mlx5_ib_mmap_cmd { + MLX5_IB_MMAP_REGULAR_PAGE = 0, + MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES = 1, /* always last */ +}; + +enum { + MLX5_RES_SCAT_DATA32_CQE = 0x1, + MLX5_RES_SCAT_DATA64_CQE = 0x2, + MLX5_REQ_SCAT_DATA32_CQE = 0x11, + MLX5_REQ_SCAT_DATA64_CQE = 0x22, +}; + +enum mlx5_ib_latency_class { + MLX5_IB_LATENCY_CLASS_LOW, + MLX5_IB_LATENCY_CLASS_MEDIUM, + MLX5_IB_LATENCY_CLASS_HIGH, + MLX5_IB_LATENCY_CLASS_FAST_PATH +}; + +enum mlx5_ib_mad_ifc_flags { + MLX5_MAD_IFC_IGNORE_MKEY = 1, + MLX5_MAD_IFC_IGNORE_BKEY = 2, + MLX5_MAD_IFC_NET_VIEW = 4, +}; + +struct mlx5_ib_ucontext { + struct ib_ucontext ibucontext; + struct list_head db_page_list; + + /* protect doorbell record alloc/free + */ + struct mutex db_page_mutex; + struct mlx5_uuar_info uuari; +}; + +static inline struct mlx5_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext) +{ + return container_of(ibucontext, struct mlx5_ib_ucontext, ibucontext); +} + +struct mlx5_ib_pd { + struct ib_pd ibpd; + u32 pdn; + u32 pa_lkey; +}; + +/* Use macros here so that don't have to duplicate + * enum ib_send_flags and enum ib_qp_type for low-level driver + */ + +#define MLX5_IB_SEND_UMR_UNREG IB_SEND_RESERVED_START +#define MLX5_IB_SEND_UMR_FAIL_IF_FREE (IB_SEND_RESERVED_START << 1) +#define MLX5_IB_SEND_UMR_UPDATE_MTT (IB_SEND_RESERVED_START << 2) +#define MLX5_IB_QPT_REG_UMR IB_QPT_RESERVED1 +#define MLX5_IB_WR_UMR IB_WR_RESERVED1 + +struct wr_list { + u16 opcode; + u16 next; +}; + +struct mlx5_ib_wq { + u64 *wrid; + u32 *wr_data; + struct wr_list *w_list; + unsigned *wqe_head; + u16 unsig_count; + + /* serialize post to the work queue + */ + spinlock_t lock; + int wqe_cnt; + int max_post; + int max_gs; + int offset; + int wqe_shift; + unsigned head; + unsigned tail; + u16 cur_post; + u16 last_poll; + void *qend; +}; + +enum { + MLX5_QP_USER, + MLX5_QP_KERNEL, + MLX5_QP_EMPTY +}; + +/* + * Connect-IB can trigger up to four concurrent pagefaults + * per-QP. + */ +enum mlx5_ib_pagefault_context { + MLX5_IB_PAGEFAULT_RESPONDER_READ, + MLX5_IB_PAGEFAULT_REQUESTOR_READ, + MLX5_IB_PAGEFAULT_RESPONDER_WRITE, + MLX5_IB_PAGEFAULT_REQUESTOR_WRITE, + MLX5_IB_PAGEFAULT_CONTEXTS +}; + +static inline enum mlx5_ib_pagefault_context + mlx5_ib_get_pagefault_context(struct mlx5_pagefault *pagefault) +{ + return pagefault->flags & (MLX5_PFAULT_REQUESTOR | MLX5_PFAULT_WRITE); +} + +struct mlx5_ib_pfault { + struct work_struct work; + struct mlx5_pagefault mpfault; +}; + +struct mlx5_ib_qp { + struct ib_qp ibqp; + struct mlx5_core_qp mqp; + struct mlx5_buf buf; + + struct mlx5_db db; + struct mlx5_ib_wq rq; + + u32 doorbell_qpn; + u8 sq_signal_bits; + u8 fm_cache; + int sq_max_wqes_per_wr; + int sq_spare_wqes; + struct mlx5_ib_wq sq; + + struct ib_umem *umem; + int buf_size; + + /* serialize qp state modifications + */ + struct mutex mutex; + u16 xrcdn; + u32 flags; + u8 port; + u8 alt_port; + u8 atomic_rd_en; + u8 resp_depth; + u8 state; + int mlx_type; + int wq_sig; + int scat_cqe; + int max_inline_data; + struct mlx5_bf *bf; + int has_rq; + + /* only for user space QPs. For kernel + * we have it from the bf object + */ + int uuarn; + + int create_type; + u32 pa_lkey; + + /* Store signature errors */ + bool signature_en; + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + /* + * A flag that is true for QP's that are in a state that doesn't + * allow page faults, and shouldn't schedule any more faults. + */ + int disable_page_faults; + /* + * The disable_page_faults_lock protects a QP's disable_page_faults + * field, allowing for a thread to atomically check whether the QP + * allows page faults, and if so schedule a page fault. + */ + spinlock_t disable_page_faults_lock; + struct mlx5_ib_pfault pagefaults[MLX5_IB_PAGEFAULT_CONTEXTS]; +#endif +}; + +struct mlx5_ib_cq_buf { + struct mlx5_buf buf; + struct ib_umem *umem; + int cqe_size; + int nent; +}; + +enum mlx5_ib_qp_flags { + MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK = 1 << 0, + MLX5_IB_QP_SIGNATURE_HANDLING = 1 << 1, +}; + +struct mlx5_umr_wr { + union { + u64 virt_addr; + u64 offset; + } target; + struct ib_pd *pd; + unsigned int page_shift; + unsigned int npages; + u32 length; + int access_flags; + u32 mkey; +}; + +struct mlx5_shared_mr_info { + int mr_id; + struct ib_umem *umem; +}; + +struct mlx5_ib_cq { + struct ib_cq ibcq; + struct mlx5_core_cq mcq; + struct mlx5_ib_cq_buf buf; + struct mlx5_db db; + + /* serialize access to the CQ + */ + spinlock_t lock; + + /* protect resize cq + */ + struct mutex resize_mutex; + struct mlx5_ib_cq_buf *resize_buf; + struct ib_umem *resize_umem; + int cqe_size; +}; + +struct mlx5_ib_srq { + struct ib_srq ibsrq; + struct mlx5_core_srq msrq; + struct mlx5_buf buf; + struct mlx5_db db; + u64 *wrid; + /* protect SRQ hanlding + */ + spinlock_t lock; + int head; + int tail; + u16 wqe_ctr; + struct ib_umem *umem; + /* serialize arming a SRQ + */ + struct mutex mutex; + int wq_sig; +}; + +struct mlx5_ib_xrcd { + struct ib_xrcd ibxrcd; + u32 xrcdn; +}; + +enum mlx5_ib_mtt_access_flags { + MLX5_IB_MTT_READ = (1 << 0), + MLX5_IB_MTT_WRITE = (1 << 1), +}; + +#define MLX5_IB_MTT_PRESENT (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE) + +struct mlx5_ib_mr { + struct ib_mr ibmr; + struct mlx5_core_mr mmr; + struct ib_umem *umem; + struct mlx5_shared_mr_info *smr_info; + struct list_head list; + int order; + int umred; + int npages; + struct mlx5_ib_dev *dev; + struct mlx5_create_mkey_mbox_out out; + struct mlx5_core_sig_ctx *sig; + int live; +}; + +struct mlx5_ib_fast_reg_page_list { + struct ib_fast_reg_page_list ibfrpl; + __be64 *mapped_page_list; + dma_addr_t map; +}; + +struct mlx5_ib_umr_context { + enum ib_wc_status status; + struct completion done; +}; + +static inline void mlx5_ib_init_umr_context(struct mlx5_ib_umr_context *context) +{ + context->status = -1; + init_completion(&context->done); +} + +struct umr_common { + struct ib_pd *pd; + struct ib_cq *cq; + struct ib_qp *qp; + struct ib_mr *mr; + /* control access to UMR QP + */ + struct semaphore sem; +}; + +enum { + MLX5_FMR_INVALID, + MLX5_FMR_VALID, + MLX5_FMR_BUSY, +}; + +struct mlx5_ib_fmr { + struct ib_fmr ibfmr; + struct mlx5_core_mr mr; + int access_flags; + int state; + /* protect fmr state + */ + spinlock_t lock; + u64 wrid; + struct ib_send_wr wr[2]; + u8 page_shift; + struct ib_fast_reg_page_list page_list; +}; + +struct mlx5_cache_ent { + struct list_head head; + /* sync access to the cahce entry + */ + spinlock_t lock; + + + struct dentry *dir; + char name[4]; + u32 order; + u32 size; + u32 cur; + u32 miss; + u32 limit; + + struct dentry *fsize; + struct dentry *fcur; + struct dentry *fmiss; + struct dentry *flimit; + + struct mlx5_ib_dev *dev; + struct work_struct work; + struct delayed_work dwork; + int pending; +}; + +struct mlx5_mr_cache { + struct workqueue_struct *wq; + struct mlx5_cache_ent ent[MAX_MR_CACHE_ENTRIES]; + int stopped; + struct dentry *root; + unsigned long last_add; +}; + +struct mlx5_ib_resources { + struct ib_cq *c0; + struct ib_xrcd *x0; + struct ib_xrcd *x1; + struct ib_pd *p0; + struct ib_srq *s0; +}; + +struct mlx5_ib_dev { + struct ib_device ib_dev; + struct mlx5_core_dev *mdev; + MLX5_DECLARE_DOORBELL_LOCK(uar_lock); + int num_ports; + /* serialize update of capability mask + */ + struct mutex cap_mask_mutex; + bool ib_active; + struct umr_common umrc; + /* sync used page count stats + */ + struct mlx5_ib_resources devr; + struct mlx5_mr_cache cache; + struct timer_list delay_timer; + int fill_delay; +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + struct ib_odp_caps odp_caps; + /* + * Sleepable RCU that prevents destruction of MRs while they are still + * being used by a page fault handler. + */ + struct srcu_struct mr_srcu; +#endif +}; + +static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq) +{ + return container_of(mcq, struct mlx5_ib_cq, mcq); +} + +static inline struct mlx5_ib_xrcd *to_mxrcd(struct ib_xrcd *ibxrcd) +{ + return container_of(ibxrcd, struct mlx5_ib_xrcd, ibxrcd); +} + +static inline struct mlx5_ib_dev *to_mdev(struct ib_device *ibdev) +{ + return container_of(ibdev, struct mlx5_ib_dev, ib_dev); +} + +static inline struct mlx5_ib_fmr *to_mfmr(struct ib_fmr *ibfmr) +{ + return container_of(ibfmr, struct mlx5_ib_fmr, ibfmr); +} + +static inline struct mlx5_ib_cq *to_mcq(struct ib_cq *ibcq) +{ + return container_of(ibcq, struct mlx5_ib_cq, ibcq); +} + +static inline struct mlx5_ib_qp *to_mibqp(struct mlx5_core_qp *mqp) +{ + return container_of(mqp, struct mlx5_ib_qp, mqp); +} + +static inline struct mlx5_ib_mr *to_mibmr(struct mlx5_core_mr *mmr) +{ + return container_of(mmr, struct mlx5_ib_mr, mmr); +} + +static inline struct mlx5_ib_pd *to_mpd(struct ib_pd *ibpd) +{ + return container_of(ibpd, struct mlx5_ib_pd, ibpd); +} + +static inline struct mlx5_ib_srq *to_msrq(struct ib_srq *ibsrq) +{ + return container_of(ibsrq, struct mlx5_ib_srq, ibsrq); +} + +static inline struct mlx5_ib_qp *to_mqp(struct ib_qp *ibqp) +{ + return container_of(ibqp, struct mlx5_ib_qp, ibqp); +} + +static inline struct mlx5_ib_srq *to_mibsrq(struct mlx5_core_srq *msrq) +{ + return container_of(msrq, struct mlx5_ib_srq, msrq); +} + +static inline struct mlx5_ib_mr *to_mmr(struct ib_mr *ibmr) +{ + return container_of(ibmr, struct mlx5_ib_mr, ibmr); +} + +static inline struct mlx5_ib_fast_reg_page_list *to_mfrpl(struct ib_fast_reg_page_list *ibfrpl) +{ + return container_of(ibfrpl, struct mlx5_ib_fast_reg_page_list, ibfrpl); +} + +struct mlx5_ib_ah { + struct ib_ah ibah; + struct mlx5_av av; +}; + +static inline struct mlx5_ib_ah *to_mah(struct ib_ah *ibah) +{ + return container_of(ibah, struct mlx5_ib_ah, ibah); +} + +int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, unsigned long virt, + struct mlx5_db *db); +void mlx5_ib_db_unmap_user(struct mlx5_ib_ucontext *context, struct mlx5_db *db); +void __mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq); +void mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq); +void mlx5_ib_free_srq_wqe(struct mlx5_ib_srq *srq, int wqe_index); +int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey, int ignore_bkey, + u8 port, struct ib_wc *in_wc, struct ib_grh *in_grh, + void *in_mad, void *response_mad); +struct ib_ah *create_ib_ah(struct ib_ah_attr *ah_attr, + struct mlx5_ib_ah *ah); +struct ib_ah *mlx5_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr); +int mlx5_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr); +int mlx5_ib_destroy_ah(struct ib_ah *ah); +struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd, + struct ib_srq_init_attr *init_attr, + struct ib_udata *udata); +int mlx5_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, struct ib_udata *udata); +int mlx5_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr); +int mlx5_ib_destroy_srq(struct ib_srq *srq); +int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr); +struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata); +int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata); +int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr); +int mlx5_ib_destroy_qp(struct ib_qp *qp); +int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr); +int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr); +void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n); +int mlx5_ib_read_user_wqe(struct mlx5_ib_qp *qp, int send, int wqe_index, + void *buffer, u32 length); +struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, int entries, + int vector, struct ib_ucontext *context, + struct ib_udata *udata); +int mlx5_ib_destroy_cq(struct ib_cq *cq); +int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); +int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); +int mlx5_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period); +int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata); +struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc); +struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int access_flags, + struct ib_udata *udata); +int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, + int npages, int zap); +int mlx5_ib_dereg_mr(struct ib_mr *ibmr); +int mlx5_ib_destroy_mr(struct ib_mr *ibmr); +struct ib_mr *mlx5_ib_create_mr(struct ib_pd *pd, + struct ib_mr_init_attr *mr_init_attr); +struct ib_mr *mlx5_ib_alloc_fast_reg_mr(struct ib_pd *pd, + int max_page_list_len); +struct ib_fast_reg_page_list *mlx5_ib_alloc_fast_reg_page_list(struct ib_device *ibdev, + int page_list_len); +void mlx5_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list); +struct ib_fmr *mlx5_ib_fmr_alloc(struct ib_pd *pd, int acc, + struct ib_fmr_attr *fmr_attr); +int mlx5_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, + int npages, u64 iova); +int mlx5_ib_unmap_fmr(struct list_head *fmr_list); +int mlx5_ib_fmr_dealloc(struct ib_fmr *ibfmr); +int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + struct ib_wc *in_wc, struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad); +struct ib_xrcd *mlx5_ib_alloc_xrcd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata); +int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd); +int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset); +int mlx5_query_ext_port_caps(struct mlx5_ib_dev *dev, u8 port); +int mlx5_ib_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props); +int mlx5_ib_init_fmr(struct mlx5_ib_dev *dev); +void mlx5_ib_cleanup_fmr(struct mlx5_ib_dev *dev); +void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift, + int *ncont, int *order); +void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, + int page_shift, size_t offset, size_t num_pages, + __be64 *pas, int access_flags); +void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, + int page_shift, __be64 *pas, int access_flags); +void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num); +int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq); +int mlx5_mr_cache_init(struct mlx5_ib_dev *dev); +int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev); +int mlx5_mr_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift); +void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context); +int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, + struct ib_mr_status *mr_status); + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING +extern struct workqueue_struct *mlx5_ib_page_fault_wq; + +int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev); +void mlx5_ib_mr_pfault_handler(struct mlx5_ib_qp *qp, + struct mlx5_ib_pfault *pfault); +void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp); +int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev); +void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev); +int __init mlx5_ib_odp_init(void); +void mlx5_ib_odp_cleanup(void); +void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp); +void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp); +void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, + unsigned long end); + +#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ +static inline int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev) +{ + return 0; +} + +static inline void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp) {} +static inline int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) { return 0; } +static inline void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev) {} +static inline int mlx5_ib_odp_init(void) { return 0; } +static inline void mlx5_ib_odp_cleanup(void) {} +static inline void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp) {} +static inline void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp) {} + +#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ + +static inline void init_query_mad(struct ib_smp *mad) +{ + mad->base_version = 1; + mad->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED; + mad->class_version = 1; + mad->method = IB_MGMT_METHOD_GET; +} + +static inline u8 convert_access(int acc) +{ + return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX5_PERM_ATOMIC : 0) | + (acc & IB_ACCESS_REMOTE_WRITE ? MLX5_PERM_REMOTE_WRITE : 0) | + (acc & IB_ACCESS_REMOTE_READ ? MLX5_PERM_REMOTE_READ : 0) | + (acc & IB_ACCESS_LOCAL_WRITE ? MLX5_PERM_LOCAL_WRITE : 0) | + MLX5_PERM_LOCAL_READ; +} + +#define MLX5_MAX_UMR_SHIFT 16 +#define MLX5_MAX_UMR_PAGES (1 << MLX5_MAX_UMR_SHIFT) + +#endif /* MLX5_IB_H */ diff --git a/kernel/drivers/infiniband/hw/mlx5/mr.c b/kernel/drivers/infiniband/hw/mlx5/mr.c new file mode 100644 index 000000000..71c593583 --- /dev/null +++ b/kernel/drivers/infiniband/hw/mlx5/mr.c @@ -0,0 +1,1479 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include "mlx5_ib.h" + +enum { + MAX_PENDING_REG_MR = 8, +}; + +#define MLX5_UMR_ALIGN 2048 +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING +static __be64 mlx5_ib_update_mtt_emergency_buffer[ + MLX5_UMR_MTT_MIN_CHUNK_SIZE/sizeof(__be64)] + __aligned(MLX5_UMR_ALIGN); +static DEFINE_MUTEX(mlx5_ib_update_mtt_emergency_buffer_mutex); +#endif + +static int clean_mr(struct mlx5_ib_mr *mr); + +static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) +{ + int err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr); + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + /* Wait until all page fault handlers using the mr complete. */ + synchronize_srcu(&dev->mr_srcu); +#endif + + return err; +} + +static int order2idx(struct mlx5_ib_dev *dev, int order) +{ + struct mlx5_mr_cache *cache = &dev->cache; + + if (order < cache->ent[0].order) + return 0; + else + return order - cache->ent[0].order; +} + +static void reg_mr_callback(int status, void *context) +{ + struct mlx5_ib_mr *mr = context; + struct mlx5_ib_dev *dev = mr->dev; + struct mlx5_mr_cache *cache = &dev->cache; + int c = order2idx(dev, mr->order); + struct mlx5_cache_ent *ent = &cache->ent[c]; + u8 key; + unsigned long flags; + struct mlx5_mr_table *table = &dev->mdev->priv.mr_table; + int err; + + spin_lock_irqsave(&ent->lock, flags); + ent->pending--; + spin_unlock_irqrestore(&ent->lock, flags); + if (status) { + mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status); + kfree(mr); + dev->fill_delay = 1; + mod_timer(&dev->delay_timer, jiffies + HZ); + return; + } + + if (mr->out.hdr.status) { + mlx5_ib_warn(dev, "failed - status %d, syndorme 0x%x\n", + mr->out.hdr.status, + be32_to_cpu(mr->out.hdr.syndrome)); + kfree(mr); + dev->fill_delay = 1; + mod_timer(&dev->delay_timer, jiffies + HZ); + return; + } + + spin_lock_irqsave(&dev->mdev->priv.mkey_lock, flags); + key = dev->mdev->priv.mkey_key++; + spin_unlock_irqrestore(&dev->mdev->priv.mkey_lock, flags); + mr->mmr.key = mlx5_idx_to_mkey(be32_to_cpu(mr->out.mkey) & 0xffffff) | key; + + cache->last_add = jiffies; + + spin_lock_irqsave(&ent->lock, flags); + list_add_tail(&mr->list, &ent->head); + ent->cur++; + ent->size++; + spin_unlock_irqrestore(&ent->lock, flags); + + write_lock_irqsave(&table->lock, flags); + err = radix_tree_insert(&table->tree, mlx5_base_mkey(mr->mmr.key), + &mr->mmr); + if (err) + pr_err("Error inserting to mr tree. 0x%x\n", -err); + write_unlock_irqrestore(&table->lock, flags); +} + +static int add_keys(struct mlx5_ib_dev *dev, int c, int num) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent = &cache->ent[c]; + struct mlx5_create_mkey_mbox_in *in; + struct mlx5_ib_mr *mr; + int npages = 1 << ent->order; + int err = 0; + int i; + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) + return -ENOMEM; + + for (i = 0; i < num; i++) { + if (ent->pending >= MAX_PENDING_REG_MR) { + err = -EAGAIN; + break; + } + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) { + err = -ENOMEM; + break; + } + mr->order = ent->order; + mr->umred = 1; + mr->dev = dev; + in->seg.status = MLX5_MKEY_STATUS_FREE; + in->seg.xlt_oct_size = cpu_to_be32((npages + 1) / 2); + in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); + in->seg.flags = MLX5_ACCESS_MODE_MTT | MLX5_PERM_UMR_EN; + in->seg.log2_page_size = 12; + + spin_lock_irq(&ent->lock); + ent->pending++; + spin_unlock_irq(&ent->lock); + err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in, + sizeof(*in), reg_mr_callback, + mr, &mr->out); + if (err) { + spin_lock_irq(&ent->lock); + ent->pending--; + spin_unlock_irq(&ent->lock); + mlx5_ib_warn(dev, "create mkey failed %d\n", err); + kfree(mr); + break; + } + } + + kfree(in); + return err; +} + +static void remove_keys(struct mlx5_ib_dev *dev, int c, int num) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent = &cache->ent[c]; + struct mlx5_ib_mr *mr; + int err; + int i; + + for (i = 0; i < num; i++) { + spin_lock_irq(&ent->lock); + if (list_empty(&ent->head)) { + spin_unlock_irq(&ent->lock); + return; + } + mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); + list_del(&mr->list); + ent->cur--; + ent->size--; + spin_unlock_irq(&ent->lock); + err = destroy_mkey(dev, mr); + if (err) + mlx5_ib_warn(dev, "failed destroy mkey\n"); + else + kfree(mr); + } +} + +static ssize_t size_write(struct file *filp, const char __user *buf, + size_t count, loff_t *pos) +{ + struct mlx5_cache_ent *ent = filp->private_data; + struct mlx5_ib_dev *dev = ent->dev; + char lbuf[20]; + u32 var; + int err; + int c; + + if (copy_from_user(lbuf, buf, sizeof(lbuf))) + return -EFAULT; + + c = order2idx(dev, ent->order); + lbuf[sizeof(lbuf) - 1] = 0; + + if (sscanf(lbuf, "%u", &var) != 1) + return -EINVAL; + + if (var < ent->limit) + return -EINVAL; + + if (var > ent->size) { + do { + err = add_keys(dev, c, var - ent->size); + if (err && err != -EAGAIN) + return err; + + usleep_range(3000, 5000); + } while (err); + } else if (var < ent->size) { + remove_keys(dev, c, ent->size - var); + } + + return count; +} + +static ssize_t size_read(struct file *filp, char __user *buf, size_t count, + loff_t *pos) +{ + struct mlx5_cache_ent *ent = filp->private_data; + char lbuf[20]; + int err; + + if (*pos) + return 0; + + err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->size); + if (err < 0) + return err; + + if (copy_to_user(buf, lbuf, err)) + return -EFAULT; + + *pos += err; + + return err; +} + +static const struct file_operations size_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .write = size_write, + .read = size_read, +}; + +static ssize_t limit_write(struct file *filp, const char __user *buf, + size_t count, loff_t *pos) +{ + struct mlx5_cache_ent *ent = filp->private_data; + struct mlx5_ib_dev *dev = ent->dev; + char lbuf[20]; + u32 var; + int err; + int c; + + if (copy_from_user(lbuf, buf, sizeof(lbuf))) + return -EFAULT; + + c = order2idx(dev, ent->order); + lbuf[sizeof(lbuf) - 1] = 0; + + if (sscanf(lbuf, "%u", &var) != 1) + return -EINVAL; + + if (var > ent->size) + return -EINVAL; + + ent->limit = var; + + if (ent->cur < ent->limit) { + err = add_keys(dev, c, 2 * ent->limit - ent->cur); + if (err) + return err; + } + + return count; +} + +static ssize_t limit_read(struct file *filp, char __user *buf, size_t count, + loff_t *pos) +{ + struct mlx5_cache_ent *ent = filp->private_data; + char lbuf[20]; + int err; + + if (*pos) + return 0; + + err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->limit); + if (err < 0) + return err; + + if (copy_to_user(buf, lbuf, err)) + return -EFAULT; + + *pos += err; + + return err; +} + +static const struct file_operations limit_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .write = limit_write, + .read = limit_read, +}; + +static int someone_adding(struct mlx5_mr_cache *cache) +{ + int i; + + for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { + if (cache->ent[i].cur < cache->ent[i].limit) + return 1; + } + + return 0; +} + +static void __cache_work_func(struct mlx5_cache_ent *ent) +{ + struct mlx5_ib_dev *dev = ent->dev; + struct mlx5_mr_cache *cache = &dev->cache; + int i = order2idx(dev, ent->order); + int err; + + if (cache->stopped) + return; + + ent = &dev->cache.ent[i]; + if (ent->cur < 2 * ent->limit && !dev->fill_delay) { + err = add_keys(dev, i, 1); + if (ent->cur < 2 * ent->limit) { + if (err == -EAGAIN) { + mlx5_ib_dbg(dev, "returned eagain, order %d\n", + i + 2); + queue_delayed_work(cache->wq, &ent->dwork, + msecs_to_jiffies(3)); + } else if (err) { + mlx5_ib_warn(dev, "command failed order %d, err %d\n", + i + 2, err); + queue_delayed_work(cache->wq, &ent->dwork, + msecs_to_jiffies(1000)); + } else { + queue_work(cache->wq, &ent->work); + } + } + } else if (ent->cur > 2 * ent->limit) { + if (!someone_adding(cache) && + time_after(jiffies, cache->last_add + 300 * HZ)) { + remove_keys(dev, i, 1); + if (ent->cur > ent->limit) + queue_work(cache->wq, &ent->work); + } else { + queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ); + } + } +} + +static void delayed_cache_work_func(struct work_struct *work) +{ + struct mlx5_cache_ent *ent; + + ent = container_of(work, struct mlx5_cache_ent, dwork.work); + __cache_work_func(ent); +} + +static void cache_work_func(struct work_struct *work) +{ + struct mlx5_cache_ent *ent; + + ent = container_of(work, struct mlx5_cache_ent, work); + __cache_work_func(ent); +} + +static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_ib_dev *dev, int order) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_ib_mr *mr = NULL; + struct mlx5_cache_ent *ent; + int c; + int i; + + c = order2idx(dev, order); + if (c < 0 || c >= MAX_MR_CACHE_ENTRIES) { + mlx5_ib_warn(dev, "order %d, cache index %d\n", order, c); + return NULL; + } + + for (i = c; i < MAX_MR_CACHE_ENTRIES; i++) { + ent = &cache->ent[i]; + + mlx5_ib_dbg(dev, "order %d, cache index %d\n", ent->order, i); + + spin_lock_irq(&ent->lock); + if (!list_empty(&ent->head)) { + mr = list_first_entry(&ent->head, struct mlx5_ib_mr, + list); + list_del(&mr->list); + ent->cur--; + spin_unlock_irq(&ent->lock); + if (ent->cur < ent->limit) + queue_work(cache->wq, &ent->work); + break; + } + spin_unlock_irq(&ent->lock); + + queue_work(cache->wq, &ent->work); + + if (mr) + break; + } + + if (!mr) + cache->ent[c].miss++; + + return mr; +} + +static void free_cached_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent; + int shrink = 0; + int c; + + c = order2idx(dev, mr->order); + if (c < 0 || c >= MAX_MR_CACHE_ENTRIES) { + mlx5_ib_warn(dev, "order %d, cache index %d\n", mr->order, c); + return; + } + ent = &cache->ent[c]; + spin_lock_irq(&ent->lock); + list_add_tail(&mr->list, &ent->head); + ent->cur++; + if (ent->cur > 2 * ent->limit) + shrink = 1; + spin_unlock_irq(&ent->lock); + + if (shrink) + queue_work(cache->wq, &ent->work); +} + +static void clean_keys(struct mlx5_ib_dev *dev, int c) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent = &cache->ent[c]; + struct mlx5_ib_mr *mr; + int err; + + cancel_delayed_work(&ent->dwork); + while (1) { + spin_lock_irq(&ent->lock); + if (list_empty(&ent->head)) { + spin_unlock_irq(&ent->lock); + return; + } + mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); + list_del(&mr->list); + ent->cur--; + ent->size--; + spin_unlock_irq(&ent->lock); + err = destroy_mkey(dev, mr); + if (err) + mlx5_ib_warn(dev, "failed destroy mkey\n"); + else + kfree(mr); + } +} + +static int mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent; + int i; + + if (!mlx5_debugfs_root) + return 0; + + cache->root = debugfs_create_dir("mr_cache", dev->mdev->priv.dbg_root); + if (!cache->root) + return -ENOMEM; + + for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { + ent = &cache->ent[i]; + sprintf(ent->name, "%d", ent->order); + ent->dir = debugfs_create_dir(ent->name, cache->root); + if (!ent->dir) + return -ENOMEM; + + ent->fsize = debugfs_create_file("size", 0600, ent->dir, ent, + &size_fops); + if (!ent->fsize) + return -ENOMEM; + + ent->flimit = debugfs_create_file("limit", 0600, ent->dir, ent, + &limit_fops); + if (!ent->flimit) + return -ENOMEM; + + ent->fcur = debugfs_create_u32("cur", 0400, ent->dir, + &ent->cur); + if (!ent->fcur) + return -ENOMEM; + + ent->fmiss = debugfs_create_u32("miss", 0600, ent->dir, + &ent->miss); + if (!ent->fmiss) + return -ENOMEM; + } + + return 0; +} + +static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev) +{ + if (!mlx5_debugfs_root) + return; + + debugfs_remove_recursive(dev->cache.root); +} + +static void delay_time_func(unsigned long ctx) +{ + struct mlx5_ib_dev *dev = (struct mlx5_ib_dev *)ctx; + + dev->fill_delay = 0; +} + +int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent; + int limit; + int err; + int i; + + cache->wq = create_singlethread_workqueue("mkey_cache"); + if (!cache->wq) { + mlx5_ib_warn(dev, "failed to create work queue\n"); + return -ENOMEM; + } + + setup_timer(&dev->delay_timer, delay_time_func, (unsigned long)dev); + for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { + INIT_LIST_HEAD(&cache->ent[i].head); + spin_lock_init(&cache->ent[i].lock); + + ent = &cache->ent[i]; + INIT_LIST_HEAD(&ent->head); + spin_lock_init(&ent->lock); + ent->order = i + 2; + ent->dev = dev; + + if (dev->mdev->profile->mask & MLX5_PROF_MASK_MR_CACHE) + limit = dev->mdev->profile->mr_cache[i].limit; + else + limit = 0; + + INIT_WORK(&ent->work, cache_work_func); + INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func); + ent->limit = limit; + queue_work(cache->wq, &ent->work); + } + + err = mlx5_mr_cache_debugfs_init(dev); + if (err) + mlx5_ib_warn(dev, "cache debugfs failure\n"); + + return 0; +} + +int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev) +{ + int i; + + dev->cache.stopped = 1; + flush_workqueue(dev->cache.wq); + + mlx5_mr_cache_debugfs_cleanup(dev); + + for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) + clean_keys(dev, i); + + destroy_workqueue(dev->cache.wq); + del_timer_sync(&dev->delay_timer); + + return 0; +} + +struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_create_mkey_mbox_in *in; + struct mlx5_mkey_seg *seg; + struct mlx5_ib_mr *mr; + int err; + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err_free; + } + + seg = &in->seg; + seg->flags = convert_access(acc) | MLX5_ACCESS_MODE_PA; + seg->flags_pd = cpu_to_be32(to_mpd(pd)->pdn | MLX5_MKEY_LEN64); + seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); + seg->start_addr = 0; + + err = mlx5_core_create_mkey(mdev, &mr->mmr, in, sizeof(*in), NULL, NULL, + NULL); + if (err) + goto err_in; + + kfree(in); + mr->ibmr.lkey = mr->mmr.key; + mr->ibmr.rkey = mr->mmr.key; + mr->umem = NULL; + + return &mr->ibmr; + +err_in: + kfree(in); + +err_free: + kfree(mr); + + return ERR_PTR(err); +} + +static int get_octo_len(u64 addr, u64 len, int page_size) +{ + u64 offset; + int npages; + + offset = addr & (page_size - 1); + npages = ALIGN(len + offset, page_size) >> ilog2(page_size); + return (npages + 1) / 2; +} + +static int use_umr(int order) +{ + return order <= MLX5_MAX_UMR_SHIFT; +} + +static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr, + struct ib_sge *sg, u64 dma, int n, u32 key, + int page_shift, u64 virt_addr, u64 len, + int access_flags) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct ib_mr *mr = dev->umrc.mr; + struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr->wr.fast_reg; + + sg->addr = dma; + sg->length = ALIGN(sizeof(u64) * n, 64); + sg->lkey = mr->lkey; + + wr->next = NULL; + wr->send_flags = 0; + wr->sg_list = sg; + if (n) + wr->num_sge = 1; + else + wr->num_sge = 0; + + wr->opcode = MLX5_IB_WR_UMR; + + umrwr->npages = n; + umrwr->page_shift = page_shift; + umrwr->mkey = key; + umrwr->target.virt_addr = virt_addr; + umrwr->length = len; + umrwr->access_flags = access_flags; + umrwr->pd = pd; +} + +static void prep_umr_unreg_wqe(struct mlx5_ib_dev *dev, + struct ib_send_wr *wr, u32 key) +{ + struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr->wr.fast_reg; + + wr->send_flags = MLX5_IB_SEND_UMR_UNREG | MLX5_IB_SEND_UMR_FAIL_IF_FREE; + wr->opcode = MLX5_IB_WR_UMR; + umrwr->mkey = key; +} + +void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context) +{ + struct mlx5_ib_umr_context *context; + struct ib_wc wc; + int err; + + while (1) { + err = ib_poll_cq(cq, 1, &wc); + if (err < 0) { + pr_warn("poll cq error %d\n", err); + return; + } + if (err == 0) + break; + + context = (struct mlx5_ib_umr_context *) (unsigned long) wc.wr_id; + context->status = wc.status; + complete(&context->done); + } + ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); +} + +static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, + u64 virt_addr, u64 len, int npages, + int page_shift, int order, int access_flags) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct device *ddev = dev->ib_dev.dma_device; + struct umr_common *umrc = &dev->umrc; + struct mlx5_ib_umr_context umr_context; + struct ib_send_wr wr, *bad; + struct mlx5_ib_mr *mr; + struct ib_sge sg; + int size; + __be64 *mr_pas; + __be64 *pas; + dma_addr_t dma; + int err = 0; + int i; + + for (i = 0; i < 1; i++) { + mr = alloc_cached_mr(dev, order); + if (mr) + break; + + err = add_keys(dev, order2idx(dev, order), 1); + if (err && err != -EAGAIN) { + mlx5_ib_warn(dev, "add_keys failed, err %d\n", err); + break; + } + } + + if (!mr) + return ERR_PTR(-EAGAIN); + + /* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes. + * To avoid copying garbage after the pas array, we allocate + * a little more. */ + size = ALIGN(sizeof(u64) * npages, MLX5_UMR_MTT_ALIGNMENT); + mr_pas = kmalloc(size + MLX5_UMR_ALIGN - 1, GFP_KERNEL); + if (!mr_pas) { + err = -ENOMEM; + goto free_mr; + } + + pas = PTR_ALIGN(mr_pas, MLX5_UMR_ALIGN); + mlx5_ib_populate_pas(dev, umem, page_shift, pas, MLX5_IB_MTT_PRESENT); + /* Clear padding after the actual pages. */ + memset(pas + npages, 0, size - npages * sizeof(u64)); + + dma = dma_map_single(ddev, pas, size, DMA_TO_DEVICE); + if (dma_mapping_error(ddev, dma)) { + err = -ENOMEM; + goto free_pas; + } + + memset(&wr, 0, sizeof(wr)); + wr.wr_id = (u64)(unsigned long)&umr_context; + prep_umr_reg_wqe(pd, &wr, &sg, dma, npages, mr->mmr.key, page_shift, + virt_addr, len, access_flags); + + mlx5_ib_init_umr_context(&umr_context); + down(&umrc->sem); + err = ib_post_send(umrc->qp, &wr, &bad); + if (err) { + mlx5_ib_warn(dev, "post send failed, err %d\n", err); + goto unmap_dma; + } else { + wait_for_completion(&umr_context.done); + if (umr_context.status != IB_WC_SUCCESS) { + mlx5_ib_warn(dev, "reg umr failed\n"); + err = -EFAULT; + } + } + + mr->mmr.iova = virt_addr; + mr->mmr.size = len; + mr->mmr.pd = to_mpd(pd)->pdn; + + mr->live = 1; + +unmap_dma: + up(&umrc->sem); + dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE); + +free_pas: + kfree(mr_pas); + +free_mr: + if (err) { + free_cached_mr(dev, mr); + return ERR_PTR(err); + } + + return mr; +} + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING +int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, int npages, + int zap) +{ + struct mlx5_ib_dev *dev = mr->dev; + struct device *ddev = dev->ib_dev.dma_device; + struct umr_common *umrc = &dev->umrc; + struct mlx5_ib_umr_context umr_context; + struct ib_umem *umem = mr->umem; + int size; + __be64 *pas; + dma_addr_t dma; + struct ib_send_wr wr, *bad; + struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr.wr.fast_reg; + struct ib_sge sg; + int err = 0; + const int page_index_alignment = MLX5_UMR_MTT_ALIGNMENT / sizeof(u64); + const int page_index_mask = page_index_alignment - 1; + size_t pages_mapped = 0; + size_t pages_to_map = 0; + size_t pages_iter = 0; + int use_emergency_buf = 0; + + /* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes, + * so we need to align the offset and length accordingly */ + if (start_page_index & page_index_mask) { + npages += start_page_index & page_index_mask; + start_page_index &= ~page_index_mask; + } + + pages_to_map = ALIGN(npages, page_index_alignment); + + if (start_page_index + pages_to_map > MLX5_MAX_UMR_PAGES) + return -EINVAL; + + size = sizeof(u64) * pages_to_map; + size = min_t(int, PAGE_SIZE, size); + /* We allocate with GFP_ATOMIC to avoid recursion into page-reclaim + * code, when we are called from an invalidation. The pas buffer must + * be 2k-aligned for Connect-IB. */ + pas = (__be64 *)get_zeroed_page(GFP_ATOMIC); + if (!pas) { + mlx5_ib_warn(dev, "unable to allocate memory during MTT update, falling back to slower chunked mechanism.\n"); + pas = mlx5_ib_update_mtt_emergency_buffer; + size = MLX5_UMR_MTT_MIN_CHUNK_SIZE; + use_emergency_buf = 1; + mutex_lock(&mlx5_ib_update_mtt_emergency_buffer_mutex); + memset(pas, 0, size); + } + pages_iter = size / sizeof(u64); + dma = dma_map_single(ddev, pas, size, DMA_TO_DEVICE); + if (dma_mapping_error(ddev, dma)) { + mlx5_ib_err(dev, "unable to map DMA during MTT update.\n"); + err = -ENOMEM; + goto free_pas; + } + + for (pages_mapped = 0; + pages_mapped < pages_to_map && !err; + pages_mapped += pages_iter, start_page_index += pages_iter) { + dma_sync_single_for_cpu(ddev, dma, size, DMA_TO_DEVICE); + + npages = min_t(size_t, + pages_iter, + ib_umem_num_pages(umem) - start_page_index); + + if (!zap) { + __mlx5_ib_populate_pas(dev, umem, PAGE_SHIFT, + start_page_index, npages, pas, + MLX5_IB_MTT_PRESENT); + /* Clear padding after the pages brought from the + * umem. */ + memset(pas + npages, 0, size - npages * sizeof(u64)); + } + + dma_sync_single_for_device(ddev, dma, size, DMA_TO_DEVICE); + + memset(&wr, 0, sizeof(wr)); + wr.wr_id = (u64)(unsigned long)&umr_context; + + sg.addr = dma; + sg.length = ALIGN(npages * sizeof(u64), + MLX5_UMR_MTT_ALIGNMENT); + sg.lkey = dev->umrc.mr->lkey; + + wr.send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE | + MLX5_IB_SEND_UMR_UPDATE_MTT; + wr.sg_list = &sg; + wr.num_sge = 1; + wr.opcode = MLX5_IB_WR_UMR; + umrwr->npages = sg.length / sizeof(u64); + umrwr->page_shift = PAGE_SHIFT; + umrwr->mkey = mr->mmr.key; + umrwr->target.offset = start_page_index; + + mlx5_ib_init_umr_context(&umr_context); + down(&umrc->sem); + err = ib_post_send(umrc->qp, &wr, &bad); + if (err) { + mlx5_ib_err(dev, "UMR post send failed, err %d\n", err); + } else { + wait_for_completion(&umr_context.done); + if (umr_context.status != IB_WC_SUCCESS) { + mlx5_ib_err(dev, "UMR completion failed, code %d\n", + umr_context.status); + err = -EFAULT; + } + } + up(&umrc->sem); + } + dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE); + +free_pas: + if (!use_emergency_buf) + free_page((unsigned long)pas); + else + mutex_unlock(&mlx5_ib_update_mtt_emergency_buffer_mutex); + + return err; +} +#endif + +static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr, + u64 length, struct ib_umem *umem, + int npages, int page_shift, + int access_flags) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_create_mkey_mbox_in *in; + struct mlx5_ib_mr *mr; + int inlen; + int err; + bool pg_cap = !!(dev->mdev->caps.gen.flags & + MLX5_DEV_CAP_FLAG_ON_DMND_PG); + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + inlen = sizeof(*in) + sizeof(*in->pas) * ((npages + 1) / 2) * 2; + in = mlx5_vzalloc(inlen); + if (!in) { + err = -ENOMEM; + goto err_1; + } + mlx5_ib_populate_pas(dev, umem, page_shift, in->pas, + pg_cap ? MLX5_IB_MTT_PRESENT : 0); + + /* The MLX5_MKEY_INBOX_PG_ACCESS bit allows setting the access flags + * in the page list submitted with the command. */ + in->flags = pg_cap ? cpu_to_be32(MLX5_MKEY_INBOX_PG_ACCESS) : 0; + in->seg.flags = convert_access(access_flags) | + MLX5_ACCESS_MODE_MTT; + in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn); + in->seg.start_addr = cpu_to_be64(virt_addr); + in->seg.len = cpu_to_be64(length); + in->seg.bsfs_octo_size = 0; + in->seg.xlt_oct_size = cpu_to_be32(get_octo_len(virt_addr, length, 1 << page_shift)); + in->seg.log2_page_size = page_shift; + in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); + in->xlat_oct_act_size = cpu_to_be32(get_octo_len(virt_addr, length, + 1 << page_shift)); + err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in, inlen, NULL, + NULL, NULL); + if (err) { + mlx5_ib_warn(dev, "create mkey failed\n"); + goto err_2; + } + mr->umem = umem; + mr->dev = dev; + mr->live = 1; + kvfree(in); + + mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmr.key); + + return mr; + +err_2: + kvfree(in); + +err_1: + kfree(mr); + + return ERR_PTR(err); +} + +struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int access_flags, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_ib_mr *mr = NULL; + struct ib_umem *umem; + int page_shift; + int npages; + int ncont; + int order; + int err; + + mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n", + start, virt_addr, length, access_flags); + umem = ib_umem_get(pd->uobject->context, start, length, access_flags, + 0); + if (IS_ERR(umem)) { + mlx5_ib_dbg(dev, "umem get failed (%ld)\n", PTR_ERR(umem)); + return (void *)umem; + } + + mlx5_ib_cont_pages(umem, start, &npages, &page_shift, &ncont, &order); + if (!npages) { + mlx5_ib_warn(dev, "avoid zero region\n"); + err = -EINVAL; + goto error; + } + + mlx5_ib_dbg(dev, "npages %d, ncont %d, order %d, page_shift %d\n", + npages, ncont, order, page_shift); + + if (use_umr(order)) { + mr = reg_umr(pd, umem, virt_addr, length, ncont, page_shift, + order, access_flags); + if (PTR_ERR(mr) == -EAGAIN) { + mlx5_ib_dbg(dev, "cache empty for order %d", order); + mr = NULL; + } + } else if (access_flags & IB_ACCESS_ON_DEMAND) { + err = -EINVAL; + pr_err("Got MR registration for ODP MR > 512MB, not supported for Connect-IB"); + goto error; + } + + if (!mr) + mr = reg_create(pd, virt_addr, length, umem, ncont, page_shift, + access_flags); + + if (IS_ERR(mr)) { + err = PTR_ERR(mr); + goto error; + } + + mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmr.key); + + mr->umem = umem; + mr->npages = npages; + atomic_add(npages, &dev->mdev->priv.reg_pages); + mr->ibmr.lkey = mr->mmr.key; + mr->ibmr.rkey = mr->mmr.key; + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + if (umem->odp_data) { + /* + * This barrier prevents the compiler from moving the + * setting of umem->odp_data->private to point to our + * MR, before reg_umr finished, to ensure that the MR + * initialization have finished before starting to + * handle invalidations. + */ + smp_wmb(); + mr->umem->odp_data->private = mr; + /* + * Make sure we will see the new + * umem->odp_data->private value in the invalidation + * routines, before we can get page faults on the + * MR. Page faults can happen once we put the MR in + * the tree, below this line. Without the barrier, + * there can be a fault handling and an invalidation + * before umem->odp_data->private == mr is visible to + * the invalidation handler. + */ + smp_wmb(); + } +#endif + + return &mr->ibmr; + +error: + /* + * Destroy the umem *before* destroying the MR, to ensure we + * will not have any in-flight notifiers when destroying the + * MR. + * + * As the MR is completely invalid to begin with, and this + * error path is only taken if we can't push the mr entry into + * the pagefault tree, this is safe. + */ + + ib_umem_release(umem); + /* Kill the MR, and return an error code. */ + clean_mr(mr); + return ERR_PTR(err); +} + +static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) +{ + struct umr_common *umrc = &dev->umrc; + struct mlx5_ib_umr_context umr_context; + struct ib_send_wr wr, *bad; + int err; + + memset(&wr, 0, sizeof(wr)); + wr.wr_id = (u64)(unsigned long)&umr_context; + prep_umr_unreg_wqe(dev, &wr, mr->mmr.key); + + mlx5_ib_init_umr_context(&umr_context); + down(&umrc->sem); + err = ib_post_send(umrc->qp, &wr, &bad); + if (err) { + up(&umrc->sem); + mlx5_ib_dbg(dev, "err %d\n", err); + goto error; + } else { + wait_for_completion(&umr_context.done); + up(&umrc->sem); + } + if (umr_context.status != IB_WC_SUCCESS) { + mlx5_ib_warn(dev, "unreg umr failed\n"); + err = -EFAULT; + goto error; + } + return 0; + +error: + return err; +} + +static int clean_mr(struct mlx5_ib_mr *mr) +{ + struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); + int umred = mr->umred; + int err; + + if (!umred) { + err = destroy_mkey(dev, mr); + if (err) { + mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n", + mr->mmr.key, err); + return err; + } + } else { + err = unreg_umr(dev, mr); + if (err) { + mlx5_ib_warn(dev, "failed unregister\n"); + return err; + } + free_cached_mr(dev, mr); + } + + if (!umred) + kfree(mr); + + return 0; +} + +int mlx5_ib_dereg_mr(struct ib_mr *ibmr) +{ + struct mlx5_ib_dev *dev = to_mdev(ibmr->device); + struct mlx5_ib_mr *mr = to_mmr(ibmr); + int npages = mr->npages; + struct ib_umem *umem = mr->umem; + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + if (umem && umem->odp_data) { + /* Prevent new page faults from succeeding */ + mr->live = 0; + /* Wait for all running page-fault handlers to finish. */ + synchronize_srcu(&dev->mr_srcu); + /* Destroy all page mappings */ + mlx5_ib_invalidate_range(umem, ib_umem_start(umem), + ib_umem_end(umem)); + /* + * We kill the umem before the MR for ODP, + * so that there will not be any invalidations in + * flight, looking at the *mr struct. + */ + ib_umem_release(umem); + atomic_sub(npages, &dev->mdev->priv.reg_pages); + + /* Avoid double-freeing the umem. */ + umem = NULL; + } +#endif + + clean_mr(mr); + + if (umem) { + ib_umem_release(umem); + atomic_sub(npages, &dev->mdev->priv.reg_pages); + } + + return 0; +} + +struct ib_mr *mlx5_ib_create_mr(struct ib_pd *pd, + struct ib_mr_init_attr *mr_init_attr) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_create_mkey_mbox_in *in; + struct mlx5_ib_mr *mr; + int access_mode, err; + int ndescs = roundup(mr_init_attr->max_reg_descriptors, 4); + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err_free; + } + + in->seg.status = MLX5_MKEY_STATUS_FREE; + in->seg.xlt_oct_size = cpu_to_be32(ndescs); + in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); + in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn); + access_mode = MLX5_ACCESS_MODE_MTT; + + if (mr_init_attr->flags & IB_MR_SIGNATURE_EN) { + u32 psv_index[2]; + + in->seg.flags_pd = cpu_to_be32(be32_to_cpu(in->seg.flags_pd) | + MLX5_MKEY_BSF_EN); + in->seg.bsfs_octo_size = cpu_to_be32(MLX5_MKEY_BSF_OCTO_SIZE); + mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL); + if (!mr->sig) { + err = -ENOMEM; + goto err_free_in; + } + + /* create mem & wire PSVs */ + err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn, + 2, psv_index); + if (err) + goto err_free_sig; + + access_mode = MLX5_ACCESS_MODE_KLM; + mr->sig->psv_memory.psv_idx = psv_index[0]; + mr->sig->psv_wire.psv_idx = psv_index[1]; + + mr->sig->sig_status_checked = true; + mr->sig->sig_err_exists = false; + /* Next UMR, Arm SIGERR */ + ++mr->sig->sigerr_count; + } + + in->seg.flags = MLX5_PERM_UMR_EN | access_mode; + err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in, sizeof(*in), + NULL, NULL, NULL); + if (err) + goto err_destroy_psv; + + mr->ibmr.lkey = mr->mmr.key; + mr->ibmr.rkey = mr->mmr.key; + mr->umem = NULL; + kfree(in); + + return &mr->ibmr; + +err_destroy_psv: + if (mr->sig) { + if (mlx5_core_destroy_psv(dev->mdev, + mr->sig->psv_memory.psv_idx)) + mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", + mr->sig->psv_memory.psv_idx); + if (mlx5_core_destroy_psv(dev->mdev, + mr->sig->psv_wire.psv_idx)) + mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", + mr->sig->psv_wire.psv_idx); + } +err_free_sig: + kfree(mr->sig); +err_free_in: + kfree(in); +err_free: + kfree(mr); + return ERR_PTR(err); +} + +int mlx5_ib_destroy_mr(struct ib_mr *ibmr) +{ + struct mlx5_ib_dev *dev = to_mdev(ibmr->device); + struct mlx5_ib_mr *mr = to_mmr(ibmr); + int err; + + if (mr->sig) { + if (mlx5_core_destroy_psv(dev->mdev, + mr->sig->psv_memory.psv_idx)) + mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", + mr->sig->psv_memory.psv_idx); + if (mlx5_core_destroy_psv(dev->mdev, + mr->sig->psv_wire.psv_idx)) + mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", + mr->sig->psv_wire.psv_idx); + kfree(mr->sig); + } + + err = destroy_mkey(dev, mr); + if (err) { + mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n", + mr->mmr.key, err); + return err; + } + + kfree(mr); + + return err; +} + +struct ib_mr *mlx5_ib_alloc_fast_reg_mr(struct ib_pd *pd, + int max_page_list_len) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_create_mkey_mbox_in *in; + struct mlx5_ib_mr *mr; + int err; + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err_free; + } + + in->seg.status = MLX5_MKEY_STATUS_FREE; + in->seg.xlt_oct_size = cpu_to_be32((max_page_list_len + 1) / 2); + in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); + in->seg.flags = MLX5_PERM_UMR_EN | MLX5_ACCESS_MODE_MTT; + in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn); + /* + * TBD not needed - issue 197292 */ + in->seg.log2_page_size = PAGE_SHIFT; + + err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in, sizeof(*in), NULL, + NULL, NULL); + kfree(in); + if (err) + goto err_free; + + mr->ibmr.lkey = mr->mmr.key; + mr->ibmr.rkey = mr->mmr.key; + mr->umem = NULL; + + return &mr->ibmr; + +err_free: + kfree(mr); + return ERR_PTR(err); +} + +struct ib_fast_reg_page_list *mlx5_ib_alloc_fast_reg_page_list(struct ib_device *ibdev, + int page_list_len) +{ + struct mlx5_ib_fast_reg_page_list *mfrpl; + int size = page_list_len * sizeof(u64); + + mfrpl = kmalloc(sizeof(*mfrpl), GFP_KERNEL); + if (!mfrpl) + return ERR_PTR(-ENOMEM); + + mfrpl->ibfrpl.page_list = kmalloc(size, GFP_KERNEL); + if (!mfrpl->ibfrpl.page_list) + goto err_free; + + mfrpl->mapped_page_list = dma_alloc_coherent(ibdev->dma_device, + size, &mfrpl->map, + GFP_KERNEL); + if (!mfrpl->mapped_page_list) + goto err_free; + + WARN_ON(mfrpl->map & 0x3f); + + return &mfrpl->ibfrpl; + +err_free: + kfree(mfrpl->ibfrpl.page_list); + kfree(mfrpl); + return ERR_PTR(-ENOMEM); +} + +void mlx5_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list) +{ + struct mlx5_ib_fast_reg_page_list *mfrpl = to_mfrpl(page_list); + struct mlx5_ib_dev *dev = to_mdev(page_list->device); + int size = page_list->max_page_list_len * sizeof(u64); + + dma_free_coherent(&dev->mdev->pdev->dev, size, mfrpl->mapped_page_list, + mfrpl->map); + kfree(mfrpl->ibfrpl.page_list); + kfree(mfrpl); +} + +int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, + struct ib_mr_status *mr_status) +{ + struct mlx5_ib_mr *mmr = to_mmr(ibmr); + int ret = 0; + + if (check_mask & ~IB_MR_CHECK_SIG_STATUS) { + pr_err("Invalid status check mask\n"); + ret = -EINVAL; + goto done; + } + + mr_status->fail_status = 0; + if (check_mask & IB_MR_CHECK_SIG_STATUS) { + if (!mmr->sig) { + ret = -EINVAL; + pr_err("signature status check requested on a non-signature enabled MR\n"); + goto done; + } + + mmr->sig->sig_status_checked = true; + if (!mmr->sig->sig_err_exists) + goto done; + + if (ibmr->lkey == mmr->sig->err_item.key) + memcpy(&mr_status->sig_err, &mmr->sig->err_item, + sizeof(mr_status->sig_err)); + else { + mr_status->sig_err.err_type = IB_SIG_BAD_GUARD; + mr_status->sig_err.sig_err_offset = 0; + mr_status->sig_err.key = mmr->sig->err_item.key; + } + + mmr->sig->sig_err_exists = false; + mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS; + } + +done: + return ret; +} diff --git a/kernel/drivers/infiniband/hw/mlx5/odp.c b/kernel/drivers/infiniband/hw/mlx5/odp.c new file mode 100644 index 000000000..5099db08a --- /dev/null +++ b/kernel/drivers/infiniband/hw/mlx5/odp.c @@ -0,0 +1,798 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include "mlx5_ib.h" + +#define MAX_PREFETCH_LEN (4*1024*1024U) + +/* Timeout in ms to wait for an active mmu notifier to complete when handling + * a pagefault. */ +#define MMU_NOTIFIER_TIMEOUT 1000 + +struct workqueue_struct *mlx5_ib_page_fault_wq; + +void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, + unsigned long end) +{ + struct mlx5_ib_mr *mr; + const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT / sizeof(u64)) - 1; + u64 idx = 0, blk_start_idx = 0; + int in_block = 0; + u64 addr; + + if (!umem || !umem->odp_data) { + pr_err("invalidation called on NULL umem or non-ODP umem\n"); + return; + } + + mr = umem->odp_data->private; + + if (!mr || !mr->ibmr.pd) + return; + + start = max_t(u64, ib_umem_start(umem), start); + end = min_t(u64, ib_umem_end(umem), end); + + /* + * Iteration one - zap the HW's MTTs. The notifiers_count ensures that + * while we are doing the invalidation, no page fault will attempt to + * overwrite the same MTTs. Concurent invalidations might race us, + * but they will write 0s as well, so no difference in the end result. + */ + + for (addr = start; addr < end; addr += (u64)umem->page_size) { + idx = (addr - ib_umem_start(umem)) / PAGE_SIZE; + /* + * Strive to write the MTTs in chunks, but avoid overwriting + * non-existing MTTs. The huristic here can be improved to + * estimate the cost of another UMR vs. the cost of bigger + * UMR. + */ + if (umem->odp_data->dma_list[idx] & + (ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) { + if (!in_block) { + blk_start_idx = idx; + in_block = 1; + } + } else { + u64 umr_offset = idx & umr_block_mask; + + if (in_block && umr_offset == 0) { + mlx5_ib_update_mtt(mr, blk_start_idx, + idx - blk_start_idx, 1); + in_block = 0; + } + } + } + if (in_block) + mlx5_ib_update_mtt(mr, blk_start_idx, idx - blk_start_idx + 1, + 1); + + /* + * We are now sure that the device will not access the + * memory. We can safely unmap it, and mark it as dirty if + * needed. + */ + + ib_umem_odp_unmap_dma_pages(umem, start, end); +} + +#define COPY_ODP_BIT_MLX_TO_IB(reg, ib_caps, field_name, bit_name) do { \ + if (be32_to_cpu(reg.field_name) & MLX5_ODP_SUPPORT_##bit_name) \ + ib_caps->field_name |= IB_ODP_SUPPORT_##bit_name; \ +} while (0) + +int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev) +{ + int err; + struct mlx5_odp_caps hw_caps; + struct ib_odp_caps *caps = &dev->odp_caps; + + memset(caps, 0, sizeof(*caps)); + + if (!(dev->mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_ON_DMND_PG)) + return 0; + + err = mlx5_query_odp_caps(dev->mdev, &hw_caps); + if (err) + goto out; + + caps->general_caps = IB_ODP_SUPPORT; + COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.ud_odp_caps, + SEND); + COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps, + SEND); + COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps, + RECV); + COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps, + WRITE); + COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps, + READ); + +out: + return err; +} + +static struct mlx5_ib_mr *mlx5_ib_odp_find_mr_lkey(struct mlx5_ib_dev *dev, + u32 key) +{ + u32 base_key = mlx5_base_mkey(key); + struct mlx5_core_mr *mmr = __mlx5_mr_lookup(dev->mdev, base_key); + struct mlx5_ib_mr *mr = container_of(mmr, struct mlx5_ib_mr, mmr); + + if (!mmr || mmr->key != key || !mr->live) + return NULL; + + return container_of(mmr, struct mlx5_ib_mr, mmr); +} + +static void mlx5_ib_page_fault_resume(struct mlx5_ib_qp *qp, + struct mlx5_ib_pfault *pfault, + int error) { + struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device); + int ret = mlx5_core_page_fault_resume(dev->mdev, qp->mqp.qpn, + pfault->mpfault.flags, + error); + if (ret) + pr_err("Failed to resolve the page fault on QP 0x%x\n", + qp->mqp.qpn); +} + +/* + * Handle a single data segment in a page-fault WQE. + * + * Returns number of pages retrieved on success. The caller will continue to + * the next data segment. + * Can return the following error codes: + * -EAGAIN to designate a temporary error. The caller will abort handling the + * page fault and resolve it. + * -EFAULT when there's an error mapping the requested pages. The caller will + * abort the page fault handling and possibly move the QP to an error state. + * On other errors the QP should also be closed with an error. + */ +static int pagefault_single_data_segment(struct mlx5_ib_qp *qp, + struct mlx5_ib_pfault *pfault, + u32 key, u64 io_virt, size_t bcnt, + u32 *bytes_mapped) +{ + struct mlx5_ib_dev *mib_dev = to_mdev(qp->ibqp.pd->device); + int srcu_key; + unsigned int current_seq; + u64 start_idx; + int npages = 0, ret = 0; + struct mlx5_ib_mr *mr; + u64 access_mask = ODP_READ_ALLOWED_BIT; + + srcu_key = srcu_read_lock(&mib_dev->mr_srcu); + mr = mlx5_ib_odp_find_mr_lkey(mib_dev, key); + /* + * If we didn't find the MR, it means the MR was closed while we were + * handling the ODP event. In this case we return -EFAULT so that the + * QP will be closed. + */ + if (!mr || !mr->ibmr.pd) { + pr_err("Failed to find relevant mr for lkey=0x%06x, probably the MR was destroyed\n", + key); + ret = -EFAULT; + goto srcu_unlock; + } + if (!mr->umem->odp_data) { + pr_debug("skipping non ODP MR (lkey=0x%06x) in page fault handler.\n", + key); + if (bytes_mapped) + *bytes_mapped += + (bcnt - pfault->mpfault.bytes_committed); + goto srcu_unlock; + } + if (mr->ibmr.pd != qp->ibqp.pd) { + pr_err("Page-fault with different PDs for QP and MR.\n"); + ret = -EFAULT; + goto srcu_unlock; + } + + current_seq = ACCESS_ONCE(mr->umem->odp_data->notifiers_seq); + /* + * Ensure the sequence number is valid for some time before we call + * gup. + */ + smp_rmb(); + + /* + * Avoid branches - this code will perform correctly + * in all iterations (in iteration 2 and above, + * bytes_committed == 0). + */ + io_virt += pfault->mpfault.bytes_committed; + bcnt -= pfault->mpfault.bytes_committed; + + start_idx = (io_virt - (mr->mmr.iova & PAGE_MASK)) >> PAGE_SHIFT; + + if (mr->umem->writable) + access_mask |= ODP_WRITE_ALLOWED_BIT; + npages = ib_umem_odp_map_dma_pages(mr->umem, io_virt, bcnt, + access_mask, current_seq); + if (npages < 0) { + ret = npages; + goto srcu_unlock; + } + + if (npages > 0) { + mutex_lock(&mr->umem->odp_data->umem_mutex); + if (!ib_umem_mmu_notifier_retry(mr->umem, current_seq)) { + /* + * No need to check whether the MTTs really belong to + * this MR, since ib_umem_odp_map_dma_pages already + * checks this. + */ + ret = mlx5_ib_update_mtt(mr, start_idx, npages, 0); + } else { + ret = -EAGAIN; + } + mutex_unlock(&mr->umem->odp_data->umem_mutex); + if (ret < 0) { + if (ret != -EAGAIN) + pr_err("Failed to update mkey page tables\n"); + goto srcu_unlock; + } + + if (bytes_mapped) { + u32 new_mappings = npages * PAGE_SIZE - + (io_virt - round_down(io_virt, PAGE_SIZE)); + *bytes_mapped += min_t(u32, new_mappings, bcnt); + } + } + +srcu_unlock: + if (ret == -EAGAIN) { + if (!mr->umem->odp_data->dying) { + struct ib_umem_odp *odp_data = mr->umem->odp_data; + unsigned long timeout = + msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT); + + if (!wait_for_completion_timeout( + &odp_data->notifier_completion, + timeout)) { + pr_warn("timeout waiting for mmu notifier completion\n"); + } + } else { + /* The MR is being killed, kill the QP as well. */ + ret = -EFAULT; + } + } + srcu_read_unlock(&mib_dev->mr_srcu, srcu_key); + pfault->mpfault.bytes_committed = 0; + return ret ? ret : npages; +} + +/** + * Parse a series of data segments for page fault handling. + * + * @qp the QP on which the fault occurred. + * @pfault contains page fault information. + * @wqe points at the first data segment in the WQE. + * @wqe_end points after the end of the WQE. + * @bytes_mapped receives the number of bytes that the function was able to + * map. This allows the caller to decide intelligently whether + * enough memory was mapped to resolve the page fault + * successfully (e.g. enough for the next MTU, or the entire + * WQE). + * @total_wqe_bytes receives the total data size of this WQE in bytes (minus + * the committed bytes). + * + * Returns the number of pages loaded if positive, zero for an empty WQE, or a + * negative error code. + */ +static int pagefault_data_segments(struct mlx5_ib_qp *qp, + struct mlx5_ib_pfault *pfault, void *wqe, + void *wqe_end, u32 *bytes_mapped, + u32 *total_wqe_bytes, int receive_queue) +{ + int ret = 0, npages = 0; + u64 io_virt; + u32 key; + u32 byte_count; + size_t bcnt; + int inline_segment; + + /* Skip SRQ next-WQE segment. */ + if (receive_queue && qp->ibqp.srq) + wqe += sizeof(struct mlx5_wqe_srq_next_seg); + + if (bytes_mapped) + *bytes_mapped = 0; + if (total_wqe_bytes) + *total_wqe_bytes = 0; + + while (wqe < wqe_end) { + struct mlx5_wqe_data_seg *dseg = wqe; + + io_virt = be64_to_cpu(dseg->addr); + key = be32_to_cpu(dseg->lkey); + byte_count = be32_to_cpu(dseg->byte_count); + inline_segment = !!(byte_count & MLX5_INLINE_SEG); + bcnt = byte_count & ~MLX5_INLINE_SEG; + + if (inline_segment) { + bcnt = bcnt & MLX5_WQE_INLINE_SEG_BYTE_COUNT_MASK; + wqe += ALIGN(sizeof(struct mlx5_wqe_inline_seg) + bcnt, + 16); + } else { + wqe += sizeof(*dseg); + } + + /* receive WQE end of sg list. */ + if (receive_queue && bcnt == 0 && key == MLX5_INVALID_LKEY && + io_virt == 0) + break; + + if (!inline_segment && total_wqe_bytes) { + *total_wqe_bytes += bcnt - min_t(size_t, bcnt, + pfault->mpfault.bytes_committed); + } + + /* A zero length data segment designates a length of 2GB. */ + if (bcnt == 0) + bcnt = 1U << 31; + + if (inline_segment || bcnt <= pfault->mpfault.bytes_committed) { + pfault->mpfault.bytes_committed -= + min_t(size_t, bcnt, + pfault->mpfault.bytes_committed); + continue; + } + + ret = pagefault_single_data_segment(qp, pfault, key, io_virt, + bcnt, bytes_mapped); + if (ret < 0) + break; + npages += ret; + } + + return ret < 0 ? ret : npages; +} + +/* + * Parse initiator WQE. Advances the wqe pointer to point at the + * scatter-gather list, and set wqe_end to the end of the WQE. + */ +static int mlx5_ib_mr_initiator_pfault_handler( + struct mlx5_ib_qp *qp, struct mlx5_ib_pfault *pfault, + void **wqe, void **wqe_end, int wqe_length) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device); + struct mlx5_wqe_ctrl_seg *ctrl = *wqe; + u16 wqe_index = pfault->mpfault.wqe.wqe_index; + unsigned ds, opcode; +#if defined(DEBUG) + u32 ctrl_wqe_index, ctrl_qpn; +#endif + + ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK; + if (ds * MLX5_WQE_DS_UNITS > wqe_length) { + mlx5_ib_err(dev, "Unable to read the complete WQE. ds = 0x%x, ret = 0x%x\n", + ds, wqe_length); + return -EFAULT; + } + + if (ds == 0) { + mlx5_ib_err(dev, "Got WQE with zero DS. wqe_index=%x, qpn=%x\n", + wqe_index, qp->mqp.qpn); + return -EFAULT; + } + +#if defined(DEBUG) + ctrl_wqe_index = (be32_to_cpu(ctrl->opmod_idx_opcode) & + MLX5_WQE_CTRL_WQE_INDEX_MASK) >> + MLX5_WQE_CTRL_WQE_INDEX_SHIFT; + if (wqe_index != ctrl_wqe_index) { + mlx5_ib_err(dev, "Got WQE with invalid wqe_index. wqe_index=0x%x, qpn=0x%x ctrl->wqe_index=0x%x\n", + wqe_index, qp->mqp.qpn, + ctrl_wqe_index); + return -EFAULT; + } + + ctrl_qpn = (be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_QPN_MASK) >> + MLX5_WQE_CTRL_QPN_SHIFT; + if (qp->mqp.qpn != ctrl_qpn) { + mlx5_ib_err(dev, "Got WQE with incorrect QP number. wqe_index=0x%x, qpn=0x%x ctrl->qpn=0x%x\n", + wqe_index, qp->mqp.qpn, + ctrl_qpn); + return -EFAULT; + } +#endif /* DEBUG */ + + *wqe_end = *wqe + ds * MLX5_WQE_DS_UNITS; + *wqe += sizeof(*ctrl); + + opcode = be32_to_cpu(ctrl->opmod_idx_opcode) & + MLX5_WQE_CTRL_OPCODE_MASK; + switch (qp->ibqp.qp_type) { + case IB_QPT_RC: + switch (opcode) { + case MLX5_OPCODE_SEND: + case MLX5_OPCODE_SEND_IMM: + case MLX5_OPCODE_SEND_INVAL: + if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & + IB_ODP_SUPPORT_SEND)) + goto invalid_transport_or_opcode; + break; + case MLX5_OPCODE_RDMA_WRITE: + case MLX5_OPCODE_RDMA_WRITE_IMM: + if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & + IB_ODP_SUPPORT_WRITE)) + goto invalid_transport_or_opcode; + *wqe += sizeof(struct mlx5_wqe_raddr_seg); + break; + case MLX5_OPCODE_RDMA_READ: + if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & + IB_ODP_SUPPORT_READ)) + goto invalid_transport_or_opcode; + *wqe += sizeof(struct mlx5_wqe_raddr_seg); + break; + default: + goto invalid_transport_or_opcode; + } + break; + case IB_QPT_UD: + switch (opcode) { + case MLX5_OPCODE_SEND: + case MLX5_OPCODE_SEND_IMM: + if (!(dev->odp_caps.per_transport_caps.ud_odp_caps & + IB_ODP_SUPPORT_SEND)) + goto invalid_transport_or_opcode; + *wqe += sizeof(struct mlx5_wqe_datagram_seg); + break; + default: + goto invalid_transport_or_opcode; + } + break; + default: +invalid_transport_or_opcode: + mlx5_ib_err(dev, "ODP fault on QP of an unsupported opcode or transport. transport: 0x%x opcode: 0x%x.\n", + qp->ibqp.qp_type, opcode); + return -EFAULT; + } + + return 0; +} + +/* + * Parse responder WQE. Advances the wqe pointer to point at the + * scatter-gather list, and set wqe_end to the end of the WQE. + */ +static int mlx5_ib_mr_responder_pfault_handler( + struct mlx5_ib_qp *qp, struct mlx5_ib_pfault *pfault, + void **wqe, void **wqe_end, int wqe_length) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device); + struct mlx5_ib_wq *wq = &qp->rq; + int wqe_size = 1 << wq->wqe_shift; + + if (qp->ibqp.srq) { + mlx5_ib_err(dev, "ODP fault on SRQ is not supported\n"); + return -EFAULT; + } + + if (qp->wq_sig) { + mlx5_ib_err(dev, "ODP fault with WQE signatures is not supported\n"); + return -EFAULT; + } + + if (wqe_size > wqe_length) { + mlx5_ib_err(dev, "Couldn't read all of the receive WQE's content\n"); + return -EFAULT; + } + + switch (qp->ibqp.qp_type) { + case IB_QPT_RC: + if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & + IB_ODP_SUPPORT_RECV)) + goto invalid_transport_or_opcode; + break; + default: +invalid_transport_or_opcode: + mlx5_ib_err(dev, "ODP fault on QP of an unsupported transport. transport: 0x%x\n", + qp->ibqp.qp_type); + return -EFAULT; + } + + *wqe_end = *wqe + wqe_size; + + return 0; +} + +static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_qp *qp, + struct mlx5_ib_pfault *pfault) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device); + int ret; + void *wqe, *wqe_end; + u32 bytes_mapped, total_wqe_bytes; + char *buffer = NULL; + int resume_with_error = 0; + u16 wqe_index = pfault->mpfault.wqe.wqe_index; + int requestor = pfault->mpfault.flags & MLX5_PFAULT_REQUESTOR; + + buffer = (char *)__get_free_page(GFP_KERNEL); + if (!buffer) { + mlx5_ib_err(dev, "Error allocating memory for IO page fault handling.\n"); + resume_with_error = 1; + goto resolve_page_fault; + } + + ret = mlx5_ib_read_user_wqe(qp, requestor, wqe_index, buffer, + PAGE_SIZE); + if (ret < 0) { + mlx5_ib_err(dev, "Failed reading a WQE following page fault, error=%x, wqe_index=%x, qpn=%x\n", + -ret, wqe_index, qp->mqp.qpn); + resume_with_error = 1; + goto resolve_page_fault; + } + + wqe = buffer; + if (requestor) + ret = mlx5_ib_mr_initiator_pfault_handler(qp, pfault, &wqe, + &wqe_end, ret); + else + ret = mlx5_ib_mr_responder_pfault_handler(qp, pfault, &wqe, + &wqe_end, ret); + if (ret < 0) { + resume_with_error = 1; + goto resolve_page_fault; + } + + if (wqe >= wqe_end) { + mlx5_ib_err(dev, "ODP fault on invalid WQE.\n"); + resume_with_error = 1; + goto resolve_page_fault; + } + + ret = pagefault_data_segments(qp, pfault, wqe, wqe_end, &bytes_mapped, + &total_wqe_bytes, !requestor); + if (ret == -EAGAIN) { + goto resolve_page_fault; + } else if (ret < 0 || total_wqe_bytes > bytes_mapped) { + mlx5_ib_err(dev, "Error getting user pages for page fault. Error: 0x%x\n", + -ret); + resume_with_error = 1; + goto resolve_page_fault; + } + +resolve_page_fault: + mlx5_ib_page_fault_resume(qp, pfault, resume_with_error); + mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, flags: 0x%x\n", + qp->mqp.qpn, resume_with_error, pfault->mpfault.flags); + + free_page((unsigned long)buffer); +} + +static int pages_in_range(u64 address, u32 length) +{ + return (ALIGN(address + length, PAGE_SIZE) - + (address & PAGE_MASK)) >> PAGE_SHIFT; +} + +static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_qp *qp, + struct mlx5_ib_pfault *pfault) +{ + struct mlx5_pagefault *mpfault = &pfault->mpfault; + u64 address; + u32 length; + u32 prefetch_len = mpfault->bytes_committed; + int prefetch_activated = 0; + u32 rkey = mpfault->rdma.r_key; + int ret; + + /* The RDMA responder handler handles the page fault in two parts. + * First it brings the necessary pages for the current packet + * (and uses the pfault context), and then (after resuming the QP) + * prefetches more pages. The second operation cannot use the pfault + * context and therefore uses the dummy_pfault context allocated on + * the stack */ + struct mlx5_ib_pfault dummy_pfault = {}; + + dummy_pfault.mpfault.bytes_committed = 0; + + mpfault->rdma.rdma_va += mpfault->bytes_committed; + mpfault->rdma.rdma_op_len -= min(mpfault->bytes_committed, + mpfault->rdma.rdma_op_len); + mpfault->bytes_committed = 0; + + address = mpfault->rdma.rdma_va; + length = mpfault->rdma.rdma_op_len; + + /* For some operations, the hardware cannot tell the exact message + * length, and in those cases it reports zero. Use prefetch + * logic. */ + if (length == 0) { + prefetch_activated = 1; + length = mpfault->rdma.packet_size; + prefetch_len = min(MAX_PREFETCH_LEN, prefetch_len); + } + + ret = pagefault_single_data_segment(qp, pfault, rkey, address, length, + NULL); + if (ret == -EAGAIN) { + /* We're racing with an invalidation, don't prefetch */ + prefetch_activated = 0; + } else if (ret < 0 || pages_in_range(address, length) > ret) { + mlx5_ib_page_fault_resume(qp, pfault, 1); + return; + } + + mlx5_ib_page_fault_resume(qp, pfault, 0); + + /* At this point, there might be a new pagefault already arriving in + * the eq, switch to the dummy pagefault for the rest of the + * processing. We're still OK with the objects being alive as the + * work-queue is being fenced. */ + + if (prefetch_activated) { + ret = pagefault_single_data_segment(qp, &dummy_pfault, rkey, + address, + prefetch_len, + NULL); + if (ret < 0) { + pr_warn("Prefetch failed (ret = %d, prefetch_activated = %d) for QPN %d, address: 0x%.16llx, length = 0x%.16x\n", + ret, prefetch_activated, + qp->ibqp.qp_num, address, prefetch_len); + } + } +} + +void mlx5_ib_mr_pfault_handler(struct mlx5_ib_qp *qp, + struct mlx5_ib_pfault *pfault) +{ + u8 event_subtype = pfault->mpfault.event_subtype; + + switch (event_subtype) { + case MLX5_PFAULT_SUBTYPE_WQE: + mlx5_ib_mr_wqe_pfault_handler(qp, pfault); + break; + case MLX5_PFAULT_SUBTYPE_RDMA: + mlx5_ib_mr_rdma_pfault_handler(qp, pfault); + break; + default: + pr_warn("Invalid page fault event subtype: 0x%x\n", + event_subtype); + mlx5_ib_page_fault_resume(qp, pfault, 1); + break; + } +} + +static void mlx5_ib_qp_pfault_action(struct work_struct *work) +{ + struct mlx5_ib_pfault *pfault = container_of(work, + struct mlx5_ib_pfault, + work); + enum mlx5_ib_pagefault_context context = + mlx5_ib_get_pagefault_context(&pfault->mpfault); + struct mlx5_ib_qp *qp = container_of(pfault, struct mlx5_ib_qp, + pagefaults[context]); + mlx5_ib_mr_pfault_handler(qp, pfault); +} + +void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp) +{ + unsigned long flags; + + spin_lock_irqsave(&qp->disable_page_faults_lock, flags); + qp->disable_page_faults = 1; + spin_unlock_irqrestore(&qp->disable_page_faults_lock, flags); + + /* + * Note that at this point, we are guarenteed that no more + * work queue elements will be posted to the work queue with + * the QP we are closing. + */ + flush_workqueue(mlx5_ib_page_fault_wq); +} + +void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp) +{ + unsigned long flags; + + spin_lock_irqsave(&qp->disable_page_faults_lock, flags); + qp->disable_page_faults = 0; + spin_unlock_irqrestore(&qp->disable_page_faults_lock, flags); +} + +static void mlx5_ib_pfault_handler(struct mlx5_core_qp *qp, + struct mlx5_pagefault *pfault) +{ + /* + * Note that we will only get one fault event per QP per context + * (responder/initiator, read/write), until we resolve the page fault + * with the mlx5_ib_page_fault_resume command. Since this function is + * called from within the work element, there is no risk of missing + * events. + */ + struct mlx5_ib_qp *mibqp = to_mibqp(qp); + enum mlx5_ib_pagefault_context context = + mlx5_ib_get_pagefault_context(pfault); + struct mlx5_ib_pfault *qp_pfault = &mibqp->pagefaults[context]; + + qp_pfault->mpfault = *pfault; + + /* No need to stop interrupts here since we are in an interrupt */ + spin_lock(&mibqp->disable_page_faults_lock); + if (!mibqp->disable_page_faults) + queue_work(mlx5_ib_page_fault_wq, &qp_pfault->work); + spin_unlock(&mibqp->disable_page_faults_lock); +} + +void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp) +{ + int i; + + qp->disable_page_faults = 1; + spin_lock_init(&qp->disable_page_faults_lock); + + qp->mqp.pfault_handler = mlx5_ib_pfault_handler; + + for (i = 0; i < MLX5_IB_PAGEFAULT_CONTEXTS; ++i) + INIT_WORK(&qp->pagefaults[i].work, mlx5_ib_qp_pfault_action); +} + +int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) +{ + int ret; + + ret = init_srcu_struct(&ibdev->mr_srcu); + if (ret) + return ret; + + return 0; +} + +void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev) +{ + cleanup_srcu_struct(&ibdev->mr_srcu); +} + +int __init mlx5_ib_odp_init(void) +{ + mlx5_ib_page_fault_wq = + create_singlethread_workqueue("mlx5_ib_page_faults"); + if (!mlx5_ib_page_fault_wq) + return -ENOMEM; + + return 0; +} + +void mlx5_ib_odp_cleanup(void) +{ + destroy_workqueue(mlx5_ib_page_fault_wq); +} diff --git a/kernel/drivers/infiniband/hw/mlx5/qp.c b/kernel/drivers/infiniband/hw/mlx5/qp.c new file mode 100644 index 000000000..d35f62d4f --- /dev/null +++ b/kernel/drivers/infiniband/hw/mlx5/qp.c @@ -0,0 +1,3174 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include "mlx5_ib.h" +#include "user.h" + +/* not supported currently */ +static int wq_signature; + +enum { + MLX5_IB_ACK_REQ_FREQ = 8, +}; + +enum { + MLX5_IB_DEFAULT_SCHED_QUEUE = 0x83, + MLX5_IB_DEFAULT_QP0_SCHED_QUEUE = 0x3f, + MLX5_IB_LINK_TYPE_IB = 0, + MLX5_IB_LINK_TYPE_ETH = 1 +}; + +enum { + MLX5_IB_SQ_STRIDE = 6, + MLX5_IB_CACHE_LINE_SIZE = 64, +}; + +static const u32 mlx5_ib_opcode[] = { + [IB_WR_SEND] = MLX5_OPCODE_SEND, + [IB_WR_SEND_WITH_IMM] = MLX5_OPCODE_SEND_IMM, + [IB_WR_RDMA_WRITE] = MLX5_OPCODE_RDMA_WRITE, + [IB_WR_RDMA_WRITE_WITH_IMM] = MLX5_OPCODE_RDMA_WRITE_IMM, + [IB_WR_RDMA_READ] = MLX5_OPCODE_RDMA_READ, + [IB_WR_ATOMIC_CMP_AND_SWP] = MLX5_OPCODE_ATOMIC_CS, + [IB_WR_ATOMIC_FETCH_AND_ADD] = MLX5_OPCODE_ATOMIC_FA, + [IB_WR_SEND_WITH_INV] = MLX5_OPCODE_SEND_INVAL, + [IB_WR_LOCAL_INV] = MLX5_OPCODE_UMR, + [IB_WR_FAST_REG_MR] = MLX5_OPCODE_UMR, + [IB_WR_MASKED_ATOMIC_CMP_AND_SWP] = MLX5_OPCODE_ATOMIC_MASKED_CS, + [IB_WR_MASKED_ATOMIC_FETCH_AND_ADD] = MLX5_OPCODE_ATOMIC_MASKED_FA, + [MLX5_IB_WR_UMR] = MLX5_OPCODE_UMR, +}; + + +static int is_qp0(enum ib_qp_type qp_type) +{ + return qp_type == IB_QPT_SMI; +} + +static int is_qp1(enum ib_qp_type qp_type) +{ + return qp_type == IB_QPT_GSI; +} + +static int is_sqp(enum ib_qp_type qp_type) +{ + return is_qp0(qp_type) || is_qp1(qp_type); +} + +static void *get_wqe(struct mlx5_ib_qp *qp, int offset) +{ + return mlx5_buf_offset(&qp->buf, offset); +} + +static void *get_recv_wqe(struct mlx5_ib_qp *qp, int n) +{ + return get_wqe(qp, qp->rq.offset + (n << qp->rq.wqe_shift)); +} + +void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n) +{ + return get_wqe(qp, qp->sq.offset + (n << MLX5_IB_SQ_STRIDE)); +} + +/** + * mlx5_ib_read_user_wqe() - Copy a user-space WQE to kernel space. + * + * @qp: QP to copy from. + * @send: copy from the send queue when non-zero, use the receive queue + * otherwise. + * @wqe_index: index to start copying from. For send work queues, the + * wqe_index is in units of MLX5_SEND_WQE_BB. + * For receive work queue, it is the number of work queue + * element in the queue. + * @buffer: destination buffer. + * @length: maximum number of bytes to copy. + * + * Copies at least a single WQE, but may copy more data. + * + * Return: the number of bytes copied, or an error code. + */ +int mlx5_ib_read_user_wqe(struct mlx5_ib_qp *qp, int send, int wqe_index, + void *buffer, u32 length) +{ + struct ib_device *ibdev = qp->ibqp.device; + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_ib_wq *wq = send ? &qp->sq : &qp->rq; + size_t offset; + size_t wq_end; + struct ib_umem *umem = qp->umem; + u32 first_copy_length; + int wqe_length; + int ret; + + if (wq->wqe_cnt == 0) { + mlx5_ib_dbg(dev, "mlx5_ib_read_user_wqe for a QP with wqe_cnt == 0. qp_type: 0x%x\n", + qp->ibqp.qp_type); + return -EINVAL; + } + + offset = wq->offset + ((wqe_index % wq->wqe_cnt) << wq->wqe_shift); + wq_end = wq->offset + (wq->wqe_cnt << wq->wqe_shift); + + if (send && length < sizeof(struct mlx5_wqe_ctrl_seg)) + return -EINVAL; + + if (offset > umem->length || + (send && offset + sizeof(struct mlx5_wqe_ctrl_seg) > umem->length)) + return -EINVAL; + + first_copy_length = min_t(u32, offset + length, wq_end) - offset; + ret = ib_umem_copy_from(buffer, umem, offset, first_copy_length); + if (ret) + return ret; + + if (send) { + struct mlx5_wqe_ctrl_seg *ctrl = buffer; + int ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK; + + wqe_length = ds * MLX5_WQE_DS_UNITS; + } else { + wqe_length = 1 << wq->wqe_shift; + } + + if (wqe_length <= first_copy_length) + return first_copy_length; + + ret = ib_umem_copy_from(buffer + first_copy_length, umem, wq->offset, + wqe_length - first_copy_length); + if (ret) + return ret; + + return wqe_length; +} + +static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type) +{ + struct ib_qp *ibqp = &to_mibqp(qp)->ibqp; + struct ib_event event; + + if (type == MLX5_EVENT_TYPE_PATH_MIG) + to_mibqp(qp)->port = to_mibqp(qp)->alt_port; + + if (ibqp->event_handler) { + event.device = ibqp->device; + event.element.qp = ibqp; + switch (type) { + case MLX5_EVENT_TYPE_PATH_MIG: + event.event = IB_EVENT_PATH_MIG; + break; + case MLX5_EVENT_TYPE_COMM_EST: + event.event = IB_EVENT_COMM_EST; + break; + case MLX5_EVENT_TYPE_SQ_DRAINED: + event.event = IB_EVENT_SQ_DRAINED; + break; + case MLX5_EVENT_TYPE_SRQ_LAST_WQE: + event.event = IB_EVENT_QP_LAST_WQE_REACHED; + break; + case MLX5_EVENT_TYPE_WQ_CATAS_ERROR: + event.event = IB_EVENT_QP_FATAL; + break; + case MLX5_EVENT_TYPE_PATH_MIG_FAILED: + event.event = IB_EVENT_PATH_MIG_ERR; + break; + case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: + event.event = IB_EVENT_QP_REQ_ERR; + break; + case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: + event.event = IB_EVENT_QP_ACCESS_ERR; + break; + default: + pr_warn("mlx5_ib: Unexpected event type %d on QP %06x\n", type, qp->qpn); + return; + } + + ibqp->event_handler(&event, ibqp->qp_context); + } +} + +static int set_rq_size(struct mlx5_ib_dev *dev, struct ib_qp_cap *cap, + int has_rq, struct mlx5_ib_qp *qp, struct mlx5_ib_create_qp *ucmd) +{ + struct mlx5_general_caps *gen; + int wqe_size; + int wq_size; + + gen = &dev->mdev->caps.gen; + /* Sanity check RQ size before proceeding */ + if (cap->max_recv_wr > gen->max_wqes) + return -EINVAL; + + if (!has_rq) { + qp->rq.max_gs = 0; + qp->rq.wqe_cnt = 0; + qp->rq.wqe_shift = 0; + } else { + if (ucmd) { + qp->rq.wqe_cnt = ucmd->rq_wqe_count; + qp->rq.wqe_shift = ucmd->rq_wqe_shift; + qp->rq.max_gs = (1 << qp->rq.wqe_shift) / sizeof(struct mlx5_wqe_data_seg) - qp->wq_sig; + qp->rq.max_post = qp->rq.wqe_cnt; + } else { + wqe_size = qp->wq_sig ? sizeof(struct mlx5_wqe_signature_seg) : 0; + wqe_size += cap->max_recv_sge * sizeof(struct mlx5_wqe_data_seg); + wqe_size = roundup_pow_of_two(wqe_size); + wq_size = roundup_pow_of_two(cap->max_recv_wr) * wqe_size; + wq_size = max_t(int, wq_size, MLX5_SEND_WQE_BB); + qp->rq.wqe_cnt = wq_size / wqe_size; + if (wqe_size > gen->max_rq_desc_sz) { + mlx5_ib_dbg(dev, "wqe_size %d, max %d\n", + wqe_size, + gen->max_rq_desc_sz); + return -EINVAL; + } + qp->rq.wqe_shift = ilog2(wqe_size); + qp->rq.max_gs = (1 << qp->rq.wqe_shift) / sizeof(struct mlx5_wqe_data_seg) - qp->wq_sig; + qp->rq.max_post = qp->rq.wqe_cnt; + } + } + + return 0; +} + +static int sq_overhead(enum ib_qp_type qp_type) +{ + int size = 0; + + switch (qp_type) { + case IB_QPT_XRC_INI: + size += sizeof(struct mlx5_wqe_xrc_seg); + /* fall through */ + case IB_QPT_RC: + size += sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_atomic_seg) + + sizeof(struct mlx5_wqe_raddr_seg); + break; + + case IB_QPT_XRC_TGT: + return 0; + + case IB_QPT_UC: + size += sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_raddr_seg) + + sizeof(struct mlx5_wqe_umr_ctrl_seg) + + sizeof(struct mlx5_mkey_seg); + break; + + case IB_QPT_UD: + case IB_QPT_SMI: + case IB_QPT_GSI: + size += sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_datagram_seg); + break; + + case MLX5_IB_QPT_REG_UMR: + size += sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_umr_ctrl_seg) + + sizeof(struct mlx5_mkey_seg); + break; + + default: + return -EINVAL; + } + + return size; +} + +static int calc_send_wqe(struct ib_qp_init_attr *attr) +{ + int inl_size = 0; + int size; + + size = sq_overhead(attr->qp_type); + if (size < 0) + return size; + + if (attr->cap.max_inline_data) { + inl_size = size + sizeof(struct mlx5_wqe_inline_seg) + + attr->cap.max_inline_data; + } + + size += attr->cap.max_send_sge * sizeof(struct mlx5_wqe_data_seg); + if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN && + ALIGN(max_t(int, inl_size, size), MLX5_SEND_WQE_BB) < MLX5_SIG_WQE_SIZE) + return MLX5_SIG_WQE_SIZE; + else + return ALIGN(max_t(int, inl_size, size), MLX5_SEND_WQE_BB); +} + +static int calc_sq_size(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr, + struct mlx5_ib_qp *qp) +{ + struct mlx5_general_caps *gen; + int wqe_size; + int wq_size; + + gen = &dev->mdev->caps.gen; + if (!attr->cap.max_send_wr) + return 0; + + wqe_size = calc_send_wqe(attr); + mlx5_ib_dbg(dev, "wqe_size %d\n", wqe_size); + if (wqe_size < 0) + return wqe_size; + + if (wqe_size > gen->max_sq_desc_sz) { + mlx5_ib_dbg(dev, "wqe_size(%d) > max_sq_desc_sz(%d)\n", + wqe_size, gen->max_sq_desc_sz); + return -EINVAL; + } + + qp->max_inline_data = wqe_size - sq_overhead(attr->qp_type) - + sizeof(struct mlx5_wqe_inline_seg); + attr->cap.max_inline_data = qp->max_inline_data; + + if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN) + qp->signature_en = true; + + wq_size = roundup_pow_of_two(attr->cap.max_send_wr * wqe_size); + qp->sq.wqe_cnt = wq_size / MLX5_SEND_WQE_BB; + if (qp->sq.wqe_cnt > gen->max_wqes) { + mlx5_ib_dbg(dev, "wqe count(%d) exceeds limits(%d)\n", + qp->sq.wqe_cnt, gen->max_wqes); + return -ENOMEM; + } + qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB); + qp->sq.max_gs = attr->cap.max_send_sge; + qp->sq.max_post = wq_size / wqe_size; + attr->cap.max_send_wr = qp->sq.max_post; + + return wq_size; +} + +static int set_user_buf_size(struct mlx5_ib_dev *dev, + struct mlx5_ib_qp *qp, + struct mlx5_ib_create_qp *ucmd) +{ + struct mlx5_general_caps *gen; + int desc_sz = 1 << qp->sq.wqe_shift; + + gen = &dev->mdev->caps.gen; + if (desc_sz > gen->max_sq_desc_sz) { + mlx5_ib_warn(dev, "desc_sz %d, max_sq_desc_sz %d\n", + desc_sz, gen->max_sq_desc_sz); + return -EINVAL; + } + + if (ucmd->sq_wqe_count && ((1 << ilog2(ucmd->sq_wqe_count)) != ucmd->sq_wqe_count)) { + mlx5_ib_warn(dev, "sq_wqe_count %d, sq_wqe_count %d\n", + ucmd->sq_wqe_count, ucmd->sq_wqe_count); + return -EINVAL; + } + + qp->sq.wqe_cnt = ucmd->sq_wqe_count; + + if (qp->sq.wqe_cnt > gen->max_wqes) { + mlx5_ib_warn(dev, "wqe_cnt %d, max_wqes %d\n", + qp->sq.wqe_cnt, gen->max_wqes); + return -EINVAL; + } + + qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + + (qp->sq.wqe_cnt << 6); + + return 0; +} + +static int qp_has_rq(struct ib_qp_init_attr *attr) +{ + if (attr->qp_type == IB_QPT_XRC_INI || + attr->qp_type == IB_QPT_XRC_TGT || attr->srq || + attr->qp_type == MLX5_IB_QPT_REG_UMR || + !attr->cap.max_recv_wr) + return 0; + + return 1; +} + +static int first_med_uuar(void) +{ + return 1; +} + +static int next_uuar(int n) +{ + n++; + + while (((n % 4) & 2)) + n++; + + return n; +} + +static int num_med_uuar(struct mlx5_uuar_info *uuari) +{ + int n; + + n = uuari->num_uars * MLX5_NON_FP_BF_REGS_PER_PAGE - + uuari->num_low_latency_uuars - 1; + + return n >= 0 ? n : 0; +} + +static int max_uuari(struct mlx5_uuar_info *uuari) +{ + return uuari->num_uars * 4; +} + +static int first_hi_uuar(struct mlx5_uuar_info *uuari) +{ + int med; + int i; + int t; + + med = num_med_uuar(uuari); + for (t = 0, i = first_med_uuar();; i = next_uuar(i)) { + t++; + if (t == med) + return next_uuar(i); + } + + return 0; +} + +static int alloc_high_class_uuar(struct mlx5_uuar_info *uuari) +{ + int i; + + for (i = first_hi_uuar(uuari); i < max_uuari(uuari); i = next_uuar(i)) { + if (!test_bit(i, uuari->bitmap)) { + set_bit(i, uuari->bitmap); + uuari->count[i]++; + return i; + } + } + + return -ENOMEM; +} + +static int alloc_med_class_uuar(struct mlx5_uuar_info *uuari) +{ + int minidx = first_med_uuar(); + int i; + + for (i = first_med_uuar(); i < first_hi_uuar(uuari); i = next_uuar(i)) { + if (uuari->count[i] < uuari->count[minidx]) + minidx = i; + } + + uuari->count[minidx]++; + return minidx; +} + +static int alloc_uuar(struct mlx5_uuar_info *uuari, + enum mlx5_ib_latency_class lat) +{ + int uuarn = -EINVAL; + + mutex_lock(&uuari->lock); + switch (lat) { + case MLX5_IB_LATENCY_CLASS_LOW: + uuarn = 0; + uuari->count[uuarn]++; + break; + + case MLX5_IB_LATENCY_CLASS_MEDIUM: + if (uuari->ver < 2) + uuarn = -ENOMEM; + else + uuarn = alloc_med_class_uuar(uuari); + break; + + case MLX5_IB_LATENCY_CLASS_HIGH: + if (uuari->ver < 2) + uuarn = -ENOMEM; + else + uuarn = alloc_high_class_uuar(uuari); + break; + + case MLX5_IB_LATENCY_CLASS_FAST_PATH: + uuarn = 2; + break; + } + mutex_unlock(&uuari->lock); + + return uuarn; +} + +static void free_med_class_uuar(struct mlx5_uuar_info *uuari, int uuarn) +{ + clear_bit(uuarn, uuari->bitmap); + --uuari->count[uuarn]; +} + +static void free_high_class_uuar(struct mlx5_uuar_info *uuari, int uuarn) +{ + clear_bit(uuarn, uuari->bitmap); + --uuari->count[uuarn]; +} + +static void free_uuar(struct mlx5_uuar_info *uuari, int uuarn) +{ + int nuuars = uuari->num_uars * MLX5_BF_REGS_PER_PAGE; + int high_uuar = nuuars - uuari->num_low_latency_uuars; + + mutex_lock(&uuari->lock); + if (uuarn == 0) { + --uuari->count[uuarn]; + goto out; + } + + if (uuarn < high_uuar) { + free_med_class_uuar(uuari, uuarn); + goto out; + } + + free_high_class_uuar(uuari, uuarn); + +out: + mutex_unlock(&uuari->lock); +} + +static enum mlx5_qp_state to_mlx5_state(enum ib_qp_state state) +{ + switch (state) { + case IB_QPS_RESET: return MLX5_QP_STATE_RST; + case IB_QPS_INIT: return MLX5_QP_STATE_INIT; + case IB_QPS_RTR: return MLX5_QP_STATE_RTR; + case IB_QPS_RTS: return MLX5_QP_STATE_RTS; + case IB_QPS_SQD: return MLX5_QP_STATE_SQD; + case IB_QPS_SQE: return MLX5_QP_STATE_SQER; + case IB_QPS_ERR: return MLX5_QP_STATE_ERR; + default: return -1; + } +} + +static int to_mlx5_st(enum ib_qp_type type) +{ + switch (type) { + case IB_QPT_RC: return MLX5_QP_ST_RC; + case IB_QPT_UC: return MLX5_QP_ST_UC; + case IB_QPT_UD: return MLX5_QP_ST_UD; + case MLX5_IB_QPT_REG_UMR: return MLX5_QP_ST_REG_UMR; + case IB_QPT_XRC_INI: + case IB_QPT_XRC_TGT: return MLX5_QP_ST_XRC; + case IB_QPT_SMI: return MLX5_QP_ST_QP0; + case IB_QPT_GSI: return MLX5_QP_ST_QP1; + case IB_QPT_RAW_IPV6: return MLX5_QP_ST_RAW_IPV6; + case IB_QPT_RAW_ETHERTYPE: return MLX5_QP_ST_RAW_ETHERTYPE; + case IB_QPT_RAW_PACKET: + case IB_QPT_MAX: + default: return -EINVAL; + } +} + +static int uuarn_to_uar_index(struct mlx5_uuar_info *uuari, int uuarn) +{ + return uuari->uars[uuarn / MLX5_BF_REGS_PER_PAGE].index; +} + +static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, + struct mlx5_ib_qp *qp, struct ib_udata *udata, + struct mlx5_create_qp_mbox_in **in, + struct mlx5_ib_create_qp_resp *resp, int *inlen) +{ + struct mlx5_ib_ucontext *context; + struct mlx5_ib_create_qp ucmd; + int page_shift = 0; + int uar_index; + int npages; + u32 offset = 0; + int uuarn; + int ncont = 0; + int err; + + err = ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)); + if (err) { + mlx5_ib_dbg(dev, "copy failed\n"); + return err; + } + + context = to_mucontext(pd->uobject->context); + /* + * TBD: should come from the verbs when we have the API + */ + uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_HIGH); + if (uuarn < 0) { + mlx5_ib_dbg(dev, "failed to allocate low latency UUAR\n"); + mlx5_ib_dbg(dev, "reverting to medium latency\n"); + uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_MEDIUM); + if (uuarn < 0) { + mlx5_ib_dbg(dev, "failed to allocate medium latency UUAR\n"); + mlx5_ib_dbg(dev, "reverting to high latency\n"); + uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_LOW); + if (uuarn < 0) { + mlx5_ib_warn(dev, "uuar allocation failed\n"); + return uuarn; + } + } + } + + uar_index = uuarn_to_uar_index(&context->uuari, uuarn); + mlx5_ib_dbg(dev, "uuarn 0x%x, uar_index 0x%x\n", uuarn, uar_index); + + qp->rq.offset = 0; + qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB); + qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift; + + err = set_user_buf_size(dev, qp, &ucmd); + if (err) + goto err_uuar; + + if (ucmd.buf_addr && qp->buf_size) { + qp->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, + qp->buf_size, 0, 0); + if (IS_ERR(qp->umem)) { + mlx5_ib_dbg(dev, "umem_get failed\n"); + err = PTR_ERR(qp->umem); + goto err_uuar; + } + } else { + qp->umem = NULL; + } + + if (qp->umem) { + mlx5_ib_cont_pages(qp->umem, ucmd.buf_addr, &npages, &page_shift, + &ncont, NULL); + err = mlx5_ib_get_buf_offset(ucmd.buf_addr, page_shift, &offset); + if (err) { + mlx5_ib_warn(dev, "bad offset\n"); + goto err_umem; + } + mlx5_ib_dbg(dev, "addr 0x%llx, size %d, npages %d, page_shift %d, ncont %d, offset %d\n", + ucmd.buf_addr, qp->buf_size, npages, page_shift, ncont, offset); + } + + *inlen = sizeof(**in) + sizeof(*(*in)->pas) * ncont; + *in = mlx5_vzalloc(*inlen); + if (!*in) { + err = -ENOMEM; + goto err_umem; + } + if (qp->umem) + mlx5_ib_populate_pas(dev, qp->umem, page_shift, (*in)->pas, 0); + (*in)->ctx.log_pg_sz_remote_qpn = + cpu_to_be32((page_shift - MLX5_ADAPTER_PAGE_SHIFT) << 24); + (*in)->ctx.params2 = cpu_to_be32(offset << 6); + + (*in)->ctx.qp_counter_set_usr_page = cpu_to_be32(uar_index); + resp->uuar_index = uuarn; + qp->uuarn = uuarn; + + err = mlx5_ib_db_map_user(context, ucmd.db_addr, &qp->db); + if (err) { + mlx5_ib_dbg(dev, "map failed\n"); + goto err_free; + } + + err = ib_copy_to_udata(udata, resp, sizeof(*resp)); + if (err) { + mlx5_ib_dbg(dev, "copy failed\n"); + goto err_unmap; + } + qp->create_type = MLX5_QP_USER; + + return 0; + +err_unmap: + mlx5_ib_db_unmap_user(context, &qp->db); + +err_free: + kvfree(*in); + +err_umem: + if (qp->umem) + ib_umem_release(qp->umem); + +err_uuar: + free_uuar(&context->uuari, uuarn); + return err; +} + +static void destroy_qp_user(struct ib_pd *pd, struct mlx5_ib_qp *qp) +{ + struct mlx5_ib_ucontext *context; + + context = to_mucontext(pd->uobject->context); + mlx5_ib_db_unmap_user(context, &qp->db); + if (qp->umem) + ib_umem_release(qp->umem); + free_uuar(&context->uuari, qp->uuarn); +} + +static int create_kernel_qp(struct mlx5_ib_dev *dev, + struct ib_qp_init_attr *init_attr, + struct mlx5_ib_qp *qp, + struct mlx5_create_qp_mbox_in **in, int *inlen) +{ + enum mlx5_ib_latency_class lc = MLX5_IB_LATENCY_CLASS_LOW; + struct mlx5_uuar_info *uuari; + int uar_index; + int uuarn; + int err; + + uuari = &dev->mdev->priv.uuari; + if (init_attr->create_flags & ~(IB_QP_CREATE_SIGNATURE_EN | IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)) + return -EINVAL; + + if (init_attr->qp_type == MLX5_IB_QPT_REG_UMR) + lc = MLX5_IB_LATENCY_CLASS_FAST_PATH; + + uuarn = alloc_uuar(uuari, lc); + if (uuarn < 0) { + mlx5_ib_dbg(dev, "\n"); + return -ENOMEM; + } + + qp->bf = &uuari->bfs[uuarn]; + uar_index = qp->bf->uar->index; + + err = calc_sq_size(dev, init_attr, qp); + if (err < 0) { + mlx5_ib_dbg(dev, "err %d\n", err); + goto err_uuar; + } + + qp->rq.offset = 0; + qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift; + qp->buf_size = err + (qp->rq.wqe_cnt << qp->rq.wqe_shift); + + err = mlx5_buf_alloc(dev->mdev, qp->buf_size, PAGE_SIZE * 2, &qp->buf); + if (err) { + mlx5_ib_dbg(dev, "err %d\n", err); + goto err_uuar; + } + + qp->sq.qend = mlx5_get_send_wqe(qp, qp->sq.wqe_cnt); + *inlen = sizeof(**in) + sizeof(*(*in)->pas) * qp->buf.npages; + *in = mlx5_vzalloc(*inlen); + if (!*in) { + err = -ENOMEM; + goto err_buf; + } + (*in)->ctx.qp_counter_set_usr_page = cpu_to_be32(uar_index); + (*in)->ctx.log_pg_sz_remote_qpn = + cpu_to_be32((qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT) << 24); + /* Set "fast registration enabled" for all kernel QPs */ + (*in)->ctx.params1 |= cpu_to_be32(1 << 11); + (*in)->ctx.sq_crq_size |= cpu_to_be16(1 << 4); + + mlx5_fill_page_array(&qp->buf, (*in)->pas); + + err = mlx5_db_alloc(dev->mdev, &qp->db); + if (err) { + mlx5_ib_dbg(dev, "err %d\n", err); + goto err_free; + } + + qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wrid), GFP_KERNEL); + qp->sq.wr_data = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wr_data), GFP_KERNEL); + qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof(*qp->rq.wrid), GFP_KERNEL); + qp->sq.w_list = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.w_list), GFP_KERNEL); + qp->sq.wqe_head = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wqe_head), GFP_KERNEL); + + if (!qp->sq.wrid || !qp->sq.wr_data || !qp->rq.wrid || + !qp->sq.w_list || !qp->sq.wqe_head) { + err = -ENOMEM; + goto err_wrid; + } + qp->create_type = MLX5_QP_KERNEL; + + return 0; + +err_wrid: + mlx5_db_free(dev->mdev, &qp->db); + kfree(qp->sq.wqe_head); + kfree(qp->sq.w_list); + kfree(qp->sq.wrid); + kfree(qp->sq.wr_data); + kfree(qp->rq.wrid); + +err_free: + kvfree(*in); + +err_buf: + mlx5_buf_free(dev->mdev, &qp->buf); + +err_uuar: + free_uuar(&dev->mdev->priv.uuari, uuarn); + return err; +} + +static void destroy_qp_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) +{ + mlx5_db_free(dev->mdev, &qp->db); + kfree(qp->sq.wqe_head); + kfree(qp->sq.w_list); + kfree(qp->sq.wrid); + kfree(qp->sq.wr_data); + kfree(qp->rq.wrid); + mlx5_buf_free(dev->mdev, &qp->buf); + free_uuar(&dev->mdev->priv.uuari, qp->bf->uuarn); +} + +static __be32 get_rx_type(struct mlx5_ib_qp *qp, struct ib_qp_init_attr *attr) +{ + if (attr->srq || (attr->qp_type == IB_QPT_XRC_TGT) || + (attr->qp_type == IB_QPT_XRC_INI)) + return cpu_to_be32(MLX5_SRQ_RQ); + else if (!qp->has_rq) + return cpu_to_be32(MLX5_ZERO_LEN_RQ); + else + return cpu_to_be32(MLX5_NON_ZERO_RQ); +} + +static int is_connected(enum ib_qp_type qp_type) +{ + if (qp_type == IB_QPT_RC || qp_type == IB_QPT_UC) + return 1; + + return 0; +} + +static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata, struct mlx5_ib_qp *qp) +{ + struct mlx5_ib_resources *devr = &dev->devr; + struct mlx5_ib_create_qp_resp resp; + struct mlx5_create_qp_mbox_in *in; + struct mlx5_general_caps *gen; + struct mlx5_ib_create_qp ucmd; + int inlen = sizeof(*in); + int err; + + mlx5_ib_odp_create_qp(qp); + + gen = &dev->mdev->caps.gen; + mutex_init(&qp->mutex); + spin_lock_init(&qp->sq.lock); + spin_lock_init(&qp->rq.lock); + + if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) { + if (!(gen->flags & MLX5_DEV_CAP_FLAG_BLOCK_MCAST)) { + mlx5_ib_dbg(dev, "block multicast loopback isn't supported\n"); + return -EINVAL; + } else { + qp->flags |= MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK; + } + } + + if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) + qp->sq_signal_bits = MLX5_WQE_CTRL_CQ_UPDATE; + + if (pd && pd->uobject) { + if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) { + mlx5_ib_dbg(dev, "copy failed\n"); + return -EFAULT; + } + + qp->wq_sig = !!(ucmd.flags & MLX5_QP_FLAG_SIGNATURE); + qp->scat_cqe = !!(ucmd.flags & MLX5_QP_FLAG_SCATTER_CQE); + } else { + qp->wq_sig = !!wq_signature; + } + + qp->has_rq = qp_has_rq(init_attr); + err = set_rq_size(dev, &init_attr->cap, qp->has_rq, + qp, (pd && pd->uobject) ? &ucmd : NULL); + if (err) { + mlx5_ib_dbg(dev, "err %d\n", err); + return err; + } + + if (pd) { + if (pd->uobject) { + mlx5_ib_dbg(dev, "requested sq_wqe_count (%d)\n", ucmd.sq_wqe_count); + if (ucmd.rq_wqe_shift != qp->rq.wqe_shift || + ucmd.rq_wqe_count != qp->rq.wqe_cnt) { + mlx5_ib_dbg(dev, "invalid rq params\n"); + return -EINVAL; + } + if (ucmd.sq_wqe_count > gen->max_wqes) { + mlx5_ib_dbg(dev, "requested sq_wqe_count (%d) > max allowed (%d)\n", + ucmd.sq_wqe_count, gen->max_wqes); + return -EINVAL; + } + err = create_user_qp(dev, pd, qp, udata, &in, &resp, &inlen); + if (err) + mlx5_ib_dbg(dev, "err %d\n", err); + } else { + err = create_kernel_qp(dev, init_attr, qp, &in, &inlen); + if (err) + mlx5_ib_dbg(dev, "err %d\n", err); + else + qp->pa_lkey = to_mpd(pd)->pa_lkey; + } + + if (err) + return err; + } else { + in = mlx5_vzalloc(sizeof(*in)); + if (!in) + return -ENOMEM; + + qp->create_type = MLX5_QP_EMPTY; + } + + if (is_sqp(init_attr->qp_type)) + qp->port = init_attr->port_num; + + in->ctx.flags = cpu_to_be32(to_mlx5_st(init_attr->qp_type) << 16 | + MLX5_QP_PM_MIGRATED << 11); + + if (init_attr->qp_type != MLX5_IB_QPT_REG_UMR) + in->ctx.flags_pd = cpu_to_be32(to_mpd(pd ? pd : devr->p0)->pdn); + else + in->ctx.flags_pd = cpu_to_be32(MLX5_QP_LAT_SENSITIVE); + + if (qp->wq_sig) + in->ctx.flags_pd |= cpu_to_be32(MLX5_QP_ENABLE_SIG); + + if (qp->flags & MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK) + in->ctx.flags_pd |= cpu_to_be32(MLX5_QP_BLOCK_MCAST); + + if (qp->scat_cqe && is_connected(init_attr->qp_type)) { + int rcqe_sz; + int scqe_sz; + + rcqe_sz = mlx5_ib_get_cqe_size(dev, init_attr->recv_cq); + scqe_sz = mlx5_ib_get_cqe_size(dev, init_attr->send_cq); + + if (rcqe_sz == 128) + in->ctx.cs_res = MLX5_RES_SCAT_DATA64_CQE; + else + in->ctx.cs_res = MLX5_RES_SCAT_DATA32_CQE; + + if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) { + if (scqe_sz == 128) + in->ctx.cs_req = MLX5_REQ_SCAT_DATA64_CQE; + else + in->ctx.cs_req = MLX5_REQ_SCAT_DATA32_CQE; + } + } + + if (qp->rq.wqe_cnt) { + in->ctx.rq_size_stride = (qp->rq.wqe_shift - 4); + in->ctx.rq_size_stride |= ilog2(qp->rq.wqe_cnt) << 3; + } + + in->ctx.rq_type_srqn = get_rx_type(qp, init_attr); + + if (qp->sq.wqe_cnt) + in->ctx.sq_crq_size |= cpu_to_be16(ilog2(qp->sq.wqe_cnt) << 11); + else + in->ctx.sq_crq_size |= cpu_to_be16(0x8000); + + /* Set default resources */ + switch (init_attr->qp_type) { + case IB_QPT_XRC_TGT: + in->ctx.cqn_recv = cpu_to_be32(to_mcq(devr->c0)->mcq.cqn); + in->ctx.cqn_send = cpu_to_be32(to_mcq(devr->c0)->mcq.cqn); + in->ctx.rq_type_srqn |= cpu_to_be32(to_msrq(devr->s0)->msrq.srqn); + in->ctx.xrcd = cpu_to_be32(to_mxrcd(init_attr->xrcd)->xrcdn); + break; + case IB_QPT_XRC_INI: + in->ctx.cqn_recv = cpu_to_be32(to_mcq(devr->c0)->mcq.cqn); + in->ctx.xrcd = cpu_to_be32(to_mxrcd(devr->x1)->xrcdn); + in->ctx.rq_type_srqn |= cpu_to_be32(to_msrq(devr->s0)->msrq.srqn); + break; + default: + if (init_attr->srq) { + in->ctx.xrcd = cpu_to_be32(to_mxrcd(devr->x0)->xrcdn); + in->ctx.rq_type_srqn |= cpu_to_be32(to_msrq(init_attr->srq)->msrq.srqn); + } else { + in->ctx.xrcd = cpu_to_be32(to_mxrcd(devr->x1)->xrcdn); + in->ctx.rq_type_srqn |= cpu_to_be32(to_msrq(devr->s0)->msrq.srqn); + } + } + + if (init_attr->send_cq) + in->ctx.cqn_send = cpu_to_be32(to_mcq(init_attr->send_cq)->mcq.cqn); + + if (init_attr->recv_cq) + in->ctx.cqn_recv = cpu_to_be32(to_mcq(init_attr->recv_cq)->mcq.cqn); + + in->ctx.db_rec_addr = cpu_to_be64(qp->db.dma); + + err = mlx5_core_create_qp(dev->mdev, &qp->mqp, in, inlen); + if (err) { + mlx5_ib_dbg(dev, "create qp failed\n"); + goto err_create; + } + + kvfree(in); + /* Hardware wants QPN written in big-endian order (after + * shifting) for send doorbell. Precompute this value to save + * a little bit when posting sends. + */ + qp->doorbell_qpn = swab32(qp->mqp.qpn << 8); + + qp->mqp.event = mlx5_ib_qp_event; + + return 0; + +err_create: + if (qp->create_type == MLX5_QP_USER) + destroy_qp_user(pd, qp); + else if (qp->create_type == MLX5_QP_KERNEL) + destroy_qp_kernel(dev, qp); + + kvfree(in); + return err; +} + +static void mlx5_ib_lock_cqs(struct mlx5_ib_cq *send_cq, struct mlx5_ib_cq *recv_cq) + __acquires(&send_cq->lock) __acquires(&recv_cq->lock) +{ + if (send_cq) { + if (recv_cq) { + if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { + spin_lock_irq(&send_cq->lock); + spin_lock_nested(&recv_cq->lock, + SINGLE_DEPTH_NESTING); + } else if (send_cq->mcq.cqn == recv_cq->mcq.cqn) { + spin_lock_irq(&send_cq->lock); + __acquire(&recv_cq->lock); + } else { + spin_lock_irq(&recv_cq->lock); + spin_lock_nested(&send_cq->lock, + SINGLE_DEPTH_NESTING); + } + } else { + spin_lock_irq(&send_cq->lock); + __acquire(&recv_cq->lock); + } + } else if (recv_cq) { + spin_lock_irq(&recv_cq->lock); + __acquire(&send_cq->lock); + } else { + __acquire(&send_cq->lock); + __acquire(&recv_cq->lock); + } +} + +static void mlx5_ib_unlock_cqs(struct mlx5_ib_cq *send_cq, struct mlx5_ib_cq *recv_cq) + __releases(&send_cq->lock) __releases(&recv_cq->lock) +{ + if (send_cq) { + if (recv_cq) { + if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { + spin_unlock(&recv_cq->lock); + spin_unlock_irq(&send_cq->lock); + } else if (send_cq->mcq.cqn == recv_cq->mcq.cqn) { + __release(&recv_cq->lock); + spin_unlock_irq(&send_cq->lock); + } else { + spin_unlock(&send_cq->lock); + spin_unlock_irq(&recv_cq->lock); + } + } else { + __release(&recv_cq->lock); + spin_unlock_irq(&send_cq->lock); + } + } else if (recv_cq) { + __release(&send_cq->lock); + spin_unlock_irq(&recv_cq->lock); + } else { + __release(&recv_cq->lock); + __release(&send_cq->lock); + } +} + +static struct mlx5_ib_pd *get_pd(struct mlx5_ib_qp *qp) +{ + return to_mpd(qp->ibqp.pd); +} + +static void get_cqs(struct mlx5_ib_qp *qp, + struct mlx5_ib_cq **send_cq, struct mlx5_ib_cq **recv_cq) +{ + switch (qp->ibqp.qp_type) { + case IB_QPT_XRC_TGT: + *send_cq = NULL; + *recv_cq = NULL; + break; + case MLX5_IB_QPT_REG_UMR: + case IB_QPT_XRC_INI: + *send_cq = to_mcq(qp->ibqp.send_cq); + *recv_cq = NULL; + break; + + case IB_QPT_SMI: + case IB_QPT_GSI: + case IB_QPT_RC: + case IB_QPT_UC: + case IB_QPT_UD: + case IB_QPT_RAW_IPV6: + case IB_QPT_RAW_ETHERTYPE: + *send_cq = to_mcq(qp->ibqp.send_cq); + *recv_cq = to_mcq(qp->ibqp.recv_cq); + break; + + case IB_QPT_RAW_PACKET: + case IB_QPT_MAX: + default: + *send_cq = NULL; + *recv_cq = NULL; + break; + } +} + +static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) +{ + struct mlx5_ib_cq *send_cq, *recv_cq; + struct mlx5_modify_qp_mbox_in *in; + int err; + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) + return; + + if (qp->state != IB_QPS_RESET) { + mlx5_ib_qp_disable_pagefaults(qp); + if (mlx5_core_qp_modify(dev->mdev, to_mlx5_state(qp->state), + MLX5_QP_STATE_RST, in, 0, &qp->mqp)) + mlx5_ib_warn(dev, "mlx5_ib: modify QP %06x to RESET failed\n", + qp->mqp.qpn); + } + + get_cqs(qp, &send_cq, &recv_cq); + + if (qp->create_type == MLX5_QP_KERNEL) { + mlx5_ib_lock_cqs(send_cq, recv_cq); + __mlx5_ib_cq_clean(recv_cq, qp->mqp.qpn, + qp->ibqp.srq ? to_msrq(qp->ibqp.srq) : NULL); + if (send_cq != recv_cq) + __mlx5_ib_cq_clean(send_cq, qp->mqp.qpn, NULL); + mlx5_ib_unlock_cqs(send_cq, recv_cq); + } + + err = mlx5_core_destroy_qp(dev->mdev, &qp->mqp); + if (err) + mlx5_ib_warn(dev, "failed to destroy QP 0x%x\n", qp->mqp.qpn); + kfree(in); + + + if (qp->create_type == MLX5_QP_KERNEL) + destroy_qp_kernel(dev, qp); + else if (qp->create_type == MLX5_QP_USER) + destroy_qp_user(&get_pd(qp)->ibpd, qp); +} + +static const char *ib_qp_type_str(enum ib_qp_type type) +{ + switch (type) { + case IB_QPT_SMI: + return "IB_QPT_SMI"; + case IB_QPT_GSI: + return "IB_QPT_GSI"; + case IB_QPT_RC: + return "IB_QPT_RC"; + case IB_QPT_UC: + return "IB_QPT_UC"; + case IB_QPT_UD: + return "IB_QPT_UD"; + case IB_QPT_RAW_IPV6: + return "IB_QPT_RAW_IPV6"; + case IB_QPT_RAW_ETHERTYPE: + return "IB_QPT_RAW_ETHERTYPE"; + case IB_QPT_XRC_INI: + return "IB_QPT_XRC_INI"; + case IB_QPT_XRC_TGT: + return "IB_QPT_XRC_TGT"; + case IB_QPT_RAW_PACKET: + return "IB_QPT_RAW_PACKET"; + case MLX5_IB_QPT_REG_UMR: + return "MLX5_IB_QPT_REG_UMR"; + case IB_QPT_MAX: + default: + return "Invalid QP type"; + } +} + +struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata) +{ + struct mlx5_general_caps *gen; + struct mlx5_ib_dev *dev; + struct mlx5_ib_qp *qp; + u16 xrcdn = 0; + int err; + + if (pd) { + dev = to_mdev(pd->device); + } else { + /* being cautious here */ + if (init_attr->qp_type != IB_QPT_XRC_TGT && + init_attr->qp_type != MLX5_IB_QPT_REG_UMR) { + pr_warn("%s: no PD for transport %s\n", __func__, + ib_qp_type_str(init_attr->qp_type)); + return ERR_PTR(-EINVAL); + } + dev = to_mdev(to_mxrcd(init_attr->xrcd)->ibxrcd.device); + } + gen = &dev->mdev->caps.gen; + + switch (init_attr->qp_type) { + case IB_QPT_XRC_TGT: + case IB_QPT_XRC_INI: + if (!(gen->flags & MLX5_DEV_CAP_FLAG_XRC)) { + mlx5_ib_dbg(dev, "XRC not supported\n"); + return ERR_PTR(-ENOSYS); + } + init_attr->recv_cq = NULL; + if (init_attr->qp_type == IB_QPT_XRC_TGT) { + xrcdn = to_mxrcd(init_attr->xrcd)->xrcdn; + init_attr->send_cq = NULL; + } + + /* fall through */ + case IB_QPT_RC: + case IB_QPT_UC: + case IB_QPT_UD: + case IB_QPT_SMI: + case IB_QPT_GSI: + case MLX5_IB_QPT_REG_UMR: + qp = kzalloc(sizeof(*qp), GFP_KERNEL); + if (!qp) + return ERR_PTR(-ENOMEM); + + err = create_qp_common(dev, pd, init_attr, udata, qp); + if (err) { + mlx5_ib_dbg(dev, "create_qp_common failed\n"); + kfree(qp); + return ERR_PTR(err); + } + + if (is_qp0(init_attr->qp_type)) + qp->ibqp.qp_num = 0; + else if (is_qp1(init_attr->qp_type)) + qp->ibqp.qp_num = 1; + else + qp->ibqp.qp_num = qp->mqp.qpn; + + mlx5_ib_dbg(dev, "ib qpnum 0x%x, mlx qpn 0x%x, rcqn 0x%x, scqn 0x%x\n", + qp->ibqp.qp_num, qp->mqp.qpn, to_mcq(init_attr->recv_cq)->mcq.cqn, + to_mcq(init_attr->send_cq)->mcq.cqn); + + qp->xrcdn = xrcdn; + + break; + + case IB_QPT_RAW_IPV6: + case IB_QPT_RAW_ETHERTYPE: + case IB_QPT_RAW_PACKET: + case IB_QPT_MAX: + default: + mlx5_ib_dbg(dev, "unsupported qp type %d\n", + init_attr->qp_type); + /* Don't support raw QPs */ + return ERR_PTR(-EINVAL); + } + + return &qp->ibqp; +} + +int mlx5_ib_destroy_qp(struct ib_qp *qp) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->device); + struct mlx5_ib_qp *mqp = to_mqp(qp); + + destroy_qp_common(dev, mqp); + + kfree(mqp); + + return 0; +} + +static __be32 to_mlx5_access_flags(struct mlx5_ib_qp *qp, const struct ib_qp_attr *attr, + int attr_mask) +{ + u32 hw_access_flags = 0; + u8 dest_rd_atomic; + u32 access_flags; + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) + dest_rd_atomic = attr->max_dest_rd_atomic; + else + dest_rd_atomic = qp->resp_depth; + + if (attr_mask & IB_QP_ACCESS_FLAGS) + access_flags = attr->qp_access_flags; + else + access_flags = qp->atomic_rd_en; + + if (!dest_rd_atomic) + access_flags &= IB_ACCESS_REMOTE_WRITE; + + if (access_flags & IB_ACCESS_REMOTE_READ) + hw_access_flags |= MLX5_QP_BIT_RRE; + if (access_flags & IB_ACCESS_REMOTE_ATOMIC) + hw_access_flags |= (MLX5_QP_BIT_RAE | MLX5_ATOMIC_MODE_CX); + if (access_flags & IB_ACCESS_REMOTE_WRITE) + hw_access_flags |= MLX5_QP_BIT_RWE; + + return cpu_to_be32(hw_access_flags); +} + +enum { + MLX5_PATH_FLAG_FL = 1 << 0, + MLX5_PATH_FLAG_FREE_AR = 1 << 1, + MLX5_PATH_FLAG_COUNTER = 1 << 2, +}; + +static int ib_rate_to_mlx5(struct mlx5_ib_dev *dev, u8 rate) +{ + struct mlx5_general_caps *gen; + + gen = &dev->mdev->caps.gen; + if (rate == IB_RATE_PORT_CURRENT) { + return 0; + } else if (rate < IB_RATE_2_5_GBPS || rate > IB_RATE_300_GBPS) { + return -EINVAL; + } else { + while (rate != IB_RATE_2_5_GBPS && + !(1 << (rate + MLX5_STAT_RATE_OFFSET) & + gen->stat_rate_support)) + --rate; + } + + return rate + MLX5_STAT_RATE_OFFSET; +} + +static int mlx5_set_path(struct mlx5_ib_dev *dev, const struct ib_ah_attr *ah, + struct mlx5_qp_path *path, u8 port, int attr_mask, + u32 path_flags, const struct ib_qp_attr *attr) +{ + struct mlx5_general_caps *gen; + int err; + + gen = &dev->mdev->caps.gen; + path->fl = (path_flags & MLX5_PATH_FLAG_FL) ? 0x80 : 0; + path->free_ar = (path_flags & MLX5_PATH_FLAG_FREE_AR) ? 0x80 : 0; + + if (attr_mask & IB_QP_PKEY_INDEX) + path->pkey_index = attr->pkey_index; + + path->grh_mlid = ah->src_path_bits & 0x7f; + path->rlid = cpu_to_be16(ah->dlid); + + if (ah->ah_flags & IB_AH_GRH) { + if (ah->grh.sgid_index >= gen->port[port - 1].gid_table_len) { + pr_err("sgid_index (%u) too large. max is %d\n", + ah->grh.sgid_index, gen->port[port - 1].gid_table_len); + return -EINVAL; + } + path->grh_mlid |= 1 << 7; + path->mgid_index = ah->grh.sgid_index; + path->hop_limit = ah->grh.hop_limit; + path->tclass_flowlabel = + cpu_to_be32((ah->grh.traffic_class << 20) | + (ah->grh.flow_label)); + memcpy(path->rgid, ah->grh.dgid.raw, 16); + } + + err = ib_rate_to_mlx5(dev, ah->static_rate); + if (err < 0) + return err; + path->static_rate = err; + path->port = port; + + if (attr_mask & IB_QP_TIMEOUT) + path->ackto_lt = attr->timeout << 3; + + path->sl = ah->sl & 0xf; + + return 0; +} + +static enum mlx5_qp_optpar opt_mask[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE][MLX5_QP_ST_MAX] = { + [MLX5_QP_STATE_INIT] = { + [MLX5_QP_STATE_INIT] = { + [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_PRI_PORT, + [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_PRI_PORT, + [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_Q_KEY | + MLX5_QP_OPTPAR_PRI_PORT, + }, + [MLX5_QP_STATE_RTR] = { + [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | + MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PKEY_INDEX, + [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PKEY_INDEX, + [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_Q_KEY, + [MLX5_QP_ST_MLX] = MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_Q_KEY, + [MLX5_QP_ST_XRC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | + MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PKEY_INDEX, + }, + }, + [MLX5_QP_STATE_RTR] = { + [MLX5_QP_STATE_RTS] = { + [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | + MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PM_STATE | + MLX5_QP_OPTPAR_RNR_TIMEOUT, + [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PM_STATE, + [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY, + }, + }, + [MLX5_QP_STATE_RTS] = { + [MLX5_QP_STATE_RTS] = { + [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_RNR_TIMEOUT | + MLX5_QP_OPTPAR_PM_STATE | + MLX5_QP_OPTPAR_ALT_ADDR_PATH, + [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PM_STATE | + MLX5_QP_OPTPAR_ALT_ADDR_PATH, + [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY | + MLX5_QP_OPTPAR_SRQN | + MLX5_QP_OPTPAR_CQN_RCV, + }, + }, + [MLX5_QP_STATE_SQER] = { + [MLX5_QP_STATE_RTS] = { + [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY, + [MLX5_QP_ST_MLX] = MLX5_QP_OPTPAR_Q_KEY, + [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE, + [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_RNR_TIMEOUT | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RRE, + }, + }, +}; + +static int ib_nr_to_mlx5_nr(int ib_mask) +{ + switch (ib_mask) { + case IB_QP_STATE: + return 0; + case IB_QP_CUR_STATE: + return 0; + case IB_QP_EN_SQD_ASYNC_NOTIFY: + return 0; + case IB_QP_ACCESS_FLAGS: + return MLX5_QP_OPTPAR_RWE | MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE; + case IB_QP_PKEY_INDEX: + return MLX5_QP_OPTPAR_PKEY_INDEX; + case IB_QP_PORT: + return MLX5_QP_OPTPAR_PRI_PORT; + case IB_QP_QKEY: + return MLX5_QP_OPTPAR_Q_KEY; + case IB_QP_AV: + return MLX5_QP_OPTPAR_PRIMARY_ADDR_PATH | + MLX5_QP_OPTPAR_PRI_PORT; + case IB_QP_PATH_MTU: + return 0; + case IB_QP_TIMEOUT: + return MLX5_QP_OPTPAR_ACK_TIMEOUT; + case IB_QP_RETRY_CNT: + return MLX5_QP_OPTPAR_RETRY_COUNT; + case IB_QP_RNR_RETRY: + return MLX5_QP_OPTPAR_RNR_RETRY; + case IB_QP_RQ_PSN: + return 0; + case IB_QP_MAX_QP_RD_ATOMIC: + return MLX5_QP_OPTPAR_SRA_MAX; + case IB_QP_ALT_PATH: + return MLX5_QP_OPTPAR_ALT_ADDR_PATH; + case IB_QP_MIN_RNR_TIMER: + return MLX5_QP_OPTPAR_RNR_TIMEOUT; + case IB_QP_SQ_PSN: + return 0; + case IB_QP_MAX_DEST_RD_ATOMIC: + return MLX5_QP_OPTPAR_RRA_MAX | MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_RRE | MLX5_QP_OPTPAR_RAE; + case IB_QP_PATH_MIG_STATE: + return MLX5_QP_OPTPAR_PM_STATE; + case IB_QP_CAP: + return 0; + case IB_QP_DEST_QPN: + return 0; + } + return 0; +} + +static int ib_mask_to_mlx5_opt(int ib_mask) +{ + int result = 0; + int i; + + for (i = 0; i < 8 * sizeof(int); i++) { + if ((1 << i) & ib_mask) + result |= ib_nr_to_mlx5_nr(1 << i); + } + + return result; +} + +static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, + const struct ib_qp_attr *attr, int attr_mask, + enum ib_qp_state cur_state, enum ib_qp_state new_state) +{ + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + struct mlx5_ib_qp *qp = to_mqp(ibqp); + struct mlx5_ib_cq *send_cq, *recv_cq; + struct mlx5_qp_context *context; + struct mlx5_general_caps *gen; + struct mlx5_modify_qp_mbox_in *in; + struct mlx5_ib_pd *pd; + enum mlx5_qp_state mlx5_cur, mlx5_new; + enum mlx5_qp_optpar optpar; + int sqd_event; + int mlx5_st; + int err; + + gen = &dev->mdev->caps.gen; + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) + return -ENOMEM; + + context = &in->ctx; + err = to_mlx5_st(ibqp->qp_type); + if (err < 0) + goto out; + + context->flags = cpu_to_be32(err << 16); + + if (!(attr_mask & IB_QP_PATH_MIG_STATE)) { + context->flags |= cpu_to_be32(MLX5_QP_PM_MIGRATED << 11); + } else { + switch (attr->path_mig_state) { + case IB_MIG_MIGRATED: + context->flags |= cpu_to_be32(MLX5_QP_PM_MIGRATED << 11); + break; + case IB_MIG_REARM: + context->flags |= cpu_to_be32(MLX5_QP_PM_REARM << 11); + break; + case IB_MIG_ARMED: + context->flags |= cpu_to_be32(MLX5_QP_PM_ARMED << 11); + break; + } + } + + if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI) { + context->mtu_msgmax = (IB_MTU_256 << 5) | 8; + } else if (ibqp->qp_type == IB_QPT_UD || + ibqp->qp_type == MLX5_IB_QPT_REG_UMR) { + context->mtu_msgmax = (IB_MTU_4096 << 5) | 12; + } else if (attr_mask & IB_QP_PATH_MTU) { + if (attr->path_mtu < IB_MTU_256 || + attr->path_mtu > IB_MTU_4096) { + mlx5_ib_warn(dev, "invalid mtu %d\n", attr->path_mtu); + err = -EINVAL; + goto out; + } + context->mtu_msgmax = (attr->path_mtu << 5) | gen->log_max_msg; + } + + if (attr_mask & IB_QP_DEST_QPN) + context->log_pg_sz_remote_qpn = cpu_to_be32(attr->dest_qp_num); + + if (attr_mask & IB_QP_PKEY_INDEX) + context->pri_path.pkey_index = attr->pkey_index; + + /* todo implement counter_index functionality */ + + if (is_sqp(ibqp->qp_type)) + context->pri_path.port = qp->port; + + if (attr_mask & IB_QP_PORT) + context->pri_path.port = attr->port_num; + + if (attr_mask & IB_QP_AV) { + err = mlx5_set_path(dev, &attr->ah_attr, &context->pri_path, + attr_mask & IB_QP_PORT ? attr->port_num : qp->port, + attr_mask, 0, attr); + if (err) + goto out; + } + + if (attr_mask & IB_QP_TIMEOUT) + context->pri_path.ackto_lt |= attr->timeout << 3; + + if (attr_mask & IB_QP_ALT_PATH) { + err = mlx5_set_path(dev, &attr->alt_ah_attr, &context->alt_path, + attr->alt_port_num, attr_mask, 0, attr); + if (err) + goto out; + } + + pd = get_pd(qp); + get_cqs(qp, &send_cq, &recv_cq); + + context->flags_pd = cpu_to_be32(pd ? pd->pdn : to_mpd(dev->devr.p0)->pdn); + context->cqn_send = send_cq ? cpu_to_be32(send_cq->mcq.cqn) : 0; + context->cqn_recv = recv_cq ? cpu_to_be32(recv_cq->mcq.cqn) : 0; + context->params1 = cpu_to_be32(MLX5_IB_ACK_REQ_FREQ << 28); + + if (attr_mask & IB_QP_RNR_RETRY) + context->params1 |= cpu_to_be32(attr->rnr_retry << 13); + + if (attr_mask & IB_QP_RETRY_CNT) + context->params1 |= cpu_to_be32(attr->retry_cnt << 16); + + if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) { + if (attr->max_rd_atomic) + context->params1 |= + cpu_to_be32(fls(attr->max_rd_atomic - 1) << 21); + } + + if (attr_mask & IB_QP_SQ_PSN) + context->next_send_psn = cpu_to_be32(attr->sq_psn); + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) { + if (attr->max_dest_rd_atomic) + context->params2 |= + cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21); + } + + if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) + context->params2 |= to_mlx5_access_flags(qp, attr, attr_mask); + + if (attr_mask & IB_QP_MIN_RNR_TIMER) + context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24); + + if (attr_mask & IB_QP_RQ_PSN) + context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn); + + if (attr_mask & IB_QP_QKEY) + context->qkey = cpu_to_be32(attr->qkey); + + if (qp->rq.wqe_cnt && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) + context->db_rec_addr = cpu_to_be64(qp->db.dma); + + if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD && + attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY && attr->en_sqd_async_notify) + sqd_event = 1; + else + sqd_event = 0; + + if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) + context->sq_crq_size |= cpu_to_be16(1 << 4); + + + mlx5_cur = to_mlx5_state(cur_state); + mlx5_new = to_mlx5_state(new_state); + mlx5_st = to_mlx5_st(ibqp->qp_type); + if (mlx5_st < 0) + goto out; + + /* If moving to a reset or error state, we must disable page faults on + * this QP and flush all current page faults. Otherwise a stale page + * fault may attempt to work on this QP after it is reset and moved + * again to RTS, and may cause the driver and the device to get out of + * sync. */ + if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR && + (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR)) + mlx5_ib_qp_disable_pagefaults(qp); + + optpar = ib_mask_to_mlx5_opt(attr_mask); + optpar &= opt_mask[mlx5_cur][mlx5_new][mlx5_st]; + in->optparam = cpu_to_be32(optpar); + err = mlx5_core_qp_modify(dev->mdev, to_mlx5_state(cur_state), + to_mlx5_state(new_state), in, sqd_event, + &qp->mqp); + if (err) + goto out; + + if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) + mlx5_ib_qp_enable_pagefaults(qp); + + qp->state = new_state; + + if (attr_mask & IB_QP_ACCESS_FLAGS) + qp->atomic_rd_en = attr->qp_access_flags; + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) + qp->resp_depth = attr->max_dest_rd_atomic; + if (attr_mask & IB_QP_PORT) + qp->port = attr->port_num; + if (attr_mask & IB_QP_ALT_PATH) + qp->alt_port = attr->alt_port_num; + + /* + * If we moved a kernel QP to RESET, clean up all old CQ + * entries and reinitialize the QP. + */ + if (new_state == IB_QPS_RESET && !ibqp->uobject) { + mlx5_ib_cq_clean(recv_cq, qp->mqp.qpn, + ibqp->srq ? to_msrq(ibqp->srq) : NULL); + if (send_cq != recv_cq) + mlx5_ib_cq_clean(send_cq, qp->mqp.qpn, NULL); + + qp->rq.head = 0; + qp->rq.tail = 0; + qp->sq.head = 0; + qp->sq.tail = 0; + qp->sq.cur_post = 0; + qp->sq.last_poll = 0; + qp->db.db[MLX5_RCV_DBR] = 0; + qp->db.db[MLX5_SND_DBR] = 0; + } + +out: + kfree(in); + return err; +} + +int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + struct mlx5_ib_qp *qp = to_mqp(ibqp); + enum ib_qp_state cur_state, new_state; + struct mlx5_general_caps *gen; + int err = -EINVAL; + int port; + + gen = &dev->mdev->caps.gen; + mutex_lock(&qp->mutex); + + cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state; + new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; + + if (ibqp->qp_type != MLX5_IB_QPT_REG_UMR && + !ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask, + IB_LINK_LAYER_UNSPECIFIED)) + goto out; + + if ((attr_mask & IB_QP_PORT) && + (attr->port_num == 0 || attr->port_num > gen->num_ports)) + goto out; + + if (attr_mask & IB_QP_PKEY_INDEX) { + port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; + if (attr->pkey_index >= gen->port[port - 1].pkey_table_len) + goto out; + } + + if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC && + attr->max_rd_atomic > (1 << gen->log_max_ra_res_qp)) + goto out; + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC && + attr->max_dest_rd_atomic > (1 << gen->log_max_ra_req_qp)) + goto out; + + if (cur_state == new_state && cur_state == IB_QPS_RESET) { + err = 0; + goto out; + } + + err = __mlx5_ib_modify_qp(ibqp, attr, attr_mask, cur_state, new_state); + +out: + mutex_unlock(&qp->mutex); + return err; +} + +static int mlx5_wq_overflow(struct mlx5_ib_wq *wq, int nreq, struct ib_cq *ib_cq) +{ + struct mlx5_ib_cq *cq; + unsigned cur; + + cur = wq->head - wq->tail; + if (likely(cur + nreq < wq->max_post)) + return 0; + + cq = to_mcq(ib_cq); + spin_lock(&cq->lock); + cur = wq->head - wq->tail; + spin_unlock(&cq->lock); + + return cur + nreq >= wq->max_post; +} + +static __always_inline void set_raddr_seg(struct mlx5_wqe_raddr_seg *rseg, + u64 remote_addr, u32 rkey) +{ + rseg->raddr = cpu_to_be64(remote_addr); + rseg->rkey = cpu_to_be32(rkey); + rseg->reserved = 0; +} + +static void set_datagram_seg(struct mlx5_wqe_datagram_seg *dseg, + struct ib_send_wr *wr) +{ + memcpy(&dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof(struct mlx5_av)); + dseg->av.dqp_dct = cpu_to_be32(wr->wr.ud.remote_qpn | MLX5_EXTENDED_UD_AV); + dseg->av.key.qkey.qkey = cpu_to_be32(wr->wr.ud.remote_qkey); +} + +static void set_data_ptr_seg(struct mlx5_wqe_data_seg *dseg, struct ib_sge *sg) +{ + dseg->byte_count = cpu_to_be32(sg->length); + dseg->lkey = cpu_to_be32(sg->lkey); + dseg->addr = cpu_to_be64(sg->addr); +} + +static __be16 get_klm_octo(int npages) +{ + return cpu_to_be16(ALIGN(npages, 8) / 2); +} + +static __be64 frwr_mkey_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_LEN | + MLX5_MKEY_MASK_PAGE_SIZE | + MLX5_MKEY_MASK_START_ADDR | + MLX5_MKEY_MASK_EN_RINVAL | + MLX5_MKEY_MASK_KEY | + MLX5_MKEY_MASK_LR | + MLX5_MKEY_MASK_LW | + MLX5_MKEY_MASK_RR | + MLX5_MKEY_MASK_RW | + MLX5_MKEY_MASK_A | + MLX5_MKEY_MASK_SMALL_FENCE | + MLX5_MKEY_MASK_FREE; + + return cpu_to_be64(result); +} + +static __be64 sig_mkey_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_LEN | + MLX5_MKEY_MASK_PAGE_SIZE | + MLX5_MKEY_MASK_START_ADDR | + MLX5_MKEY_MASK_EN_SIGERR | + MLX5_MKEY_MASK_EN_RINVAL | + MLX5_MKEY_MASK_KEY | + MLX5_MKEY_MASK_LR | + MLX5_MKEY_MASK_LW | + MLX5_MKEY_MASK_RR | + MLX5_MKEY_MASK_RW | + MLX5_MKEY_MASK_SMALL_FENCE | + MLX5_MKEY_MASK_FREE | + MLX5_MKEY_MASK_BSF_EN; + + return cpu_to_be64(result); +} + +static void set_frwr_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, + struct ib_send_wr *wr, int li) +{ + memset(umr, 0, sizeof(*umr)); + + if (li) { + umr->mkey_mask = cpu_to_be64(MLX5_MKEY_MASK_FREE); + umr->flags = 1 << 7; + return; + } + + umr->flags = (1 << 5); /* fail if not free */ + umr->klm_octowords = get_klm_octo(wr->wr.fast_reg.page_list_len); + umr->mkey_mask = frwr_mkey_mask(); +} + +static __be64 get_umr_reg_mr_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_LEN | + MLX5_MKEY_MASK_PAGE_SIZE | + MLX5_MKEY_MASK_START_ADDR | + MLX5_MKEY_MASK_PD | + MLX5_MKEY_MASK_LR | + MLX5_MKEY_MASK_LW | + MLX5_MKEY_MASK_KEY | + MLX5_MKEY_MASK_RR | + MLX5_MKEY_MASK_RW | + MLX5_MKEY_MASK_A | + MLX5_MKEY_MASK_FREE; + + return cpu_to_be64(result); +} + +static __be64 get_umr_unreg_mr_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_FREE; + + return cpu_to_be64(result); +} + +static __be64 get_umr_update_mtt_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_FREE; + + return cpu_to_be64(result); +} + +static void set_reg_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, + struct ib_send_wr *wr) +{ + struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr->wr.fast_reg; + + memset(umr, 0, sizeof(*umr)); + + if (wr->send_flags & MLX5_IB_SEND_UMR_FAIL_IF_FREE) + umr->flags = MLX5_UMR_CHECK_FREE; /* fail if free */ + else + umr->flags = MLX5_UMR_CHECK_NOT_FREE; /* fail if not free */ + + if (!(wr->send_flags & MLX5_IB_SEND_UMR_UNREG)) { + umr->klm_octowords = get_klm_octo(umrwr->npages); + if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_MTT) { + umr->mkey_mask = get_umr_update_mtt_mask(); + umr->bsf_octowords = get_klm_octo(umrwr->target.offset); + umr->flags |= MLX5_UMR_TRANSLATION_OFFSET_EN; + } else { + umr->mkey_mask = get_umr_reg_mr_mask(); + } + } else { + umr->mkey_mask = get_umr_unreg_mr_mask(); + } + + if (!wr->num_sge) + umr->flags |= MLX5_UMR_INLINE; +} + +static u8 get_umr_flags(int acc) +{ + return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX5_PERM_ATOMIC : 0) | + (acc & IB_ACCESS_REMOTE_WRITE ? MLX5_PERM_REMOTE_WRITE : 0) | + (acc & IB_ACCESS_REMOTE_READ ? MLX5_PERM_REMOTE_READ : 0) | + (acc & IB_ACCESS_LOCAL_WRITE ? MLX5_PERM_LOCAL_WRITE : 0) | + MLX5_PERM_LOCAL_READ | MLX5_PERM_UMR_EN; +} + +static void set_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *wr, + int li, int *writ) +{ + memset(seg, 0, sizeof(*seg)); + if (li) { + seg->status = MLX5_MKEY_STATUS_FREE; + return; + } + + seg->flags = get_umr_flags(wr->wr.fast_reg.access_flags) | + MLX5_ACCESS_MODE_MTT; + *writ = seg->flags & (MLX5_PERM_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE); + seg->qpn_mkey7_0 = cpu_to_be32((wr->wr.fast_reg.rkey & 0xff) | 0xffffff00); + seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL); + seg->start_addr = cpu_to_be64(wr->wr.fast_reg.iova_start); + seg->len = cpu_to_be64(wr->wr.fast_reg.length); + seg->xlt_oct_size = cpu_to_be32((wr->wr.fast_reg.page_list_len + 1) / 2); + seg->log2_page_size = wr->wr.fast_reg.page_shift; +} + +static void set_reg_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *wr) +{ + struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr->wr.fast_reg; + + memset(seg, 0, sizeof(*seg)); + if (wr->send_flags & MLX5_IB_SEND_UMR_UNREG) { + seg->status = MLX5_MKEY_STATUS_FREE; + return; + } + + seg->flags = convert_access(umrwr->access_flags); + if (!(wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_MTT)) { + seg->flags_pd = cpu_to_be32(to_mpd(umrwr->pd)->pdn); + seg->start_addr = cpu_to_be64(umrwr->target.virt_addr); + } + seg->len = cpu_to_be64(umrwr->length); + seg->log2_page_size = umrwr->page_shift; + seg->qpn_mkey7_0 = cpu_to_be32(0xffffff00 | + mlx5_mkey_variant(umrwr->mkey)); +} + +static void set_frwr_pages(struct mlx5_wqe_data_seg *dseg, + struct ib_send_wr *wr, + struct mlx5_core_dev *mdev, + struct mlx5_ib_pd *pd, + int writ) +{ + struct mlx5_ib_fast_reg_page_list *mfrpl = to_mfrpl(wr->wr.fast_reg.page_list); + u64 *page_list = wr->wr.fast_reg.page_list->page_list; + u64 perm = MLX5_EN_RD | (writ ? MLX5_EN_WR : 0); + int i; + + for (i = 0; i < wr->wr.fast_reg.page_list_len; i++) + mfrpl->mapped_page_list[i] = cpu_to_be64(page_list[i] | perm); + dseg->addr = cpu_to_be64(mfrpl->map); + dseg->byte_count = cpu_to_be32(ALIGN(sizeof(u64) * wr->wr.fast_reg.page_list_len, 64)); + dseg->lkey = cpu_to_be32(pd->pa_lkey); +} + +static __be32 send_ieth(struct ib_send_wr *wr) +{ + switch (wr->opcode) { + case IB_WR_SEND_WITH_IMM: + case IB_WR_RDMA_WRITE_WITH_IMM: + return wr->ex.imm_data; + + case IB_WR_SEND_WITH_INV: + return cpu_to_be32(wr->ex.invalidate_rkey); + + default: + return 0; + } +} + +static u8 calc_sig(void *wqe, int size) +{ + u8 *p = wqe; + u8 res = 0; + int i; + + for (i = 0; i < size; i++) + res ^= p[i]; + + return ~res; +} + +static u8 wq_sig(void *wqe) +{ + return calc_sig(wqe, (*((u8 *)wqe + 8) & 0x3f) << 4); +} + +static int set_data_inl_seg(struct mlx5_ib_qp *qp, struct ib_send_wr *wr, + void *wqe, int *sz) +{ + struct mlx5_wqe_inline_seg *seg; + void *qend = qp->sq.qend; + void *addr; + int inl = 0; + int copy; + int len; + int i; + + seg = wqe; + wqe += sizeof(*seg); + for (i = 0; i < wr->num_sge; i++) { + addr = (void *)(unsigned long)(wr->sg_list[i].addr); + len = wr->sg_list[i].length; + inl += len; + + if (unlikely(inl > qp->max_inline_data)) + return -ENOMEM; + + if (unlikely(wqe + len > qend)) { + copy = qend - wqe; + memcpy(wqe, addr, copy); + addr += copy; + len -= copy; + wqe = mlx5_get_send_wqe(qp, 0); + } + memcpy(wqe, addr, len); + wqe += len; + } + + seg->byte_count = cpu_to_be32(inl | MLX5_INLINE_SEG); + + *sz = ALIGN(inl + sizeof(seg->byte_count), 16) / 16; + + return 0; +} + +static u16 prot_field_size(enum ib_signature_type type) +{ + switch (type) { + case IB_SIG_TYPE_T10_DIF: + return MLX5_DIF_SIZE; + default: + return 0; + } +} + +static u8 bs_selector(int block_size) +{ + switch (block_size) { + case 512: return 0x1; + case 520: return 0x2; + case 4096: return 0x3; + case 4160: return 0x4; + case 1073741824: return 0x5; + default: return 0; + } +} + +static void mlx5_fill_inl_bsf(struct ib_sig_domain *domain, + struct mlx5_bsf_inl *inl) +{ + /* Valid inline section and allow BSF refresh */ + inl->vld_refresh = cpu_to_be16(MLX5_BSF_INL_VALID | + MLX5_BSF_REFRESH_DIF); + inl->dif_apptag = cpu_to_be16(domain->sig.dif.app_tag); + inl->dif_reftag = cpu_to_be32(domain->sig.dif.ref_tag); + /* repeating block */ + inl->rp_inv_seed = MLX5_BSF_REPEAT_BLOCK; + inl->sig_type = domain->sig.dif.bg_type == IB_T10DIF_CRC ? + MLX5_DIF_CRC : MLX5_DIF_IPCS; + + if (domain->sig.dif.ref_remap) + inl->dif_inc_ref_guard_check |= MLX5_BSF_INC_REFTAG; + + if (domain->sig.dif.app_escape) { + if (domain->sig.dif.ref_escape) + inl->dif_inc_ref_guard_check |= MLX5_BSF_APPREF_ESCAPE; + else + inl->dif_inc_ref_guard_check |= MLX5_BSF_APPTAG_ESCAPE; + } + + inl->dif_app_bitmask_check = + cpu_to_be16(domain->sig.dif.apptag_check_mask); +} + +static int mlx5_set_bsf(struct ib_mr *sig_mr, + struct ib_sig_attrs *sig_attrs, + struct mlx5_bsf *bsf, u32 data_size) +{ + struct mlx5_core_sig_ctx *msig = to_mmr(sig_mr)->sig; + struct mlx5_bsf_basic *basic = &bsf->basic; + struct ib_sig_domain *mem = &sig_attrs->mem; + struct ib_sig_domain *wire = &sig_attrs->wire; + + memset(bsf, 0, sizeof(*bsf)); + + /* Basic + Extended + Inline */ + basic->bsf_size_sbs = 1 << 7; + /* Input domain check byte mask */ + basic->check_byte_mask = sig_attrs->check_mask; + basic->raw_data_size = cpu_to_be32(data_size); + + /* Memory domain */ + switch (sig_attrs->mem.sig_type) { + case IB_SIG_TYPE_NONE: + break; + case IB_SIG_TYPE_T10_DIF: + basic->mem.bs_selector = bs_selector(mem->sig.dif.pi_interval); + basic->m_bfs_psv = cpu_to_be32(msig->psv_memory.psv_idx); + mlx5_fill_inl_bsf(mem, &bsf->m_inl); + break; + default: + return -EINVAL; + } + + /* Wire domain */ + switch (sig_attrs->wire.sig_type) { + case IB_SIG_TYPE_NONE: + break; + case IB_SIG_TYPE_T10_DIF: + if (mem->sig.dif.pi_interval == wire->sig.dif.pi_interval && + mem->sig_type == wire->sig_type) { + /* Same block structure */ + basic->bsf_size_sbs |= 1 << 4; + if (mem->sig.dif.bg_type == wire->sig.dif.bg_type) + basic->wire.copy_byte_mask |= MLX5_CPY_GRD_MASK; + if (mem->sig.dif.app_tag == wire->sig.dif.app_tag) + basic->wire.copy_byte_mask |= MLX5_CPY_APP_MASK; + if (mem->sig.dif.ref_tag == wire->sig.dif.ref_tag) + basic->wire.copy_byte_mask |= MLX5_CPY_REF_MASK; + } else + basic->wire.bs_selector = bs_selector(wire->sig.dif.pi_interval); + + basic->w_bfs_psv = cpu_to_be32(msig->psv_wire.psv_idx); + mlx5_fill_inl_bsf(wire, &bsf->w_inl); + break; + default: + return -EINVAL; + } + + return 0; +} + +static int set_sig_data_segment(struct ib_send_wr *wr, struct mlx5_ib_qp *qp, + void **seg, int *size) +{ + struct ib_sig_attrs *sig_attrs = wr->wr.sig_handover.sig_attrs; + struct ib_mr *sig_mr = wr->wr.sig_handover.sig_mr; + struct mlx5_bsf *bsf; + u32 data_len = wr->sg_list->length; + u32 data_key = wr->sg_list->lkey; + u64 data_va = wr->sg_list->addr; + int ret; + int wqe_size; + + if (!wr->wr.sig_handover.prot || + (data_key == wr->wr.sig_handover.prot->lkey && + data_va == wr->wr.sig_handover.prot->addr && + data_len == wr->wr.sig_handover.prot->length)) { + /** + * Source domain doesn't contain signature information + * or data and protection are interleaved in memory. + * So need construct: + * ------------------ + * | data_klm | + * ------------------ + * | BSF | + * ------------------ + **/ + struct mlx5_klm *data_klm = *seg; + + data_klm->bcount = cpu_to_be32(data_len); + data_klm->key = cpu_to_be32(data_key); + data_klm->va = cpu_to_be64(data_va); + wqe_size = ALIGN(sizeof(*data_klm), 64); + } else { + /** + * Source domain contains signature information + * So need construct a strided block format: + * --------------------------- + * | stride_block_ctrl | + * --------------------------- + * | data_klm | + * --------------------------- + * | prot_klm | + * --------------------------- + * | BSF | + * --------------------------- + **/ + struct mlx5_stride_block_ctrl_seg *sblock_ctrl; + struct mlx5_stride_block_entry *data_sentry; + struct mlx5_stride_block_entry *prot_sentry; + u32 prot_key = wr->wr.sig_handover.prot->lkey; + u64 prot_va = wr->wr.sig_handover.prot->addr; + u16 block_size = sig_attrs->mem.sig.dif.pi_interval; + int prot_size; + + sblock_ctrl = *seg; + data_sentry = (void *)sblock_ctrl + sizeof(*sblock_ctrl); + prot_sentry = (void *)data_sentry + sizeof(*data_sentry); + + prot_size = prot_field_size(sig_attrs->mem.sig_type); + if (!prot_size) { + pr_err("Bad block size given: %u\n", block_size); + return -EINVAL; + } + sblock_ctrl->bcount_per_cycle = cpu_to_be32(block_size + + prot_size); + sblock_ctrl->op = cpu_to_be32(MLX5_STRIDE_BLOCK_OP); + sblock_ctrl->repeat_count = cpu_to_be32(data_len / block_size); + sblock_ctrl->num_entries = cpu_to_be16(2); + + data_sentry->bcount = cpu_to_be16(block_size); + data_sentry->key = cpu_to_be32(data_key); + data_sentry->va = cpu_to_be64(data_va); + data_sentry->stride = cpu_to_be16(block_size); + + prot_sentry->bcount = cpu_to_be16(prot_size); + prot_sentry->key = cpu_to_be32(prot_key); + prot_sentry->va = cpu_to_be64(prot_va); + prot_sentry->stride = cpu_to_be16(prot_size); + + wqe_size = ALIGN(sizeof(*sblock_ctrl) + sizeof(*data_sentry) + + sizeof(*prot_sentry), 64); + } + + *seg += wqe_size; + *size += wqe_size / 16; + if (unlikely((*seg == qp->sq.qend))) + *seg = mlx5_get_send_wqe(qp, 0); + + bsf = *seg; + ret = mlx5_set_bsf(sig_mr, sig_attrs, bsf, data_len); + if (ret) + return -EINVAL; + + *seg += sizeof(*bsf); + *size += sizeof(*bsf) / 16; + if (unlikely((*seg == qp->sq.qend))) + *seg = mlx5_get_send_wqe(qp, 0); + + return 0; +} + +static void set_sig_mkey_segment(struct mlx5_mkey_seg *seg, + struct ib_send_wr *wr, u32 nelements, + u32 length, u32 pdn) +{ + struct ib_mr *sig_mr = wr->wr.sig_handover.sig_mr; + u32 sig_key = sig_mr->rkey; + u8 sigerr = to_mmr(sig_mr)->sig->sigerr_count & 1; + + memset(seg, 0, sizeof(*seg)); + + seg->flags = get_umr_flags(wr->wr.sig_handover.access_flags) | + MLX5_ACCESS_MODE_KLM; + seg->qpn_mkey7_0 = cpu_to_be32((sig_key & 0xff) | 0xffffff00); + seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL | sigerr << 26 | + MLX5_MKEY_BSF_EN | pdn); + seg->len = cpu_to_be64(length); + seg->xlt_oct_size = cpu_to_be32(be16_to_cpu(get_klm_octo(nelements))); + seg->bsfs_octo_size = cpu_to_be32(MLX5_MKEY_BSF_OCTO_SIZE); +} + +static void set_sig_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, + struct ib_send_wr *wr, u32 nelements) +{ + memset(umr, 0, sizeof(*umr)); + + umr->flags = MLX5_FLAGS_INLINE | MLX5_FLAGS_CHECK_FREE; + umr->klm_octowords = get_klm_octo(nelements); + umr->bsf_octowords = cpu_to_be16(MLX5_MKEY_BSF_OCTO_SIZE); + umr->mkey_mask = sig_mkey_mask(); +} + + +static int set_sig_umr_wr(struct ib_send_wr *wr, struct mlx5_ib_qp *qp, + void **seg, int *size) +{ + struct mlx5_ib_mr *sig_mr = to_mmr(wr->wr.sig_handover.sig_mr); + u32 pdn = get_pd(qp)->pdn; + u32 klm_oct_size; + int region_len, ret; + + if (unlikely(wr->num_sge != 1) || + unlikely(wr->wr.sig_handover.access_flags & + IB_ACCESS_REMOTE_ATOMIC) || + unlikely(!sig_mr->sig) || unlikely(!qp->signature_en) || + unlikely(!sig_mr->sig->sig_status_checked)) + return -EINVAL; + + /* length of the protected region, data + protection */ + region_len = wr->sg_list->length; + if (wr->wr.sig_handover.prot && + (wr->wr.sig_handover.prot->lkey != wr->sg_list->lkey || + wr->wr.sig_handover.prot->addr != wr->sg_list->addr || + wr->wr.sig_handover.prot->length != wr->sg_list->length)) + region_len += wr->wr.sig_handover.prot->length; + + /** + * KLM octoword size - if protection was provided + * then we use strided block format (3 octowords), + * else we use single KLM (1 octoword) + **/ + klm_oct_size = wr->wr.sig_handover.prot ? 3 : 1; + + set_sig_umr_segment(*seg, wr, klm_oct_size); + *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); + *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; + if (unlikely((*seg == qp->sq.qend))) + *seg = mlx5_get_send_wqe(qp, 0); + + set_sig_mkey_segment(*seg, wr, klm_oct_size, region_len, pdn); + *seg += sizeof(struct mlx5_mkey_seg); + *size += sizeof(struct mlx5_mkey_seg) / 16; + if (unlikely((*seg == qp->sq.qend))) + *seg = mlx5_get_send_wqe(qp, 0); + + ret = set_sig_data_segment(wr, qp, seg, size); + if (ret) + return ret; + + sig_mr->sig->sig_status_checked = false; + return 0; +} + +static int set_psv_wr(struct ib_sig_domain *domain, + u32 psv_idx, void **seg, int *size) +{ + struct mlx5_seg_set_psv *psv_seg = *seg; + + memset(psv_seg, 0, sizeof(*psv_seg)); + psv_seg->psv_num = cpu_to_be32(psv_idx); + switch (domain->sig_type) { + case IB_SIG_TYPE_NONE: + break; + case IB_SIG_TYPE_T10_DIF: + psv_seg->transient_sig = cpu_to_be32(domain->sig.dif.bg << 16 | + domain->sig.dif.app_tag); + psv_seg->ref_tag = cpu_to_be32(domain->sig.dif.ref_tag); + break; + default: + pr_err("Bad signature type given.\n"); + return 1; + } + + *seg += sizeof(*psv_seg); + *size += sizeof(*psv_seg) / 16; + + return 0; +} + +static int set_frwr_li_wr(void **seg, struct ib_send_wr *wr, int *size, + struct mlx5_core_dev *mdev, struct mlx5_ib_pd *pd, struct mlx5_ib_qp *qp) +{ + int writ = 0; + int li; + + li = wr->opcode == IB_WR_LOCAL_INV ? 1 : 0; + if (unlikely(wr->send_flags & IB_SEND_INLINE)) + return -EINVAL; + + set_frwr_umr_segment(*seg, wr, li); + *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); + *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; + if (unlikely((*seg == qp->sq.qend))) + *seg = mlx5_get_send_wqe(qp, 0); + set_mkey_segment(*seg, wr, li, &writ); + *seg += sizeof(struct mlx5_mkey_seg); + *size += sizeof(struct mlx5_mkey_seg) / 16; + if (unlikely((*seg == qp->sq.qend))) + *seg = mlx5_get_send_wqe(qp, 0); + if (!li) { + if (unlikely(wr->wr.fast_reg.page_list_len > + wr->wr.fast_reg.page_list->max_page_list_len)) + return -ENOMEM; + + set_frwr_pages(*seg, wr, mdev, pd, writ); + *seg += sizeof(struct mlx5_wqe_data_seg); + *size += (sizeof(struct mlx5_wqe_data_seg) / 16); + } + return 0; +} + +static void dump_wqe(struct mlx5_ib_qp *qp, int idx, int size_16) +{ + __be32 *p = NULL; + int tidx = idx; + int i, j; + + pr_debug("dump wqe at %p\n", mlx5_get_send_wqe(qp, tidx)); + for (i = 0, j = 0; i < size_16 * 4; i += 4, j += 4) { + if ((i & 0xf) == 0) { + void *buf = mlx5_get_send_wqe(qp, tidx); + tidx = (tidx + 1) & (qp->sq.wqe_cnt - 1); + p = buf; + j = 0; + } + pr_debug("%08x %08x %08x %08x\n", be32_to_cpu(p[j]), + be32_to_cpu(p[j + 1]), be32_to_cpu(p[j + 2]), + be32_to_cpu(p[j + 3])); + } +} + +static void mlx5_bf_copy(u64 __iomem *dst, u64 *src, + unsigned bytecnt, struct mlx5_ib_qp *qp) +{ + while (bytecnt > 0) { + __iowrite64_copy(dst++, src++, 8); + __iowrite64_copy(dst++, src++, 8); + __iowrite64_copy(dst++, src++, 8); + __iowrite64_copy(dst++, src++, 8); + __iowrite64_copy(dst++, src++, 8); + __iowrite64_copy(dst++, src++, 8); + __iowrite64_copy(dst++, src++, 8); + __iowrite64_copy(dst++, src++, 8); + bytecnt -= 64; + if (unlikely(src == qp->sq.qend)) + src = mlx5_get_send_wqe(qp, 0); + } +} + +static u8 get_fence(u8 fence, struct ib_send_wr *wr) +{ + if (unlikely(wr->opcode == IB_WR_LOCAL_INV && + wr->send_flags & IB_SEND_FENCE)) + return MLX5_FENCE_MODE_STRONG_ORDERING; + + if (unlikely(fence)) { + if (wr->send_flags & IB_SEND_FENCE) + return MLX5_FENCE_MODE_SMALL_AND_FENCE; + else + return fence; + + } else { + return 0; + } +} + +static int begin_wqe(struct mlx5_ib_qp *qp, void **seg, + struct mlx5_wqe_ctrl_seg **ctrl, + struct ib_send_wr *wr, unsigned *idx, + int *size, int nreq) +{ + int err = 0; + + if (unlikely(mlx5_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq))) { + err = -ENOMEM; + return err; + } + + *idx = qp->sq.cur_post & (qp->sq.wqe_cnt - 1); + *seg = mlx5_get_send_wqe(qp, *idx); + *ctrl = *seg; + *(uint32_t *)(*seg + 8) = 0; + (*ctrl)->imm = send_ieth(wr); + (*ctrl)->fm_ce_se = qp->sq_signal_bits | + (wr->send_flags & IB_SEND_SIGNALED ? + MLX5_WQE_CTRL_CQ_UPDATE : 0) | + (wr->send_flags & IB_SEND_SOLICITED ? + MLX5_WQE_CTRL_SOLICITED : 0); + + *seg += sizeof(**ctrl); + *size = sizeof(**ctrl) / 16; + + return err; +} + +static void finish_wqe(struct mlx5_ib_qp *qp, + struct mlx5_wqe_ctrl_seg *ctrl, + u8 size, unsigned idx, u64 wr_id, + int nreq, u8 fence, u8 next_fence, + u32 mlx5_opcode) +{ + u8 opmod = 0; + + ctrl->opmod_idx_opcode = cpu_to_be32(((u32)(qp->sq.cur_post) << 8) | + mlx5_opcode | ((u32)opmod << 24)); + ctrl->qpn_ds = cpu_to_be32(size | (qp->mqp.qpn << 8)); + ctrl->fm_ce_se |= fence; + qp->fm_cache = next_fence; + if (unlikely(qp->wq_sig)) + ctrl->signature = wq_sig(ctrl); + + qp->sq.wrid[idx] = wr_id; + qp->sq.w_list[idx].opcode = mlx5_opcode; + qp->sq.wqe_head[idx] = qp->sq.head + nreq; + qp->sq.cur_post += DIV_ROUND_UP(size * 16, MLX5_SEND_WQE_BB); + qp->sq.w_list[idx].next = qp->sq.cur_post; +} + + +int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr) +{ + struct mlx5_wqe_ctrl_seg *ctrl = NULL; /* compiler warning */ + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_ib_qp *qp = to_mqp(ibqp); + struct mlx5_ib_mr *mr; + struct mlx5_wqe_data_seg *dpseg; + struct mlx5_wqe_xrc_seg *xrc; + struct mlx5_bf *bf = qp->bf; + int uninitialized_var(size); + void *qend = qp->sq.qend; + unsigned long flags; + unsigned idx; + int err = 0; + int inl = 0; + int num_sge; + void *seg; + int nreq; + int i; + u8 next_fence = 0; + u8 fence; + + spin_lock_irqsave(&qp->sq.lock, flags); + + for (nreq = 0; wr; nreq++, wr = wr->next) { + if (unlikely(wr->opcode >= ARRAY_SIZE(mlx5_ib_opcode))) { + mlx5_ib_warn(dev, "\n"); + err = -EINVAL; + *bad_wr = wr; + goto out; + } + + fence = qp->fm_cache; + num_sge = wr->num_sge; + if (unlikely(num_sge > qp->sq.max_gs)) { + mlx5_ib_warn(dev, "\n"); + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + err = begin_wqe(qp, &seg, &ctrl, wr, &idx, &size, nreq); + if (err) { + mlx5_ib_warn(dev, "\n"); + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + switch (ibqp->qp_type) { + case IB_QPT_XRC_INI: + xrc = seg; + xrc->xrc_srqn = htonl(wr->xrc_remote_srq_num); + seg += sizeof(*xrc); + size += sizeof(*xrc) / 16; + /* fall through */ + case IB_QPT_RC: + switch (wr->opcode) { + case IB_WR_RDMA_READ: + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + set_raddr_seg(seg, wr->wr.rdma.remote_addr, + wr->wr.rdma.rkey); + seg += sizeof(struct mlx5_wqe_raddr_seg); + size += sizeof(struct mlx5_wqe_raddr_seg) / 16; + break; + + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + case IB_WR_MASKED_ATOMIC_CMP_AND_SWP: + mlx5_ib_warn(dev, "Atomic operations are not supported yet\n"); + err = -ENOSYS; + *bad_wr = wr; + goto out; + + case IB_WR_LOCAL_INV: + next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL; + qp->sq.wr_data[idx] = IB_WR_LOCAL_INV; + ctrl->imm = cpu_to_be32(wr->ex.invalidate_rkey); + err = set_frwr_li_wr(&seg, wr, &size, mdev, to_mpd(ibqp->pd), qp); + if (err) { + mlx5_ib_warn(dev, "\n"); + *bad_wr = wr; + goto out; + } + num_sge = 0; + break; + + case IB_WR_FAST_REG_MR: + next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL; + qp->sq.wr_data[idx] = IB_WR_FAST_REG_MR; + ctrl->imm = cpu_to_be32(wr->wr.fast_reg.rkey); + err = set_frwr_li_wr(&seg, wr, &size, mdev, to_mpd(ibqp->pd), qp); + if (err) { + mlx5_ib_warn(dev, "\n"); + *bad_wr = wr; + goto out; + } + num_sge = 0; + break; + + case IB_WR_REG_SIG_MR: + qp->sq.wr_data[idx] = IB_WR_REG_SIG_MR; + mr = to_mmr(wr->wr.sig_handover.sig_mr); + + ctrl->imm = cpu_to_be32(mr->ibmr.rkey); + err = set_sig_umr_wr(wr, qp, &seg, &size); + if (err) { + mlx5_ib_warn(dev, "\n"); + *bad_wr = wr; + goto out; + } + + finish_wqe(qp, ctrl, size, idx, wr->wr_id, + nreq, get_fence(fence, wr), + next_fence, MLX5_OPCODE_UMR); + /* + * SET_PSV WQEs are not signaled and solicited + * on error + */ + wr->send_flags &= ~IB_SEND_SIGNALED; + wr->send_flags |= IB_SEND_SOLICITED; + err = begin_wqe(qp, &seg, &ctrl, wr, + &idx, &size, nreq); + if (err) { + mlx5_ib_warn(dev, "\n"); + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + err = set_psv_wr(&wr->wr.sig_handover.sig_attrs->mem, + mr->sig->psv_memory.psv_idx, &seg, + &size); + if (err) { + mlx5_ib_warn(dev, "\n"); + *bad_wr = wr; + goto out; + } + + finish_wqe(qp, ctrl, size, idx, wr->wr_id, + nreq, get_fence(fence, wr), + next_fence, MLX5_OPCODE_SET_PSV); + err = begin_wqe(qp, &seg, &ctrl, wr, + &idx, &size, nreq); + if (err) { + mlx5_ib_warn(dev, "\n"); + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL; + err = set_psv_wr(&wr->wr.sig_handover.sig_attrs->wire, + mr->sig->psv_wire.psv_idx, &seg, + &size); + if (err) { + mlx5_ib_warn(dev, "\n"); + *bad_wr = wr; + goto out; + } + + finish_wqe(qp, ctrl, size, idx, wr->wr_id, + nreq, get_fence(fence, wr), + next_fence, MLX5_OPCODE_SET_PSV); + num_sge = 0; + goto skip_psv; + + default: + break; + } + break; + + case IB_QPT_UC: + switch (wr->opcode) { + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + set_raddr_seg(seg, wr->wr.rdma.remote_addr, + wr->wr.rdma.rkey); + seg += sizeof(struct mlx5_wqe_raddr_seg); + size += sizeof(struct mlx5_wqe_raddr_seg) / 16; + break; + + default: + break; + } + break; + + case IB_QPT_UD: + case IB_QPT_SMI: + case IB_QPT_GSI: + set_datagram_seg(seg, wr); + seg += sizeof(struct mlx5_wqe_datagram_seg); + size += sizeof(struct mlx5_wqe_datagram_seg) / 16; + if (unlikely((seg == qend))) + seg = mlx5_get_send_wqe(qp, 0); + break; + + case MLX5_IB_QPT_REG_UMR: + if (wr->opcode != MLX5_IB_WR_UMR) { + err = -EINVAL; + mlx5_ib_warn(dev, "bad opcode\n"); + goto out; + } + qp->sq.wr_data[idx] = MLX5_IB_WR_UMR; + ctrl->imm = cpu_to_be32(wr->wr.fast_reg.rkey); + set_reg_umr_segment(seg, wr); + seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); + size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; + if (unlikely((seg == qend))) + seg = mlx5_get_send_wqe(qp, 0); + set_reg_mkey_segment(seg, wr); + seg += sizeof(struct mlx5_mkey_seg); + size += sizeof(struct mlx5_mkey_seg) / 16; + if (unlikely((seg == qend))) + seg = mlx5_get_send_wqe(qp, 0); + break; + + default: + break; + } + + if (wr->send_flags & IB_SEND_INLINE && num_sge) { + int uninitialized_var(sz); + + err = set_data_inl_seg(qp, wr, seg, &sz); + if (unlikely(err)) { + mlx5_ib_warn(dev, "\n"); + *bad_wr = wr; + goto out; + } + inl = 1; + size += sz; + } else { + dpseg = seg; + for (i = 0; i < num_sge; i++) { + if (unlikely(dpseg == qend)) { + seg = mlx5_get_send_wqe(qp, 0); + dpseg = seg; + } + if (likely(wr->sg_list[i].length)) { + set_data_ptr_seg(dpseg, wr->sg_list + i); + size += sizeof(struct mlx5_wqe_data_seg) / 16; + dpseg++; + } + } + } + + finish_wqe(qp, ctrl, size, idx, wr->wr_id, nreq, + get_fence(fence, wr), next_fence, + mlx5_ib_opcode[wr->opcode]); +skip_psv: + if (0) + dump_wqe(qp, idx, size); + } + +out: + if (likely(nreq)) { + qp->sq.head += nreq; + + /* Make sure that descriptors are written before + * updating doorbell record and ringing the doorbell + */ + wmb(); + + qp->db.db[MLX5_SND_DBR] = cpu_to_be32(qp->sq.cur_post); + + /* Make sure doorbell record is visible to the HCA before + * we hit doorbell */ + wmb(); + + if (bf->need_lock) + spin_lock(&bf->lock); + else + __acquire(&bf->lock); + + /* TBD enable WC */ + if (0 && nreq == 1 && bf->uuarn && inl && size > 1 && size <= bf->buf_size / 16) { + mlx5_bf_copy(bf->reg + bf->offset, (u64 *)ctrl, ALIGN(size * 16, 64), qp); + /* wc_wmb(); */ + } else { + mlx5_write64((__be32 *)ctrl, bf->regreg + bf->offset, + MLX5_GET_DOORBELL_LOCK(&bf->lock32)); + /* Make sure doorbells don't leak out of SQ spinlock + * and reach the HCA out of order. + */ + mmiowb(); + } + bf->offset ^= bf->buf_size; + if (bf->need_lock) + spin_unlock(&bf->lock); + else + __release(&bf->lock); + } + + spin_unlock_irqrestore(&qp->sq.lock, flags); + + return err; +} + +static void set_sig_seg(struct mlx5_rwqe_sig *sig, int size) +{ + sig->signature = calc_sig(sig, size); +} + +int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct mlx5_ib_qp *qp = to_mqp(ibqp); + struct mlx5_wqe_data_seg *scat; + struct mlx5_rwqe_sig *sig; + unsigned long flags; + int err = 0; + int nreq; + int ind; + int i; + + spin_lock_irqsave(&qp->rq.lock, flags); + + ind = qp->rq.head & (qp->rq.wqe_cnt - 1); + + for (nreq = 0; wr; nreq++, wr = wr->next) { + if (mlx5_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) { + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + if (unlikely(wr->num_sge > qp->rq.max_gs)) { + err = -EINVAL; + *bad_wr = wr; + goto out; + } + + scat = get_recv_wqe(qp, ind); + if (qp->wq_sig) + scat++; + + for (i = 0; i < wr->num_sge; i++) + set_data_ptr_seg(scat + i, wr->sg_list + i); + + if (i < qp->rq.max_gs) { + scat[i].byte_count = 0; + scat[i].lkey = cpu_to_be32(MLX5_INVALID_LKEY); + scat[i].addr = 0; + } + + if (qp->wq_sig) { + sig = (struct mlx5_rwqe_sig *)scat; + set_sig_seg(sig, (qp->rq.max_gs + 1) << 2); + } + + qp->rq.wrid[ind] = wr->wr_id; + + ind = (ind + 1) & (qp->rq.wqe_cnt - 1); + } + +out: + if (likely(nreq)) { + qp->rq.head += nreq; + + /* Make sure that descriptors are written before + * doorbell record. + */ + wmb(); + + *qp->db.db = cpu_to_be32(qp->rq.head & 0xffff); + } + + spin_unlock_irqrestore(&qp->rq.lock, flags); + + return err; +} + +static inline enum ib_qp_state to_ib_qp_state(enum mlx5_qp_state mlx5_state) +{ + switch (mlx5_state) { + case MLX5_QP_STATE_RST: return IB_QPS_RESET; + case MLX5_QP_STATE_INIT: return IB_QPS_INIT; + case MLX5_QP_STATE_RTR: return IB_QPS_RTR; + case MLX5_QP_STATE_RTS: return IB_QPS_RTS; + case MLX5_QP_STATE_SQ_DRAINING: + case MLX5_QP_STATE_SQD: return IB_QPS_SQD; + case MLX5_QP_STATE_SQER: return IB_QPS_SQE; + case MLX5_QP_STATE_ERR: return IB_QPS_ERR; + default: return -1; + } +} + +static inline enum ib_mig_state to_ib_mig_state(int mlx5_mig_state) +{ + switch (mlx5_mig_state) { + case MLX5_QP_PM_ARMED: return IB_MIG_ARMED; + case MLX5_QP_PM_REARM: return IB_MIG_REARM; + case MLX5_QP_PM_MIGRATED: return IB_MIG_MIGRATED; + default: return -1; + } +} + +static int to_ib_qp_access_flags(int mlx5_flags) +{ + int ib_flags = 0; + + if (mlx5_flags & MLX5_QP_BIT_RRE) + ib_flags |= IB_ACCESS_REMOTE_READ; + if (mlx5_flags & MLX5_QP_BIT_RWE) + ib_flags |= IB_ACCESS_REMOTE_WRITE; + if (mlx5_flags & MLX5_QP_BIT_RAE) + ib_flags |= IB_ACCESS_REMOTE_ATOMIC; + + return ib_flags; +} + +static void to_ib_ah_attr(struct mlx5_ib_dev *ibdev, struct ib_ah_attr *ib_ah_attr, + struct mlx5_qp_path *path) +{ + struct mlx5_core_dev *dev = ibdev->mdev; + + memset(ib_ah_attr, 0, sizeof(*ib_ah_attr)); + ib_ah_attr->port_num = path->port; + + if (ib_ah_attr->port_num == 0 || + ib_ah_attr->port_num > dev->caps.gen.num_ports) + return; + + ib_ah_attr->sl = path->sl & 0xf; + + ib_ah_attr->dlid = be16_to_cpu(path->rlid); + ib_ah_attr->src_path_bits = path->grh_mlid & 0x7f; + ib_ah_attr->static_rate = path->static_rate ? path->static_rate - 5 : 0; + ib_ah_attr->ah_flags = (path->grh_mlid & (1 << 7)) ? IB_AH_GRH : 0; + if (ib_ah_attr->ah_flags) { + ib_ah_attr->grh.sgid_index = path->mgid_index; + ib_ah_attr->grh.hop_limit = path->hop_limit; + ib_ah_attr->grh.traffic_class = + (be32_to_cpu(path->tclass_flowlabel) >> 20) & 0xff; + ib_ah_attr->grh.flow_label = + be32_to_cpu(path->tclass_flowlabel) & 0xfffff; + memcpy(ib_ah_attr->grh.dgid.raw, + path->rgid, sizeof(ib_ah_attr->grh.dgid.raw)); + } +} + +int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr) +{ + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + struct mlx5_ib_qp *qp = to_mqp(ibqp); + struct mlx5_query_qp_mbox_out *outb; + struct mlx5_qp_context *context; + int mlx5_state; + int err = 0; + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + /* + * Wait for any outstanding page faults, in case the user frees memory + * based upon this query's result. + */ + flush_workqueue(mlx5_ib_page_fault_wq); +#endif + + mutex_lock(&qp->mutex); + outb = kzalloc(sizeof(*outb), GFP_KERNEL); + if (!outb) { + err = -ENOMEM; + goto out; + } + context = &outb->ctx; + err = mlx5_core_qp_query(dev->mdev, &qp->mqp, outb, sizeof(*outb)); + if (err) + goto out_free; + + mlx5_state = be32_to_cpu(context->flags) >> 28; + + qp->state = to_ib_qp_state(mlx5_state); + qp_attr->qp_state = qp->state; + qp_attr->path_mtu = context->mtu_msgmax >> 5; + qp_attr->path_mig_state = + to_ib_mig_state((be32_to_cpu(context->flags) >> 11) & 0x3); + qp_attr->qkey = be32_to_cpu(context->qkey); + qp_attr->rq_psn = be32_to_cpu(context->rnr_nextrecvpsn) & 0xffffff; + qp_attr->sq_psn = be32_to_cpu(context->next_send_psn) & 0xffffff; + qp_attr->dest_qp_num = be32_to_cpu(context->log_pg_sz_remote_qpn) & 0xffffff; + qp_attr->qp_access_flags = + to_ib_qp_access_flags(be32_to_cpu(context->params2)); + + if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) { + to_ib_ah_attr(dev, &qp_attr->ah_attr, &context->pri_path); + to_ib_ah_attr(dev, &qp_attr->alt_ah_attr, &context->alt_path); + qp_attr->alt_pkey_index = context->alt_path.pkey_index & 0x7f; + qp_attr->alt_port_num = qp_attr->alt_ah_attr.port_num; + } + + qp_attr->pkey_index = context->pri_path.pkey_index & 0x7f; + qp_attr->port_num = context->pri_path.port; + + /* qp_attr->en_sqd_async_notify is only applicable in modify qp */ + qp_attr->sq_draining = mlx5_state == MLX5_QP_STATE_SQ_DRAINING; + + qp_attr->max_rd_atomic = 1 << ((be32_to_cpu(context->params1) >> 21) & 0x7); + + qp_attr->max_dest_rd_atomic = + 1 << ((be32_to_cpu(context->params2) >> 21) & 0x7); + qp_attr->min_rnr_timer = + (be32_to_cpu(context->rnr_nextrecvpsn) >> 24) & 0x1f; + qp_attr->timeout = context->pri_path.ackto_lt >> 3; + qp_attr->retry_cnt = (be32_to_cpu(context->params1) >> 16) & 0x7; + qp_attr->rnr_retry = (be32_to_cpu(context->params1) >> 13) & 0x7; + qp_attr->alt_timeout = context->alt_path.ackto_lt >> 3; + qp_attr->cur_qp_state = qp_attr->qp_state; + qp_attr->cap.max_recv_wr = qp->rq.wqe_cnt; + qp_attr->cap.max_recv_sge = qp->rq.max_gs; + + if (!ibqp->uobject) { + qp_attr->cap.max_send_wr = qp->sq.wqe_cnt; + qp_attr->cap.max_send_sge = qp->sq.max_gs; + } else { + qp_attr->cap.max_send_wr = 0; + qp_attr->cap.max_send_sge = 0; + } + + /* We don't support inline sends for kernel QPs (yet), and we + * don't know what userspace's value should be. + */ + qp_attr->cap.max_inline_data = 0; + + qp_init_attr->cap = qp_attr->cap; + + qp_init_attr->create_flags = 0; + if (qp->flags & MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK) + qp_init_attr->create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK; + + qp_init_attr->sq_sig_type = qp->sq_signal_bits & MLX5_WQE_CTRL_CQ_UPDATE ? + IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; + +out_free: + kfree(outb); + +out: + mutex_unlock(&qp->mutex); + return err; +} + +struct ib_xrcd *mlx5_ib_alloc_xrcd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_general_caps *gen; + struct mlx5_ib_xrcd *xrcd; + int err; + + gen = &dev->mdev->caps.gen; + if (!(gen->flags & MLX5_DEV_CAP_FLAG_XRC)) + return ERR_PTR(-ENOSYS); + + xrcd = kmalloc(sizeof(*xrcd), GFP_KERNEL); + if (!xrcd) + return ERR_PTR(-ENOMEM); + + err = mlx5_core_xrcd_alloc(dev->mdev, &xrcd->xrcdn); + if (err) { + kfree(xrcd); + return ERR_PTR(-ENOMEM); + } + + return &xrcd->ibxrcd; +} + +int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd) +{ + struct mlx5_ib_dev *dev = to_mdev(xrcd->device); + u32 xrcdn = to_mxrcd(xrcd)->xrcdn; + int err; + + err = mlx5_core_xrcd_dealloc(dev->mdev, xrcdn); + if (err) { + mlx5_ib_warn(dev, "failed to dealloc xrcdn 0x%x\n", xrcdn); + return err; + } + + kfree(xrcd); + + return 0; +} diff --git a/kernel/drivers/infiniband/hw/mlx5/srq.c b/kernel/drivers/infiniband/hw/mlx5/srq.c new file mode 100644 index 000000000..02d77a297 --- /dev/null +++ b/kernel/drivers/infiniband/hw/mlx5/srq.c @@ -0,0 +1,485 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include + +#include "mlx5_ib.h" +#include "user.h" + +/* not supported currently */ +static int srq_signature; + +static void *get_wqe(struct mlx5_ib_srq *srq, int n) +{ + return mlx5_buf_offset(&srq->buf, n << srq->msrq.wqe_shift); +} + +static void mlx5_ib_srq_event(struct mlx5_core_srq *srq, enum mlx5_event type) +{ + struct ib_event event; + struct ib_srq *ibsrq = &to_mibsrq(srq)->ibsrq; + + if (ibsrq->event_handler) { + event.device = ibsrq->device; + event.element.srq = ibsrq; + switch (type) { + case MLX5_EVENT_TYPE_SRQ_RQ_LIMIT: + event.event = IB_EVENT_SRQ_LIMIT_REACHED; + break; + case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR: + event.event = IB_EVENT_SRQ_ERR; + break; + default: + pr_warn("mlx5_ib: Unexpected event type %d on SRQ %06x\n", + type, srq->srqn); + return; + } + + ibsrq->event_handler(&event, ibsrq->srq_context); + } +} + +static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq, + struct mlx5_create_srq_mbox_in **in, + struct ib_udata *udata, int buf_size, int *inlen) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_ib_create_srq ucmd; + size_t ucmdlen; + int err; + int npages; + int page_shift; + int ncont; + u32 offset; + + ucmdlen = + (udata->inlen - sizeof(struct ib_uverbs_cmd_hdr) < + sizeof(ucmd)) ? (sizeof(ucmd) - + sizeof(ucmd.reserved)) : sizeof(ucmd); + + if (ib_copy_from_udata(&ucmd, udata, ucmdlen)) { + mlx5_ib_dbg(dev, "failed copy udata\n"); + return -EFAULT; + } + + if (ucmdlen == sizeof(ucmd) && + ucmd.reserved != 0) + return -EINVAL; + + srq->wq_sig = !!(ucmd.flags & MLX5_SRQ_FLAG_SIGNATURE); + + srq->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, buf_size, + 0, 0); + if (IS_ERR(srq->umem)) { + mlx5_ib_dbg(dev, "failed umem get, size %d\n", buf_size); + err = PTR_ERR(srq->umem); + return err; + } + + mlx5_ib_cont_pages(srq->umem, ucmd.buf_addr, &npages, + &page_shift, &ncont, NULL); + err = mlx5_ib_get_buf_offset(ucmd.buf_addr, page_shift, + &offset); + if (err) { + mlx5_ib_warn(dev, "bad offset\n"); + goto err_umem; + } + + *inlen = sizeof(**in) + sizeof(*(*in)->pas) * ncont; + *in = mlx5_vzalloc(*inlen); + if (!(*in)) { + err = -ENOMEM; + goto err_umem; + } + + mlx5_ib_populate_pas(dev, srq->umem, page_shift, (*in)->pas, 0); + + err = mlx5_ib_db_map_user(to_mucontext(pd->uobject->context), + ucmd.db_addr, &srq->db); + if (err) { + mlx5_ib_dbg(dev, "map doorbell failed\n"); + goto err_in; + } + + (*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; + (*in)->ctx.pgoff_cqn = cpu_to_be32(offset << 26); + + return 0; + +err_in: + kvfree(*in); + +err_umem: + ib_umem_release(srq->umem); + + return err; +} + +static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq, + struct mlx5_create_srq_mbox_in **in, int buf_size, + int *inlen) +{ + int err; + int i; + struct mlx5_wqe_srq_next_seg *next; + int page_shift; + int npages; + + err = mlx5_db_alloc(dev->mdev, &srq->db); + if (err) { + mlx5_ib_warn(dev, "alloc dbell rec failed\n"); + return err; + } + + if (mlx5_buf_alloc(dev->mdev, buf_size, PAGE_SIZE * 2, &srq->buf)) { + mlx5_ib_dbg(dev, "buf alloc failed\n"); + err = -ENOMEM; + goto err_db; + } + page_shift = srq->buf.page_shift; + + srq->head = 0; + srq->tail = srq->msrq.max - 1; + srq->wqe_ctr = 0; + + for (i = 0; i < srq->msrq.max; i++) { + next = get_wqe(srq, i); + next->next_wqe_index = + cpu_to_be16((i + 1) & (srq->msrq.max - 1)); + } + + npages = DIV_ROUND_UP(srq->buf.npages, 1 << (page_shift - PAGE_SHIFT)); + mlx5_ib_dbg(dev, "buf_size %d, page_shift %d, npages %d, calc npages %d\n", + buf_size, page_shift, srq->buf.npages, npages); + *inlen = sizeof(**in) + sizeof(*(*in)->pas) * npages; + *in = mlx5_vzalloc(*inlen); + if (!*in) { + err = -ENOMEM; + goto err_buf; + } + mlx5_fill_page_array(&srq->buf, (*in)->pas); + + srq->wrid = kmalloc(srq->msrq.max * sizeof(u64), GFP_KERNEL); + if (!srq->wrid) { + mlx5_ib_dbg(dev, "kmalloc failed %lu\n", + (unsigned long)(srq->msrq.max * sizeof(u64))); + err = -ENOMEM; + goto err_in; + } + srq->wq_sig = !!srq_signature; + + (*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; + + return 0; + +err_in: + kvfree(*in); + +err_buf: + mlx5_buf_free(dev->mdev, &srq->buf); + +err_db: + mlx5_db_free(dev->mdev, &srq->db); + return err; +} + +static void destroy_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq) +{ + mlx5_ib_db_unmap_user(to_mucontext(pd->uobject->context), &srq->db); + ib_umem_release(srq->umem); +} + + +static void destroy_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq) +{ + kfree(srq->wrid); + mlx5_buf_free(dev->mdev, &srq->buf); + mlx5_db_free(dev->mdev, &srq->db); +} + +struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd, + struct ib_srq_init_attr *init_attr, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_general_caps *gen; + struct mlx5_ib_srq *srq; + int desc_size; + int buf_size; + int err; + struct mlx5_create_srq_mbox_in *uninitialized_var(in); + int uninitialized_var(inlen); + int is_xrc; + u32 flgs, xrcdn; + + gen = &dev->mdev->caps.gen; + /* Sanity check SRQ size before proceeding */ + if (init_attr->attr.max_wr >= gen->max_srq_wqes) { + mlx5_ib_dbg(dev, "max_wr %d, cap %d\n", + init_attr->attr.max_wr, + gen->max_srq_wqes); + return ERR_PTR(-EINVAL); + } + + srq = kmalloc(sizeof(*srq), GFP_KERNEL); + if (!srq) + return ERR_PTR(-ENOMEM); + + mutex_init(&srq->mutex); + spin_lock_init(&srq->lock); + srq->msrq.max = roundup_pow_of_two(init_attr->attr.max_wr + 1); + srq->msrq.max_gs = init_attr->attr.max_sge; + + desc_size = sizeof(struct mlx5_wqe_srq_next_seg) + + srq->msrq.max_gs * sizeof(struct mlx5_wqe_data_seg); + desc_size = roundup_pow_of_two(desc_size); + desc_size = max_t(int, 32, desc_size); + srq->msrq.max_avail_gather = (desc_size - sizeof(struct mlx5_wqe_srq_next_seg)) / + sizeof(struct mlx5_wqe_data_seg); + srq->msrq.wqe_shift = ilog2(desc_size); + buf_size = srq->msrq.max * desc_size; + mlx5_ib_dbg(dev, "desc_size 0x%x, req wr 0x%x, srq size 0x%x, max_gs 0x%x, max_avail_gather 0x%x\n", + desc_size, init_attr->attr.max_wr, srq->msrq.max, srq->msrq.max_gs, + srq->msrq.max_avail_gather); + + if (pd->uobject) + err = create_srq_user(pd, srq, &in, udata, buf_size, &inlen); + else + err = create_srq_kernel(dev, srq, &in, buf_size, &inlen); + + if (err) { + mlx5_ib_warn(dev, "create srq %s failed, err %d\n", + pd->uobject ? "user" : "kernel", err); + goto err_srq; + } + + is_xrc = (init_attr->srq_type == IB_SRQT_XRC); + in->ctx.state_log_sz = ilog2(srq->msrq.max); + flgs = ((srq->msrq.wqe_shift - 4) | (is_xrc << 5) | (srq->wq_sig << 7)) << 24; + xrcdn = 0; + if (is_xrc) { + xrcdn = to_mxrcd(init_attr->ext.xrc.xrcd)->xrcdn; + in->ctx.pgoff_cqn |= cpu_to_be32(to_mcq(init_attr->ext.xrc.cq)->mcq.cqn); + } else if (init_attr->srq_type == IB_SRQT_BASIC) { + xrcdn = to_mxrcd(dev->devr.x0)->xrcdn; + in->ctx.pgoff_cqn |= cpu_to_be32(to_mcq(dev->devr.c0)->mcq.cqn); + } + + in->ctx.flags_xrcd = cpu_to_be32((flgs & 0xFF000000) | (xrcdn & 0xFFFFFF)); + + in->ctx.pd = cpu_to_be32(to_mpd(pd)->pdn); + in->ctx.db_record = cpu_to_be64(srq->db.dma); + err = mlx5_core_create_srq(dev->mdev, &srq->msrq, in, inlen); + kvfree(in); + if (err) { + mlx5_ib_dbg(dev, "create SRQ failed, err %d\n", err); + goto err_usr_kern_srq; + } + + mlx5_ib_dbg(dev, "create SRQ with srqn 0x%x\n", srq->msrq.srqn); + + srq->msrq.event = mlx5_ib_srq_event; + srq->ibsrq.ext.xrc.srq_num = srq->msrq.srqn; + + if (pd->uobject) + if (ib_copy_to_udata(udata, &srq->msrq.srqn, sizeof(__u32))) { + mlx5_ib_dbg(dev, "copy to user failed\n"); + err = -EFAULT; + goto err_core; + } + + init_attr->attr.max_wr = srq->msrq.max - 1; + + return &srq->ibsrq; + +err_core: + mlx5_core_destroy_srq(dev->mdev, &srq->msrq); + +err_usr_kern_srq: + if (pd->uobject) + destroy_srq_user(pd, srq); + else + destroy_srq_kernel(dev, srq); + +err_srq: + kfree(srq); + + return ERR_PTR(err); +} + +int mlx5_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(ibsrq->device); + struct mlx5_ib_srq *srq = to_msrq(ibsrq); + int ret; + + /* We don't support resizing SRQs yet */ + if (attr_mask & IB_SRQ_MAX_WR) + return -EINVAL; + + if (attr_mask & IB_SRQ_LIMIT) { + if (attr->srq_limit >= srq->msrq.max) + return -EINVAL; + + mutex_lock(&srq->mutex); + ret = mlx5_core_arm_srq(dev->mdev, &srq->msrq, attr->srq_limit, 1); + mutex_unlock(&srq->mutex); + + if (ret) + return ret; + } + + return 0; +} + +int mlx5_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr) +{ + struct mlx5_ib_dev *dev = to_mdev(ibsrq->device); + struct mlx5_ib_srq *srq = to_msrq(ibsrq); + int ret; + struct mlx5_query_srq_mbox_out *out; + + out = kzalloc(sizeof(*out), GFP_KERNEL); + if (!out) + return -ENOMEM; + + ret = mlx5_core_query_srq(dev->mdev, &srq->msrq, out); + if (ret) + goto out_box; + + srq_attr->srq_limit = be16_to_cpu(out->ctx.lwm); + srq_attr->max_wr = srq->msrq.max - 1; + srq_attr->max_sge = srq->msrq.max_gs; + +out_box: + kfree(out); + return ret; +} + +int mlx5_ib_destroy_srq(struct ib_srq *srq) +{ + struct mlx5_ib_dev *dev = to_mdev(srq->device); + struct mlx5_ib_srq *msrq = to_msrq(srq); + + mlx5_core_destroy_srq(dev->mdev, &msrq->msrq); + + if (srq->uobject) { + mlx5_ib_db_unmap_user(to_mucontext(srq->uobject->context), &msrq->db); + ib_umem_release(msrq->umem); + } else { + destroy_srq_kernel(dev, msrq); + } + + kfree(srq); + return 0; +} + +void mlx5_ib_free_srq_wqe(struct mlx5_ib_srq *srq, int wqe_index) +{ + struct mlx5_wqe_srq_next_seg *next; + + /* always called with interrupts disabled. */ + spin_lock(&srq->lock); + + next = get_wqe(srq, srq->tail); + next->next_wqe_index = cpu_to_be16(wqe_index); + srq->tail = wqe_index; + + spin_unlock(&srq->lock); +} + +int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct mlx5_ib_srq *srq = to_msrq(ibsrq); + struct mlx5_wqe_srq_next_seg *next; + struct mlx5_wqe_data_seg *scat; + unsigned long flags; + int err = 0; + int nreq; + int i; + + spin_lock_irqsave(&srq->lock, flags); + + for (nreq = 0; wr; nreq++, wr = wr->next) { + if (unlikely(wr->num_sge > srq->msrq.max_gs)) { + err = -EINVAL; + *bad_wr = wr; + break; + } + + if (unlikely(srq->head == srq->tail)) { + err = -ENOMEM; + *bad_wr = wr; + break; + } + + srq->wrid[srq->head] = wr->wr_id; + + next = get_wqe(srq, srq->head); + srq->head = be16_to_cpu(next->next_wqe_index); + scat = (struct mlx5_wqe_data_seg *)(next + 1); + + for (i = 0; i < wr->num_sge; i++) { + scat[i].byte_count = cpu_to_be32(wr->sg_list[i].length); + scat[i].lkey = cpu_to_be32(wr->sg_list[i].lkey); + scat[i].addr = cpu_to_be64(wr->sg_list[i].addr); + } + + if (i < srq->msrq.max_avail_gather) { + scat[i].byte_count = 0; + scat[i].lkey = cpu_to_be32(MLX5_INVALID_LKEY); + scat[i].addr = 0; + } + } + + if (likely(nreq)) { + srq->wqe_ctr += nreq; + + /* Make sure that descriptors are written before + * doorbell record. + */ + wmb(); + + *srq->db.db = cpu_to_be32(srq->wqe_ctr); + } + + spin_unlock_irqrestore(&srq->lock, flags); + + return err; +} diff --git a/kernel/drivers/infiniband/hw/mlx5/user.h b/kernel/drivers/infiniband/hw/mlx5/user.h new file mode 100644 index 000000000..76fb7b927 --- /dev/null +++ b/kernel/drivers/infiniband/hw/mlx5/user.h @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX5_IB_USER_H +#define MLX5_IB_USER_H + +#include + +enum { + MLX5_QP_FLAG_SIGNATURE = 1 << 0, + MLX5_QP_FLAG_SCATTER_CQE = 1 << 1, +}; + +enum { + MLX5_SRQ_FLAG_SIGNATURE = 1 << 0, +}; + + +/* Increment this value if any changes that break userspace ABI + * compatibility are made. + */ +#define MLX5_IB_UVERBS_ABI_VERSION 1 + +/* Make sure that all structs defined in this file remain laid out so + * that they pack the same way on 32-bit and 64-bit architectures (to + * avoid incompatibility between 32-bit userspace and 64-bit kernels). + * In particular do not use pointer types -- pass pointers in __u64 + * instead. + */ + +struct mlx5_ib_alloc_ucontext_req { + __u32 total_num_uuars; + __u32 num_low_latency_uuars; +}; + +struct mlx5_ib_alloc_ucontext_req_v2 { + __u32 total_num_uuars; + __u32 num_low_latency_uuars; + __u32 flags; + __u32 reserved; +}; + +struct mlx5_ib_alloc_ucontext_resp { + __u32 qp_tab_size; + __u32 bf_reg_size; + __u32 tot_uuars; + __u32 cache_line_size; + __u16 max_sq_desc_sz; + __u16 max_rq_desc_sz; + __u32 max_send_wqebb; + __u32 max_recv_wr; + __u32 max_srq_recv_wr; + __u16 num_ports; + __u16 reserved; +}; + +struct mlx5_ib_alloc_pd_resp { + __u32 pdn; +}; + +struct mlx5_ib_create_cq { + __u64 buf_addr; + __u64 db_addr; + __u32 cqe_size; + __u32 reserved; /* explicit padding (optional on i386) */ +}; + +struct mlx5_ib_create_cq_resp { + __u32 cqn; + __u32 reserved; +}; + +struct mlx5_ib_resize_cq { + __u64 buf_addr; + __u16 cqe_size; + __u16 reserved0; + __u32 reserved1; +}; + +struct mlx5_ib_create_srq { + __u64 buf_addr; + __u64 db_addr; + __u32 flags; + __u32 reserved; /* explicit padding (optional on i386) */ +}; + +struct mlx5_ib_create_srq_resp { + __u32 srqn; + __u32 reserved; +}; + +struct mlx5_ib_create_qp { + __u64 buf_addr; + __u64 db_addr; + __u32 sq_wqe_count; + __u32 rq_wqe_count; + __u32 rq_wqe_shift; + __u32 flags; +}; + +struct mlx5_ib_create_qp_resp { + __u32 uuar_index; +}; +#endif /* MLX5_IB_USER_H */ diff --git a/kernel/drivers/infiniband/hw/mthca/Kconfig b/kernel/drivers/infiniband/hw/mthca/Kconfig new file mode 100644 index 000000000..da314c3fe --- /dev/null +++ b/kernel/drivers/infiniband/hw/mthca/Kconfig @@ -0,0 +1,17 @@ +config INFINIBAND_MTHCA + tristate "Mellanox HCA support" + depends on PCI + ---help--- + This is a low-level driver for Mellanox InfiniHost host + channel adapters (HCAs), including the MT23108 PCI-X HCA + ("Tavor") and the MT25208 PCI Express HCA ("Arbel"). + +config INFINIBAND_MTHCA_DEBUG + bool "Verbose debugging output" if EXPERT + depends on INFINIBAND_MTHCA + default y + ---help--- + This option causes debugging code to be compiled into the + mthca driver. The output can be turned on via the + debug_level module parameter (which can also be set after + the driver is loaded through sysfs). diff --git a/kernel/drivers/infiniband/hw/mthca/Makefile b/kernel/drivers/infiniband/hw/mthca/Makefile new file mode 100644 index 000000000..e388d95d0 --- /dev/null +++ b/kernel/drivers/infiniband/hw/mthca/Makefile @@ -0,0 +1,7 @@ +obj-$(CONFIG_INFINIBAND_MTHCA) += ib_mthca.o + +ib_mthca-y := mthca_main.o mthca_cmd.o mthca_profile.o mthca_reset.o \ + mthca_allocator.o mthca_eq.o mthca_pd.o mthca_cq.o \ + mthca_mr.o mthca_qp.o mthca_av.o mthca_mcg.o mthca_mad.o \ + mthca_provider.o mthca_memfree.o mthca_uar.o mthca_srq.o \ + mthca_catas.o diff --git a/kernel/drivers/infiniband/hw/mthca/mthca_allocator.c b/kernel/drivers/infiniband/hw/mthca/mthca_allocator.c new file mode 100644 index 000000000..b4e0cf4e9 --- /dev/null +++ b/kernel/drivers/infiniband/hw/mthca/mthca_allocator.c @@ -0,0 +1,301 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include "mthca_dev.h" + +/* Trivial bitmap-based allocator */ +u32 mthca_alloc(struct mthca_alloc *alloc) +{ + unsigned long flags; + u32 obj; + + spin_lock_irqsave(&alloc->lock, flags); + + obj = find_next_zero_bit(alloc->table, alloc->max, alloc->last); + if (obj >= alloc->max) { + alloc->top = (alloc->top + alloc->max) & alloc->mask; + obj = find_first_zero_bit(alloc->table, alloc->max); + } + + if (obj < alloc->max) { + set_bit(obj, alloc->table); + obj |= alloc->top; + } else + obj = -1; + + spin_unlock_irqrestore(&alloc->lock, flags); + + return obj; +} + +void mthca_free(struct mthca_alloc *alloc, u32 obj) +{ + unsigned long flags; + + obj &= alloc->max - 1; + + spin_lock_irqsave(&alloc->lock, flags); + + clear_bit(obj, alloc->table); + alloc->last = min(alloc->last, obj); + alloc->top = (alloc->top + alloc->max) & alloc->mask; + + spin_unlock_irqrestore(&alloc->lock, flags); +} + +int mthca_alloc_init(struct mthca_alloc *alloc, u32 num, u32 mask, + u32 reserved) +{ + int i; + + /* num must be a power of 2 */ + if (num != 1 << (ffs(num) - 1)) + return -EINVAL; + + alloc->last = 0; + alloc->top = 0; + alloc->max = num; + alloc->mask = mask; + spin_lock_init(&alloc->lock); + alloc->table = kmalloc(BITS_TO_LONGS(num) * sizeof (long), + GFP_KERNEL); + if (!alloc->table) + return -ENOMEM; + + bitmap_zero(alloc->table, num); + for (i = 0; i < reserved; ++i) + set_bit(i, alloc->table); + + return 0; +} + +void mthca_alloc_cleanup(struct mthca_alloc *alloc) +{ + kfree(alloc->table); +} + +/* + * Array of pointers with lazy allocation of leaf pages. Callers of + * _get, _set and _clear methods must use a lock or otherwise + * serialize access to the array. + */ + +#define MTHCA_ARRAY_MASK (PAGE_SIZE / sizeof (void *) - 1) + +void *mthca_array_get(struct mthca_array *array, int index) +{ + int p = (index * sizeof (void *)) >> PAGE_SHIFT; + + if (array->page_list[p].page) + return array->page_list[p].page[index & MTHCA_ARRAY_MASK]; + else + return NULL; +} + +int mthca_array_set(struct mthca_array *array, int index, void *value) +{ + int p = (index * sizeof (void *)) >> PAGE_SHIFT; + + /* Allocate with GFP_ATOMIC because we'll be called with locks held. */ + if (!array->page_list[p].page) + array->page_list[p].page = (void **) get_zeroed_page(GFP_ATOMIC); + + if (!array->page_list[p].page) + return -ENOMEM; + + array->page_list[p].page[index & MTHCA_ARRAY_MASK] = value; + ++array->page_list[p].used; + + return 0; +} + +void mthca_array_clear(struct mthca_array *array, int index) +{ + int p = (index * sizeof (void *)) >> PAGE_SHIFT; + + if (--array->page_list[p].used == 0) { + free_page((unsigned long) array->page_list[p].page); + array->page_list[p].page = NULL; + } else + array->page_list[p].page[index & MTHCA_ARRAY_MASK] = NULL; + + if (array->page_list[p].used < 0) + pr_debug("Array %p index %d page %d with ref count %d < 0\n", + array, index, p, array->page_list[p].used); +} + +int mthca_array_init(struct mthca_array *array, int nent) +{ + int npage = (nent * sizeof (void *) + PAGE_SIZE - 1) / PAGE_SIZE; + int i; + + array->page_list = kmalloc(npage * sizeof *array->page_list, GFP_KERNEL); + if (!array->page_list) + return -ENOMEM; + + for (i = 0; i < npage; ++i) { + array->page_list[i].page = NULL; + array->page_list[i].used = 0; + } + + return 0; +} + +void mthca_array_cleanup(struct mthca_array *array, int nent) +{ + int i; + + for (i = 0; i < (nent * sizeof (void *) + PAGE_SIZE - 1) / PAGE_SIZE; ++i) + free_page((unsigned long) array->page_list[i].page); + + kfree(array->page_list); +} + +/* + * Handling for queue buffers -- we allocate a bunch of memory and + * register it in a memory region at HCA virtual address 0. If the + * requested size is > max_direct, we split the allocation into + * multiple pages, so we don't require too much contiguous memory. + */ + +int mthca_buf_alloc(struct mthca_dev *dev, int size, int max_direct, + union mthca_buf *buf, int *is_direct, struct mthca_pd *pd, + int hca_write, struct mthca_mr *mr) +{ + int err = -ENOMEM; + int npages, shift; + u64 *dma_list = NULL; + dma_addr_t t; + int i; + + if (size <= max_direct) { + *is_direct = 1; + npages = 1; + shift = get_order(size) + PAGE_SHIFT; + + buf->direct.buf = dma_alloc_coherent(&dev->pdev->dev, + size, &t, GFP_KERNEL); + if (!buf->direct.buf) + return -ENOMEM; + + dma_unmap_addr_set(&buf->direct, mapping, t); + + memset(buf->direct.buf, 0, size); + + while (t & ((1 << shift) - 1)) { + --shift; + npages *= 2; + } + + dma_list = kmalloc(npages * sizeof *dma_list, GFP_KERNEL); + if (!dma_list) + goto err_free; + + for (i = 0; i < npages; ++i) + dma_list[i] = t + i * (1 << shift); + } else { + *is_direct = 0; + npages = (size + PAGE_SIZE - 1) / PAGE_SIZE; + shift = PAGE_SHIFT; + + dma_list = kmalloc(npages * sizeof *dma_list, GFP_KERNEL); + if (!dma_list) + return -ENOMEM; + + buf->page_list = kmalloc(npages * sizeof *buf->page_list, + GFP_KERNEL); + if (!buf->page_list) + goto err_out; + + for (i = 0; i < npages; ++i) + buf->page_list[i].buf = NULL; + + for (i = 0; i < npages; ++i) { + buf->page_list[i].buf = + dma_alloc_coherent(&dev->pdev->dev, PAGE_SIZE, + &t, GFP_KERNEL); + if (!buf->page_list[i].buf) + goto err_free; + + dma_list[i] = t; + dma_unmap_addr_set(&buf->page_list[i], mapping, t); + + clear_page(buf->page_list[i].buf); + } + } + + err = mthca_mr_alloc_phys(dev, pd->pd_num, + dma_list, shift, npages, + 0, size, + MTHCA_MPT_FLAG_LOCAL_READ | + (hca_write ? MTHCA_MPT_FLAG_LOCAL_WRITE : 0), + mr); + if (err) + goto err_free; + + kfree(dma_list); + + return 0; + +err_free: + mthca_buf_free(dev, size, buf, *is_direct, NULL); + +err_out: + kfree(dma_list); + + return err; +} + +void mthca_buf_free(struct mthca_dev *dev, int size, union mthca_buf *buf, + int is_direct, struct mthca_mr *mr) +{ + int i; + + if (mr) + mthca_free_mr(dev, mr); + + if (is_direct) + dma_free_coherent(&dev->pdev->dev, size, buf->direct.buf, + dma_unmap_addr(&buf->direct, mapping)); + else { + for (i = 0; i < (size + PAGE_SIZE - 1) / PAGE_SIZE; ++i) + dma_free_coherent(&dev->pdev->dev, PAGE_SIZE, + buf->page_list[i].buf, + dma_unmap_addr(&buf->page_list[i], + mapping)); + kfree(buf->page_list); + } +} diff --git a/kernel/drivers/infiniband/hw/mthca/mthca_av.c b/kernel/drivers/infiniband/hw/mthca/mthca_av.c new file mode 100644 index 000000000..32f6c6315 --- /dev/null +++ b/kernel/drivers/infiniband/hw/mthca/mthca_av.c @@ -0,0 +1,374 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include +#include + +#include "mthca_dev.h" + +enum { + MTHCA_RATE_TAVOR_FULL = 0, + MTHCA_RATE_TAVOR_1X = 1, + MTHCA_RATE_TAVOR_4X = 2, + MTHCA_RATE_TAVOR_1X_DDR = 3 +}; + +enum { + MTHCA_RATE_MEMFREE_FULL = 0, + MTHCA_RATE_MEMFREE_QUARTER = 1, + MTHCA_RATE_MEMFREE_EIGHTH = 2, + MTHCA_RATE_MEMFREE_HALF = 3 +}; + +struct mthca_av { + __be32 port_pd; + u8 reserved1; + u8 g_slid; + __be16 dlid; + u8 reserved2; + u8 gid_index; + u8 msg_sr; + u8 hop_limit; + __be32 sl_tclass_flowlabel; + __be32 dgid[4]; +}; + +static enum ib_rate memfree_rate_to_ib(u8 mthca_rate, u8 port_rate) +{ + switch (mthca_rate) { + case MTHCA_RATE_MEMFREE_EIGHTH: + return mult_to_ib_rate(port_rate >> 3); + case MTHCA_RATE_MEMFREE_QUARTER: + return mult_to_ib_rate(port_rate >> 2); + case MTHCA_RATE_MEMFREE_HALF: + return mult_to_ib_rate(port_rate >> 1); + case MTHCA_RATE_MEMFREE_FULL: + default: + return mult_to_ib_rate(port_rate); + } +} + +static enum ib_rate tavor_rate_to_ib(u8 mthca_rate, u8 port_rate) +{ + switch (mthca_rate) { + case MTHCA_RATE_TAVOR_1X: return IB_RATE_2_5_GBPS; + case MTHCA_RATE_TAVOR_1X_DDR: return IB_RATE_5_GBPS; + case MTHCA_RATE_TAVOR_4X: return IB_RATE_10_GBPS; + default: return mult_to_ib_rate(port_rate); + } +} + +enum ib_rate mthca_rate_to_ib(struct mthca_dev *dev, u8 mthca_rate, u8 port) +{ + if (mthca_is_memfree(dev)) { + /* Handle old Arbel FW */ + if (dev->limits.stat_rate_support == 0x3 && mthca_rate) + return IB_RATE_2_5_GBPS; + + return memfree_rate_to_ib(mthca_rate, dev->rate[port - 1]); + } else + return tavor_rate_to_ib(mthca_rate, dev->rate[port - 1]); +} + +static u8 ib_rate_to_memfree(u8 req_rate, u8 cur_rate) +{ + if (cur_rate <= req_rate) + return 0; + + /* + * Inter-packet delay (IPD) to get from rate X down to a rate + * no more than Y is (X - 1) / Y. + */ + switch ((cur_rate - 1) / req_rate) { + case 0: return MTHCA_RATE_MEMFREE_FULL; + case 1: return MTHCA_RATE_MEMFREE_HALF; + case 2: /* fall through */ + case 3: return MTHCA_RATE_MEMFREE_QUARTER; + default: return MTHCA_RATE_MEMFREE_EIGHTH; + } +} + +static u8 ib_rate_to_tavor(u8 static_rate) +{ + switch (static_rate) { + case IB_RATE_2_5_GBPS: return MTHCA_RATE_TAVOR_1X; + case IB_RATE_5_GBPS: return MTHCA_RATE_TAVOR_1X_DDR; + case IB_RATE_10_GBPS: return MTHCA_RATE_TAVOR_4X; + default: return MTHCA_RATE_TAVOR_FULL; + } +} + +u8 mthca_get_rate(struct mthca_dev *dev, int static_rate, u8 port) +{ + u8 rate; + + if (!static_rate || ib_rate_to_mult(static_rate) >= dev->rate[port - 1]) + return 0; + + if (mthca_is_memfree(dev)) + rate = ib_rate_to_memfree(ib_rate_to_mult(static_rate), + dev->rate[port - 1]); + else + rate = ib_rate_to_tavor(static_rate); + + if (!(dev->limits.stat_rate_support & (1 << rate))) + rate = 1; + + return rate; +} + +int mthca_create_ah(struct mthca_dev *dev, + struct mthca_pd *pd, + struct ib_ah_attr *ah_attr, + struct mthca_ah *ah) +{ + u32 index = -1; + struct mthca_av *av = NULL; + + ah->type = MTHCA_AH_PCI_POOL; + + if (mthca_is_memfree(dev)) { + ah->av = kmalloc(sizeof *ah->av, GFP_ATOMIC); + if (!ah->av) + return -ENOMEM; + + ah->type = MTHCA_AH_KMALLOC; + av = ah->av; + } else if (!atomic_read(&pd->sqp_count) && + !(dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN)) { + index = mthca_alloc(&dev->av_table.alloc); + + /* fall back to allocate in host memory */ + if (index == -1) + goto on_hca_fail; + + av = kmalloc(sizeof *av, GFP_ATOMIC); + if (!av) + goto on_hca_fail; + + ah->type = MTHCA_AH_ON_HCA; + ah->avdma = dev->av_table.ddr_av_base + + index * MTHCA_AV_SIZE; + } + +on_hca_fail: + if (ah->type == MTHCA_AH_PCI_POOL) { + ah->av = pci_pool_alloc(dev->av_table.pool, + GFP_ATOMIC, &ah->avdma); + if (!ah->av) + return -ENOMEM; + + av = ah->av; + } + + ah->key = pd->ntmr.ibmr.lkey; + + memset(av, 0, MTHCA_AV_SIZE); + + av->port_pd = cpu_to_be32(pd->pd_num | (ah_attr->port_num << 24)); + av->g_slid = ah_attr->src_path_bits; + av->dlid = cpu_to_be16(ah_attr->dlid); + av->msg_sr = (3 << 4) | /* 2K message */ + mthca_get_rate(dev, ah_attr->static_rate, ah_attr->port_num); + av->sl_tclass_flowlabel = cpu_to_be32(ah_attr->sl << 28); + if (ah_attr->ah_flags & IB_AH_GRH) { + av->g_slid |= 0x80; + av->gid_index = (ah_attr->port_num - 1) * dev->limits.gid_table_len + + ah_attr->grh.sgid_index; + av->hop_limit = ah_attr->grh.hop_limit; + av->sl_tclass_flowlabel |= + cpu_to_be32((ah_attr->grh.traffic_class << 20) | + ah_attr->grh.flow_label); + memcpy(av->dgid, ah_attr->grh.dgid.raw, 16); + } else { + /* Arbel workaround -- low byte of GID must be 2 */ + av->dgid[3] = cpu_to_be32(2); + } + + if (0) { + int j; + + mthca_dbg(dev, "Created UDAV at %p/%08lx:\n", + av, (unsigned long) ah->avdma); + for (j = 0; j < 8; ++j) + printk(KERN_DEBUG " [%2x] %08x\n", + j * 4, be32_to_cpu(((__be32 *) av)[j])); + } + + if (ah->type == MTHCA_AH_ON_HCA) { + memcpy_toio(dev->av_table.av_map + index * MTHCA_AV_SIZE, + av, MTHCA_AV_SIZE); + kfree(av); + } + + return 0; +} + +int mthca_destroy_ah(struct mthca_dev *dev, struct mthca_ah *ah) +{ + switch (ah->type) { + case MTHCA_AH_ON_HCA: + mthca_free(&dev->av_table.alloc, + (ah->avdma - dev->av_table.ddr_av_base) / + MTHCA_AV_SIZE); + break; + + case MTHCA_AH_PCI_POOL: + pci_pool_free(dev->av_table.pool, ah->av, ah->avdma); + break; + + case MTHCA_AH_KMALLOC: + kfree(ah->av); + break; + } + + return 0; +} + +int mthca_ah_grh_present(struct mthca_ah *ah) +{ + return !!(ah->av->g_slid & 0x80); +} + +int mthca_read_ah(struct mthca_dev *dev, struct mthca_ah *ah, + struct ib_ud_header *header) +{ + if (ah->type == MTHCA_AH_ON_HCA) + return -EINVAL; + + header->lrh.service_level = be32_to_cpu(ah->av->sl_tclass_flowlabel) >> 28; + header->lrh.destination_lid = ah->av->dlid; + header->lrh.source_lid = cpu_to_be16(ah->av->g_slid & 0x7f); + if (mthca_ah_grh_present(ah)) { + header->grh.traffic_class = + (be32_to_cpu(ah->av->sl_tclass_flowlabel) >> 20) & 0xff; + header->grh.flow_label = + ah->av->sl_tclass_flowlabel & cpu_to_be32(0xfffff); + header->grh.hop_limit = ah->av->hop_limit; + ib_get_cached_gid(&dev->ib_dev, + be32_to_cpu(ah->av->port_pd) >> 24, + ah->av->gid_index % dev->limits.gid_table_len, + &header->grh.source_gid); + memcpy(header->grh.destination_gid.raw, + ah->av->dgid, 16); + } + + return 0; +} + +int mthca_ah_query(struct ib_ah *ibah, struct ib_ah_attr *attr) +{ + struct mthca_ah *ah = to_mah(ibah); + struct mthca_dev *dev = to_mdev(ibah->device); + + /* Only implement for MAD and memfree ah for now. */ + if (ah->type == MTHCA_AH_ON_HCA) + return -ENOSYS; + + memset(attr, 0, sizeof *attr); + attr->dlid = be16_to_cpu(ah->av->dlid); + attr->sl = be32_to_cpu(ah->av->sl_tclass_flowlabel) >> 28; + attr->port_num = be32_to_cpu(ah->av->port_pd) >> 24; + attr->static_rate = mthca_rate_to_ib(dev, ah->av->msg_sr & 0x7, + attr->port_num); + attr->src_path_bits = ah->av->g_slid & 0x7F; + attr->ah_flags = mthca_ah_grh_present(ah) ? IB_AH_GRH : 0; + + if (attr->ah_flags) { + attr->grh.traffic_class = + be32_to_cpu(ah->av->sl_tclass_flowlabel) >> 20; + attr->grh.flow_label = + be32_to_cpu(ah->av->sl_tclass_flowlabel) & 0xfffff; + attr->grh.hop_limit = ah->av->hop_limit; + attr->grh.sgid_index = ah->av->gid_index & + (dev->limits.gid_table_len - 1); + memcpy(attr->grh.dgid.raw, ah->av->dgid, 16); + } + + return 0; +} + +int mthca_init_av_table(struct mthca_dev *dev) +{ + int err; + + if (mthca_is_memfree(dev)) + return 0; + + err = mthca_alloc_init(&dev->av_table.alloc, + dev->av_table.num_ddr_avs, + dev->av_table.num_ddr_avs - 1, + 0); + if (err) + return err; + + dev->av_table.pool = pci_pool_create("mthca_av", dev->pdev, + MTHCA_AV_SIZE, + MTHCA_AV_SIZE, 0); + if (!dev->av_table.pool) + goto out_free_alloc; + + if (!(dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN)) { + dev->av_table.av_map = ioremap(pci_resource_start(dev->pdev, 4) + + dev->av_table.ddr_av_base - + dev->ddr_start, + dev->av_table.num_ddr_avs * + MTHCA_AV_SIZE); + if (!dev->av_table.av_map) + goto out_free_pool; + } else + dev->av_table.av_map = NULL; + + return 0; + + out_free_pool: + pci_pool_destroy(dev->av_table.pool); + + out_free_alloc: + mthca_alloc_cleanup(&dev->av_table.alloc); + return -ENOMEM; +} + +void mthca_cleanup_av_table(struct mthca_dev *dev) +{ + if (mthca_is_memfree(dev)) + return; + + if (dev->av_table.av_map) + iounmap(dev->av_table.av_map); + pci_pool_destroy(dev->av_table.pool); + mthca_alloc_cleanup(&dev->av_table.alloc); +} diff --git a/kernel/drivers/infiniband/hw/mthca/mthca_catas.c b/kernel/drivers/infiniband/hw/mthca/mthca_catas.c new file mode 100644 index 000000000..712d2a30f --- /dev/null +++ b/kernel/drivers/infiniband/hw/mthca/mthca_catas.c @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2005 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#include "mthca_dev.h" + +enum { + MTHCA_CATAS_POLL_INTERVAL = 5 * HZ, + + MTHCA_CATAS_TYPE_INTERNAL = 0, + MTHCA_CATAS_TYPE_UPLINK = 3, + MTHCA_CATAS_TYPE_DDR = 4, + MTHCA_CATAS_TYPE_PARITY = 5, +}; + +static DEFINE_SPINLOCK(catas_lock); + +static LIST_HEAD(catas_list); +static struct workqueue_struct *catas_wq; +static struct work_struct catas_work; + +static int catas_reset_disable; +module_param_named(catas_reset_disable, catas_reset_disable, int, 0644); +MODULE_PARM_DESC(catas_reset_disable, "disable reset on catastrophic event if nonzero"); + +static void catas_reset(struct work_struct *work) +{ + struct mthca_dev *dev, *tmpdev; + LIST_HEAD(tlist); + int ret; + + mutex_lock(&mthca_device_mutex); + + spin_lock_irq(&catas_lock); + list_splice_init(&catas_list, &tlist); + spin_unlock_irq(&catas_lock); + + list_for_each_entry_safe(dev, tmpdev, &tlist, catas_err.list) { + struct pci_dev *pdev = dev->pdev; + ret = __mthca_restart_one(dev->pdev); + /* 'dev' now is not valid */ + if (ret) + printk(KERN_ERR "mthca %s: Reset failed (%d)\n", + pci_name(pdev), ret); + else { + struct mthca_dev *d = pci_get_drvdata(pdev); + mthca_dbg(d, "Reset succeeded\n"); + } + } + + mutex_unlock(&mthca_device_mutex); +} + +static void handle_catas(struct mthca_dev *dev) +{ + struct ib_event event; + unsigned long flags; + const char *type; + int i; + + event.device = &dev->ib_dev; + event.event = IB_EVENT_DEVICE_FATAL; + event.element.port_num = 0; + dev->active = false; + + ib_dispatch_event(&event); + + switch (swab32(readl(dev->catas_err.map)) >> 24) { + case MTHCA_CATAS_TYPE_INTERNAL: + type = "internal error"; + break; + case MTHCA_CATAS_TYPE_UPLINK: + type = "uplink bus error"; + break; + case MTHCA_CATAS_TYPE_DDR: + type = "DDR data error"; + break; + case MTHCA_CATAS_TYPE_PARITY: + type = "internal parity error"; + break; + default: + type = "unknown error"; + break; + } + + mthca_err(dev, "Catastrophic error detected: %s\n", type); + for (i = 0; i < dev->catas_err.size; ++i) + mthca_err(dev, " buf[%02x]: %08x\n", + i, swab32(readl(dev->catas_err.map + i))); + + if (catas_reset_disable) + return; + + spin_lock_irqsave(&catas_lock, flags); + list_add(&dev->catas_err.list, &catas_list); + queue_work(catas_wq, &catas_work); + spin_unlock_irqrestore(&catas_lock, flags); +} + +static void poll_catas(unsigned long dev_ptr) +{ + struct mthca_dev *dev = (struct mthca_dev *) dev_ptr; + int i; + + for (i = 0; i < dev->catas_err.size; ++i) + if (readl(dev->catas_err.map + i)) { + handle_catas(dev); + return; + } + + mod_timer(&dev->catas_err.timer, + round_jiffies(jiffies + MTHCA_CATAS_POLL_INTERVAL)); +} + +void mthca_start_catas_poll(struct mthca_dev *dev) +{ + phys_addr_t addr; + + init_timer(&dev->catas_err.timer); + dev->catas_err.map = NULL; + + addr = pci_resource_start(dev->pdev, 0) + + ((pci_resource_len(dev->pdev, 0) - 1) & + dev->catas_err.addr); + + dev->catas_err.map = ioremap(addr, dev->catas_err.size * 4); + if (!dev->catas_err.map) { + mthca_warn(dev, "couldn't map catastrophic error region " + "at 0x%llx/0x%x\n", (unsigned long long) addr, + dev->catas_err.size * 4); + return; + } + + dev->catas_err.timer.data = (unsigned long) dev; + dev->catas_err.timer.function = poll_catas; + dev->catas_err.timer.expires = jiffies + MTHCA_CATAS_POLL_INTERVAL; + INIT_LIST_HEAD(&dev->catas_err.list); + add_timer(&dev->catas_err.timer); +} + +void mthca_stop_catas_poll(struct mthca_dev *dev) +{ + del_timer_sync(&dev->catas_err.timer); + + if (dev->catas_err.map) + iounmap(dev->catas_err.map); + + spin_lock_irq(&catas_lock); + list_del(&dev->catas_err.list); + spin_unlock_irq(&catas_lock); +} + +int __init mthca_catas_init(void) +{ + INIT_WORK(&catas_work, catas_reset); + + catas_wq = create_singlethread_workqueue("mthca_catas"); + if (!catas_wq) + return -ENOMEM; + + return 0; +} + +void mthca_catas_cleanup(void) +{ + destroy_workqueue(catas_wq); +} diff --git a/kernel/drivers/infiniband/hw/mthca/mthca_cmd.c b/kernel/drivers/infiniband/hw/mthca/mthca_cmd.c new file mode 100644 index 000000000..9d3e5c1ac --- /dev/null +++ b/kernel/drivers/infiniband/hw/mthca/mthca_cmd.c @@ -0,0 +1,1969 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "mthca_dev.h" +#include "mthca_config_reg.h" +#include "mthca_cmd.h" +#include "mthca_memfree.h" + +#define CMD_POLL_TOKEN 0xffff + +enum { + HCR_IN_PARAM_OFFSET = 0x00, + HCR_IN_MODIFIER_OFFSET = 0x08, + HCR_OUT_PARAM_OFFSET = 0x0c, + HCR_TOKEN_OFFSET = 0x14, + HCR_STATUS_OFFSET = 0x18, + + HCR_OPMOD_SHIFT = 12, + HCA_E_BIT = 22, + HCR_GO_BIT = 23 +}; + +enum { + /* initialization and general commands */ + CMD_SYS_EN = 0x1, + CMD_SYS_DIS = 0x2, + CMD_MAP_FA = 0xfff, + CMD_UNMAP_FA = 0xffe, + CMD_RUN_FW = 0xff6, + CMD_MOD_STAT_CFG = 0x34, + CMD_QUERY_DEV_LIM = 0x3, + CMD_QUERY_FW = 0x4, + CMD_ENABLE_LAM = 0xff8, + CMD_DISABLE_LAM = 0xff7, + CMD_QUERY_DDR = 0x5, + CMD_QUERY_ADAPTER = 0x6, + CMD_INIT_HCA = 0x7, + CMD_CLOSE_HCA = 0x8, + CMD_INIT_IB = 0x9, + CMD_CLOSE_IB = 0xa, + CMD_QUERY_HCA = 0xb, + CMD_SET_IB = 0xc, + CMD_ACCESS_DDR = 0x2e, + CMD_MAP_ICM = 0xffa, + CMD_UNMAP_ICM = 0xff9, + CMD_MAP_ICM_AUX = 0xffc, + CMD_UNMAP_ICM_AUX = 0xffb, + CMD_SET_ICM_SIZE = 0xffd, + + /* TPT commands */ + CMD_SW2HW_MPT = 0xd, + CMD_QUERY_MPT = 0xe, + CMD_HW2SW_MPT = 0xf, + CMD_READ_MTT = 0x10, + CMD_WRITE_MTT = 0x11, + CMD_SYNC_TPT = 0x2f, + + /* EQ commands */ + CMD_MAP_EQ = 0x12, + CMD_SW2HW_EQ = 0x13, + CMD_HW2SW_EQ = 0x14, + CMD_QUERY_EQ = 0x15, + + /* CQ commands */ + CMD_SW2HW_CQ = 0x16, + CMD_HW2SW_CQ = 0x17, + CMD_QUERY_CQ = 0x18, + CMD_RESIZE_CQ = 0x2c, + + /* SRQ commands */ + CMD_SW2HW_SRQ = 0x35, + CMD_HW2SW_SRQ = 0x36, + CMD_QUERY_SRQ = 0x37, + CMD_ARM_SRQ = 0x40, + + /* QP/EE commands */ + CMD_RST2INIT_QPEE = 0x19, + CMD_INIT2RTR_QPEE = 0x1a, + CMD_RTR2RTS_QPEE = 0x1b, + CMD_RTS2RTS_QPEE = 0x1c, + CMD_SQERR2RTS_QPEE = 0x1d, + CMD_2ERR_QPEE = 0x1e, + CMD_RTS2SQD_QPEE = 0x1f, + CMD_SQD2SQD_QPEE = 0x38, + CMD_SQD2RTS_QPEE = 0x20, + CMD_ERR2RST_QPEE = 0x21, + CMD_QUERY_QPEE = 0x22, + CMD_INIT2INIT_QPEE = 0x2d, + CMD_SUSPEND_QPEE = 0x32, + CMD_UNSUSPEND_QPEE = 0x33, + /* special QPs and management commands */ + CMD_CONF_SPECIAL_QP = 0x23, + CMD_MAD_IFC = 0x24, + + /* multicast commands */ + CMD_READ_MGM = 0x25, + CMD_WRITE_MGM = 0x26, + CMD_MGID_HASH = 0x27, + + /* miscellaneous commands */ + CMD_DIAG_RPRT = 0x30, + CMD_NOP = 0x31, + + /* debug commands */ + CMD_QUERY_DEBUG_MSG = 0x2a, + CMD_SET_DEBUG_MSG = 0x2b, +}; + +/* + * According to Mellanox code, FW may be starved and never complete + * commands. So we can't use strict timeouts described in PRM -- we + * just arbitrarily select 60 seconds for now. + */ +#if 0 +/* + * Round up and add 1 to make sure we get the full wait time (since we + * will be starting in the middle of a jiffy) + */ +enum { + CMD_TIME_CLASS_A = (HZ + 999) / 1000 + 1, + CMD_TIME_CLASS_B = (HZ + 99) / 100 + 1, + CMD_TIME_CLASS_C = (HZ + 9) / 10 + 1, + CMD_TIME_CLASS_D = 60 * HZ +}; +#else +enum { + CMD_TIME_CLASS_A = 60 * HZ, + CMD_TIME_CLASS_B = 60 * HZ, + CMD_TIME_CLASS_C = 60 * HZ, + CMD_TIME_CLASS_D = 60 * HZ +}; +#endif + +enum { + GO_BIT_TIMEOUT = HZ * 10 +}; + +struct mthca_cmd_context { + struct completion done; + int result; + int next; + u64 out_param; + u16 token; + u8 status; +}; + +static int fw_cmd_doorbell = 0; +module_param(fw_cmd_doorbell, int, 0644); +MODULE_PARM_DESC(fw_cmd_doorbell, "post FW commands through doorbell page if nonzero " + "(and supported by FW)"); + +static inline int go_bit(struct mthca_dev *dev) +{ + return readl(dev->hcr + HCR_STATUS_OFFSET) & + swab32(1 << HCR_GO_BIT); +} + +static void mthca_cmd_post_dbell(struct mthca_dev *dev, + u64 in_param, + u64 out_param, + u32 in_modifier, + u8 op_modifier, + u16 op, + u16 token) +{ + void __iomem *ptr = dev->cmd.dbell_map; + u16 *offs = dev->cmd.dbell_offsets; + + __raw_writel((__force u32) cpu_to_be32(in_param >> 32), ptr + offs[0]); + wmb(); + __raw_writel((__force u32) cpu_to_be32(in_param & 0xfffffffful), ptr + offs[1]); + wmb(); + __raw_writel((__force u32) cpu_to_be32(in_modifier), ptr + offs[2]); + wmb(); + __raw_writel((__force u32) cpu_to_be32(out_param >> 32), ptr + offs[3]); + wmb(); + __raw_writel((__force u32) cpu_to_be32(out_param & 0xfffffffful), ptr + offs[4]); + wmb(); + __raw_writel((__force u32) cpu_to_be32(token << 16), ptr + offs[5]); + wmb(); + __raw_writel((__force u32) cpu_to_be32((1 << HCR_GO_BIT) | + (1 << HCA_E_BIT) | + (op_modifier << HCR_OPMOD_SHIFT) | + op), ptr + offs[6]); + wmb(); + __raw_writel((__force u32) 0, ptr + offs[7]); + wmb(); +} + +static int mthca_cmd_post_hcr(struct mthca_dev *dev, + u64 in_param, + u64 out_param, + u32 in_modifier, + u8 op_modifier, + u16 op, + u16 token, + int event) +{ + if (event) { + unsigned long end = jiffies + GO_BIT_TIMEOUT; + + while (go_bit(dev) && time_before(jiffies, end)) { + set_current_state(TASK_RUNNING); + schedule(); + } + } + + if (go_bit(dev)) + return -EAGAIN; + + /* + * We use writel (instead of something like memcpy_toio) + * because writes of less than 32 bits to the HCR don't work + * (and some architectures such as ia64 implement memcpy_toio + * in terms of writeb). + */ + __raw_writel((__force u32) cpu_to_be32(in_param >> 32), dev->hcr + 0 * 4); + __raw_writel((__force u32) cpu_to_be32(in_param & 0xfffffffful), dev->hcr + 1 * 4); + __raw_writel((__force u32) cpu_to_be32(in_modifier), dev->hcr + 2 * 4); + __raw_writel((__force u32) cpu_to_be32(out_param >> 32), dev->hcr + 3 * 4); + __raw_writel((__force u32) cpu_to_be32(out_param & 0xfffffffful), dev->hcr + 4 * 4); + __raw_writel((__force u32) cpu_to_be32(token << 16), dev->hcr + 5 * 4); + + /* __raw_writel may not order writes. */ + wmb(); + + __raw_writel((__force u32) cpu_to_be32((1 << HCR_GO_BIT) | + (event ? (1 << HCA_E_BIT) : 0) | + (op_modifier << HCR_OPMOD_SHIFT) | + op), dev->hcr + 6 * 4); + + return 0; +} + +static int mthca_cmd_post(struct mthca_dev *dev, + u64 in_param, + u64 out_param, + u32 in_modifier, + u8 op_modifier, + u16 op, + u16 token, + int event) +{ + int err = 0; + + mutex_lock(&dev->cmd.hcr_mutex); + + if (event && dev->cmd.flags & MTHCA_CMD_POST_DOORBELLS && fw_cmd_doorbell) + mthca_cmd_post_dbell(dev, in_param, out_param, in_modifier, + op_modifier, op, token); + else + err = mthca_cmd_post_hcr(dev, in_param, out_param, in_modifier, + op_modifier, op, token, event); + + /* + * Make sure that our HCR writes don't get mixed in with + * writes from another CPU starting a FW command. + */ + mmiowb(); + + mutex_unlock(&dev->cmd.hcr_mutex); + return err; +} + + +static int mthca_status_to_errno(u8 status) +{ + static const int trans_table[] = { + [MTHCA_CMD_STAT_INTERNAL_ERR] = -EIO, + [MTHCA_CMD_STAT_BAD_OP] = -EPERM, + [MTHCA_CMD_STAT_BAD_PARAM] = -EINVAL, + [MTHCA_CMD_STAT_BAD_SYS_STATE] = -ENXIO, + [MTHCA_CMD_STAT_BAD_RESOURCE] = -EBADF, + [MTHCA_CMD_STAT_RESOURCE_BUSY] = -EBUSY, + [MTHCA_CMD_STAT_DDR_MEM_ERR] = -ENOMEM, + [MTHCA_CMD_STAT_EXCEED_LIM] = -ENOMEM, + [MTHCA_CMD_STAT_BAD_RES_STATE] = -EBADF, + [MTHCA_CMD_STAT_BAD_INDEX] = -EBADF, + [MTHCA_CMD_STAT_BAD_NVMEM] = -EFAULT, + [MTHCA_CMD_STAT_BAD_QPEE_STATE] = -EINVAL, + [MTHCA_CMD_STAT_BAD_SEG_PARAM] = -EFAULT, + [MTHCA_CMD_STAT_REG_BOUND] = -EBUSY, + [MTHCA_CMD_STAT_LAM_NOT_PRE] = -EAGAIN, + [MTHCA_CMD_STAT_BAD_PKT] = -EBADMSG, + [MTHCA_CMD_STAT_BAD_SIZE] = -ENOMEM, + }; + + if (status >= ARRAY_SIZE(trans_table) || + (status != MTHCA_CMD_STAT_OK + && trans_table[status] == 0)) + return -EINVAL; + + return trans_table[status]; +} + + +static int mthca_cmd_poll(struct mthca_dev *dev, + u64 in_param, + u64 *out_param, + int out_is_imm, + u32 in_modifier, + u8 op_modifier, + u16 op, + unsigned long timeout) +{ + int err = 0; + unsigned long end; + u8 status; + + down(&dev->cmd.poll_sem); + + err = mthca_cmd_post(dev, in_param, + out_param ? *out_param : 0, + in_modifier, op_modifier, + op, CMD_POLL_TOKEN, 0); + if (err) + goto out; + + end = timeout + jiffies; + while (go_bit(dev) && time_before(jiffies, end)) { + set_current_state(TASK_RUNNING); + schedule(); + } + + if (go_bit(dev)) { + err = -EBUSY; + goto out; + } + + if (out_is_imm) + *out_param = + (u64) be32_to_cpu((__force __be32) + __raw_readl(dev->hcr + HCR_OUT_PARAM_OFFSET)) << 32 | + (u64) be32_to_cpu((__force __be32) + __raw_readl(dev->hcr + HCR_OUT_PARAM_OFFSET + 4)); + + status = be32_to_cpu((__force __be32) __raw_readl(dev->hcr + HCR_STATUS_OFFSET)) >> 24; + if (status) { + mthca_dbg(dev, "Command %02x completed with status %02x\n", + op, status); + err = mthca_status_to_errno(status); + } + +out: + up(&dev->cmd.poll_sem); + return err; +} + +void mthca_cmd_event(struct mthca_dev *dev, + u16 token, + u8 status, + u64 out_param) +{ + struct mthca_cmd_context *context = + &dev->cmd.context[token & dev->cmd.token_mask]; + + /* previously timed out command completing at long last */ + if (token != context->token) + return; + + context->result = 0; + context->status = status; + context->out_param = out_param; + + complete(&context->done); +} + +static int mthca_cmd_wait(struct mthca_dev *dev, + u64 in_param, + u64 *out_param, + int out_is_imm, + u32 in_modifier, + u8 op_modifier, + u16 op, + unsigned long timeout) +{ + int err = 0; + struct mthca_cmd_context *context; + + down(&dev->cmd.event_sem); + + spin_lock(&dev->cmd.context_lock); + BUG_ON(dev->cmd.free_head < 0); + context = &dev->cmd.context[dev->cmd.free_head]; + context->token += dev->cmd.token_mask + 1; + dev->cmd.free_head = context->next; + spin_unlock(&dev->cmd.context_lock); + + init_completion(&context->done); + + err = mthca_cmd_post(dev, in_param, + out_param ? *out_param : 0, + in_modifier, op_modifier, + op, context->token, 1); + if (err) + goto out; + + if (!wait_for_completion_timeout(&context->done, timeout)) { + err = -EBUSY; + goto out; + } + + err = context->result; + if (err) + goto out; + + if (context->status) { + mthca_dbg(dev, "Command %02x completed with status %02x\n", + op, context->status); + err = mthca_status_to_errno(context->status); + } + + if (out_is_imm) + *out_param = context->out_param; + +out: + spin_lock(&dev->cmd.context_lock); + context->next = dev->cmd.free_head; + dev->cmd.free_head = context - dev->cmd.context; + spin_unlock(&dev->cmd.context_lock); + + up(&dev->cmd.event_sem); + return err; +} + +/* Invoke a command with an output mailbox */ +static int mthca_cmd_box(struct mthca_dev *dev, + u64 in_param, + u64 out_param, + u32 in_modifier, + u8 op_modifier, + u16 op, + unsigned long timeout) +{ + if (dev->cmd.flags & MTHCA_CMD_USE_EVENTS) + return mthca_cmd_wait(dev, in_param, &out_param, 0, + in_modifier, op_modifier, op, + timeout); + else + return mthca_cmd_poll(dev, in_param, &out_param, 0, + in_modifier, op_modifier, op, + timeout); +} + +/* Invoke a command with no output parameter */ +static int mthca_cmd(struct mthca_dev *dev, + u64 in_param, + u32 in_modifier, + u8 op_modifier, + u16 op, + unsigned long timeout) +{ + return mthca_cmd_box(dev, in_param, 0, in_modifier, + op_modifier, op, timeout); +} + +/* + * Invoke a command with an immediate output parameter (and copy the + * output into the caller's out_param pointer after the command + * executes). + */ +static int mthca_cmd_imm(struct mthca_dev *dev, + u64 in_param, + u64 *out_param, + u32 in_modifier, + u8 op_modifier, + u16 op, + unsigned long timeout) +{ + if (dev->cmd.flags & MTHCA_CMD_USE_EVENTS) + return mthca_cmd_wait(dev, in_param, out_param, 1, + in_modifier, op_modifier, op, + timeout); + else + return mthca_cmd_poll(dev, in_param, out_param, 1, + in_modifier, op_modifier, op, + timeout); +} + +int mthca_cmd_init(struct mthca_dev *dev) +{ + mutex_init(&dev->cmd.hcr_mutex); + sema_init(&dev->cmd.poll_sem, 1); + dev->cmd.flags = 0; + + dev->hcr = ioremap(pci_resource_start(dev->pdev, 0) + MTHCA_HCR_BASE, + MTHCA_HCR_SIZE); + if (!dev->hcr) { + mthca_err(dev, "Couldn't map command register."); + return -ENOMEM; + } + + dev->cmd.pool = pci_pool_create("mthca_cmd", dev->pdev, + MTHCA_MAILBOX_SIZE, + MTHCA_MAILBOX_SIZE, 0); + if (!dev->cmd.pool) { + iounmap(dev->hcr); + return -ENOMEM; + } + + return 0; +} + +void mthca_cmd_cleanup(struct mthca_dev *dev) +{ + pci_pool_destroy(dev->cmd.pool); + iounmap(dev->hcr); + if (dev->cmd.flags & MTHCA_CMD_POST_DOORBELLS) + iounmap(dev->cmd.dbell_map); +} + +/* + * Switch to using events to issue FW commands (should be called after + * event queue to command events has been initialized). + */ +int mthca_cmd_use_events(struct mthca_dev *dev) +{ + int i; + + dev->cmd.context = kmalloc(dev->cmd.max_cmds * + sizeof (struct mthca_cmd_context), + GFP_KERNEL); + if (!dev->cmd.context) + return -ENOMEM; + + for (i = 0; i < dev->cmd.max_cmds; ++i) { + dev->cmd.context[i].token = i; + dev->cmd.context[i].next = i + 1; + } + + dev->cmd.context[dev->cmd.max_cmds - 1].next = -1; + dev->cmd.free_head = 0; + + sema_init(&dev->cmd.event_sem, dev->cmd.max_cmds); + spin_lock_init(&dev->cmd.context_lock); + + for (dev->cmd.token_mask = 1; + dev->cmd.token_mask < dev->cmd.max_cmds; + dev->cmd.token_mask <<= 1) + ; /* nothing */ + --dev->cmd.token_mask; + + dev->cmd.flags |= MTHCA_CMD_USE_EVENTS; + + down(&dev->cmd.poll_sem); + + return 0; +} + +/* + * Switch back to polling (used when shutting down the device) + */ +void mthca_cmd_use_polling(struct mthca_dev *dev) +{ + int i; + + dev->cmd.flags &= ~MTHCA_CMD_USE_EVENTS; + + for (i = 0; i < dev->cmd.max_cmds; ++i) + down(&dev->cmd.event_sem); + + kfree(dev->cmd.context); + + up(&dev->cmd.poll_sem); +} + +struct mthca_mailbox *mthca_alloc_mailbox(struct mthca_dev *dev, + gfp_t gfp_mask) +{ + struct mthca_mailbox *mailbox; + + mailbox = kmalloc(sizeof *mailbox, gfp_mask); + if (!mailbox) + return ERR_PTR(-ENOMEM); + + mailbox->buf = pci_pool_alloc(dev->cmd.pool, gfp_mask, &mailbox->dma); + if (!mailbox->buf) { + kfree(mailbox); + return ERR_PTR(-ENOMEM); + } + + return mailbox; +} + +void mthca_free_mailbox(struct mthca_dev *dev, struct mthca_mailbox *mailbox) +{ + if (!mailbox) + return; + + pci_pool_free(dev->cmd.pool, mailbox->buf, mailbox->dma); + kfree(mailbox); +} + +int mthca_SYS_EN(struct mthca_dev *dev) +{ + u64 out; + int ret; + + ret = mthca_cmd_imm(dev, 0, &out, 0, 0, CMD_SYS_EN, CMD_TIME_CLASS_D); + + if (ret == -ENOMEM) + mthca_warn(dev, "SYS_EN DDR error: syn=%x, sock=%d, " + "sladdr=%d, SPD source=%s\n", + (int) (out >> 6) & 0xf, (int) (out >> 4) & 3, + (int) (out >> 1) & 7, (int) out & 1 ? "NVMEM" : "DIMM"); + + return ret; +} + +int mthca_SYS_DIS(struct mthca_dev *dev) +{ + return mthca_cmd(dev, 0, 0, 0, CMD_SYS_DIS, CMD_TIME_CLASS_C); +} + +static int mthca_map_cmd(struct mthca_dev *dev, u16 op, struct mthca_icm *icm, + u64 virt) +{ + struct mthca_mailbox *mailbox; + struct mthca_icm_iter iter; + __be64 *pages; + int lg; + int nent = 0; + int i; + int err = 0; + int ts = 0, tc = 0; + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + memset(mailbox->buf, 0, MTHCA_MAILBOX_SIZE); + pages = mailbox->buf; + + for (mthca_icm_first(icm, &iter); + !mthca_icm_last(&iter); + mthca_icm_next(&iter)) { + /* + * We have to pass pages that are aligned to their + * size, so find the least significant 1 in the + * address or size and use that as our log2 size. + */ + lg = ffs(mthca_icm_addr(&iter) | mthca_icm_size(&iter)) - 1; + if (lg < MTHCA_ICM_PAGE_SHIFT) { + mthca_warn(dev, "Got FW area not aligned to %d (%llx/%lx).\n", + MTHCA_ICM_PAGE_SIZE, + (unsigned long long) mthca_icm_addr(&iter), + mthca_icm_size(&iter)); + err = -EINVAL; + goto out; + } + for (i = 0; i < mthca_icm_size(&iter) >> lg; ++i) { + if (virt != -1) { + pages[nent * 2] = cpu_to_be64(virt); + virt += 1 << lg; + } + + pages[nent * 2 + 1] = + cpu_to_be64((mthca_icm_addr(&iter) + (i << lg)) | + (lg - MTHCA_ICM_PAGE_SHIFT)); + ts += 1 << (lg - 10); + ++tc; + + if (++nent == MTHCA_MAILBOX_SIZE / 16) { + err = mthca_cmd(dev, mailbox->dma, nent, 0, op, + CMD_TIME_CLASS_B); + if (err) + goto out; + nent = 0; + } + } + } + + if (nent) + err = mthca_cmd(dev, mailbox->dma, nent, 0, op, + CMD_TIME_CLASS_B); + + switch (op) { + case CMD_MAP_FA: + mthca_dbg(dev, "Mapped %d chunks/%d KB for FW.\n", tc, ts); + break; + case CMD_MAP_ICM_AUX: + mthca_dbg(dev, "Mapped %d chunks/%d KB for ICM aux.\n", tc, ts); + break; + case CMD_MAP_ICM: + mthca_dbg(dev, "Mapped %d chunks/%d KB at %llx for ICM.\n", + tc, ts, (unsigned long long) virt - (ts << 10)); + break; + } + +out: + mthca_free_mailbox(dev, mailbox); + return err; +} + +int mthca_MAP_FA(struct mthca_dev *dev, struct mthca_icm *icm) +{ + return mthca_map_cmd(dev, CMD_MAP_FA, icm, -1); +} + +int mthca_UNMAP_FA(struct mthca_dev *dev) +{ + return mthca_cmd(dev, 0, 0, 0, CMD_UNMAP_FA, CMD_TIME_CLASS_B); +} + +int mthca_RUN_FW(struct mthca_dev *dev) +{ + return mthca_cmd(dev, 0, 0, 0, CMD_RUN_FW, CMD_TIME_CLASS_A); +} + +static void mthca_setup_cmd_doorbells(struct mthca_dev *dev, u64 base) +{ + phys_addr_t addr; + u16 max_off = 0; + int i; + + for (i = 0; i < 8; ++i) + max_off = max(max_off, dev->cmd.dbell_offsets[i]); + + if ((base & PAGE_MASK) != ((base + max_off) & PAGE_MASK)) { + mthca_warn(dev, "Firmware doorbell region at 0x%016llx, " + "length 0x%x crosses a page boundary\n", + (unsigned long long) base, max_off); + return; + } + + addr = pci_resource_start(dev->pdev, 2) + + ((pci_resource_len(dev->pdev, 2) - 1) & base); + dev->cmd.dbell_map = ioremap(addr, max_off + sizeof(u32)); + if (!dev->cmd.dbell_map) + return; + + dev->cmd.flags |= MTHCA_CMD_POST_DOORBELLS; + mthca_dbg(dev, "Mapped doorbell page for posting FW commands\n"); +} + +int mthca_QUERY_FW(struct mthca_dev *dev) +{ + struct mthca_mailbox *mailbox; + u32 *outbox; + u64 base; + u32 tmp; + int err = 0; + u8 lg; + int i; + +#define QUERY_FW_OUT_SIZE 0x100 +#define QUERY_FW_VER_OFFSET 0x00 +#define QUERY_FW_MAX_CMD_OFFSET 0x0f +#define QUERY_FW_ERR_START_OFFSET 0x30 +#define QUERY_FW_ERR_SIZE_OFFSET 0x38 + +#define QUERY_FW_CMD_DB_EN_OFFSET 0x10 +#define QUERY_FW_CMD_DB_OFFSET 0x50 +#define QUERY_FW_CMD_DB_BASE 0x60 + +#define QUERY_FW_START_OFFSET 0x20 +#define QUERY_FW_END_OFFSET 0x28 + +#define QUERY_FW_SIZE_OFFSET 0x00 +#define QUERY_FW_CLR_INT_BASE_OFFSET 0x20 +#define QUERY_FW_EQ_ARM_BASE_OFFSET 0x40 +#define QUERY_FW_EQ_SET_CI_BASE_OFFSET 0x48 + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + outbox = mailbox->buf; + + err = mthca_cmd_box(dev, 0, mailbox->dma, 0, 0, CMD_QUERY_FW, + CMD_TIME_CLASS_A); + + if (err) + goto out; + + MTHCA_GET(dev->fw_ver, outbox, QUERY_FW_VER_OFFSET); + /* + * FW subminor version is at more significant bits than minor + * version, so swap here. + */ + dev->fw_ver = (dev->fw_ver & 0xffff00000000ull) | + ((dev->fw_ver & 0xffff0000ull) >> 16) | + ((dev->fw_ver & 0x0000ffffull) << 16); + + MTHCA_GET(lg, outbox, QUERY_FW_MAX_CMD_OFFSET); + dev->cmd.max_cmds = 1 << lg; + + mthca_dbg(dev, "FW version %012llx, max commands %d\n", + (unsigned long long) dev->fw_ver, dev->cmd.max_cmds); + + MTHCA_GET(dev->catas_err.addr, outbox, QUERY_FW_ERR_START_OFFSET); + MTHCA_GET(dev->catas_err.size, outbox, QUERY_FW_ERR_SIZE_OFFSET); + + mthca_dbg(dev, "Catastrophic error buffer at 0x%llx, size 0x%x\n", + (unsigned long long) dev->catas_err.addr, dev->catas_err.size); + + MTHCA_GET(tmp, outbox, QUERY_FW_CMD_DB_EN_OFFSET); + if (tmp & 0x1) { + mthca_dbg(dev, "FW supports commands through doorbells\n"); + + MTHCA_GET(base, outbox, QUERY_FW_CMD_DB_BASE); + for (i = 0; i < MTHCA_CMD_NUM_DBELL_DWORDS; ++i) + MTHCA_GET(dev->cmd.dbell_offsets[i], outbox, + QUERY_FW_CMD_DB_OFFSET + (i << 1)); + + mthca_setup_cmd_doorbells(dev, base); + } + + if (mthca_is_memfree(dev)) { + MTHCA_GET(dev->fw.arbel.fw_pages, outbox, QUERY_FW_SIZE_OFFSET); + MTHCA_GET(dev->fw.arbel.clr_int_base, outbox, QUERY_FW_CLR_INT_BASE_OFFSET); + MTHCA_GET(dev->fw.arbel.eq_arm_base, outbox, QUERY_FW_EQ_ARM_BASE_OFFSET); + MTHCA_GET(dev->fw.arbel.eq_set_ci_base, outbox, QUERY_FW_EQ_SET_CI_BASE_OFFSET); + mthca_dbg(dev, "FW size %d KB\n", dev->fw.arbel.fw_pages << 2); + + /* + * Round up number of system pages needed in case + * MTHCA_ICM_PAGE_SIZE < PAGE_SIZE. + */ + dev->fw.arbel.fw_pages = + ALIGN(dev->fw.arbel.fw_pages, PAGE_SIZE / MTHCA_ICM_PAGE_SIZE) >> + (PAGE_SHIFT - MTHCA_ICM_PAGE_SHIFT); + + mthca_dbg(dev, "Clear int @ %llx, EQ arm @ %llx, EQ set CI @ %llx\n", + (unsigned long long) dev->fw.arbel.clr_int_base, + (unsigned long long) dev->fw.arbel.eq_arm_base, + (unsigned long long) dev->fw.arbel.eq_set_ci_base); + } else { + MTHCA_GET(dev->fw.tavor.fw_start, outbox, QUERY_FW_START_OFFSET); + MTHCA_GET(dev->fw.tavor.fw_end, outbox, QUERY_FW_END_OFFSET); + + mthca_dbg(dev, "FW size %d KB (start %llx, end %llx)\n", + (int) ((dev->fw.tavor.fw_end - dev->fw.tavor.fw_start) >> 10), + (unsigned long long) dev->fw.tavor.fw_start, + (unsigned long long) dev->fw.tavor.fw_end); + } + +out: + mthca_free_mailbox(dev, mailbox); + return err; +} + +int mthca_ENABLE_LAM(struct mthca_dev *dev) +{ + struct mthca_mailbox *mailbox; + u8 info; + u32 *outbox; + int err = 0; + +#define ENABLE_LAM_OUT_SIZE 0x100 +#define ENABLE_LAM_START_OFFSET 0x00 +#define ENABLE_LAM_END_OFFSET 0x08 +#define ENABLE_LAM_INFO_OFFSET 0x13 + +#define ENABLE_LAM_INFO_HIDDEN_FLAG (1 << 4) +#define ENABLE_LAM_INFO_ECC_MASK 0x3 + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + outbox = mailbox->buf; + + err = mthca_cmd_box(dev, 0, mailbox->dma, 0, 0, CMD_ENABLE_LAM, + CMD_TIME_CLASS_C); + + if (err) + goto out; + + MTHCA_GET(dev->ddr_start, outbox, ENABLE_LAM_START_OFFSET); + MTHCA_GET(dev->ddr_end, outbox, ENABLE_LAM_END_OFFSET); + MTHCA_GET(info, outbox, ENABLE_LAM_INFO_OFFSET); + + if (!!(info & ENABLE_LAM_INFO_HIDDEN_FLAG) != + !!(dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN)) { + mthca_info(dev, "FW reports that HCA-attached memory " + "is %s hidden; does not match PCI config\n", + (info & ENABLE_LAM_INFO_HIDDEN_FLAG) ? + "" : "not"); + } + if (info & ENABLE_LAM_INFO_HIDDEN_FLAG) + mthca_dbg(dev, "HCA-attached memory is hidden.\n"); + + mthca_dbg(dev, "HCA memory size %d KB (start %llx, end %llx)\n", + (int) ((dev->ddr_end - dev->ddr_start) >> 10), + (unsigned long long) dev->ddr_start, + (unsigned long long) dev->ddr_end); + +out: + mthca_free_mailbox(dev, mailbox); + return err; +} + +int mthca_DISABLE_LAM(struct mthca_dev *dev) +{ + return mthca_cmd(dev, 0, 0, 0, CMD_SYS_DIS, CMD_TIME_CLASS_C); +} + +int mthca_QUERY_DDR(struct mthca_dev *dev) +{ + struct mthca_mailbox *mailbox; + u8 info; + u32 *outbox; + int err = 0; + +#define QUERY_DDR_OUT_SIZE 0x100 +#define QUERY_DDR_START_OFFSET 0x00 +#define QUERY_DDR_END_OFFSET 0x08 +#define QUERY_DDR_INFO_OFFSET 0x13 + +#define QUERY_DDR_INFO_HIDDEN_FLAG (1 << 4) +#define QUERY_DDR_INFO_ECC_MASK 0x3 + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + outbox = mailbox->buf; + + err = mthca_cmd_box(dev, 0, mailbox->dma, 0, 0, CMD_QUERY_DDR, + CMD_TIME_CLASS_A); + + if (err) + goto out; + + MTHCA_GET(dev->ddr_start, outbox, QUERY_DDR_START_OFFSET); + MTHCA_GET(dev->ddr_end, outbox, QUERY_DDR_END_OFFSET); + MTHCA_GET(info, outbox, QUERY_DDR_INFO_OFFSET); + + if (!!(info & QUERY_DDR_INFO_HIDDEN_FLAG) != + !!(dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN)) { + mthca_info(dev, "FW reports that HCA-attached memory " + "is %s hidden; does not match PCI config\n", + (info & QUERY_DDR_INFO_HIDDEN_FLAG) ? + "" : "not"); + } + if (info & QUERY_DDR_INFO_HIDDEN_FLAG) + mthca_dbg(dev, "HCA-attached memory is hidden.\n"); + + mthca_dbg(dev, "HCA memory size %d KB (start %llx, end %llx)\n", + (int) ((dev->ddr_end - dev->ddr_start) >> 10), + (unsigned long long) dev->ddr_start, + (unsigned long long) dev->ddr_end); + +out: + mthca_free_mailbox(dev, mailbox); + return err; +} + +int mthca_QUERY_DEV_LIM(struct mthca_dev *dev, + struct mthca_dev_lim *dev_lim) +{ + struct mthca_mailbox *mailbox; + u32 *outbox; + u8 field; + u16 size; + u16 stat_rate; + int err; + +#define QUERY_DEV_LIM_OUT_SIZE 0x100 +#define QUERY_DEV_LIM_MAX_SRQ_SZ_OFFSET 0x10 +#define QUERY_DEV_LIM_MAX_QP_SZ_OFFSET 0x11 +#define QUERY_DEV_LIM_RSVD_QP_OFFSET 0x12 +#define QUERY_DEV_LIM_MAX_QP_OFFSET 0x13 +#define QUERY_DEV_LIM_RSVD_SRQ_OFFSET 0x14 +#define QUERY_DEV_LIM_MAX_SRQ_OFFSET 0x15 +#define QUERY_DEV_LIM_RSVD_EEC_OFFSET 0x16 +#define QUERY_DEV_LIM_MAX_EEC_OFFSET 0x17 +#define QUERY_DEV_LIM_MAX_CQ_SZ_OFFSET 0x19 +#define QUERY_DEV_LIM_RSVD_CQ_OFFSET 0x1a +#define QUERY_DEV_LIM_MAX_CQ_OFFSET 0x1b +#define QUERY_DEV_LIM_MAX_MPT_OFFSET 0x1d +#define QUERY_DEV_LIM_RSVD_EQ_OFFSET 0x1e +#define QUERY_DEV_LIM_MAX_EQ_OFFSET 0x1f +#define QUERY_DEV_LIM_RSVD_MTT_OFFSET 0x20 +#define QUERY_DEV_LIM_MAX_MRW_SZ_OFFSET 0x21 +#define QUERY_DEV_LIM_RSVD_MRW_OFFSET 0x22 +#define QUERY_DEV_LIM_MAX_MTT_SEG_OFFSET 0x23 +#define QUERY_DEV_LIM_MAX_AV_OFFSET 0x27 +#define QUERY_DEV_LIM_MAX_REQ_QP_OFFSET 0x29 +#define QUERY_DEV_LIM_MAX_RES_QP_OFFSET 0x2b +#define QUERY_DEV_LIM_MAX_RDMA_OFFSET 0x2f +#define QUERY_DEV_LIM_RSZ_SRQ_OFFSET 0x33 +#define QUERY_DEV_LIM_ACK_DELAY_OFFSET 0x35 +#define QUERY_DEV_LIM_MTU_WIDTH_OFFSET 0x36 +#define QUERY_DEV_LIM_VL_PORT_OFFSET 0x37 +#define QUERY_DEV_LIM_MAX_GID_OFFSET 0x3b +#define QUERY_DEV_LIM_RATE_SUPPORT_OFFSET 0x3c +#define QUERY_DEV_LIM_MAX_PKEY_OFFSET 0x3f +#define QUERY_DEV_LIM_FLAGS_OFFSET 0x44 +#define QUERY_DEV_LIM_RSVD_UAR_OFFSET 0x48 +#define QUERY_DEV_LIM_UAR_SZ_OFFSET 0x49 +#define QUERY_DEV_LIM_PAGE_SZ_OFFSET 0x4b +#define QUERY_DEV_LIM_MAX_SG_OFFSET 0x51 +#define QUERY_DEV_LIM_MAX_DESC_SZ_OFFSET 0x52 +#define QUERY_DEV_LIM_MAX_SG_RQ_OFFSET 0x55 +#define QUERY_DEV_LIM_MAX_DESC_SZ_RQ_OFFSET 0x56 +#define QUERY_DEV_LIM_MAX_QP_MCG_OFFSET 0x61 +#define QUERY_DEV_LIM_RSVD_MCG_OFFSET 0x62 +#define QUERY_DEV_LIM_MAX_MCG_OFFSET 0x63 +#define QUERY_DEV_LIM_RSVD_PD_OFFSET 0x64 +#define QUERY_DEV_LIM_MAX_PD_OFFSET 0x65 +#define QUERY_DEV_LIM_RSVD_RDD_OFFSET 0x66 +#define QUERY_DEV_LIM_MAX_RDD_OFFSET 0x67 +#define QUERY_DEV_LIM_EEC_ENTRY_SZ_OFFSET 0x80 +#define QUERY_DEV_LIM_QPC_ENTRY_SZ_OFFSET 0x82 +#define QUERY_DEV_LIM_EEEC_ENTRY_SZ_OFFSET 0x84 +#define QUERY_DEV_LIM_EQPC_ENTRY_SZ_OFFSET 0x86 +#define QUERY_DEV_LIM_EQC_ENTRY_SZ_OFFSET 0x88 +#define QUERY_DEV_LIM_CQC_ENTRY_SZ_OFFSET 0x8a +#define QUERY_DEV_LIM_SRQ_ENTRY_SZ_OFFSET 0x8c +#define QUERY_DEV_LIM_UAR_ENTRY_SZ_OFFSET 0x8e +#define QUERY_DEV_LIM_MTT_ENTRY_SZ_OFFSET 0x90 +#define QUERY_DEV_LIM_MPT_ENTRY_SZ_OFFSET 0x92 +#define QUERY_DEV_LIM_PBL_SZ_OFFSET 0x96 +#define QUERY_DEV_LIM_BMME_FLAGS_OFFSET 0x97 +#define QUERY_DEV_LIM_RSVD_LKEY_OFFSET 0x98 +#define QUERY_DEV_LIM_LAMR_OFFSET 0x9f +#define QUERY_DEV_LIM_MAX_ICM_SZ_OFFSET 0xa0 + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + outbox = mailbox->buf; + + err = mthca_cmd_box(dev, 0, mailbox->dma, 0, 0, CMD_QUERY_DEV_LIM, + CMD_TIME_CLASS_A); + + if (err) + goto out; + + MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_QP_OFFSET); + dev_lim->reserved_qps = 1 << (field & 0xf); + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_QP_OFFSET); + dev_lim->max_qps = 1 << (field & 0x1f); + MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_SRQ_OFFSET); + dev_lim->reserved_srqs = 1 << (field >> 4); + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SRQ_OFFSET); + dev_lim->max_srqs = 1 << (field & 0x1f); + MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_EEC_OFFSET); + dev_lim->reserved_eecs = 1 << (field & 0xf); + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_EEC_OFFSET); + dev_lim->max_eecs = 1 << (field & 0x1f); + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_CQ_SZ_OFFSET); + dev_lim->max_cq_sz = 1 << field; + MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_CQ_OFFSET); + dev_lim->reserved_cqs = 1 << (field & 0xf); + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_CQ_OFFSET); + dev_lim->max_cqs = 1 << (field & 0x1f); + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_MPT_OFFSET); + dev_lim->max_mpts = 1 << (field & 0x3f); + MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_EQ_OFFSET); + dev_lim->reserved_eqs = 1 << (field & 0xf); + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_EQ_OFFSET); + dev_lim->max_eqs = 1 << (field & 0x7); + MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_MTT_OFFSET); + if (mthca_is_memfree(dev)) + dev_lim->reserved_mtts = ALIGN((1 << (field >> 4)) * sizeof(u64), + dev->limits.mtt_seg_size) / dev->limits.mtt_seg_size; + else + dev_lim->reserved_mtts = 1 << (field >> 4); + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_MRW_SZ_OFFSET); + dev_lim->max_mrw_sz = 1 << field; + MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_MRW_OFFSET); + dev_lim->reserved_mrws = 1 << (field & 0xf); + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_MTT_SEG_OFFSET); + dev_lim->max_mtt_seg = 1 << (field & 0x3f); + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_REQ_QP_OFFSET); + dev_lim->max_requester_per_qp = 1 << (field & 0x3f); + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_RES_QP_OFFSET); + dev_lim->max_responder_per_qp = 1 << (field & 0x3f); + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_RDMA_OFFSET); + dev_lim->max_rdma_global = 1 << (field & 0x3f); + MTHCA_GET(field, outbox, QUERY_DEV_LIM_ACK_DELAY_OFFSET); + dev_lim->local_ca_ack_delay = field & 0x1f; + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MTU_WIDTH_OFFSET); + dev_lim->max_mtu = field >> 4; + dev_lim->max_port_width = field & 0xf; + MTHCA_GET(field, outbox, QUERY_DEV_LIM_VL_PORT_OFFSET); + dev_lim->max_vl = field >> 4; + dev_lim->num_ports = field & 0xf; + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_GID_OFFSET); + dev_lim->max_gids = 1 << (field & 0xf); + MTHCA_GET(stat_rate, outbox, QUERY_DEV_LIM_RATE_SUPPORT_OFFSET); + dev_lim->stat_rate_support = stat_rate; + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_PKEY_OFFSET); + dev_lim->max_pkeys = 1 << (field & 0xf); + MTHCA_GET(dev_lim->flags, outbox, QUERY_DEV_LIM_FLAGS_OFFSET); + MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_UAR_OFFSET); + dev_lim->reserved_uars = field >> 4; + MTHCA_GET(field, outbox, QUERY_DEV_LIM_UAR_SZ_OFFSET); + dev_lim->uar_size = 1 << ((field & 0x3f) + 20); + MTHCA_GET(field, outbox, QUERY_DEV_LIM_PAGE_SZ_OFFSET); + dev_lim->min_page_sz = 1 << field; + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SG_OFFSET); + dev_lim->max_sg = field; + + MTHCA_GET(size, outbox, QUERY_DEV_LIM_MAX_DESC_SZ_OFFSET); + dev_lim->max_desc_sz = size; + + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_QP_MCG_OFFSET); + dev_lim->max_qp_per_mcg = 1 << field; + MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_MCG_OFFSET); + dev_lim->reserved_mgms = field & 0xf; + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_MCG_OFFSET); + dev_lim->max_mcgs = 1 << field; + MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_PD_OFFSET); + dev_lim->reserved_pds = field >> 4; + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_PD_OFFSET); + dev_lim->max_pds = 1 << (field & 0x3f); + MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_RDD_OFFSET); + dev_lim->reserved_rdds = field >> 4; + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_RDD_OFFSET); + dev_lim->max_rdds = 1 << (field & 0x3f); + + MTHCA_GET(size, outbox, QUERY_DEV_LIM_EEC_ENTRY_SZ_OFFSET); + dev_lim->eec_entry_sz = size; + MTHCA_GET(size, outbox, QUERY_DEV_LIM_QPC_ENTRY_SZ_OFFSET); + dev_lim->qpc_entry_sz = size; + MTHCA_GET(size, outbox, QUERY_DEV_LIM_EEEC_ENTRY_SZ_OFFSET); + dev_lim->eeec_entry_sz = size; + MTHCA_GET(size, outbox, QUERY_DEV_LIM_EQPC_ENTRY_SZ_OFFSET); + dev_lim->eqpc_entry_sz = size; + MTHCA_GET(size, outbox, QUERY_DEV_LIM_EQC_ENTRY_SZ_OFFSET); + dev_lim->eqc_entry_sz = size; + MTHCA_GET(size, outbox, QUERY_DEV_LIM_CQC_ENTRY_SZ_OFFSET); + dev_lim->cqc_entry_sz = size; + MTHCA_GET(size, outbox, QUERY_DEV_LIM_SRQ_ENTRY_SZ_OFFSET); + dev_lim->srq_entry_sz = size; + MTHCA_GET(size, outbox, QUERY_DEV_LIM_UAR_ENTRY_SZ_OFFSET); + dev_lim->uar_scratch_entry_sz = size; + + if (mthca_is_memfree(dev)) { + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SRQ_SZ_OFFSET); + dev_lim->max_srq_sz = 1 << field; + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_QP_SZ_OFFSET); + dev_lim->max_qp_sz = 1 << field; + MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSZ_SRQ_OFFSET); + dev_lim->hca.arbel.resize_srq = field & 1; + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SG_RQ_OFFSET); + dev_lim->max_sg = min_t(int, field, dev_lim->max_sg); + MTHCA_GET(size, outbox, QUERY_DEV_LIM_MAX_DESC_SZ_RQ_OFFSET); + dev_lim->max_desc_sz = min_t(int, size, dev_lim->max_desc_sz); + MTHCA_GET(size, outbox, QUERY_DEV_LIM_MPT_ENTRY_SZ_OFFSET); + dev_lim->mpt_entry_sz = size; + MTHCA_GET(field, outbox, QUERY_DEV_LIM_PBL_SZ_OFFSET); + dev_lim->hca.arbel.max_pbl_sz = 1 << (field & 0x3f); + MTHCA_GET(dev_lim->hca.arbel.bmme_flags, outbox, + QUERY_DEV_LIM_BMME_FLAGS_OFFSET); + MTHCA_GET(dev_lim->hca.arbel.reserved_lkey, outbox, + QUERY_DEV_LIM_RSVD_LKEY_OFFSET); + MTHCA_GET(field, outbox, QUERY_DEV_LIM_LAMR_OFFSET); + dev_lim->hca.arbel.lam_required = field & 1; + MTHCA_GET(dev_lim->hca.arbel.max_icm_sz, outbox, + QUERY_DEV_LIM_MAX_ICM_SZ_OFFSET); + + if (dev_lim->hca.arbel.bmme_flags & 1) + mthca_dbg(dev, "Base MM extensions: yes " + "(flags %d, max PBL %d, rsvd L_Key %08x)\n", + dev_lim->hca.arbel.bmme_flags, + dev_lim->hca.arbel.max_pbl_sz, + dev_lim->hca.arbel.reserved_lkey); + else + mthca_dbg(dev, "Base MM extensions: no\n"); + + mthca_dbg(dev, "Max ICM size %lld MB\n", + (unsigned long long) dev_lim->hca.arbel.max_icm_sz >> 20); + } else { + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SRQ_SZ_OFFSET); + dev_lim->max_srq_sz = (1 << field) - 1; + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_QP_SZ_OFFSET); + dev_lim->max_qp_sz = (1 << field) - 1; + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_AV_OFFSET); + dev_lim->hca.tavor.max_avs = 1 << (field & 0x3f); + dev_lim->mpt_entry_sz = MTHCA_MPT_ENTRY_SIZE; + } + + mthca_dbg(dev, "Max QPs: %d, reserved QPs: %d, entry size: %d\n", + dev_lim->max_qps, dev_lim->reserved_qps, dev_lim->qpc_entry_sz); + mthca_dbg(dev, "Max SRQs: %d, reserved SRQs: %d, entry size: %d\n", + dev_lim->max_srqs, dev_lim->reserved_srqs, dev_lim->srq_entry_sz); + mthca_dbg(dev, "Max CQs: %d, reserved CQs: %d, entry size: %d\n", + dev_lim->max_cqs, dev_lim->reserved_cqs, dev_lim->cqc_entry_sz); + mthca_dbg(dev, "Max EQs: %d, reserved EQs: %d, entry size: %d\n", + dev_lim->max_eqs, dev_lim->reserved_eqs, dev_lim->eqc_entry_sz); + mthca_dbg(dev, "reserved MPTs: %d, reserved MTTs: %d\n", + dev_lim->reserved_mrws, dev_lim->reserved_mtts); + mthca_dbg(dev, "Max PDs: %d, reserved PDs: %d, reserved UARs: %d\n", + dev_lim->max_pds, dev_lim->reserved_pds, dev_lim->reserved_uars); + mthca_dbg(dev, "Max QP/MCG: %d, reserved MGMs: %d\n", + dev_lim->max_pds, dev_lim->reserved_mgms); + mthca_dbg(dev, "Max CQEs: %d, max WQEs: %d, max SRQ WQEs: %d\n", + dev_lim->max_cq_sz, dev_lim->max_qp_sz, dev_lim->max_srq_sz); + + mthca_dbg(dev, "Flags: %08x\n", dev_lim->flags); + +out: + mthca_free_mailbox(dev, mailbox); + return err; +} + +static void get_board_id(void *vsd, char *board_id) +{ + int i; + +#define VSD_OFFSET_SIG1 0x00 +#define VSD_OFFSET_SIG2 0xde +#define VSD_OFFSET_MLX_BOARD_ID 0xd0 +#define VSD_OFFSET_TS_BOARD_ID 0x20 + +#define VSD_SIGNATURE_TOPSPIN 0x5ad + + memset(board_id, 0, MTHCA_BOARD_ID_LEN); + + if (be16_to_cpup(vsd + VSD_OFFSET_SIG1) == VSD_SIGNATURE_TOPSPIN && + be16_to_cpup(vsd + VSD_OFFSET_SIG2) == VSD_SIGNATURE_TOPSPIN) { + strlcpy(board_id, vsd + VSD_OFFSET_TS_BOARD_ID, MTHCA_BOARD_ID_LEN); + } else { + /* + * The board ID is a string but the firmware byte + * swaps each 4-byte word before passing it back to + * us. Therefore we need to swab it before printing. + */ + for (i = 0; i < 4; ++i) + ((u32 *) board_id)[i] = + swab32(*(u32 *) (vsd + VSD_OFFSET_MLX_BOARD_ID + i * 4)); + } +} + +int mthca_QUERY_ADAPTER(struct mthca_dev *dev, + struct mthca_adapter *adapter) +{ + struct mthca_mailbox *mailbox; + u32 *outbox; + int err; + +#define QUERY_ADAPTER_OUT_SIZE 0x100 +#define QUERY_ADAPTER_VENDOR_ID_OFFSET 0x00 +#define QUERY_ADAPTER_DEVICE_ID_OFFSET 0x04 +#define QUERY_ADAPTER_REVISION_ID_OFFSET 0x08 +#define QUERY_ADAPTER_INTA_PIN_OFFSET 0x10 +#define QUERY_ADAPTER_VSD_OFFSET 0x20 + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + outbox = mailbox->buf; + + err = mthca_cmd_box(dev, 0, mailbox->dma, 0, 0, CMD_QUERY_ADAPTER, + CMD_TIME_CLASS_A); + + if (err) + goto out; + + if (!mthca_is_memfree(dev)) { + MTHCA_GET(adapter->vendor_id, outbox, + QUERY_ADAPTER_VENDOR_ID_OFFSET); + MTHCA_GET(adapter->device_id, outbox, + QUERY_ADAPTER_DEVICE_ID_OFFSET); + MTHCA_GET(adapter->revision_id, outbox, + QUERY_ADAPTER_REVISION_ID_OFFSET); + } + MTHCA_GET(adapter->inta_pin, outbox, QUERY_ADAPTER_INTA_PIN_OFFSET); + + get_board_id(outbox + QUERY_ADAPTER_VSD_OFFSET / 4, + adapter->board_id); + +out: + mthca_free_mailbox(dev, mailbox); + return err; +} + +int mthca_INIT_HCA(struct mthca_dev *dev, + struct mthca_init_hca_param *param) +{ + struct mthca_mailbox *mailbox; + __be32 *inbox; + int err; + +#define INIT_HCA_IN_SIZE 0x200 +#define INIT_HCA_FLAGS1_OFFSET 0x00c +#define INIT_HCA_FLAGS2_OFFSET 0x014 +#define INIT_HCA_QPC_OFFSET 0x020 +#define INIT_HCA_QPC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x10) +#define INIT_HCA_LOG_QP_OFFSET (INIT_HCA_QPC_OFFSET + 0x17) +#define INIT_HCA_EEC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x20) +#define INIT_HCA_LOG_EEC_OFFSET (INIT_HCA_QPC_OFFSET + 0x27) +#define INIT_HCA_SRQC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x28) +#define INIT_HCA_LOG_SRQ_OFFSET (INIT_HCA_QPC_OFFSET + 0x2f) +#define INIT_HCA_CQC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x30) +#define INIT_HCA_LOG_CQ_OFFSET (INIT_HCA_QPC_OFFSET + 0x37) +#define INIT_HCA_EQPC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x40) +#define INIT_HCA_EEEC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x50) +#define INIT_HCA_EQC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x60) +#define INIT_HCA_LOG_EQ_OFFSET (INIT_HCA_QPC_OFFSET + 0x67) +#define INIT_HCA_RDB_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x70) +#define INIT_HCA_UDAV_OFFSET 0x0b0 +#define INIT_HCA_UDAV_LKEY_OFFSET (INIT_HCA_UDAV_OFFSET + 0x0) +#define INIT_HCA_UDAV_PD_OFFSET (INIT_HCA_UDAV_OFFSET + 0x4) +#define INIT_HCA_MCAST_OFFSET 0x0c0 +#define INIT_HCA_MC_BASE_OFFSET (INIT_HCA_MCAST_OFFSET + 0x00) +#define INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x12) +#define INIT_HCA_MC_HASH_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x16) +#define INIT_HCA_LOG_MC_TABLE_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x1b) +#define INIT_HCA_TPT_OFFSET 0x0f0 +#define INIT_HCA_MPT_BASE_OFFSET (INIT_HCA_TPT_OFFSET + 0x00) +#define INIT_HCA_MTT_SEG_SZ_OFFSET (INIT_HCA_TPT_OFFSET + 0x09) +#define INIT_HCA_LOG_MPT_SZ_OFFSET (INIT_HCA_TPT_OFFSET + 0x0b) +#define INIT_HCA_MTT_BASE_OFFSET (INIT_HCA_TPT_OFFSET + 0x10) +#define INIT_HCA_UAR_OFFSET 0x120 +#define INIT_HCA_UAR_BASE_OFFSET (INIT_HCA_UAR_OFFSET + 0x00) +#define INIT_HCA_UARC_SZ_OFFSET (INIT_HCA_UAR_OFFSET + 0x09) +#define INIT_HCA_LOG_UAR_SZ_OFFSET (INIT_HCA_UAR_OFFSET + 0x0a) +#define INIT_HCA_UAR_PAGE_SZ_OFFSET (INIT_HCA_UAR_OFFSET + 0x0b) +#define INIT_HCA_UAR_SCATCH_BASE_OFFSET (INIT_HCA_UAR_OFFSET + 0x10) +#define INIT_HCA_UAR_CTX_BASE_OFFSET (INIT_HCA_UAR_OFFSET + 0x18) + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + inbox = mailbox->buf; + + memset(inbox, 0, INIT_HCA_IN_SIZE); + + if (dev->mthca_flags & MTHCA_FLAG_SINAI_OPT) + MTHCA_PUT(inbox, 0x1, INIT_HCA_FLAGS1_OFFSET); + +#if defined(__LITTLE_ENDIAN) + *(inbox + INIT_HCA_FLAGS2_OFFSET / 4) &= ~cpu_to_be32(1 << 1); +#elif defined(__BIG_ENDIAN) + *(inbox + INIT_HCA_FLAGS2_OFFSET / 4) |= cpu_to_be32(1 << 1); +#else +#error Host endianness not defined +#endif + /* Check port for UD address vector: */ + *(inbox + INIT_HCA_FLAGS2_OFFSET / 4) |= cpu_to_be32(1); + + /* Enable IPoIB checksumming if we can: */ + if (dev->device_cap_flags & IB_DEVICE_UD_IP_CSUM) + *(inbox + INIT_HCA_FLAGS2_OFFSET / 4) |= cpu_to_be32(7 << 3); + + /* We leave wqe_quota, responder_exu, etc as 0 (default) */ + + /* QPC/EEC/CQC/EQC/RDB attributes */ + + MTHCA_PUT(inbox, param->qpc_base, INIT_HCA_QPC_BASE_OFFSET); + MTHCA_PUT(inbox, param->log_num_qps, INIT_HCA_LOG_QP_OFFSET); + MTHCA_PUT(inbox, param->eec_base, INIT_HCA_EEC_BASE_OFFSET); + MTHCA_PUT(inbox, param->log_num_eecs, INIT_HCA_LOG_EEC_OFFSET); + MTHCA_PUT(inbox, param->srqc_base, INIT_HCA_SRQC_BASE_OFFSET); + MTHCA_PUT(inbox, param->log_num_srqs, INIT_HCA_LOG_SRQ_OFFSET); + MTHCA_PUT(inbox, param->cqc_base, INIT_HCA_CQC_BASE_OFFSET); + MTHCA_PUT(inbox, param->log_num_cqs, INIT_HCA_LOG_CQ_OFFSET); + MTHCA_PUT(inbox, param->eqpc_base, INIT_HCA_EQPC_BASE_OFFSET); + MTHCA_PUT(inbox, param->eeec_base, INIT_HCA_EEEC_BASE_OFFSET); + MTHCA_PUT(inbox, param->eqc_base, INIT_HCA_EQC_BASE_OFFSET); + MTHCA_PUT(inbox, param->log_num_eqs, INIT_HCA_LOG_EQ_OFFSET); + MTHCA_PUT(inbox, param->rdb_base, INIT_HCA_RDB_BASE_OFFSET); + + /* UD AV attributes */ + + /* multicast attributes */ + + MTHCA_PUT(inbox, param->mc_base, INIT_HCA_MC_BASE_OFFSET); + MTHCA_PUT(inbox, param->log_mc_entry_sz, INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET); + MTHCA_PUT(inbox, param->mc_hash_sz, INIT_HCA_MC_HASH_SZ_OFFSET); + MTHCA_PUT(inbox, param->log_mc_table_sz, INIT_HCA_LOG_MC_TABLE_SZ_OFFSET); + + /* TPT attributes */ + + MTHCA_PUT(inbox, param->mpt_base, INIT_HCA_MPT_BASE_OFFSET); + if (!mthca_is_memfree(dev)) + MTHCA_PUT(inbox, param->mtt_seg_sz, INIT_HCA_MTT_SEG_SZ_OFFSET); + MTHCA_PUT(inbox, param->log_mpt_sz, INIT_HCA_LOG_MPT_SZ_OFFSET); + MTHCA_PUT(inbox, param->mtt_base, INIT_HCA_MTT_BASE_OFFSET); + + /* UAR attributes */ + { + u8 uar_page_sz = PAGE_SHIFT - 12; + MTHCA_PUT(inbox, uar_page_sz, INIT_HCA_UAR_PAGE_SZ_OFFSET); + } + + MTHCA_PUT(inbox, param->uar_scratch_base, INIT_HCA_UAR_SCATCH_BASE_OFFSET); + + if (mthca_is_memfree(dev)) { + MTHCA_PUT(inbox, param->log_uarc_sz, INIT_HCA_UARC_SZ_OFFSET); + MTHCA_PUT(inbox, param->log_uar_sz, INIT_HCA_LOG_UAR_SZ_OFFSET); + MTHCA_PUT(inbox, param->uarc_base, INIT_HCA_UAR_CTX_BASE_OFFSET); + } + + err = mthca_cmd(dev, mailbox->dma, 0, 0, + CMD_INIT_HCA, CMD_TIME_CLASS_D); + + mthca_free_mailbox(dev, mailbox); + return err; +} + +int mthca_INIT_IB(struct mthca_dev *dev, + struct mthca_init_ib_param *param, + int port) +{ + struct mthca_mailbox *mailbox; + u32 *inbox; + int err; + u32 flags; + +#define INIT_IB_IN_SIZE 56 +#define INIT_IB_FLAGS_OFFSET 0x00 +#define INIT_IB_FLAG_SIG (1 << 18) +#define INIT_IB_FLAG_NG (1 << 17) +#define INIT_IB_FLAG_G0 (1 << 16) +#define INIT_IB_VL_SHIFT 4 +#define INIT_IB_PORT_WIDTH_SHIFT 8 +#define INIT_IB_MTU_SHIFT 12 +#define INIT_IB_MAX_GID_OFFSET 0x06 +#define INIT_IB_MAX_PKEY_OFFSET 0x0a +#define INIT_IB_GUID0_OFFSET 0x10 +#define INIT_IB_NODE_GUID_OFFSET 0x18 +#define INIT_IB_SI_GUID_OFFSET 0x20 + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + inbox = mailbox->buf; + + memset(inbox, 0, INIT_IB_IN_SIZE); + + flags = 0; + flags |= param->set_guid0 ? INIT_IB_FLAG_G0 : 0; + flags |= param->set_node_guid ? INIT_IB_FLAG_NG : 0; + flags |= param->set_si_guid ? INIT_IB_FLAG_SIG : 0; + flags |= param->vl_cap << INIT_IB_VL_SHIFT; + flags |= param->port_width << INIT_IB_PORT_WIDTH_SHIFT; + flags |= param->mtu_cap << INIT_IB_MTU_SHIFT; + MTHCA_PUT(inbox, flags, INIT_IB_FLAGS_OFFSET); + + MTHCA_PUT(inbox, param->gid_cap, INIT_IB_MAX_GID_OFFSET); + MTHCA_PUT(inbox, param->pkey_cap, INIT_IB_MAX_PKEY_OFFSET); + MTHCA_PUT(inbox, param->guid0, INIT_IB_GUID0_OFFSET); + MTHCA_PUT(inbox, param->node_guid, INIT_IB_NODE_GUID_OFFSET); + MTHCA_PUT(inbox, param->si_guid, INIT_IB_SI_GUID_OFFSET); + + err = mthca_cmd(dev, mailbox->dma, port, 0, CMD_INIT_IB, + CMD_TIME_CLASS_A); + + mthca_free_mailbox(dev, mailbox); + return err; +} + +int mthca_CLOSE_IB(struct mthca_dev *dev, int port) +{ + return mthca_cmd(dev, 0, port, 0, CMD_CLOSE_IB, CMD_TIME_CLASS_A); +} + +int mthca_CLOSE_HCA(struct mthca_dev *dev, int panic) +{ + return mthca_cmd(dev, 0, 0, panic, CMD_CLOSE_HCA, CMD_TIME_CLASS_C); +} + +int mthca_SET_IB(struct mthca_dev *dev, struct mthca_set_ib_param *param, + int port) +{ + struct mthca_mailbox *mailbox; + u32 *inbox; + int err; + u32 flags = 0; + +#define SET_IB_IN_SIZE 0x40 +#define SET_IB_FLAGS_OFFSET 0x00 +#define SET_IB_FLAG_SIG (1 << 18) +#define SET_IB_FLAG_RQK (1 << 0) +#define SET_IB_CAP_MASK_OFFSET 0x04 +#define SET_IB_SI_GUID_OFFSET 0x08 + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + inbox = mailbox->buf; + + memset(inbox, 0, SET_IB_IN_SIZE); + + flags |= param->set_si_guid ? SET_IB_FLAG_SIG : 0; + flags |= param->reset_qkey_viol ? SET_IB_FLAG_RQK : 0; + MTHCA_PUT(inbox, flags, SET_IB_FLAGS_OFFSET); + + MTHCA_PUT(inbox, param->cap_mask, SET_IB_CAP_MASK_OFFSET); + MTHCA_PUT(inbox, param->si_guid, SET_IB_SI_GUID_OFFSET); + + err = mthca_cmd(dev, mailbox->dma, port, 0, CMD_SET_IB, + CMD_TIME_CLASS_B); + + mthca_free_mailbox(dev, mailbox); + return err; +} + +int mthca_MAP_ICM(struct mthca_dev *dev, struct mthca_icm *icm, u64 virt) +{ + return mthca_map_cmd(dev, CMD_MAP_ICM, icm, virt); +} + +int mthca_MAP_ICM_page(struct mthca_dev *dev, u64 dma_addr, u64 virt) +{ + struct mthca_mailbox *mailbox; + __be64 *inbox; + int err; + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + inbox = mailbox->buf; + + inbox[0] = cpu_to_be64(virt); + inbox[1] = cpu_to_be64(dma_addr); + + err = mthca_cmd(dev, mailbox->dma, 1, 0, CMD_MAP_ICM, + CMD_TIME_CLASS_B); + + mthca_free_mailbox(dev, mailbox); + + if (!err) + mthca_dbg(dev, "Mapped page at %llx to %llx for ICM.\n", + (unsigned long long) dma_addr, (unsigned long long) virt); + + return err; +} + +int mthca_UNMAP_ICM(struct mthca_dev *dev, u64 virt, u32 page_count) +{ + mthca_dbg(dev, "Unmapping %d pages at %llx from ICM.\n", + page_count, (unsigned long long) virt); + + return mthca_cmd(dev, virt, page_count, 0, + CMD_UNMAP_ICM, CMD_TIME_CLASS_B); +} + +int mthca_MAP_ICM_AUX(struct mthca_dev *dev, struct mthca_icm *icm) +{ + return mthca_map_cmd(dev, CMD_MAP_ICM_AUX, icm, -1); +} + +int mthca_UNMAP_ICM_AUX(struct mthca_dev *dev) +{ + return mthca_cmd(dev, 0, 0, 0, CMD_UNMAP_ICM_AUX, CMD_TIME_CLASS_B); +} + +int mthca_SET_ICM_SIZE(struct mthca_dev *dev, u64 icm_size, u64 *aux_pages) +{ + int ret = mthca_cmd_imm(dev, icm_size, aux_pages, 0, + 0, CMD_SET_ICM_SIZE, CMD_TIME_CLASS_A); + + if (ret) + return ret; + + /* + * Round up number of system pages needed in case + * MTHCA_ICM_PAGE_SIZE < PAGE_SIZE. + */ + *aux_pages = ALIGN(*aux_pages, PAGE_SIZE / MTHCA_ICM_PAGE_SIZE) >> + (PAGE_SHIFT - MTHCA_ICM_PAGE_SHIFT); + + return 0; +} + +int mthca_SW2HW_MPT(struct mthca_dev *dev, struct mthca_mailbox *mailbox, + int mpt_index) +{ + return mthca_cmd(dev, mailbox->dma, mpt_index, 0, CMD_SW2HW_MPT, + CMD_TIME_CLASS_B); +} + +int mthca_HW2SW_MPT(struct mthca_dev *dev, struct mthca_mailbox *mailbox, + int mpt_index) +{ + return mthca_cmd_box(dev, 0, mailbox ? mailbox->dma : 0, mpt_index, + !mailbox, CMD_HW2SW_MPT, + CMD_TIME_CLASS_B); +} + +int mthca_WRITE_MTT(struct mthca_dev *dev, struct mthca_mailbox *mailbox, + int num_mtt) +{ + return mthca_cmd(dev, mailbox->dma, num_mtt, 0, CMD_WRITE_MTT, + CMD_TIME_CLASS_B); +} + +int mthca_SYNC_TPT(struct mthca_dev *dev) +{ + return mthca_cmd(dev, 0, 0, 0, CMD_SYNC_TPT, CMD_TIME_CLASS_B); +} + +int mthca_MAP_EQ(struct mthca_dev *dev, u64 event_mask, int unmap, + int eq_num) +{ + mthca_dbg(dev, "%s mask %016llx for eqn %d\n", + unmap ? "Clearing" : "Setting", + (unsigned long long) event_mask, eq_num); + return mthca_cmd(dev, event_mask, (unmap << 31) | eq_num, + 0, CMD_MAP_EQ, CMD_TIME_CLASS_B); +} + +int mthca_SW2HW_EQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox, + int eq_num) +{ + return mthca_cmd(dev, mailbox->dma, eq_num, 0, CMD_SW2HW_EQ, + CMD_TIME_CLASS_A); +} + +int mthca_HW2SW_EQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox, + int eq_num) +{ + return mthca_cmd_box(dev, 0, mailbox->dma, eq_num, 0, + CMD_HW2SW_EQ, + CMD_TIME_CLASS_A); +} + +int mthca_SW2HW_CQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox, + int cq_num) +{ + return mthca_cmd(dev, mailbox->dma, cq_num, 0, CMD_SW2HW_CQ, + CMD_TIME_CLASS_A); +} + +int mthca_HW2SW_CQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox, + int cq_num) +{ + return mthca_cmd_box(dev, 0, mailbox->dma, cq_num, 0, + CMD_HW2SW_CQ, + CMD_TIME_CLASS_A); +} + +int mthca_RESIZE_CQ(struct mthca_dev *dev, int cq_num, u32 lkey, u8 log_size) +{ + struct mthca_mailbox *mailbox; + __be32 *inbox; + int err; + +#define RESIZE_CQ_IN_SIZE 0x40 +#define RESIZE_CQ_LOG_SIZE_OFFSET 0x0c +#define RESIZE_CQ_LKEY_OFFSET 0x1c + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + inbox = mailbox->buf; + + memset(inbox, 0, RESIZE_CQ_IN_SIZE); + /* + * Leave start address fields zeroed out -- mthca assumes that + * MRs for CQs always start at virtual address 0. + */ + MTHCA_PUT(inbox, log_size, RESIZE_CQ_LOG_SIZE_OFFSET); + MTHCA_PUT(inbox, lkey, RESIZE_CQ_LKEY_OFFSET); + + err = mthca_cmd(dev, mailbox->dma, cq_num, 1, CMD_RESIZE_CQ, + CMD_TIME_CLASS_B); + + mthca_free_mailbox(dev, mailbox); + return err; +} + +int mthca_SW2HW_SRQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox, + int srq_num) +{ + return mthca_cmd(dev, mailbox->dma, srq_num, 0, CMD_SW2HW_SRQ, + CMD_TIME_CLASS_A); +} + +int mthca_HW2SW_SRQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox, + int srq_num) +{ + return mthca_cmd_box(dev, 0, mailbox->dma, srq_num, 0, + CMD_HW2SW_SRQ, + CMD_TIME_CLASS_A); +} + +int mthca_QUERY_SRQ(struct mthca_dev *dev, u32 num, + struct mthca_mailbox *mailbox) +{ + return mthca_cmd_box(dev, 0, mailbox->dma, num, 0, + CMD_QUERY_SRQ, CMD_TIME_CLASS_A); +} + +int mthca_ARM_SRQ(struct mthca_dev *dev, int srq_num, int limit) +{ + return mthca_cmd(dev, limit, srq_num, 0, CMD_ARM_SRQ, + CMD_TIME_CLASS_B); +} + +int mthca_MODIFY_QP(struct mthca_dev *dev, enum ib_qp_state cur, + enum ib_qp_state next, u32 num, int is_ee, + struct mthca_mailbox *mailbox, u32 optmask) +{ + static const u16 op[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = { + [IB_QPS_RESET] = { + [IB_QPS_RESET] = CMD_ERR2RST_QPEE, + [IB_QPS_ERR] = CMD_2ERR_QPEE, + [IB_QPS_INIT] = CMD_RST2INIT_QPEE, + }, + [IB_QPS_INIT] = { + [IB_QPS_RESET] = CMD_ERR2RST_QPEE, + [IB_QPS_ERR] = CMD_2ERR_QPEE, + [IB_QPS_INIT] = CMD_INIT2INIT_QPEE, + [IB_QPS_RTR] = CMD_INIT2RTR_QPEE, + }, + [IB_QPS_RTR] = { + [IB_QPS_RESET] = CMD_ERR2RST_QPEE, + [IB_QPS_ERR] = CMD_2ERR_QPEE, + [IB_QPS_RTS] = CMD_RTR2RTS_QPEE, + }, + [IB_QPS_RTS] = { + [IB_QPS_RESET] = CMD_ERR2RST_QPEE, + [IB_QPS_ERR] = CMD_2ERR_QPEE, + [IB_QPS_RTS] = CMD_RTS2RTS_QPEE, + [IB_QPS_SQD] = CMD_RTS2SQD_QPEE, + }, + [IB_QPS_SQD] = { + [IB_QPS_RESET] = CMD_ERR2RST_QPEE, + [IB_QPS_ERR] = CMD_2ERR_QPEE, + [IB_QPS_RTS] = CMD_SQD2RTS_QPEE, + [IB_QPS_SQD] = CMD_SQD2SQD_QPEE, + }, + [IB_QPS_SQE] = { + [IB_QPS_RESET] = CMD_ERR2RST_QPEE, + [IB_QPS_ERR] = CMD_2ERR_QPEE, + [IB_QPS_RTS] = CMD_SQERR2RTS_QPEE, + }, + [IB_QPS_ERR] = { + [IB_QPS_RESET] = CMD_ERR2RST_QPEE, + [IB_QPS_ERR] = CMD_2ERR_QPEE, + } + }; + + u8 op_mod = 0; + int my_mailbox = 0; + int err; + + if (op[cur][next] == CMD_ERR2RST_QPEE) { + op_mod = 3; /* don't write outbox, any->reset */ + + /* For debugging */ + if (!mailbox) { + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (!IS_ERR(mailbox)) { + my_mailbox = 1; + op_mod = 2; /* write outbox, any->reset */ + } else + mailbox = NULL; + } + + err = mthca_cmd_box(dev, 0, mailbox ? mailbox->dma : 0, + (!!is_ee << 24) | num, op_mod, + op[cur][next], CMD_TIME_CLASS_C); + + if (0 && mailbox) { + int i; + mthca_dbg(dev, "Dumping QP context:\n"); + printk(" %08x\n", be32_to_cpup(mailbox->buf)); + for (i = 0; i < 0x100 / 4; ++i) { + if (i % 8 == 0) + printk("[%02x] ", i * 4); + printk(" %08x", + be32_to_cpu(((__be32 *) mailbox->buf)[i + 2])); + if ((i + 1) % 8 == 0) + printk("\n"); + } + } + + if (my_mailbox) + mthca_free_mailbox(dev, mailbox); + } else { + if (0) { + int i; + mthca_dbg(dev, "Dumping QP context:\n"); + printk(" opt param mask: %08x\n", be32_to_cpup(mailbox->buf)); + for (i = 0; i < 0x100 / 4; ++i) { + if (i % 8 == 0) + printk(" [%02x] ", i * 4); + printk(" %08x", + be32_to_cpu(((__be32 *) mailbox->buf)[i + 2])); + if ((i + 1) % 8 == 0) + printk("\n"); + } + } + + err = mthca_cmd(dev, mailbox->dma, optmask | (!!is_ee << 24) | num, + op_mod, op[cur][next], CMD_TIME_CLASS_C); + } + + return err; +} + +int mthca_QUERY_QP(struct mthca_dev *dev, u32 num, int is_ee, + struct mthca_mailbox *mailbox) +{ + return mthca_cmd_box(dev, 0, mailbox->dma, (!!is_ee << 24) | num, 0, + CMD_QUERY_QPEE, CMD_TIME_CLASS_A); +} + +int mthca_CONF_SPECIAL_QP(struct mthca_dev *dev, int type, u32 qpn) +{ + u8 op_mod; + + switch (type) { + case IB_QPT_SMI: + op_mod = 0; + break; + case IB_QPT_GSI: + op_mod = 1; + break; + case IB_QPT_RAW_IPV6: + op_mod = 2; + break; + case IB_QPT_RAW_ETHERTYPE: + op_mod = 3; + break; + default: + return -EINVAL; + } + + return mthca_cmd(dev, 0, qpn, op_mod, CMD_CONF_SPECIAL_QP, + CMD_TIME_CLASS_B); +} + +int mthca_MAD_IFC(struct mthca_dev *dev, int ignore_mkey, int ignore_bkey, + int port, struct ib_wc *in_wc, struct ib_grh *in_grh, + void *in_mad, void *response_mad) +{ + struct mthca_mailbox *inmailbox, *outmailbox; + void *inbox; + int err; + u32 in_modifier = port; + u8 op_modifier = 0; + +#define MAD_IFC_BOX_SIZE 0x400 +#define MAD_IFC_MY_QPN_OFFSET 0x100 +#define MAD_IFC_RQPN_OFFSET 0x108 +#define MAD_IFC_SL_OFFSET 0x10c +#define MAD_IFC_G_PATH_OFFSET 0x10d +#define MAD_IFC_RLID_OFFSET 0x10e +#define MAD_IFC_PKEY_OFFSET 0x112 +#define MAD_IFC_GRH_OFFSET 0x140 + + inmailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(inmailbox)) + return PTR_ERR(inmailbox); + inbox = inmailbox->buf; + + outmailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(outmailbox)) { + mthca_free_mailbox(dev, inmailbox); + return PTR_ERR(outmailbox); + } + + memcpy(inbox, in_mad, 256); + + /* + * Key check traps can't be generated unless we have in_wc to + * tell us where to send the trap. + */ + if (ignore_mkey || !in_wc) + op_modifier |= 0x1; + if (ignore_bkey || !in_wc) + op_modifier |= 0x2; + + if (in_wc) { + u8 val; + + memset(inbox + 256, 0, 256); + + MTHCA_PUT(inbox, in_wc->qp->qp_num, MAD_IFC_MY_QPN_OFFSET); + MTHCA_PUT(inbox, in_wc->src_qp, MAD_IFC_RQPN_OFFSET); + + val = in_wc->sl << 4; + MTHCA_PUT(inbox, val, MAD_IFC_SL_OFFSET); + + val = in_wc->dlid_path_bits | + (in_wc->wc_flags & IB_WC_GRH ? 0x80 : 0); + MTHCA_PUT(inbox, val, MAD_IFC_G_PATH_OFFSET); + + MTHCA_PUT(inbox, in_wc->slid, MAD_IFC_RLID_OFFSET); + MTHCA_PUT(inbox, in_wc->pkey_index, MAD_IFC_PKEY_OFFSET); + + if (in_grh) + memcpy(inbox + MAD_IFC_GRH_OFFSET, in_grh, 40); + + op_modifier |= 0x4; + + in_modifier |= in_wc->slid << 16; + } + + err = mthca_cmd_box(dev, inmailbox->dma, outmailbox->dma, + in_modifier, op_modifier, + CMD_MAD_IFC, CMD_TIME_CLASS_C); + + if (!err) + memcpy(response_mad, outmailbox->buf, 256); + + mthca_free_mailbox(dev, inmailbox); + mthca_free_mailbox(dev, outmailbox); + return err; +} + +int mthca_READ_MGM(struct mthca_dev *dev, int index, + struct mthca_mailbox *mailbox) +{ + return mthca_cmd_box(dev, 0, mailbox->dma, index, 0, + CMD_READ_MGM, CMD_TIME_CLASS_A); +} + +int mthca_WRITE_MGM(struct mthca_dev *dev, int index, + struct mthca_mailbox *mailbox) +{ + return mthca_cmd(dev, mailbox->dma, index, 0, CMD_WRITE_MGM, + CMD_TIME_CLASS_A); +} + +int mthca_MGID_HASH(struct mthca_dev *dev, struct mthca_mailbox *mailbox, + u16 *hash) +{ + u64 imm; + int err; + + err = mthca_cmd_imm(dev, mailbox->dma, &imm, 0, 0, CMD_MGID_HASH, + CMD_TIME_CLASS_A); + + *hash = imm; + return err; +} + +int mthca_NOP(struct mthca_dev *dev) +{ + return mthca_cmd(dev, 0, 0x1f, 0, CMD_NOP, msecs_to_jiffies(100)); +} diff --git a/kernel/drivers/infiniband/hw/mthca/mthca_cmd.h b/kernel/drivers/infiniband/hw/mthca/mthca_cmd.h new file mode 100644 index 000000000..f952244c5 --- /dev/null +++ b/kernel/drivers/infiniband/hw/mthca/mthca_cmd.h @@ -0,0 +1,325 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * Copyright (c) 2006 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MTHCA_CMD_H +#define MTHCA_CMD_H + +#include + +#define MTHCA_MAILBOX_SIZE 4096 + +enum { + /* command completed successfully: */ + MTHCA_CMD_STAT_OK = 0x00, + /* Internal error (such as a bus error) occurred while processing command: */ + MTHCA_CMD_STAT_INTERNAL_ERR = 0x01, + /* Operation/command not supported or opcode modifier not supported: */ + MTHCA_CMD_STAT_BAD_OP = 0x02, + /* Parameter not supported or parameter out of range: */ + MTHCA_CMD_STAT_BAD_PARAM = 0x03, + /* System not enabled or bad system state: */ + MTHCA_CMD_STAT_BAD_SYS_STATE = 0x04, + /* Attempt to access reserved or unallocaterd resource: */ + MTHCA_CMD_STAT_BAD_RESOURCE = 0x05, + /* Requested resource is currently executing a command, or is otherwise busy: */ + MTHCA_CMD_STAT_RESOURCE_BUSY = 0x06, + /* memory error: */ + MTHCA_CMD_STAT_DDR_MEM_ERR = 0x07, + /* Required capability exceeds device limits: */ + MTHCA_CMD_STAT_EXCEED_LIM = 0x08, + /* Resource is not in the appropriate state or ownership: */ + MTHCA_CMD_STAT_BAD_RES_STATE = 0x09, + /* Index out of range: */ + MTHCA_CMD_STAT_BAD_INDEX = 0x0a, + /* FW image corrupted: */ + MTHCA_CMD_STAT_BAD_NVMEM = 0x0b, + /* Attempt to modify a QP/EE which is not in the presumed state: */ + MTHCA_CMD_STAT_BAD_QPEE_STATE = 0x10, + /* Bad segment parameters (Address/Size): */ + MTHCA_CMD_STAT_BAD_SEG_PARAM = 0x20, + /* Memory Region has Memory Windows bound to: */ + MTHCA_CMD_STAT_REG_BOUND = 0x21, + /* HCA local attached memory not present: */ + MTHCA_CMD_STAT_LAM_NOT_PRE = 0x22, + /* Bad management packet (silently discarded): */ + MTHCA_CMD_STAT_BAD_PKT = 0x30, + /* More outstanding CQEs in CQ than new CQ size: */ + MTHCA_CMD_STAT_BAD_SIZE = 0x40 +}; + +enum { + MTHCA_TRANS_INVALID = 0, + MTHCA_TRANS_RST2INIT, + MTHCA_TRANS_INIT2INIT, + MTHCA_TRANS_INIT2RTR, + MTHCA_TRANS_RTR2RTS, + MTHCA_TRANS_RTS2RTS, + MTHCA_TRANS_SQERR2RTS, + MTHCA_TRANS_ANY2ERR, + MTHCA_TRANS_RTS2SQD, + MTHCA_TRANS_SQD2SQD, + MTHCA_TRANS_SQD2RTS, + MTHCA_TRANS_ANY2RST, +}; + +enum { + DEV_LIM_FLAG_RC = 1 << 0, + DEV_LIM_FLAG_UC = 1 << 1, + DEV_LIM_FLAG_UD = 1 << 2, + DEV_LIM_FLAG_RD = 1 << 3, + DEV_LIM_FLAG_RAW_IPV6 = 1 << 4, + DEV_LIM_FLAG_RAW_ETHER = 1 << 5, + DEV_LIM_FLAG_SRQ = 1 << 6, + DEV_LIM_FLAG_IPOIB_CSUM = 1 << 7, + DEV_LIM_FLAG_BAD_PKEY_CNTR = 1 << 8, + DEV_LIM_FLAG_BAD_QKEY_CNTR = 1 << 9, + DEV_LIM_FLAG_MW = 1 << 16, + DEV_LIM_FLAG_AUTO_PATH_MIG = 1 << 17, + DEV_LIM_FLAG_ATOMIC = 1 << 18, + DEV_LIM_FLAG_RAW_MULTI = 1 << 19, + DEV_LIM_FLAG_UD_AV_PORT_ENFORCE = 1 << 20, + DEV_LIM_FLAG_UD_MULTI = 1 << 21, +}; + +struct mthca_mailbox { + dma_addr_t dma; + void *buf; +}; + +struct mthca_dev_lim { + int max_srq_sz; + int max_qp_sz; + int reserved_qps; + int max_qps; + int reserved_srqs; + int max_srqs; + int reserved_eecs; + int max_eecs; + int max_cq_sz; + int reserved_cqs; + int max_cqs; + int max_mpts; + int reserved_eqs; + int max_eqs; + int reserved_mtts; + int max_mrw_sz; + int reserved_mrws; + int max_mtt_seg; + int max_requester_per_qp; + int max_responder_per_qp; + int max_rdma_global; + int local_ca_ack_delay; + int max_mtu; + int max_port_width; + int max_vl; + int num_ports; + int max_gids; + u16 stat_rate_support; + int max_pkeys; + u32 flags; + int reserved_uars; + int uar_size; + int min_page_sz; + int max_sg; + int max_desc_sz; + int max_qp_per_mcg; + int reserved_mgms; + int max_mcgs; + int reserved_pds; + int max_pds; + int reserved_rdds; + int max_rdds; + int eec_entry_sz; + int qpc_entry_sz; + int eeec_entry_sz; + int eqpc_entry_sz; + int eqc_entry_sz; + int cqc_entry_sz; + int srq_entry_sz; + int uar_scratch_entry_sz; + int mpt_entry_sz; + union { + struct { + int max_avs; + } tavor; + struct { + int resize_srq; + int max_pbl_sz; + u8 bmme_flags; + u32 reserved_lkey; + int lam_required; + u64 max_icm_sz; + } arbel; + } hca; +}; + +struct mthca_adapter { + u32 vendor_id; + u32 device_id; + u32 revision_id; + char board_id[MTHCA_BOARD_ID_LEN]; + u8 inta_pin; +}; + +struct mthca_init_hca_param { + u64 qpc_base; + u64 eec_base; + u64 srqc_base; + u64 cqc_base; + u64 eqpc_base; + u64 eeec_base; + u64 eqc_base; + u64 rdb_base; + u64 mc_base; + u64 mpt_base; + u64 mtt_base; + u64 uar_scratch_base; + u64 uarc_base; + u16 log_mc_entry_sz; + u16 mc_hash_sz; + u8 log_num_qps; + u8 log_num_eecs; + u8 log_num_srqs; + u8 log_num_cqs; + u8 log_num_eqs; + u8 log_mc_table_sz; + u8 mtt_seg_sz; + u8 log_mpt_sz; + u8 log_uar_sz; + u8 log_uarc_sz; +}; + +struct mthca_init_ib_param { + int port_width; + int vl_cap; + int mtu_cap; + u16 gid_cap; + u16 pkey_cap; + int set_guid0; + u64 guid0; + int set_node_guid; + u64 node_guid; + int set_si_guid; + u64 si_guid; +}; + +struct mthca_set_ib_param { + int set_si_guid; + int reset_qkey_viol; + u64 si_guid; + u32 cap_mask; +}; + +int mthca_cmd_init(struct mthca_dev *dev); +void mthca_cmd_cleanup(struct mthca_dev *dev); +int mthca_cmd_use_events(struct mthca_dev *dev); +void mthca_cmd_use_polling(struct mthca_dev *dev); +void mthca_cmd_event(struct mthca_dev *dev, u16 token, + u8 status, u64 out_param); + +struct mthca_mailbox *mthca_alloc_mailbox(struct mthca_dev *dev, + gfp_t gfp_mask); +void mthca_free_mailbox(struct mthca_dev *dev, struct mthca_mailbox *mailbox); + +int mthca_SYS_EN(struct mthca_dev *dev); +int mthca_SYS_DIS(struct mthca_dev *dev); +int mthca_MAP_FA(struct mthca_dev *dev, struct mthca_icm *icm); +int mthca_UNMAP_FA(struct mthca_dev *dev); +int mthca_RUN_FW(struct mthca_dev *dev); +int mthca_QUERY_FW(struct mthca_dev *dev); +int mthca_ENABLE_LAM(struct mthca_dev *dev); +int mthca_DISABLE_LAM(struct mthca_dev *dev); +int mthca_QUERY_DDR(struct mthca_dev *dev); +int mthca_QUERY_DEV_LIM(struct mthca_dev *dev, + struct mthca_dev_lim *dev_lim); +int mthca_QUERY_ADAPTER(struct mthca_dev *dev, + struct mthca_adapter *adapter); +int mthca_INIT_HCA(struct mthca_dev *dev, + struct mthca_init_hca_param *param); +int mthca_INIT_IB(struct mthca_dev *dev, + struct mthca_init_ib_param *param, + int port); +int mthca_CLOSE_IB(struct mthca_dev *dev, int port); +int mthca_CLOSE_HCA(struct mthca_dev *dev, int panic); +int mthca_SET_IB(struct mthca_dev *dev, struct mthca_set_ib_param *param, + int port); +int mthca_MAP_ICM(struct mthca_dev *dev, struct mthca_icm *icm, u64 virt); +int mthca_MAP_ICM_page(struct mthca_dev *dev, u64 dma_addr, u64 virt); +int mthca_UNMAP_ICM(struct mthca_dev *dev, u64 virt, u32 page_count); +int mthca_MAP_ICM_AUX(struct mthca_dev *dev, struct mthca_icm *icm); +int mthca_UNMAP_ICM_AUX(struct mthca_dev *dev); +int mthca_SET_ICM_SIZE(struct mthca_dev *dev, u64 icm_size, u64 *aux_pages); +int mthca_SW2HW_MPT(struct mthca_dev *dev, struct mthca_mailbox *mailbox, + int mpt_index); +int mthca_HW2SW_MPT(struct mthca_dev *dev, struct mthca_mailbox *mailbox, + int mpt_index); +int mthca_WRITE_MTT(struct mthca_dev *dev, struct mthca_mailbox *mailbox, + int num_mtt); +int mthca_SYNC_TPT(struct mthca_dev *dev); +int mthca_MAP_EQ(struct mthca_dev *dev, u64 event_mask, int unmap, + int eq_num); +int mthca_SW2HW_EQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox, + int eq_num); +int mthca_HW2SW_EQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox, + int eq_num); +int mthca_SW2HW_CQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox, + int cq_num); +int mthca_HW2SW_CQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox, + int cq_num); +int mthca_RESIZE_CQ(struct mthca_dev *dev, int cq_num, u32 lkey, u8 log_size); +int mthca_SW2HW_SRQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox, + int srq_num); +int mthca_HW2SW_SRQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox, + int srq_num); +int mthca_QUERY_SRQ(struct mthca_dev *dev, u32 num, + struct mthca_mailbox *mailbox); +int mthca_ARM_SRQ(struct mthca_dev *dev, int srq_num, int limit); +int mthca_MODIFY_QP(struct mthca_dev *dev, enum ib_qp_state cur, + enum ib_qp_state next, u32 num, int is_ee, + struct mthca_mailbox *mailbox, u32 optmask); +int mthca_QUERY_QP(struct mthca_dev *dev, u32 num, int is_ee, + struct mthca_mailbox *mailbox); +int mthca_CONF_SPECIAL_QP(struct mthca_dev *dev, int type, u32 qpn); +int mthca_MAD_IFC(struct mthca_dev *dev, int ignore_mkey, int ignore_bkey, + int port, struct ib_wc *in_wc, struct ib_grh *in_grh, + void *in_mad, void *response_mad); +int mthca_READ_MGM(struct mthca_dev *dev, int index, + struct mthca_mailbox *mailbox); +int mthca_WRITE_MGM(struct mthca_dev *dev, int index, + struct mthca_mailbox *mailbox); +int mthca_MGID_HASH(struct mthca_dev *dev, struct mthca_mailbox *mailbox, + u16 *hash); +int mthca_NOP(struct mthca_dev *dev); + +#endif /* MTHCA_CMD_H */ diff --git a/kernel/drivers/infiniband/hw/mthca/mthca_config_reg.h b/kernel/drivers/infiniband/hw/mthca/mthca_config_reg.h new file mode 100644 index 000000000..155bc6639 --- /dev/null +++ b/kernel/drivers/infiniband/hw/mthca/mthca_config_reg.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MTHCA_CONFIG_REG_H +#define MTHCA_CONFIG_REG_H + +#define MTHCA_HCR_BASE 0x80680 +#define MTHCA_HCR_SIZE 0x0001c +#define MTHCA_ECR_BASE 0x80700 +#define MTHCA_ECR_SIZE 0x00008 +#define MTHCA_ECR_CLR_BASE 0x80708 +#define MTHCA_ECR_CLR_SIZE 0x00008 +#define MTHCA_MAP_ECR_SIZE (MTHCA_ECR_SIZE + MTHCA_ECR_CLR_SIZE) +#define MTHCA_CLR_INT_BASE 0xf00d8 +#define MTHCA_CLR_INT_SIZE 0x00008 +#define MTHCA_EQ_SET_CI_SIZE (8 * 32) + +#endif /* MTHCA_CONFIG_REG_H */ diff --git a/kernel/drivers/infiniband/hw/mthca/mthca_cq.c b/kernel/drivers/infiniband/hw/mthca/mthca_cq.c new file mode 100644 index 000000000..40ba83338 --- /dev/null +++ b/kernel/drivers/infiniband/hw/mthca/mthca_cq.c @@ -0,0 +1,984 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2005, 2006 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * Copyright (c) 2004 Voltaire, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include + +#include + +#include "mthca_dev.h" +#include "mthca_cmd.h" +#include "mthca_memfree.h" + +enum { + MTHCA_MAX_DIRECT_CQ_SIZE = 4 * PAGE_SIZE +}; + +enum { + MTHCA_CQ_ENTRY_SIZE = 0x20 +}; + +enum { + MTHCA_ATOMIC_BYTE_LEN = 8 +}; + +/* + * Must be packed because start is 64 bits but only aligned to 32 bits. + */ +struct mthca_cq_context { + __be32 flags; + __be64 start; + __be32 logsize_usrpage; + __be32 error_eqn; /* Tavor only */ + __be32 comp_eqn; + __be32 pd; + __be32 lkey; + __be32 last_notified_index; + __be32 solicit_producer_index; + __be32 consumer_index; + __be32 producer_index; + __be32 cqn; + __be32 ci_db; /* Arbel only */ + __be32 state_db; /* Arbel only */ + u32 reserved; +} __attribute__((packed)); + +#define MTHCA_CQ_STATUS_OK ( 0 << 28) +#define MTHCA_CQ_STATUS_OVERFLOW ( 9 << 28) +#define MTHCA_CQ_STATUS_WRITE_FAIL (10 << 28) +#define MTHCA_CQ_FLAG_TR ( 1 << 18) +#define MTHCA_CQ_FLAG_OI ( 1 << 17) +#define MTHCA_CQ_STATE_DISARMED ( 0 << 8) +#define MTHCA_CQ_STATE_ARMED ( 1 << 8) +#define MTHCA_CQ_STATE_ARMED_SOL ( 4 << 8) +#define MTHCA_EQ_STATE_FIRED (10 << 8) + +enum { + MTHCA_ERROR_CQE_OPCODE_MASK = 0xfe +}; + +enum { + SYNDROME_LOCAL_LENGTH_ERR = 0x01, + SYNDROME_LOCAL_QP_OP_ERR = 0x02, + SYNDROME_LOCAL_EEC_OP_ERR = 0x03, + SYNDROME_LOCAL_PROT_ERR = 0x04, + SYNDROME_WR_FLUSH_ERR = 0x05, + SYNDROME_MW_BIND_ERR = 0x06, + SYNDROME_BAD_RESP_ERR = 0x10, + SYNDROME_LOCAL_ACCESS_ERR = 0x11, + SYNDROME_REMOTE_INVAL_REQ_ERR = 0x12, + SYNDROME_REMOTE_ACCESS_ERR = 0x13, + SYNDROME_REMOTE_OP_ERR = 0x14, + SYNDROME_RETRY_EXC_ERR = 0x15, + SYNDROME_RNR_RETRY_EXC_ERR = 0x16, + SYNDROME_LOCAL_RDD_VIOL_ERR = 0x20, + SYNDROME_REMOTE_INVAL_RD_REQ_ERR = 0x21, + SYNDROME_REMOTE_ABORTED_ERR = 0x22, + SYNDROME_INVAL_EECN_ERR = 0x23, + SYNDROME_INVAL_EEC_STATE_ERR = 0x24 +}; + +struct mthca_cqe { + __be32 my_qpn; + __be32 my_ee; + __be32 rqpn; + u8 sl_ipok; + u8 g_mlpath; + __be16 rlid; + __be32 imm_etype_pkey_eec; + __be32 byte_cnt; + __be32 wqe; + u8 opcode; + u8 is_send; + u8 reserved; + u8 owner; +}; + +struct mthca_err_cqe { + __be32 my_qpn; + u32 reserved1[3]; + u8 syndrome; + u8 vendor_err; + __be16 db_cnt; + u32 reserved2; + __be32 wqe; + u8 opcode; + u8 reserved3[2]; + u8 owner; +}; + +#define MTHCA_CQ_ENTRY_OWNER_SW (0 << 7) +#define MTHCA_CQ_ENTRY_OWNER_HW (1 << 7) + +#define MTHCA_TAVOR_CQ_DB_INC_CI (1 << 24) +#define MTHCA_TAVOR_CQ_DB_REQ_NOT (2 << 24) +#define MTHCA_TAVOR_CQ_DB_REQ_NOT_SOL (3 << 24) +#define MTHCA_TAVOR_CQ_DB_SET_CI (4 << 24) +#define MTHCA_TAVOR_CQ_DB_REQ_NOT_MULT (5 << 24) + +#define MTHCA_ARBEL_CQ_DB_REQ_NOT_SOL (1 << 24) +#define MTHCA_ARBEL_CQ_DB_REQ_NOT (2 << 24) +#define MTHCA_ARBEL_CQ_DB_REQ_NOT_MULT (3 << 24) + +static inline struct mthca_cqe *get_cqe_from_buf(struct mthca_cq_buf *buf, + int entry) +{ + if (buf->is_direct) + return buf->queue.direct.buf + (entry * MTHCA_CQ_ENTRY_SIZE); + else + return buf->queue.page_list[entry * MTHCA_CQ_ENTRY_SIZE / PAGE_SIZE].buf + + (entry * MTHCA_CQ_ENTRY_SIZE) % PAGE_SIZE; +} + +static inline struct mthca_cqe *get_cqe(struct mthca_cq *cq, int entry) +{ + return get_cqe_from_buf(&cq->buf, entry); +} + +static inline struct mthca_cqe *cqe_sw(struct mthca_cqe *cqe) +{ + return MTHCA_CQ_ENTRY_OWNER_HW & cqe->owner ? NULL : cqe; +} + +static inline struct mthca_cqe *next_cqe_sw(struct mthca_cq *cq) +{ + return cqe_sw(get_cqe(cq, cq->cons_index & cq->ibcq.cqe)); +} + +static inline void set_cqe_hw(struct mthca_cqe *cqe) +{ + cqe->owner = MTHCA_CQ_ENTRY_OWNER_HW; +} + +static void dump_cqe(struct mthca_dev *dev, void *cqe_ptr) +{ + __be32 *cqe = cqe_ptr; + + (void) cqe; /* avoid warning if mthca_dbg compiled away... */ + mthca_dbg(dev, "CQE contents %08x %08x %08x %08x %08x %08x %08x %08x\n", + be32_to_cpu(cqe[0]), be32_to_cpu(cqe[1]), be32_to_cpu(cqe[2]), + be32_to_cpu(cqe[3]), be32_to_cpu(cqe[4]), be32_to_cpu(cqe[5]), + be32_to_cpu(cqe[6]), be32_to_cpu(cqe[7])); +} + +/* + * incr is ignored in native Arbel (mem-free) mode, so cq->cons_index + * should be correct before calling update_cons_index(). + */ +static inline void update_cons_index(struct mthca_dev *dev, struct mthca_cq *cq, + int incr) +{ + if (mthca_is_memfree(dev)) { + *cq->set_ci_db = cpu_to_be32(cq->cons_index); + wmb(); + } else { + mthca_write64(MTHCA_TAVOR_CQ_DB_INC_CI | cq->cqn, incr - 1, + dev->kar + MTHCA_CQ_DOORBELL, + MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock)); + /* + * Make sure doorbells don't leak out of CQ spinlock + * and reach the HCA out of order: + */ + mmiowb(); + } +} + +void mthca_cq_completion(struct mthca_dev *dev, u32 cqn) +{ + struct mthca_cq *cq; + + cq = mthca_array_get(&dev->cq_table.cq, cqn & (dev->limits.num_cqs - 1)); + + if (!cq) { + mthca_warn(dev, "Completion event for bogus CQ %08x\n", cqn); + return; + } + + ++cq->arm_sn; + + cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); +} + +void mthca_cq_event(struct mthca_dev *dev, u32 cqn, + enum ib_event_type event_type) +{ + struct mthca_cq *cq; + struct ib_event event; + + spin_lock(&dev->cq_table.lock); + + cq = mthca_array_get(&dev->cq_table.cq, cqn & (dev->limits.num_cqs - 1)); + if (cq) + ++cq->refcount; + + spin_unlock(&dev->cq_table.lock); + + if (!cq) { + mthca_warn(dev, "Async event for bogus CQ %08x\n", cqn); + return; + } + + event.device = &dev->ib_dev; + event.event = event_type; + event.element.cq = &cq->ibcq; + if (cq->ibcq.event_handler) + cq->ibcq.event_handler(&event, cq->ibcq.cq_context); + + spin_lock(&dev->cq_table.lock); + if (!--cq->refcount) + wake_up(&cq->wait); + spin_unlock(&dev->cq_table.lock); +} + +static inline int is_recv_cqe(struct mthca_cqe *cqe) +{ + if ((cqe->opcode & MTHCA_ERROR_CQE_OPCODE_MASK) == + MTHCA_ERROR_CQE_OPCODE_MASK) + return !(cqe->opcode & 0x01); + else + return !(cqe->is_send & 0x80); +} + +void mthca_cq_clean(struct mthca_dev *dev, struct mthca_cq *cq, u32 qpn, + struct mthca_srq *srq) +{ + struct mthca_cqe *cqe; + u32 prod_index; + int i, nfreed = 0; + + spin_lock_irq(&cq->lock); + + /* + * First we need to find the current producer index, so we + * know where to start cleaning from. It doesn't matter if HW + * adds new entries after this loop -- the QP we're worried + * about is already in RESET, so the new entries won't come + * from our QP and therefore don't need to be checked. + */ + for (prod_index = cq->cons_index; + cqe_sw(get_cqe(cq, prod_index & cq->ibcq.cqe)); + ++prod_index) + if (prod_index == cq->cons_index + cq->ibcq.cqe) + break; + + if (0) + mthca_dbg(dev, "Cleaning QPN %06x from CQN %06x; ci %d, pi %d\n", + qpn, cq->cqn, cq->cons_index, prod_index); + + /* + * Now sweep backwards through the CQ, removing CQ entries + * that match our QP by copying older entries on top of them. + */ + while ((int) --prod_index - (int) cq->cons_index >= 0) { + cqe = get_cqe(cq, prod_index & cq->ibcq.cqe); + if (cqe->my_qpn == cpu_to_be32(qpn)) { + if (srq && is_recv_cqe(cqe)) + mthca_free_srq_wqe(srq, be32_to_cpu(cqe->wqe)); + ++nfreed; + } else if (nfreed) + memcpy(get_cqe(cq, (prod_index + nfreed) & cq->ibcq.cqe), + cqe, MTHCA_CQ_ENTRY_SIZE); + } + + if (nfreed) { + for (i = 0; i < nfreed; ++i) + set_cqe_hw(get_cqe(cq, (cq->cons_index + i) & cq->ibcq.cqe)); + wmb(); + cq->cons_index += nfreed; + update_cons_index(dev, cq, nfreed); + } + + spin_unlock_irq(&cq->lock); +} + +void mthca_cq_resize_copy_cqes(struct mthca_cq *cq) +{ + int i; + + /* + * In Tavor mode, the hardware keeps the consumer and producer + * indices mod the CQ size. Since we might be making the CQ + * bigger, we need to deal with the case where the producer + * index wrapped around before the CQ was resized. + */ + if (!mthca_is_memfree(to_mdev(cq->ibcq.device)) && + cq->ibcq.cqe < cq->resize_buf->cqe) { + cq->cons_index &= cq->ibcq.cqe; + if (cqe_sw(get_cqe(cq, cq->ibcq.cqe))) + cq->cons_index -= cq->ibcq.cqe + 1; + } + + for (i = cq->cons_index; cqe_sw(get_cqe(cq, i & cq->ibcq.cqe)); ++i) + memcpy(get_cqe_from_buf(&cq->resize_buf->buf, + i & cq->resize_buf->cqe), + get_cqe(cq, i & cq->ibcq.cqe), MTHCA_CQ_ENTRY_SIZE); +} + +int mthca_alloc_cq_buf(struct mthca_dev *dev, struct mthca_cq_buf *buf, int nent) +{ + int ret; + int i; + + ret = mthca_buf_alloc(dev, nent * MTHCA_CQ_ENTRY_SIZE, + MTHCA_MAX_DIRECT_CQ_SIZE, + &buf->queue, &buf->is_direct, + &dev->driver_pd, 1, &buf->mr); + if (ret) + return ret; + + for (i = 0; i < nent; ++i) + set_cqe_hw(get_cqe_from_buf(buf, i)); + + return 0; +} + +void mthca_free_cq_buf(struct mthca_dev *dev, struct mthca_cq_buf *buf, int cqe) +{ + mthca_buf_free(dev, (cqe + 1) * MTHCA_CQ_ENTRY_SIZE, &buf->queue, + buf->is_direct, &buf->mr); +} + +static void handle_error_cqe(struct mthca_dev *dev, struct mthca_cq *cq, + struct mthca_qp *qp, int wqe_index, int is_send, + struct mthca_err_cqe *cqe, + struct ib_wc *entry, int *free_cqe) +{ + int dbd; + __be32 new_wqe; + + if (cqe->syndrome == SYNDROME_LOCAL_QP_OP_ERR) { + mthca_dbg(dev, "local QP operation err " + "(QPN %06x, WQE @ %08x, CQN %06x, index %d)\n", + be32_to_cpu(cqe->my_qpn), be32_to_cpu(cqe->wqe), + cq->cqn, cq->cons_index); + dump_cqe(dev, cqe); + } + + /* + * For completions in error, only work request ID, status, vendor error + * (and freed resource count for RD) have to be set. + */ + switch (cqe->syndrome) { + case SYNDROME_LOCAL_LENGTH_ERR: + entry->status = IB_WC_LOC_LEN_ERR; + break; + case SYNDROME_LOCAL_QP_OP_ERR: + entry->status = IB_WC_LOC_QP_OP_ERR; + break; + case SYNDROME_LOCAL_EEC_OP_ERR: + entry->status = IB_WC_LOC_EEC_OP_ERR; + break; + case SYNDROME_LOCAL_PROT_ERR: + entry->status = IB_WC_LOC_PROT_ERR; + break; + case SYNDROME_WR_FLUSH_ERR: + entry->status = IB_WC_WR_FLUSH_ERR; + break; + case SYNDROME_MW_BIND_ERR: + entry->status = IB_WC_MW_BIND_ERR; + break; + case SYNDROME_BAD_RESP_ERR: + entry->status = IB_WC_BAD_RESP_ERR; + break; + case SYNDROME_LOCAL_ACCESS_ERR: + entry->status = IB_WC_LOC_ACCESS_ERR; + break; + case SYNDROME_REMOTE_INVAL_REQ_ERR: + entry->status = IB_WC_REM_INV_REQ_ERR; + break; + case SYNDROME_REMOTE_ACCESS_ERR: + entry->status = IB_WC_REM_ACCESS_ERR; + break; + case SYNDROME_REMOTE_OP_ERR: + entry->status = IB_WC_REM_OP_ERR; + break; + case SYNDROME_RETRY_EXC_ERR: + entry->status = IB_WC_RETRY_EXC_ERR; + break; + case SYNDROME_RNR_RETRY_EXC_ERR: + entry->status = IB_WC_RNR_RETRY_EXC_ERR; + break; + case SYNDROME_LOCAL_RDD_VIOL_ERR: + entry->status = IB_WC_LOC_RDD_VIOL_ERR; + break; + case SYNDROME_REMOTE_INVAL_RD_REQ_ERR: + entry->status = IB_WC_REM_INV_RD_REQ_ERR; + break; + case SYNDROME_REMOTE_ABORTED_ERR: + entry->status = IB_WC_REM_ABORT_ERR; + break; + case SYNDROME_INVAL_EECN_ERR: + entry->status = IB_WC_INV_EECN_ERR; + break; + case SYNDROME_INVAL_EEC_STATE_ERR: + entry->status = IB_WC_INV_EEC_STATE_ERR; + break; + default: + entry->status = IB_WC_GENERAL_ERR; + break; + } + + entry->vendor_err = cqe->vendor_err; + + /* + * Mem-free HCAs always generate one CQE per WQE, even in the + * error case, so we don't have to check the doorbell count, etc. + */ + if (mthca_is_memfree(dev)) + return; + + mthca_free_err_wqe(dev, qp, is_send, wqe_index, &dbd, &new_wqe); + + /* + * If we're at the end of the WQE chain, or we've used up our + * doorbell count, free the CQE. Otherwise just update it for + * the next poll operation. + */ + if (!(new_wqe & cpu_to_be32(0x3f)) || (!cqe->db_cnt && dbd)) + return; + + be16_add_cpu(&cqe->db_cnt, -dbd); + cqe->wqe = new_wqe; + cqe->syndrome = SYNDROME_WR_FLUSH_ERR; + + *free_cqe = 0; +} + +static inline int mthca_poll_one(struct mthca_dev *dev, + struct mthca_cq *cq, + struct mthca_qp **cur_qp, + int *freed, + struct ib_wc *entry) +{ + struct mthca_wq *wq; + struct mthca_cqe *cqe; + int wqe_index; + int is_error; + int is_send; + int free_cqe = 1; + int err = 0; + u16 checksum; + + cqe = next_cqe_sw(cq); + if (!cqe) + return -EAGAIN; + + /* + * Make sure we read CQ entry contents after we've checked the + * ownership bit. + */ + rmb(); + + if (0) { + mthca_dbg(dev, "%x/%d: CQE -> QPN %06x, WQE @ %08x\n", + cq->cqn, cq->cons_index, be32_to_cpu(cqe->my_qpn), + be32_to_cpu(cqe->wqe)); + dump_cqe(dev, cqe); + } + + is_error = (cqe->opcode & MTHCA_ERROR_CQE_OPCODE_MASK) == + MTHCA_ERROR_CQE_OPCODE_MASK; + is_send = is_error ? cqe->opcode & 0x01 : cqe->is_send & 0x80; + + if (!*cur_qp || be32_to_cpu(cqe->my_qpn) != (*cur_qp)->qpn) { + /* + * We do not have to take the QP table lock here, + * because CQs will be locked while QPs are removed + * from the table. + */ + *cur_qp = mthca_array_get(&dev->qp_table.qp, + be32_to_cpu(cqe->my_qpn) & + (dev->limits.num_qps - 1)); + if (!*cur_qp) { + mthca_warn(dev, "CQ entry for unknown QP %06x\n", + be32_to_cpu(cqe->my_qpn) & 0xffffff); + err = -EINVAL; + goto out; + } + } + + entry->qp = &(*cur_qp)->ibqp; + + if (is_send) { + wq = &(*cur_qp)->sq; + wqe_index = ((be32_to_cpu(cqe->wqe) - (*cur_qp)->send_wqe_offset) + >> wq->wqe_shift); + entry->wr_id = (*cur_qp)->wrid[wqe_index + + (*cur_qp)->rq.max]; + } else if ((*cur_qp)->ibqp.srq) { + struct mthca_srq *srq = to_msrq((*cur_qp)->ibqp.srq); + u32 wqe = be32_to_cpu(cqe->wqe); + wq = NULL; + wqe_index = wqe >> srq->wqe_shift; + entry->wr_id = srq->wrid[wqe_index]; + mthca_free_srq_wqe(srq, wqe); + } else { + s32 wqe; + wq = &(*cur_qp)->rq; + wqe = be32_to_cpu(cqe->wqe); + wqe_index = wqe >> wq->wqe_shift; + /* + * WQE addr == base - 1 might be reported in receive completion + * with error instead of (rq size - 1) by Sinai FW 1.0.800 and + * Arbel FW 5.1.400. This bug should be fixed in later FW revs. + */ + if (unlikely(wqe_index < 0)) + wqe_index = wq->max - 1; + entry->wr_id = (*cur_qp)->wrid[wqe_index]; + } + + if (wq) { + if (wq->last_comp < wqe_index) + wq->tail += wqe_index - wq->last_comp; + else + wq->tail += wqe_index + wq->max - wq->last_comp; + + wq->last_comp = wqe_index; + } + + if (is_error) { + handle_error_cqe(dev, cq, *cur_qp, wqe_index, is_send, + (struct mthca_err_cqe *) cqe, + entry, &free_cqe); + goto out; + } + + if (is_send) { + entry->wc_flags = 0; + switch (cqe->opcode) { + case MTHCA_OPCODE_RDMA_WRITE: + entry->opcode = IB_WC_RDMA_WRITE; + break; + case MTHCA_OPCODE_RDMA_WRITE_IMM: + entry->opcode = IB_WC_RDMA_WRITE; + entry->wc_flags |= IB_WC_WITH_IMM; + break; + case MTHCA_OPCODE_SEND: + entry->opcode = IB_WC_SEND; + break; + case MTHCA_OPCODE_SEND_IMM: + entry->opcode = IB_WC_SEND; + entry->wc_flags |= IB_WC_WITH_IMM; + break; + case MTHCA_OPCODE_RDMA_READ: + entry->opcode = IB_WC_RDMA_READ; + entry->byte_len = be32_to_cpu(cqe->byte_cnt); + break; + case MTHCA_OPCODE_ATOMIC_CS: + entry->opcode = IB_WC_COMP_SWAP; + entry->byte_len = MTHCA_ATOMIC_BYTE_LEN; + break; + case MTHCA_OPCODE_ATOMIC_FA: + entry->opcode = IB_WC_FETCH_ADD; + entry->byte_len = MTHCA_ATOMIC_BYTE_LEN; + break; + case MTHCA_OPCODE_BIND_MW: + entry->opcode = IB_WC_BIND_MW; + break; + default: + entry->opcode = MTHCA_OPCODE_INVALID; + break; + } + } else { + entry->byte_len = be32_to_cpu(cqe->byte_cnt); + switch (cqe->opcode & 0x1f) { + case IB_OPCODE_SEND_LAST_WITH_IMMEDIATE: + case IB_OPCODE_SEND_ONLY_WITH_IMMEDIATE: + entry->wc_flags = IB_WC_WITH_IMM; + entry->ex.imm_data = cqe->imm_etype_pkey_eec; + entry->opcode = IB_WC_RECV; + break; + case IB_OPCODE_RDMA_WRITE_LAST_WITH_IMMEDIATE: + case IB_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE: + entry->wc_flags = IB_WC_WITH_IMM; + entry->ex.imm_data = cqe->imm_etype_pkey_eec; + entry->opcode = IB_WC_RECV_RDMA_WITH_IMM; + break; + default: + entry->wc_flags = 0; + entry->opcode = IB_WC_RECV; + break; + } + entry->slid = be16_to_cpu(cqe->rlid); + entry->sl = cqe->sl_ipok >> 4; + entry->src_qp = be32_to_cpu(cqe->rqpn) & 0xffffff; + entry->dlid_path_bits = cqe->g_mlpath & 0x7f; + entry->pkey_index = be32_to_cpu(cqe->imm_etype_pkey_eec) >> 16; + entry->wc_flags |= cqe->g_mlpath & 0x80 ? IB_WC_GRH : 0; + checksum = (be32_to_cpu(cqe->rqpn) >> 24) | + ((be32_to_cpu(cqe->my_ee) >> 16) & 0xff00); + entry->wc_flags |= (cqe->sl_ipok & 1 && checksum == 0xffff) ? + IB_WC_IP_CSUM_OK : 0; + } + + entry->status = IB_WC_SUCCESS; + + out: + if (likely(free_cqe)) { + set_cqe_hw(cqe); + ++(*freed); + ++cq->cons_index; + } + + return err; +} + +int mthca_poll_cq(struct ib_cq *ibcq, int num_entries, + struct ib_wc *entry) +{ + struct mthca_dev *dev = to_mdev(ibcq->device); + struct mthca_cq *cq = to_mcq(ibcq); + struct mthca_qp *qp = NULL; + unsigned long flags; + int err = 0; + int freed = 0; + int npolled; + + spin_lock_irqsave(&cq->lock, flags); + + npolled = 0; +repoll: + while (npolled < num_entries) { + err = mthca_poll_one(dev, cq, &qp, + &freed, entry + npolled); + if (err) + break; + ++npolled; + } + + if (freed) { + wmb(); + update_cons_index(dev, cq, freed); + } + + /* + * If a CQ resize is in progress and we discovered that the + * old buffer is empty, then peek in the new buffer, and if + * it's not empty, switch to the new buffer and continue + * polling there. + */ + if (unlikely(err == -EAGAIN && cq->resize_buf && + cq->resize_buf->state == CQ_RESIZE_READY)) { + /* + * In Tavor mode, the hardware keeps the producer + * index modulo the CQ size. Since we might be making + * the CQ bigger, we need to mask our consumer index + * using the size of the old CQ buffer before looking + * in the new CQ buffer. + */ + if (!mthca_is_memfree(dev)) + cq->cons_index &= cq->ibcq.cqe; + + if (cqe_sw(get_cqe_from_buf(&cq->resize_buf->buf, + cq->cons_index & cq->resize_buf->cqe))) { + struct mthca_cq_buf tbuf; + int tcqe; + + tbuf = cq->buf; + tcqe = cq->ibcq.cqe; + cq->buf = cq->resize_buf->buf; + cq->ibcq.cqe = cq->resize_buf->cqe; + + cq->resize_buf->buf = tbuf; + cq->resize_buf->cqe = tcqe; + cq->resize_buf->state = CQ_RESIZE_SWAPPED; + + goto repoll; + } + } + + spin_unlock_irqrestore(&cq->lock, flags); + + return err == 0 || err == -EAGAIN ? npolled : err; +} + +int mthca_tavor_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags) +{ + u32 dbhi = ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ? + MTHCA_TAVOR_CQ_DB_REQ_NOT_SOL : + MTHCA_TAVOR_CQ_DB_REQ_NOT) | + to_mcq(cq)->cqn; + + mthca_write64(dbhi, 0xffffffff, to_mdev(cq->device)->kar + MTHCA_CQ_DOORBELL, + MTHCA_GET_DOORBELL_LOCK(&to_mdev(cq->device)->doorbell_lock)); + + return 0; +} + +int mthca_arbel_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) +{ + struct mthca_cq *cq = to_mcq(ibcq); + __be32 db_rec[2]; + u32 dbhi; + u32 sn = cq->arm_sn & 3; + + db_rec[0] = cpu_to_be32(cq->cons_index); + db_rec[1] = cpu_to_be32((cq->cqn << 8) | (2 << 5) | (sn << 3) | + ((flags & IB_CQ_SOLICITED_MASK) == + IB_CQ_SOLICITED ? 1 : 2)); + + mthca_write_db_rec(db_rec, cq->arm_db); + + /* + * Make sure that the doorbell record in host memory is + * written before ringing the doorbell via PCI MMIO. + */ + wmb(); + + dbhi = (sn << 28) | + ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ? + MTHCA_ARBEL_CQ_DB_REQ_NOT_SOL : + MTHCA_ARBEL_CQ_DB_REQ_NOT) | cq->cqn; + + mthca_write64(dbhi, cq->cons_index, + to_mdev(ibcq->device)->kar + MTHCA_CQ_DOORBELL, + MTHCA_GET_DOORBELL_LOCK(&to_mdev(ibcq->device)->doorbell_lock)); + + return 0; +} + +int mthca_init_cq(struct mthca_dev *dev, int nent, + struct mthca_ucontext *ctx, u32 pdn, + struct mthca_cq *cq) +{ + struct mthca_mailbox *mailbox; + struct mthca_cq_context *cq_context; + int err = -ENOMEM; + + cq->ibcq.cqe = nent - 1; + cq->is_kernel = !ctx; + + cq->cqn = mthca_alloc(&dev->cq_table.alloc); + if (cq->cqn == -1) + return -ENOMEM; + + if (mthca_is_memfree(dev)) { + err = mthca_table_get(dev, dev->cq_table.table, cq->cqn); + if (err) + goto err_out; + + if (cq->is_kernel) { + cq->arm_sn = 1; + + err = -ENOMEM; + + cq->set_ci_db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_CQ_SET_CI, + cq->cqn, &cq->set_ci_db); + if (cq->set_ci_db_index < 0) + goto err_out_icm; + + cq->arm_db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_CQ_ARM, + cq->cqn, &cq->arm_db); + if (cq->arm_db_index < 0) + goto err_out_ci; + } + } + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) + goto err_out_arm; + + cq_context = mailbox->buf; + + if (cq->is_kernel) { + err = mthca_alloc_cq_buf(dev, &cq->buf, nent); + if (err) + goto err_out_mailbox; + } + + spin_lock_init(&cq->lock); + cq->refcount = 1; + init_waitqueue_head(&cq->wait); + mutex_init(&cq->mutex); + + memset(cq_context, 0, sizeof *cq_context); + cq_context->flags = cpu_to_be32(MTHCA_CQ_STATUS_OK | + MTHCA_CQ_STATE_DISARMED | + MTHCA_CQ_FLAG_TR); + cq_context->logsize_usrpage = cpu_to_be32((ffs(nent) - 1) << 24); + if (ctx) + cq_context->logsize_usrpage |= cpu_to_be32(ctx->uar.index); + else + cq_context->logsize_usrpage |= cpu_to_be32(dev->driver_uar.index); + cq_context->error_eqn = cpu_to_be32(dev->eq_table.eq[MTHCA_EQ_ASYNC].eqn); + cq_context->comp_eqn = cpu_to_be32(dev->eq_table.eq[MTHCA_EQ_COMP].eqn); + cq_context->pd = cpu_to_be32(pdn); + cq_context->lkey = cpu_to_be32(cq->buf.mr.ibmr.lkey); + cq_context->cqn = cpu_to_be32(cq->cqn); + + if (mthca_is_memfree(dev)) { + cq_context->ci_db = cpu_to_be32(cq->set_ci_db_index); + cq_context->state_db = cpu_to_be32(cq->arm_db_index); + } + + err = mthca_SW2HW_CQ(dev, mailbox, cq->cqn); + if (err) { + mthca_warn(dev, "SW2HW_CQ failed (%d)\n", err); + goto err_out_free_mr; + } + + spin_lock_irq(&dev->cq_table.lock); + if (mthca_array_set(&dev->cq_table.cq, + cq->cqn & (dev->limits.num_cqs - 1), + cq)) { + spin_unlock_irq(&dev->cq_table.lock); + goto err_out_free_mr; + } + spin_unlock_irq(&dev->cq_table.lock); + + cq->cons_index = 0; + + mthca_free_mailbox(dev, mailbox); + + return 0; + +err_out_free_mr: + if (cq->is_kernel) + mthca_free_cq_buf(dev, &cq->buf, cq->ibcq.cqe); + +err_out_mailbox: + mthca_free_mailbox(dev, mailbox); + +err_out_arm: + if (cq->is_kernel && mthca_is_memfree(dev)) + mthca_free_db(dev, MTHCA_DB_TYPE_CQ_ARM, cq->arm_db_index); + +err_out_ci: + if (cq->is_kernel && mthca_is_memfree(dev)) + mthca_free_db(dev, MTHCA_DB_TYPE_CQ_SET_CI, cq->set_ci_db_index); + +err_out_icm: + mthca_table_put(dev, dev->cq_table.table, cq->cqn); + +err_out: + mthca_free(&dev->cq_table.alloc, cq->cqn); + + return err; +} + +static inline int get_cq_refcount(struct mthca_dev *dev, struct mthca_cq *cq) +{ + int c; + + spin_lock_irq(&dev->cq_table.lock); + c = cq->refcount; + spin_unlock_irq(&dev->cq_table.lock); + + return c; +} + +void mthca_free_cq(struct mthca_dev *dev, + struct mthca_cq *cq) +{ + struct mthca_mailbox *mailbox; + int err; + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) { + mthca_warn(dev, "No memory for mailbox to free CQ.\n"); + return; + } + + err = mthca_HW2SW_CQ(dev, mailbox, cq->cqn); + if (err) + mthca_warn(dev, "HW2SW_CQ failed (%d)\n", err); + + if (0) { + __be32 *ctx = mailbox->buf; + int j; + + printk(KERN_ERR "context for CQN %x (cons index %x, next sw %d)\n", + cq->cqn, cq->cons_index, + cq->is_kernel ? !!next_cqe_sw(cq) : 0); + for (j = 0; j < 16; ++j) + printk(KERN_ERR "[%2x] %08x\n", j * 4, be32_to_cpu(ctx[j])); + } + + spin_lock_irq(&dev->cq_table.lock); + mthca_array_clear(&dev->cq_table.cq, + cq->cqn & (dev->limits.num_cqs - 1)); + --cq->refcount; + spin_unlock_irq(&dev->cq_table.lock); + + if (dev->mthca_flags & MTHCA_FLAG_MSI_X) + synchronize_irq(dev->eq_table.eq[MTHCA_EQ_COMP].msi_x_vector); + else + synchronize_irq(dev->pdev->irq); + + wait_event(cq->wait, !get_cq_refcount(dev, cq)); + + if (cq->is_kernel) { + mthca_free_cq_buf(dev, &cq->buf, cq->ibcq.cqe); + if (mthca_is_memfree(dev)) { + mthca_free_db(dev, MTHCA_DB_TYPE_CQ_ARM, cq->arm_db_index); + mthca_free_db(dev, MTHCA_DB_TYPE_CQ_SET_CI, cq->set_ci_db_index); + } + } + + mthca_table_put(dev, dev->cq_table.table, cq->cqn); + mthca_free(&dev->cq_table.alloc, cq->cqn); + mthca_free_mailbox(dev, mailbox); +} + +int mthca_init_cq_table(struct mthca_dev *dev) +{ + int err; + + spin_lock_init(&dev->cq_table.lock); + + err = mthca_alloc_init(&dev->cq_table.alloc, + dev->limits.num_cqs, + (1 << 24) - 1, + dev->limits.reserved_cqs); + if (err) + return err; + + err = mthca_array_init(&dev->cq_table.cq, + dev->limits.num_cqs); + if (err) + mthca_alloc_cleanup(&dev->cq_table.alloc); + + return err; +} + +void mthca_cleanup_cq_table(struct mthca_dev *dev) +{ + mthca_array_cleanup(&dev->cq_table.cq, dev->limits.num_cqs); + mthca_alloc_cleanup(&dev->cq_table.alloc); +} diff --git a/kernel/drivers/infiniband/hw/mthca/mthca_dev.h b/kernel/drivers/infiniband/hw/mthca/mthca_dev.h new file mode 100644 index 000000000..7e6a6d64a --- /dev/null +++ b/kernel/drivers/infiniband/hw/mthca/mthca_dev.h @@ -0,0 +1,596 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * Copyright (c) 2004 Voltaire, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MTHCA_DEV_H +#define MTHCA_DEV_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "mthca_provider.h" +#include "mthca_doorbell.h" + +#define DRV_NAME "ib_mthca" +#define PFX DRV_NAME ": " +#define DRV_VERSION "1.0" +#define DRV_RELDATE "April 4, 2008" + +enum { + MTHCA_FLAG_DDR_HIDDEN = 1 << 1, + MTHCA_FLAG_SRQ = 1 << 2, + MTHCA_FLAG_MSI_X = 1 << 3, + MTHCA_FLAG_NO_LAM = 1 << 4, + MTHCA_FLAG_FMR = 1 << 5, + MTHCA_FLAG_MEMFREE = 1 << 6, + MTHCA_FLAG_PCIE = 1 << 7, + MTHCA_FLAG_SINAI_OPT = 1 << 8 +}; + +enum { + MTHCA_MAX_PORTS = 2 +}; + +enum { + MTHCA_BOARD_ID_LEN = 64 +}; + +enum { + MTHCA_EQ_CONTEXT_SIZE = 0x40, + MTHCA_CQ_CONTEXT_SIZE = 0x40, + MTHCA_QP_CONTEXT_SIZE = 0x200, + MTHCA_RDB_ENTRY_SIZE = 0x20, + MTHCA_AV_SIZE = 0x20, + MTHCA_MGM_ENTRY_SIZE = 0x100, + + /* Arbel FW gives us these, but we need them for Tavor */ + MTHCA_MPT_ENTRY_SIZE = 0x40, + MTHCA_MTT_SEG_SIZE = 0x40, + + MTHCA_QP_PER_MGM = 4 * (MTHCA_MGM_ENTRY_SIZE / 16 - 2) +}; + +enum { + MTHCA_EQ_CMD, + MTHCA_EQ_ASYNC, + MTHCA_EQ_COMP, + MTHCA_NUM_EQ +}; + +enum { + MTHCA_OPCODE_NOP = 0x00, + MTHCA_OPCODE_RDMA_WRITE = 0x08, + MTHCA_OPCODE_RDMA_WRITE_IMM = 0x09, + MTHCA_OPCODE_SEND = 0x0a, + MTHCA_OPCODE_SEND_IMM = 0x0b, + MTHCA_OPCODE_RDMA_READ = 0x10, + MTHCA_OPCODE_ATOMIC_CS = 0x11, + MTHCA_OPCODE_ATOMIC_FA = 0x12, + MTHCA_OPCODE_BIND_MW = 0x18, + MTHCA_OPCODE_INVALID = 0xff +}; + +enum { + MTHCA_CMD_USE_EVENTS = 1 << 0, + MTHCA_CMD_POST_DOORBELLS = 1 << 1 +}; + +enum { + MTHCA_CMD_NUM_DBELL_DWORDS = 8 +}; + +struct mthca_cmd { + struct pci_pool *pool; + struct mutex hcr_mutex; + struct semaphore poll_sem; + struct semaphore event_sem; + int max_cmds; + spinlock_t context_lock; + int free_head; + struct mthca_cmd_context *context; + u16 token_mask; + u32 flags; + void __iomem *dbell_map; + u16 dbell_offsets[MTHCA_CMD_NUM_DBELL_DWORDS]; +}; + +struct mthca_limits { + int num_ports; + int vl_cap; + int mtu_cap; + int gid_table_len; + int pkey_table_len; + int local_ca_ack_delay; + int num_uars; + int max_sg; + int num_qps; + int max_wqes; + int max_desc_sz; + int max_qp_init_rdma; + int reserved_qps; + int num_srqs; + int max_srq_wqes; + int max_srq_sge; + int reserved_srqs; + int num_eecs; + int reserved_eecs; + int num_cqs; + int max_cqes; + int reserved_cqs; + int num_eqs; + int reserved_eqs; + int num_mpts; + int num_mtt_segs; + int mtt_seg_size; + int fmr_reserved_mtts; + int reserved_mtts; + int reserved_mrws; + int reserved_uars; + int num_mgms; + int num_amgms; + int reserved_mcgs; + int num_pds; + int reserved_pds; + u32 page_size_cap; + u32 flags; + u16 stat_rate_support; + u8 port_width_cap; +}; + +struct mthca_alloc { + u32 last; + u32 top; + u32 max; + u32 mask; + spinlock_t lock; + unsigned long *table; +}; + +struct mthca_array { + struct { + void **page; + int used; + } *page_list; +}; + +struct mthca_uar_table { + struct mthca_alloc alloc; + u64 uarc_base; + int uarc_size; +}; + +struct mthca_pd_table { + struct mthca_alloc alloc; +}; + +struct mthca_buddy { + unsigned long **bits; + int *num_free; + int max_order; + spinlock_t lock; +}; + +struct mthca_mr_table { + struct mthca_alloc mpt_alloc; + struct mthca_buddy mtt_buddy; + struct mthca_buddy *fmr_mtt_buddy; + u64 mtt_base; + u64 mpt_base; + struct mthca_icm_table *mtt_table; + struct mthca_icm_table *mpt_table; + struct { + void __iomem *mpt_base; + void __iomem *mtt_base; + struct mthca_buddy mtt_buddy; + } tavor_fmr; +}; + +struct mthca_eq_table { + struct mthca_alloc alloc; + void __iomem *clr_int; + u32 clr_mask; + u32 arm_mask; + struct mthca_eq eq[MTHCA_NUM_EQ]; + u64 icm_virt; + struct page *icm_page; + dma_addr_t icm_dma; + int have_irq; + u8 inta_pin; +}; + +struct mthca_cq_table { + struct mthca_alloc alloc; + spinlock_t lock; + struct mthca_array cq; + struct mthca_icm_table *table; +}; + +struct mthca_srq_table { + struct mthca_alloc alloc; + spinlock_t lock; + struct mthca_array srq; + struct mthca_icm_table *table; +}; + +struct mthca_qp_table { + struct mthca_alloc alloc; + u32 rdb_base; + int rdb_shift; + int sqp_start; + spinlock_t lock; + struct mthca_array qp; + struct mthca_icm_table *qp_table; + struct mthca_icm_table *eqp_table; + struct mthca_icm_table *rdb_table; +}; + +struct mthca_av_table { + struct pci_pool *pool; + int num_ddr_avs; + u64 ddr_av_base; + void __iomem *av_map; + struct mthca_alloc alloc; +}; + +struct mthca_mcg_table { + struct mutex mutex; + struct mthca_alloc alloc; + struct mthca_icm_table *table; +}; + +struct mthca_catas_err { + u64 addr; + u32 __iomem *map; + u32 size; + struct timer_list timer; + struct list_head list; +}; + +extern struct mutex mthca_device_mutex; + +struct mthca_dev { + struct ib_device ib_dev; + struct pci_dev *pdev; + + int hca_type; + unsigned long mthca_flags; + unsigned long device_cap_flags; + + u32 rev_id; + char board_id[MTHCA_BOARD_ID_LEN]; + + /* firmware info */ + u64 fw_ver; + union { + struct { + u64 fw_start; + u64 fw_end; + } tavor; + struct { + u64 clr_int_base; + u64 eq_arm_base; + u64 eq_set_ci_base; + struct mthca_icm *fw_icm; + struct mthca_icm *aux_icm; + u16 fw_pages; + } arbel; + } fw; + + u64 ddr_start; + u64 ddr_end; + + MTHCA_DECLARE_DOORBELL_LOCK(doorbell_lock) + struct mutex cap_mask_mutex; + + void __iomem *hcr; + void __iomem *kar; + void __iomem *clr_base; + union { + struct { + void __iomem *ecr_base; + } tavor; + struct { + void __iomem *eq_arm; + void __iomem *eq_set_ci_base; + } arbel; + } eq_regs; + + struct mthca_cmd cmd; + struct mthca_limits limits; + + struct mthca_uar_table uar_table; + struct mthca_pd_table pd_table; + struct mthca_mr_table mr_table; + struct mthca_eq_table eq_table; + struct mthca_cq_table cq_table; + struct mthca_srq_table srq_table; + struct mthca_qp_table qp_table; + struct mthca_av_table av_table; + struct mthca_mcg_table mcg_table; + + struct mthca_catas_err catas_err; + + struct mthca_uar driver_uar; + struct mthca_db_table *db_tab; + struct mthca_pd driver_pd; + struct mthca_mr driver_mr; + + struct ib_mad_agent *send_agent[MTHCA_MAX_PORTS][2]; + struct ib_ah *sm_ah[MTHCA_MAX_PORTS]; + spinlock_t sm_lock; + u8 rate[MTHCA_MAX_PORTS]; + bool active; +}; + +#ifdef CONFIG_INFINIBAND_MTHCA_DEBUG +extern int mthca_debug_level; + +#define mthca_dbg(mdev, format, arg...) \ + do { \ + if (mthca_debug_level) \ + dev_printk(KERN_DEBUG, &mdev->pdev->dev, format, ## arg); \ + } while (0) + +#else /* CONFIG_INFINIBAND_MTHCA_DEBUG */ + +#define mthca_dbg(mdev, format, arg...) do { (void) mdev; } while (0) + +#endif /* CONFIG_INFINIBAND_MTHCA_DEBUG */ + +#define mthca_err(mdev, format, arg...) \ + dev_err(&mdev->pdev->dev, format, ## arg) +#define mthca_info(mdev, format, arg...) \ + dev_info(&mdev->pdev->dev, format, ## arg) +#define mthca_warn(mdev, format, arg...) \ + dev_warn(&mdev->pdev->dev, format, ## arg) + +extern void __buggy_use_of_MTHCA_GET(void); +extern void __buggy_use_of_MTHCA_PUT(void); + +#define MTHCA_GET(dest, source, offset) \ + do { \ + void *__p = (char *) (source) + (offset); \ + switch (sizeof (dest)) { \ + case 1: (dest) = *(u8 *) __p; break; \ + case 2: (dest) = be16_to_cpup(__p); break; \ + case 4: (dest) = be32_to_cpup(__p); break; \ + case 8: (dest) = be64_to_cpup(__p); break; \ + default: __buggy_use_of_MTHCA_GET(); \ + } \ + } while (0) + +#define MTHCA_PUT(dest, source, offset) \ + do { \ + void *__d = ((char *) (dest) + (offset)); \ + switch (sizeof(source)) { \ + case 1: *(u8 *) __d = (source); break; \ + case 2: *(__be16 *) __d = cpu_to_be16(source); break; \ + case 4: *(__be32 *) __d = cpu_to_be32(source); break; \ + case 8: *(__be64 *) __d = cpu_to_be64(source); break; \ + default: __buggy_use_of_MTHCA_PUT(); \ + } \ + } while (0) + +int mthca_reset(struct mthca_dev *mdev); + +u32 mthca_alloc(struct mthca_alloc *alloc); +void mthca_free(struct mthca_alloc *alloc, u32 obj); +int mthca_alloc_init(struct mthca_alloc *alloc, u32 num, u32 mask, + u32 reserved); +void mthca_alloc_cleanup(struct mthca_alloc *alloc); +void *mthca_array_get(struct mthca_array *array, int index); +int mthca_array_set(struct mthca_array *array, int index, void *value); +void mthca_array_clear(struct mthca_array *array, int index); +int mthca_array_init(struct mthca_array *array, int nent); +void mthca_array_cleanup(struct mthca_array *array, int nent); +int mthca_buf_alloc(struct mthca_dev *dev, int size, int max_direct, + union mthca_buf *buf, int *is_direct, struct mthca_pd *pd, + int hca_write, struct mthca_mr *mr); +void mthca_buf_free(struct mthca_dev *dev, int size, union mthca_buf *buf, + int is_direct, struct mthca_mr *mr); + +int mthca_init_uar_table(struct mthca_dev *dev); +int mthca_init_pd_table(struct mthca_dev *dev); +int mthca_init_mr_table(struct mthca_dev *dev); +int mthca_init_eq_table(struct mthca_dev *dev); +int mthca_init_cq_table(struct mthca_dev *dev); +int mthca_init_srq_table(struct mthca_dev *dev); +int mthca_init_qp_table(struct mthca_dev *dev); +int mthca_init_av_table(struct mthca_dev *dev); +int mthca_init_mcg_table(struct mthca_dev *dev); + +void mthca_cleanup_uar_table(struct mthca_dev *dev); +void mthca_cleanup_pd_table(struct mthca_dev *dev); +void mthca_cleanup_mr_table(struct mthca_dev *dev); +void mthca_cleanup_eq_table(struct mthca_dev *dev); +void mthca_cleanup_cq_table(struct mthca_dev *dev); +void mthca_cleanup_srq_table(struct mthca_dev *dev); +void mthca_cleanup_qp_table(struct mthca_dev *dev); +void mthca_cleanup_av_table(struct mthca_dev *dev); +void mthca_cleanup_mcg_table(struct mthca_dev *dev); + +int mthca_register_device(struct mthca_dev *dev); +void mthca_unregister_device(struct mthca_dev *dev); + +void mthca_start_catas_poll(struct mthca_dev *dev); +void mthca_stop_catas_poll(struct mthca_dev *dev); +int __mthca_restart_one(struct pci_dev *pdev); +int mthca_catas_init(void); +void mthca_catas_cleanup(void); + +int mthca_uar_alloc(struct mthca_dev *dev, struct mthca_uar *uar); +void mthca_uar_free(struct mthca_dev *dev, struct mthca_uar *uar); + +int mthca_pd_alloc(struct mthca_dev *dev, int privileged, struct mthca_pd *pd); +void mthca_pd_free(struct mthca_dev *dev, struct mthca_pd *pd); + +int mthca_write_mtt_size(struct mthca_dev *dev); + +struct mthca_mtt *mthca_alloc_mtt(struct mthca_dev *dev, int size); +void mthca_free_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt); +int mthca_write_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt, + int start_index, u64 *buffer_list, int list_len); +int mthca_mr_alloc(struct mthca_dev *dev, u32 pd, int buffer_size_shift, + u64 iova, u64 total_size, u32 access, struct mthca_mr *mr); +int mthca_mr_alloc_notrans(struct mthca_dev *dev, u32 pd, + u32 access, struct mthca_mr *mr); +int mthca_mr_alloc_phys(struct mthca_dev *dev, u32 pd, + u64 *buffer_list, int buffer_size_shift, + int list_len, u64 iova, u64 total_size, + u32 access, struct mthca_mr *mr); +void mthca_free_mr(struct mthca_dev *dev, struct mthca_mr *mr); + +int mthca_fmr_alloc(struct mthca_dev *dev, u32 pd, + u32 access, struct mthca_fmr *fmr); +int mthca_tavor_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, + int list_len, u64 iova); +void mthca_tavor_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr); +int mthca_arbel_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, + int list_len, u64 iova); +void mthca_arbel_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr); +int mthca_free_fmr(struct mthca_dev *dev, struct mthca_fmr *fmr); + +int mthca_map_eq_icm(struct mthca_dev *dev, u64 icm_virt); +void mthca_unmap_eq_icm(struct mthca_dev *dev); + +int mthca_poll_cq(struct ib_cq *ibcq, int num_entries, + struct ib_wc *entry); +int mthca_tavor_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags); +int mthca_arbel_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags); +int mthca_init_cq(struct mthca_dev *dev, int nent, + struct mthca_ucontext *ctx, u32 pdn, + struct mthca_cq *cq); +void mthca_free_cq(struct mthca_dev *dev, + struct mthca_cq *cq); +void mthca_cq_completion(struct mthca_dev *dev, u32 cqn); +void mthca_cq_event(struct mthca_dev *dev, u32 cqn, + enum ib_event_type event_type); +void mthca_cq_clean(struct mthca_dev *dev, struct mthca_cq *cq, u32 qpn, + struct mthca_srq *srq); +void mthca_cq_resize_copy_cqes(struct mthca_cq *cq); +int mthca_alloc_cq_buf(struct mthca_dev *dev, struct mthca_cq_buf *buf, int nent); +void mthca_free_cq_buf(struct mthca_dev *dev, struct mthca_cq_buf *buf, int cqe); + +int mthca_alloc_srq(struct mthca_dev *dev, struct mthca_pd *pd, + struct ib_srq_attr *attr, struct mthca_srq *srq); +void mthca_free_srq(struct mthca_dev *dev, struct mthca_srq *srq); +int mthca_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, struct ib_udata *udata); +int mthca_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr); +int mthca_max_srq_sge(struct mthca_dev *dev); +void mthca_srq_event(struct mthca_dev *dev, u32 srqn, + enum ib_event_type event_type); +void mthca_free_srq_wqe(struct mthca_srq *srq, u32 wqe_addr); +int mthca_tavor_post_srq_recv(struct ib_srq *srq, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr); +int mthca_arbel_post_srq_recv(struct ib_srq *srq, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr); + +void mthca_qp_event(struct mthca_dev *dev, u32 qpn, + enum ib_event_type event_type); +int mthca_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr); +int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, + struct ib_udata *udata); +int mthca_tavor_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr); +int mthca_tavor_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr); +int mthca_arbel_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr); +int mthca_arbel_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr); +void mthca_free_err_wqe(struct mthca_dev *dev, struct mthca_qp *qp, int is_send, + int index, int *dbd, __be32 *new_wqe); +int mthca_alloc_qp(struct mthca_dev *dev, + struct mthca_pd *pd, + struct mthca_cq *send_cq, + struct mthca_cq *recv_cq, + enum ib_qp_type type, + enum ib_sig_type send_policy, + struct ib_qp_cap *cap, + struct mthca_qp *qp); +int mthca_alloc_sqp(struct mthca_dev *dev, + struct mthca_pd *pd, + struct mthca_cq *send_cq, + struct mthca_cq *recv_cq, + enum ib_sig_type send_policy, + struct ib_qp_cap *cap, + int qpn, + int port, + struct mthca_sqp *sqp); +void mthca_free_qp(struct mthca_dev *dev, struct mthca_qp *qp); +int mthca_create_ah(struct mthca_dev *dev, + struct mthca_pd *pd, + struct ib_ah_attr *ah_attr, + struct mthca_ah *ah); +int mthca_destroy_ah(struct mthca_dev *dev, struct mthca_ah *ah); +int mthca_read_ah(struct mthca_dev *dev, struct mthca_ah *ah, + struct ib_ud_header *header); +int mthca_ah_query(struct ib_ah *ibah, struct ib_ah_attr *attr); +int mthca_ah_grh_present(struct mthca_ah *ah); +u8 mthca_get_rate(struct mthca_dev *dev, int static_rate, u8 port); +enum ib_rate mthca_rate_to_ib(struct mthca_dev *dev, u8 mthca_rate, u8 port); + +int mthca_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid); +int mthca_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid); + +int mthca_process_mad(struct ib_device *ibdev, + int mad_flags, + u8 port_num, + struct ib_wc *in_wc, + struct ib_grh *in_grh, + struct ib_mad *in_mad, + struct ib_mad *out_mad); +int mthca_create_agents(struct mthca_dev *dev); +void mthca_free_agents(struct mthca_dev *dev); + +static inline struct mthca_dev *to_mdev(struct ib_device *ibdev) +{ + return container_of(ibdev, struct mthca_dev, ib_dev); +} + +static inline int mthca_is_memfree(struct mthca_dev *dev) +{ + return dev->mthca_flags & MTHCA_FLAG_MEMFREE; +} + +#endif /* MTHCA_DEV_H */ diff --git a/kernel/drivers/infiniband/hw/mthca/mthca_doorbell.h b/kernel/drivers/infiniband/hw/mthca/mthca_doorbell.h new file mode 100644 index 000000000..14f51ef97 --- /dev/null +++ b/kernel/drivers/infiniband/hw/mthca/mthca_doorbell.h @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#define MTHCA_RD_DOORBELL 0x00 +#define MTHCA_SEND_DOORBELL 0x10 +#define MTHCA_RECEIVE_DOORBELL 0x18 +#define MTHCA_CQ_DOORBELL 0x20 +#define MTHCA_EQ_DOORBELL 0x28 + +#if BITS_PER_LONG == 64 +/* + * Assume that we can just write a 64-bit doorbell atomically. s390 + * actually doesn't have writeq() but S/390 systems don't even have + * PCI so we won't worry about it. + */ + +#define MTHCA_DECLARE_DOORBELL_LOCK(name) +#define MTHCA_INIT_DOORBELL_LOCK(ptr) do { } while (0) +#define MTHCA_GET_DOORBELL_LOCK(ptr) (NULL) + +static inline void mthca_write64_raw(__be64 val, void __iomem *dest) +{ + __raw_writeq((__force u64) val, dest); +} + +static inline void mthca_write64(u32 hi, u32 lo, void __iomem *dest, + spinlock_t *doorbell_lock) +{ + __raw_writeq((__force u64) cpu_to_be64((u64) hi << 32 | lo), dest); +} + +static inline void mthca_write_db_rec(__be32 val[2], __be32 *db) +{ + *(u64 *) db = *(u64 *) val; +} + +#else + +/* + * Just fall back to a spinlock to protect the doorbell if + * BITS_PER_LONG is 32 -- there's no portable way to do atomic 64-bit + * MMIO writes. + */ + +#define MTHCA_DECLARE_DOORBELL_LOCK(name) spinlock_t name; +#define MTHCA_INIT_DOORBELL_LOCK(ptr) spin_lock_init(ptr) +#define MTHCA_GET_DOORBELL_LOCK(ptr) (ptr) + +static inline void mthca_write64_raw(__be64 val, void __iomem *dest) +{ + __raw_writel(((__force u32 *) &val)[0], dest); + __raw_writel(((__force u32 *) &val)[1], dest + 4); +} + +static inline void mthca_write64(u32 hi, u32 lo, void __iomem *dest, + spinlock_t *doorbell_lock) +{ + unsigned long flags; + + hi = (__force u32) cpu_to_be32(hi); + lo = (__force u32) cpu_to_be32(lo); + + spin_lock_irqsave(doorbell_lock, flags); + __raw_writel(hi, dest); + __raw_writel(lo, dest + 4); + spin_unlock_irqrestore(doorbell_lock, flags); +} + +static inline void mthca_write_db_rec(__be32 val[2], __be32 *db) +{ + db[0] = val[0]; + wmb(); + db[1] = val[1]; +} + +#endif diff --git a/kernel/drivers/infiniband/hw/mthca/mthca_eq.c b/kernel/drivers/infiniband/hw/mthca/mthca_eq.c new file mode 100644 index 000000000..690201738 --- /dev/null +++ b/kernel/drivers/infiniband/hw/mthca/mthca_eq.c @@ -0,0 +1,905 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#include "mthca_dev.h" +#include "mthca_cmd.h" +#include "mthca_config_reg.h" + +enum { + MTHCA_NUM_ASYNC_EQE = 0x80, + MTHCA_NUM_CMD_EQE = 0x80, + MTHCA_NUM_SPARE_EQE = 0x80, + MTHCA_EQ_ENTRY_SIZE = 0x20 +}; + +/* + * Must be packed because start is 64 bits but only aligned to 32 bits. + */ +struct mthca_eq_context { + __be32 flags; + __be64 start; + __be32 logsize_usrpage; + __be32 tavor_pd; /* reserved for Arbel */ + u8 reserved1[3]; + u8 intr; + __be32 arbel_pd; /* lost_count for Tavor */ + __be32 lkey; + u32 reserved2[2]; + __be32 consumer_index; + __be32 producer_index; + u32 reserved3[4]; +} __attribute__((packed)); + +#define MTHCA_EQ_STATUS_OK ( 0 << 28) +#define MTHCA_EQ_STATUS_OVERFLOW ( 9 << 28) +#define MTHCA_EQ_STATUS_WRITE_FAIL (10 << 28) +#define MTHCA_EQ_OWNER_SW ( 0 << 24) +#define MTHCA_EQ_OWNER_HW ( 1 << 24) +#define MTHCA_EQ_FLAG_TR ( 1 << 18) +#define MTHCA_EQ_FLAG_OI ( 1 << 17) +#define MTHCA_EQ_STATE_ARMED ( 1 << 8) +#define MTHCA_EQ_STATE_FIRED ( 2 << 8) +#define MTHCA_EQ_STATE_ALWAYS_ARMED ( 3 << 8) +#define MTHCA_EQ_STATE_ARBEL ( 8 << 8) + +enum { + MTHCA_EVENT_TYPE_COMP = 0x00, + MTHCA_EVENT_TYPE_PATH_MIG = 0x01, + MTHCA_EVENT_TYPE_COMM_EST = 0x02, + MTHCA_EVENT_TYPE_SQ_DRAINED = 0x03, + MTHCA_EVENT_TYPE_SRQ_QP_LAST_WQE = 0x13, + MTHCA_EVENT_TYPE_SRQ_LIMIT = 0x14, + MTHCA_EVENT_TYPE_CQ_ERROR = 0x04, + MTHCA_EVENT_TYPE_WQ_CATAS_ERROR = 0x05, + MTHCA_EVENT_TYPE_EEC_CATAS_ERROR = 0x06, + MTHCA_EVENT_TYPE_PATH_MIG_FAILED = 0x07, + MTHCA_EVENT_TYPE_WQ_INVAL_REQ_ERROR = 0x10, + MTHCA_EVENT_TYPE_WQ_ACCESS_ERROR = 0x11, + MTHCA_EVENT_TYPE_SRQ_CATAS_ERROR = 0x12, + MTHCA_EVENT_TYPE_LOCAL_CATAS_ERROR = 0x08, + MTHCA_EVENT_TYPE_PORT_CHANGE = 0x09, + MTHCA_EVENT_TYPE_EQ_OVERFLOW = 0x0f, + MTHCA_EVENT_TYPE_ECC_DETECT = 0x0e, + MTHCA_EVENT_TYPE_CMD = 0x0a +}; + +#define MTHCA_ASYNC_EVENT_MASK ((1ULL << MTHCA_EVENT_TYPE_PATH_MIG) | \ + (1ULL << MTHCA_EVENT_TYPE_COMM_EST) | \ + (1ULL << MTHCA_EVENT_TYPE_SQ_DRAINED) | \ + (1ULL << MTHCA_EVENT_TYPE_CQ_ERROR) | \ + (1ULL << MTHCA_EVENT_TYPE_WQ_CATAS_ERROR) | \ + (1ULL << MTHCA_EVENT_TYPE_EEC_CATAS_ERROR) | \ + (1ULL << MTHCA_EVENT_TYPE_PATH_MIG_FAILED) | \ + (1ULL << MTHCA_EVENT_TYPE_WQ_INVAL_REQ_ERROR) | \ + (1ULL << MTHCA_EVENT_TYPE_WQ_ACCESS_ERROR) | \ + (1ULL << MTHCA_EVENT_TYPE_LOCAL_CATAS_ERROR) | \ + (1ULL << MTHCA_EVENT_TYPE_PORT_CHANGE) | \ + (1ULL << MTHCA_EVENT_TYPE_ECC_DETECT)) +#define MTHCA_SRQ_EVENT_MASK ((1ULL << MTHCA_EVENT_TYPE_SRQ_CATAS_ERROR) | \ + (1ULL << MTHCA_EVENT_TYPE_SRQ_QP_LAST_WQE) | \ + (1ULL << MTHCA_EVENT_TYPE_SRQ_LIMIT)) +#define MTHCA_CMD_EVENT_MASK (1ULL << MTHCA_EVENT_TYPE_CMD) + +#define MTHCA_EQ_DB_INC_CI (1 << 24) +#define MTHCA_EQ_DB_REQ_NOT (2 << 24) +#define MTHCA_EQ_DB_DISARM_CQ (3 << 24) +#define MTHCA_EQ_DB_SET_CI (4 << 24) +#define MTHCA_EQ_DB_ALWAYS_ARM (5 << 24) + +struct mthca_eqe { + u8 reserved1; + u8 type; + u8 reserved2; + u8 subtype; + union { + u32 raw[6]; + struct { + __be32 cqn; + } __attribute__((packed)) comp; + struct { + u16 reserved1; + __be16 token; + u32 reserved2; + u8 reserved3[3]; + u8 status; + __be64 out_param; + } __attribute__((packed)) cmd; + struct { + __be32 qpn; + } __attribute__((packed)) qp; + struct { + __be32 srqn; + } __attribute__((packed)) srq; + struct { + __be32 cqn; + u32 reserved1; + u8 reserved2[3]; + u8 syndrome; + } __attribute__((packed)) cq_err; + struct { + u32 reserved1[2]; + __be32 port; + } __attribute__((packed)) port_change; + } event; + u8 reserved3[3]; + u8 owner; +} __attribute__((packed)); + +#define MTHCA_EQ_ENTRY_OWNER_SW (0 << 7) +#define MTHCA_EQ_ENTRY_OWNER_HW (1 << 7) + +static inline u64 async_mask(struct mthca_dev *dev) +{ + return dev->mthca_flags & MTHCA_FLAG_SRQ ? + MTHCA_ASYNC_EVENT_MASK | MTHCA_SRQ_EVENT_MASK : + MTHCA_ASYNC_EVENT_MASK; +} + +static inline void tavor_set_eq_ci(struct mthca_dev *dev, struct mthca_eq *eq, u32 ci) +{ + /* + * This barrier makes sure that all updates to ownership bits + * done by set_eqe_hw() hit memory before the consumer index + * is updated. set_eq_ci() allows the HCA to possibly write + * more EQ entries, and we want to avoid the exceedingly + * unlikely possibility of the HCA writing an entry and then + * having set_eqe_hw() overwrite the owner field. + */ + wmb(); + mthca_write64(MTHCA_EQ_DB_SET_CI | eq->eqn, ci & (eq->nent - 1), + dev->kar + MTHCA_EQ_DOORBELL, + MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock)); +} + +static inline void arbel_set_eq_ci(struct mthca_dev *dev, struct mthca_eq *eq, u32 ci) +{ + /* See comment in tavor_set_eq_ci() above. */ + wmb(); + __raw_writel((__force u32) cpu_to_be32(ci), + dev->eq_regs.arbel.eq_set_ci_base + eq->eqn * 8); + /* We still want ordering, just not swabbing, so add a barrier */ + mb(); +} + +static inline void set_eq_ci(struct mthca_dev *dev, struct mthca_eq *eq, u32 ci) +{ + if (mthca_is_memfree(dev)) + arbel_set_eq_ci(dev, eq, ci); + else + tavor_set_eq_ci(dev, eq, ci); +} + +static inline void tavor_eq_req_not(struct mthca_dev *dev, int eqn) +{ + mthca_write64(MTHCA_EQ_DB_REQ_NOT | eqn, 0, + dev->kar + MTHCA_EQ_DOORBELL, + MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock)); +} + +static inline void arbel_eq_req_not(struct mthca_dev *dev, u32 eqn_mask) +{ + writel(eqn_mask, dev->eq_regs.arbel.eq_arm); +} + +static inline void disarm_cq(struct mthca_dev *dev, int eqn, int cqn) +{ + if (!mthca_is_memfree(dev)) { + mthca_write64(MTHCA_EQ_DB_DISARM_CQ | eqn, cqn, + dev->kar + MTHCA_EQ_DOORBELL, + MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock)); + } +} + +static inline struct mthca_eqe *get_eqe(struct mthca_eq *eq, u32 entry) +{ + unsigned long off = (entry & (eq->nent - 1)) * MTHCA_EQ_ENTRY_SIZE; + return eq->page_list[off / PAGE_SIZE].buf + off % PAGE_SIZE; +} + +static inline struct mthca_eqe *next_eqe_sw(struct mthca_eq *eq) +{ + struct mthca_eqe *eqe; + eqe = get_eqe(eq, eq->cons_index); + return (MTHCA_EQ_ENTRY_OWNER_HW & eqe->owner) ? NULL : eqe; +} + +static inline void set_eqe_hw(struct mthca_eqe *eqe) +{ + eqe->owner = MTHCA_EQ_ENTRY_OWNER_HW; +} + +static void port_change(struct mthca_dev *dev, int port, int active) +{ + struct ib_event record; + + mthca_dbg(dev, "Port change to %s for port %d\n", + active ? "active" : "down", port); + + record.device = &dev->ib_dev; + record.event = active ? IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR; + record.element.port_num = port; + + ib_dispatch_event(&record); +} + +static int mthca_eq_int(struct mthca_dev *dev, struct mthca_eq *eq) +{ + struct mthca_eqe *eqe; + int disarm_cqn; + int eqes_found = 0; + int set_ci = 0; + + while ((eqe = next_eqe_sw(eq))) { + /* + * Make sure we read EQ entry contents after we've + * checked the ownership bit. + */ + rmb(); + + switch (eqe->type) { + case MTHCA_EVENT_TYPE_COMP: + disarm_cqn = be32_to_cpu(eqe->event.comp.cqn) & 0xffffff; + disarm_cq(dev, eq->eqn, disarm_cqn); + mthca_cq_completion(dev, disarm_cqn); + break; + + case MTHCA_EVENT_TYPE_PATH_MIG: + mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff, + IB_EVENT_PATH_MIG); + break; + + case MTHCA_EVENT_TYPE_COMM_EST: + mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff, + IB_EVENT_COMM_EST); + break; + + case MTHCA_EVENT_TYPE_SQ_DRAINED: + mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff, + IB_EVENT_SQ_DRAINED); + break; + + case MTHCA_EVENT_TYPE_SRQ_QP_LAST_WQE: + mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff, + IB_EVENT_QP_LAST_WQE_REACHED); + break; + + case MTHCA_EVENT_TYPE_SRQ_LIMIT: + mthca_srq_event(dev, be32_to_cpu(eqe->event.srq.srqn) & 0xffffff, + IB_EVENT_SRQ_LIMIT_REACHED); + break; + + case MTHCA_EVENT_TYPE_WQ_CATAS_ERROR: + mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff, + IB_EVENT_QP_FATAL); + break; + + case MTHCA_EVENT_TYPE_PATH_MIG_FAILED: + mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff, + IB_EVENT_PATH_MIG_ERR); + break; + + case MTHCA_EVENT_TYPE_WQ_INVAL_REQ_ERROR: + mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff, + IB_EVENT_QP_REQ_ERR); + break; + + case MTHCA_EVENT_TYPE_WQ_ACCESS_ERROR: + mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff, + IB_EVENT_QP_ACCESS_ERR); + break; + + case MTHCA_EVENT_TYPE_CMD: + mthca_cmd_event(dev, + be16_to_cpu(eqe->event.cmd.token), + eqe->event.cmd.status, + be64_to_cpu(eqe->event.cmd.out_param)); + break; + + case MTHCA_EVENT_TYPE_PORT_CHANGE: + port_change(dev, + (be32_to_cpu(eqe->event.port_change.port) >> 28) & 3, + eqe->subtype == 0x4); + break; + + case MTHCA_EVENT_TYPE_CQ_ERROR: + mthca_warn(dev, "CQ %s on CQN %06x\n", + eqe->event.cq_err.syndrome == 1 ? + "overrun" : "access violation", + be32_to_cpu(eqe->event.cq_err.cqn) & 0xffffff); + mthca_cq_event(dev, be32_to_cpu(eqe->event.cq_err.cqn), + IB_EVENT_CQ_ERR); + break; + + case MTHCA_EVENT_TYPE_EQ_OVERFLOW: + mthca_warn(dev, "EQ overrun on EQN %d\n", eq->eqn); + break; + + case MTHCA_EVENT_TYPE_EEC_CATAS_ERROR: + case MTHCA_EVENT_TYPE_SRQ_CATAS_ERROR: + case MTHCA_EVENT_TYPE_LOCAL_CATAS_ERROR: + case MTHCA_EVENT_TYPE_ECC_DETECT: + default: + mthca_warn(dev, "Unhandled event %02x(%02x) on EQ %d\n", + eqe->type, eqe->subtype, eq->eqn); + break; + } + + set_eqe_hw(eqe); + ++eq->cons_index; + eqes_found = 1; + ++set_ci; + + /* + * The HCA will think the queue has overflowed if we + * don't tell it we've been processing events. We + * create our EQs with MTHCA_NUM_SPARE_EQE extra + * entries, so we must update our consumer index at + * least that often. + */ + if (unlikely(set_ci >= MTHCA_NUM_SPARE_EQE)) { + /* + * Conditional on hca_type is OK here because + * this is a rare case, not the fast path. + */ + set_eq_ci(dev, eq, eq->cons_index); + set_ci = 0; + } + } + + /* + * Rely on caller to set consumer index so that we don't have + * to test hca_type in our interrupt handling fast path. + */ + return eqes_found; +} + +static irqreturn_t mthca_tavor_interrupt(int irq, void *dev_ptr) +{ + struct mthca_dev *dev = dev_ptr; + u32 ecr; + int i; + + if (dev->eq_table.clr_mask) + writel(dev->eq_table.clr_mask, dev->eq_table.clr_int); + + ecr = readl(dev->eq_regs.tavor.ecr_base + 4); + if (!ecr) + return IRQ_NONE; + + writel(ecr, dev->eq_regs.tavor.ecr_base + + MTHCA_ECR_CLR_BASE - MTHCA_ECR_BASE + 4); + + for (i = 0; i < MTHCA_NUM_EQ; ++i) + if (ecr & dev->eq_table.eq[i].eqn_mask) { + if (mthca_eq_int(dev, &dev->eq_table.eq[i])) + tavor_set_eq_ci(dev, &dev->eq_table.eq[i], + dev->eq_table.eq[i].cons_index); + tavor_eq_req_not(dev, dev->eq_table.eq[i].eqn); + } + + return IRQ_HANDLED; +} + +static irqreturn_t mthca_tavor_msi_x_interrupt(int irq, void *eq_ptr) +{ + struct mthca_eq *eq = eq_ptr; + struct mthca_dev *dev = eq->dev; + + mthca_eq_int(dev, eq); + tavor_set_eq_ci(dev, eq, eq->cons_index); + tavor_eq_req_not(dev, eq->eqn); + + /* MSI-X vectors always belong to us */ + return IRQ_HANDLED; +} + +static irqreturn_t mthca_arbel_interrupt(int irq, void *dev_ptr) +{ + struct mthca_dev *dev = dev_ptr; + int work = 0; + int i; + + if (dev->eq_table.clr_mask) + writel(dev->eq_table.clr_mask, dev->eq_table.clr_int); + + for (i = 0; i < MTHCA_NUM_EQ; ++i) + if (mthca_eq_int(dev, &dev->eq_table.eq[i])) { + work = 1; + arbel_set_eq_ci(dev, &dev->eq_table.eq[i], + dev->eq_table.eq[i].cons_index); + } + + arbel_eq_req_not(dev, dev->eq_table.arm_mask); + + return IRQ_RETVAL(work); +} + +static irqreturn_t mthca_arbel_msi_x_interrupt(int irq, void *eq_ptr) +{ + struct mthca_eq *eq = eq_ptr; + struct mthca_dev *dev = eq->dev; + + mthca_eq_int(dev, eq); + arbel_set_eq_ci(dev, eq, eq->cons_index); + arbel_eq_req_not(dev, eq->eqn_mask); + + /* MSI-X vectors always belong to us */ + return IRQ_HANDLED; +} + +static int mthca_create_eq(struct mthca_dev *dev, + int nent, + u8 intr, + struct mthca_eq *eq) +{ + int npages; + u64 *dma_list = NULL; + dma_addr_t t; + struct mthca_mailbox *mailbox; + struct mthca_eq_context *eq_context; + int err = -ENOMEM; + int i; + + eq->dev = dev; + eq->nent = roundup_pow_of_two(max(nent, 2)); + npages = ALIGN(eq->nent * MTHCA_EQ_ENTRY_SIZE, PAGE_SIZE) / PAGE_SIZE; + + eq->page_list = kmalloc(npages * sizeof *eq->page_list, + GFP_KERNEL); + if (!eq->page_list) + goto err_out; + + for (i = 0; i < npages; ++i) + eq->page_list[i].buf = NULL; + + dma_list = kmalloc(npages * sizeof *dma_list, GFP_KERNEL); + if (!dma_list) + goto err_out_free; + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) + goto err_out_free; + eq_context = mailbox->buf; + + for (i = 0; i < npages; ++i) { + eq->page_list[i].buf = dma_alloc_coherent(&dev->pdev->dev, + PAGE_SIZE, &t, GFP_KERNEL); + if (!eq->page_list[i].buf) + goto err_out_free_pages; + + dma_list[i] = t; + dma_unmap_addr_set(&eq->page_list[i], mapping, t); + + clear_page(eq->page_list[i].buf); + } + + for (i = 0; i < eq->nent; ++i) + set_eqe_hw(get_eqe(eq, i)); + + eq->eqn = mthca_alloc(&dev->eq_table.alloc); + if (eq->eqn == -1) + goto err_out_free_pages; + + err = mthca_mr_alloc_phys(dev, dev->driver_pd.pd_num, + dma_list, PAGE_SHIFT, npages, + 0, npages * PAGE_SIZE, + MTHCA_MPT_FLAG_LOCAL_WRITE | + MTHCA_MPT_FLAG_LOCAL_READ, + &eq->mr); + if (err) + goto err_out_free_eq; + + memset(eq_context, 0, sizeof *eq_context); + eq_context->flags = cpu_to_be32(MTHCA_EQ_STATUS_OK | + MTHCA_EQ_OWNER_HW | + MTHCA_EQ_STATE_ARMED | + MTHCA_EQ_FLAG_TR); + if (mthca_is_memfree(dev)) + eq_context->flags |= cpu_to_be32(MTHCA_EQ_STATE_ARBEL); + + eq_context->logsize_usrpage = cpu_to_be32((ffs(eq->nent) - 1) << 24); + if (mthca_is_memfree(dev)) { + eq_context->arbel_pd = cpu_to_be32(dev->driver_pd.pd_num); + } else { + eq_context->logsize_usrpage |= cpu_to_be32(dev->driver_uar.index); + eq_context->tavor_pd = cpu_to_be32(dev->driver_pd.pd_num); + } + eq_context->intr = intr; + eq_context->lkey = cpu_to_be32(eq->mr.ibmr.lkey); + + err = mthca_SW2HW_EQ(dev, mailbox, eq->eqn); + if (err) { + mthca_warn(dev, "SW2HW_EQ returned %d\n", err); + goto err_out_free_mr; + } + + kfree(dma_list); + mthca_free_mailbox(dev, mailbox); + + eq->eqn_mask = swab32(1 << eq->eqn); + eq->cons_index = 0; + + dev->eq_table.arm_mask |= eq->eqn_mask; + + mthca_dbg(dev, "Allocated EQ %d with %d entries\n", + eq->eqn, eq->nent); + + return err; + + err_out_free_mr: + mthca_free_mr(dev, &eq->mr); + + err_out_free_eq: + mthca_free(&dev->eq_table.alloc, eq->eqn); + + err_out_free_pages: + for (i = 0; i < npages; ++i) + if (eq->page_list[i].buf) + dma_free_coherent(&dev->pdev->dev, PAGE_SIZE, + eq->page_list[i].buf, + dma_unmap_addr(&eq->page_list[i], + mapping)); + + mthca_free_mailbox(dev, mailbox); + + err_out_free: + kfree(eq->page_list); + kfree(dma_list); + + err_out: + return err; +} + +static void mthca_free_eq(struct mthca_dev *dev, + struct mthca_eq *eq) +{ + struct mthca_mailbox *mailbox; + int err; + int npages = (eq->nent * MTHCA_EQ_ENTRY_SIZE + PAGE_SIZE - 1) / + PAGE_SIZE; + int i; + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) + return; + + err = mthca_HW2SW_EQ(dev, mailbox, eq->eqn); + if (err) + mthca_warn(dev, "HW2SW_EQ returned %d\n", err); + + dev->eq_table.arm_mask &= ~eq->eqn_mask; + + if (0) { + mthca_dbg(dev, "Dumping EQ context %02x:\n", eq->eqn); + for (i = 0; i < sizeof (struct mthca_eq_context) / 4; ++i) { + if (i % 4 == 0) + printk("[%02x] ", i * 4); + printk(" %08x", be32_to_cpup(mailbox->buf + i * 4)); + if ((i + 1) % 4 == 0) + printk("\n"); + } + } + + mthca_free_mr(dev, &eq->mr); + for (i = 0; i < npages; ++i) + pci_free_consistent(dev->pdev, PAGE_SIZE, + eq->page_list[i].buf, + dma_unmap_addr(&eq->page_list[i], mapping)); + + kfree(eq->page_list); + mthca_free_mailbox(dev, mailbox); +} + +static void mthca_free_irqs(struct mthca_dev *dev) +{ + int i; + + if (dev->eq_table.have_irq) + free_irq(dev->pdev->irq, dev); + for (i = 0; i < MTHCA_NUM_EQ; ++i) + if (dev->eq_table.eq[i].have_irq) { + free_irq(dev->eq_table.eq[i].msi_x_vector, + dev->eq_table.eq + i); + dev->eq_table.eq[i].have_irq = 0; + } +} + +static int mthca_map_reg(struct mthca_dev *dev, + unsigned long offset, unsigned long size, + void __iomem **map) +{ + phys_addr_t base = pci_resource_start(dev->pdev, 0); + + *map = ioremap(base + offset, size); + if (!*map) + return -ENOMEM; + + return 0; +} + +static int mthca_map_eq_regs(struct mthca_dev *dev) +{ + if (mthca_is_memfree(dev)) { + /* + * We assume that the EQ arm and EQ set CI registers + * fall within the first BAR. We can't trust the + * values firmware gives us, since those addresses are + * valid on the HCA's side of the PCI bus but not + * necessarily the host side. + */ + if (mthca_map_reg(dev, (pci_resource_len(dev->pdev, 0) - 1) & + dev->fw.arbel.clr_int_base, MTHCA_CLR_INT_SIZE, + &dev->clr_base)) { + mthca_err(dev, "Couldn't map interrupt clear register, " + "aborting.\n"); + return -ENOMEM; + } + + /* + * Add 4 because we limit ourselves to EQs 0 ... 31, + * so we only need the low word of the register. + */ + if (mthca_map_reg(dev, ((pci_resource_len(dev->pdev, 0) - 1) & + dev->fw.arbel.eq_arm_base) + 4, 4, + &dev->eq_regs.arbel.eq_arm)) { + mthca_err(dev, "Couldn't map EQ arm register, aborting.\n"); + iounmap(dev->clr_base); + return -ENOMEM; + } + + if (mthca_map_reg(dev, (pci_resource_len(dev->pdev, 0) - 1) & + dev->fw.arbel.eq_set_ci_base, + MTHCA_EQ_SET_CI_SIZE, + &dev->eq_regs.arbel.eq_set_ci_base)) { + mthca_err(dev, "Couldn't map EQ CI register, aborting.\n"); + iounmap(dev->eq_regs.arbel.eq_arm); + iounmap(dev->clr_base); + return -ENOMEM; + } + } else { + if (mthca_map_reg(dev, MTHCA_CLR_INT_BASE, MTHCA_CLR_INT_SIZE, + &dev->clr_base)) { + mthca_err(dev, "Couldn't map interrupt clear register, " + "aborting.\n"); + return -ENOMEM; + } + + if (mthca_map_reg(dev, MTHCA_ECR_BASE, + MTHCA_ECR_SIZE + MTHCA_ECR_CLR_SIZE, + &dev->eq_regs.tavor.ecr_base)) { + mthca_err(dev, "Couldn't map ecr register, " + "aborting.\n"); + iounmap(dev->clr_base); + return -ENOMEM; + } + } + + return 0; + +} + +static void mthca_unmap_eq_regs(struct mthca_dev *dev) +{ + if (mthca_is_memfree(dev)) { + iounmap(dev->eq_regs.arbel.eq_set_ci_base); + iounmap(dev->eq_regs.arbel.eq_arm); + iounmap(dev->clr_base); + } else { + iounmap(dev->eq_regs.tavor.ecr_base); + iounmap(dev->clr_base); + } +} + +int mthca_map_eq_icm(struct mthca_dev *dev, u64 icm_virt) +{ + int ret; + + /* + * We assume that mapping one page is enough for the whole EQ + * context table. This is fine with all current HCAs, because + * we only use 32 EQs and each EQ uses 32 bytes of context + * memory, or 1 KB total. + */ + dev->eq_table.icm_virt = icm_virt; + dev->eq_table.icm_page = alloc_page(GFP_HIGHUSER); + if (!dev->eq_table.icm_page) + return -ENOMEM; + dev->eq_table.icm_dma = pci_map_page(dev->pdev, dev->eq_table.icm_page, 0, + PAGE_SIZE, PCI_DMA_BIDIRECTIONAL); + if (pci_dma_mapping_error(dev->pdev, dev->eq_table.icm_dma)) { + __free_page(dev->eq_table.icm_page); + return -ENOMEM; + } + + ret = mthca_MAP_ICM_page(dev, dev->eq_table.icm_dma, icm_virt); + if (ret) { + pci_unmap_page(dev->pdev, dev->eq_table.icm_dma, PAGE_SIZE, + PCI_DMA_BIDIRECTIONAL); + __free_page(dev->eq_table.icm_page); + } + + return ret; +} + +void mthca_unmap_eq_icm(struct mthca_dev *dev) +{ + mthca_UNMAP_ICM(dev, dev->eq_table.icm_virt, 1); + pci_unmap_page(dev->pdev, dev->eq_table.icm_dma, PAGE_SIZE, + PCI_DMA_BIDIRECTIONAL); + __free_page(dev->eq_table.icm_page); +} + +int mthca_init_eq_table(struct mthca_dev *dev) +{ + int err; + u8 intr; + int i; + + err = mthca_alloc_init(&dev->eq_table.alloc, + dev->limits.num_eqs, + dev->limits.num_eqs - 1, + dev->limits.reserved_eqs); + if (err) + return err; + + err = mthca_map_eq_regs(dev); + if (err) + goto err_out_free; + + if (dev->mthca_flags & MTHCA_FLAG_MSI_X) { + dev->eq_table.clr_mask = 0; + } else { + dev->eq_table.clr_mask = + swab32(1 << (dev->eq_table.inta_pin & 31)); + dev->eq_table.clr_int = dev->clr_base + + (dev->eq_table.inta_pin < 32 ? 4 : 0); + } + + dev->eq_table.arm_mask = 0; + + intr = dev->eq_table.inta_pin; + + err = mthca_create_eq(dev, dev->limits.num_cqs + MTHCA_NUM_SPARE_EQE, + (dev->mthca_flags & MTHCA_FLAG_MSI_X) ? 128 : intr, + &dev->eq_table.eq[MTHCA_EQ_COMP]); + if (err) + goto err_out_unmap; + + err = mthca_create_eq(dev, MTHCA_NUM_ASYNC_EQE + MTHCA_NUM_SPARE_EQE, + (dev->mthca_flags & MTHCA_FLAG_MSI_X) ? 129 : intr, + &dev->eq_table.eq[MTHCA_EQ_ASYNC]); + if (err) + goto err_out_comp; + + err = mthca_create_eq(dev, MTHCA_NUM_CMD_EQE + MTHCA_NUM_SPARE_EQE, + (dev->mthca_flags & MTHCA_FLAG_MSI_X) ? 130 : intr, + &dev->eq_table.eq[MTHCA_EQ_CMD]); + if (err) + goto err_out_async; + + if (dev->mthca_flags & MTHCA_FLAG_MSI_X) { + static const char *eq_name[] = { + [MTHCA_EQ_COMP] = DRV_NAME "-comp", + [MTHCA_EQ_ASYNC] = DRV_NAME "-async", + [MTHCA_EQ_CMD] = DRV_NAME "-cmd" + }; + + for (i = 0; i < MTHCA_NUM_EQ; ++i) { + snprintf(dev->eq_table.eq[i].irq_name, + IB_DEVICE_NAME_MAX, + "%s@pci:%s", eq_name[i], + pci_name(dev->pdev)); + err = request_irq(dev->eq_table.eq[i].msi_x_vector, + mthca_is_memfree(dev) ? + mthca_arbel_msi_x_interrupt : + mthca_tavor_msi_x_interrupt, + 0, dev->eq_table.eq[i].irq_name, + dev->eq_table.eq + i); + if (err) + goto err_out_cmd; + dev->eq_table.eq[i].have_irq = 1; + } + } else { + snprintf(dev->eq_table.eq[0].irq_name, IB_DEVICE_NAME_MAX, + DRV_NAME "@pci:%s", pci_name(dev->pdev)); + err = request_irq(dev->pdev->irq, + mthca_is_memfree(dev) ? + mthca_arbel_interrupt : + mthca_tavor_interrupt, + IRQF_SHARED, dev->eq_table.eq[0].irq_name, dev); + if (err) + goto err_out_cmd; + dev->eq_table.have_irq = 1; + } + + err = mthca_MAP_EQ(dev, async_mask(dev), + 0, dev->eq_table.eq[MTHCA_EQ_ASYNC].eqn); + if (err) + mthca_warn(dev, "MAP_EQ for async EQ %d failed (%d)\n", + dev->eq_table.eq[MTHCA_EQ_ASYNC].eqn, err); + + err = mthca_MAP_EQ(dev, MTHCA_CMD_EVENT_MASK, + 0, dev->eq_table.eq[MTHCA_EQ_CMD].eqn); + if (err) + mthca_warn(dev, "MAP_EQ for cmd EQ %d failed (%d)\n", + dev->eq_table.eq[MTHCA_EQ_CMD].eqn, err); + + for (i = 0; i < MTHCA_NUM_EQ; ++i) + if (mthca_is_memfree(dev)) + arbel_eq_req_not(dev, dev->eq_table.eq[i].eqn_mask); + else + tavor_eq_req_not(dev, dev->eq_table.eq[i].eqn); + + return 0; + +err_out_cmd: + mthca_free_irqs(dev); + mthca_free_eq(dev, &dev->eq_table.eq[MTHCA_EQ_CMD]); + +err_out_async: + mthca_free_eq(dev, &dev->eq_table.eq[MTHCA_EQ_ASYNC]); + +err_out_comp: + mthca_free_eq(dev, &dev->eq_table.eq[MTHCA_EQ_COMP]); + +err_out_unmap: + mthca_unmap_eq_regs(dev); + +err_out_free: + mthca_alloc_cleanup(&dev->eq_table.alloc); + return err; +} + +void mthca_cleanup_eq_table(struct mthca_dev *dev) +{ + int i; + + mthca_free_irqs(dev); + + mthca_MAP_EQ(dev, async_mask(dev), + 1, dev->eq_table.eq[MTHCA_EQ_ASYNC].eqn); + mthca_MAP_EQ(dev, MTHCA_CMD_EVENT_MASK, + 1, dev->eq_table.eq[MTHCA_EQ_CMD].eqn); + + for (i = 0; i < MTHCA_NUM_EQ; ++i) + mthca_free_eq(dev, &dev->eq_table.eq[i]); + + mthca_unmap_eq_regs(dev); + + mthca_alloc_cleanup(&dev->eq_table.alloc); +} diff --git a/kernel/drivers/infiniband/hw/mthca/mthca_mad.c b/kernel/drivers/infiniband/hw/mthca/mthca_mad.c new file mode 100644 index 000000000..8881fa376 --- /dev/null +++ b/kernel/drivers/infiniband/hw/mthca/mthca_mad.c @@ -0,0 +1,341 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * Copyright (c) 2004 Voltaire, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include +#include +#include + +#include "mthca_dev.h" +#include "mthca_cmd.h" + +enum { + MTHCA_VENDOR_CLASS1 = 0x9, + MTHCA_VENDOR_CLASS2 = 0xa +}; + +static int mthca_update_rate(struct mthca_dev *dev, u8 port_num) +{ + struct ib_port_attr *tprops = NULL; + int ret; + + tprops = kmalloc(sizeof *tprops, GFP_KERNEL); + if (!tprops) + return -ENOMEM; + + ret = ib_query_port(&dev->ib_dev, port_num, tprops); + if (ret) { + printk(KERN_WARNING "ib_query_port failed (%d) for %s port %d\n", + ret, dev->ib_dev.name, port_num); + goto out; + } + + dev->rate[port_num - 1] = tprops->active_speed * + ib_width_enum_to_int(tprops->active_width); + +out: + kfree(tprops); + return ret; +} + +static void update_sm_ah(struct mthca_dev *dev, + u8 port_num, u16 lid, u8 sl) +{ + struct ib_ah *new_ah; + struct ib_ah_attr ah_attr; + unsigned long flags; + + if (!dev->send_agent[port_num - 1][0]) + return; + + memset(&ah_attr, 0, sizeof ah_attr); + ah_attr.dlid = lid; + ah_attr.sl = sl; + ah_attr.port_num = port_num; + + new_ah = ib_create_ah(dev->send_agent[port_num - 1][0]->qp->pd, + &ah_attr); + if (IS_ERR(new_ah)) + return; + + spin_lock_irqsave(&dev->sm_lock, flags); + if (dev->sm_ah[port_num - 1]) + ib_destroy_ah(dev->sm_ah[port_num - 1]); + dev->sm_ah[port_num - 1] = new_ah; + spin_unlock_irqrestore(&dev->sm_lock, flags); +} + +/* + * Snoop SM MADs for port info and P_Key table sets, so we can + * synthesize LID change and P_Key change events. + */ +static void smp_snoop(struct ib_device *ibdev, + u8 port_num, + struct ib_mad *mad, + u16 prev_lid) +{ + struct ib_event event; + + if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED || + mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) && + mad->mad_hdr.method == IB_MGMT_METHOD_SET) { + if (mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO) { + struct ib_port_info *pinfo = + (struct ib_port_info *) ((struct ib_smp *) mad)->data; + u16 lid = be16_to_cpu(pinfo->lid); + + mthca_update_rate(to_mdev(ibdev), port_num); + update_sm_ah(to_mdev(ibdev), port_num, + be16_to_cpu(pinfo->sm_lid), + pinfo->neighbormtu_mastersmsl & 0xf); + + event.device = ibdev; + event.element.port_num = port_num; + + if (pinfo->clientrereg_resv_subnetto & 0x80) { + event.event = IB_EVENT_CLIENT_REREGISTER; + ib_dispatch_event(&event); + } + + if (prev_lid != lid) { + event.event = IB_EVENT_LID_CHANGE; + ib_dispatch_event(&event); + } + } + + if (mad->mad_hdr.attr_id == IB_SMP_ATTR_PKEY_TABLE) { + event.device = ibdev; + event.event = IB_EVENT_PKEY_CHANGE; + event.element.port_num = port_num; + ib_dispatch_event(&event); + } + } +} + +static void node_desc_override(struct ib_device *dev, + struct ib_mad *mad) +{ + if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED || + mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) && + mad->mad_hdr.method == IB_MGMT_METHOD_GET_RESP && + mad->mad_hdr.attr_id == IB_SMP_ATTR_NODE_DESC) { + mutex_lock(&to_mdev(dev)->cap_mask_mutex); + memcpy(((struct ib_smp *) mad)->data, dev->node_desc, 64); + mutex_unlock(&to_mdev(dev)->cap_mask_mutex); + } +} + +static void forward_trap(struct mthca_dev *dev, + u8 port_num, + struct ib_mad *mad) +{ + int qpn = mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED; + struct ib_mad_send_buf *send_buf; + struct ib_mad_agent *agent = dev->send_agent[port_num - 1][qpn]; + int ret; + unsigned long flags; + + if (agent) { + send_buf = ib_create_send_mad(agent, qpn, 0, 0, IB_MGMT_MAD_HDR, + IB_MGMT_MAD_DATA, GFP_ATOMIC); + if (IS_ERR(send_buf)) + return; + /* + * We rely here on the fact that MLX QPs don't use the + * address handle after the send is posted (this is + * wrong following the IB spec strictly, but we know + * it's OK for our devices). + */ + spin_lock_irqsave(&dev->sm_lock, flags); + memcpy(send_buf->mad, mad, sizeof *mad); + if ((send_buf->ah = dev->sm_ah[port_num - 1])) + ret = ib_post_send_mad(send_buf, NULL); + else + ret = -EINVAL; + spin_unlock_irqrestore(&dev->sm_lock, flags); + + if (ret) + ib_free_send_mad(send_buf); + } +} + +int mthca_process_mad(struct ib_device *ibdev, + int mad_flags, + u8 port_num, + struct ib_wc *in_wc, + struct ib_grh *in_grh, + struct ib_mad *in_mad, + struct ib_mad *out_mad) +{ + int err; + u16 slid = in_wc ? in_wc->slid : be16_to_cpu(IB_LID_PERMISSIVE); + u16 prev_lid = 0; + struct ib_port_attr pattr; + + /* Forward locally generated traps to the SM */ + if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP && + slid == 0) { + forward_trap(to_mdev(ibdev), port_num, in_mad); + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; + } + + /* + * Only handle SM gets, sets and trap represses for SM class + * + * Only handle PMA and Mellanox vendor-specific class gets and + * sets for other classes. + */ + if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED || + in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { + if (in_mad->mad_hdr.method != IB_MGMT_METHOD_GET && + in_mad->mad_hdr.method != IB_MGMT_METHOD_SET && + in_mad->mad_hdr.method != IB_MGMT_METHOD_TRAP_REPRESS) + return IB_MAD_RESULT_SUCCESS; + + /* + * Don't process SMInfo queries or vendor-specific + * MADs -- the SMA can't handle them. + */ + if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_SM_INFO || + ((in_mad->mad_hdr.attr_id & IB_SMP_ATTR_VENDOR_MASK) == + IB_SMP_ATTR_VENDOR_MASK)) + return IB_MAD_RESULT_SUCCESS; + } else if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT || + in_mad->mad_hdr.mgmt_class == MTHCA_VENDOR_CLASS1 || + in_mad->mad_hdr.mgmt_class == MTHCA_VENDOR_CLASS2) { + if (in_mad->mad_hdr.method != IB_MGMT_METHOD_GET && + in_mad->mad_hdr.method != IB_MGMT_METHOD_SET) + return IB_MAD_RESULT_SUCCESS; + } else + return IB_MAD_RESULT_SUCCESS; + if ((in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED || + in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) && + in_mad->mad_hdr.method == IB_MGMT_METHOD_SET && + in_mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO && + !ib_query_port(ibdev, port_num, &pattr)) + prev_lid = pattr.lid; + + err = mthca_MAD_IFC(to_mdev(ibdev), + mad_flags & IB_MAD_IGNORE_MKEY, + mad_flags & IB_MAD_IGNORE_BKEY, + port_num, in_wc, in_grh, in_mad, out_mad); + if (err == -EBADMSG) + return IB_MAD_RESULT_SUCCESS; + else if (err) { + mthca_err(to_mdev(ibdev), "MAD_IFC returned %d\n", err); + return IB_MAD_RESULT_FAILURE; + } + + if (!out_mad->mad_hdr.status) { + smp_snoop(ibdev, port_num, in_mad, prev_lid); + node_desc_override(ibdev, out_mad); + } + + /* set return bit in status of directed route responses */ + if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + out_mad->mad_hdr.status |= cpu_to_be16(1 << 15); + + if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS) + /* no response for trap repress */ + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; + + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; +} + +static void send_handler(struct ib_mad_agent *agent, + struct ib_mad_send_wc *mad_send_wc) +{ + ib_free_send_mad(mad_send_wc->send_buf); +} + +int mthca_create_agents(struct mthca_dev *dev) +{ + struct ib_mad_agent *agent; + int p, q; + int ret; + + spin_lock_init(&dev->sm_lock); + + for (p = 0; p < dev->limits.num_ports; ++p) + for (q = 0; q <= 1; ++q) { + agent = ib_register_mad_agent(&dev->ib_dev, p + 1, + q ? IB_QPT_GSI : IB_QPT_SMI, + NULL, 0, send_handler, + NULL, NULL, 0); + if (IS_ERR(agent)) { + ret = PTR_ERR(agent); + goto err; + } + dev->send_agent[p][q] = agent; + } + + + for (p = 1; p <= dev->limits.num_ports; ++p) { + ret = mthca_update_rate(dev, p); + if (ret) { + mthca_err(dev, "Failed to obtain port %d rate." + " aborting.\n", p); + goto err; + } + } + + return 0; + +err: + for (p = 0; p < dev->limits.num_ports; ++p) + for (q = 0; q <= 1; ++q) + if (dev->send_agent[p][q]) + ib_unregister_mad_agent(dev->send_agent[p][q]); + + return ret; +} + +void mthca_free_agents(struct mthca_dev *dev) +{ + struct ib_mad_agent *agent; + int p, q; + + for (p = 0; p < dev->limits.num_ports; ++p) { + for (q = 0; q <= 1; ++q) { + agent = dev->send_agent[p][q]; + dev->send_agent[p][q] = NULL; + ib_unregister_mad_agent(agent); + } + + if (dev->sm_ah[p]) + ib_destroy_ah(dev->sm_ah[p]); + } +} diff --git a/kernel/drivers/infiniband/hw/mthca/mthca_main.c b/kernel/drivers/infiniband/hw/mthca/mthca_main.c new file mode 100644 index 000000000..ded76c101 --- /dev/null +++ b/kernel/drivers/infiniband/hw/mthca/mthca_main.c @@ -0,0 +1,1275 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include + +#include "mthca_dev.h" +#include "mthca_config_reg.h" +#include "mthca_cmd.h" +#include "mthca_profile.h" +#include "mthca_memfree.h" +#include "mthca_wqe.h" + +MODULE_AUTHOR("Roland Dreier"); +MODULE_DESCRIPTION("Mellanox InfiniBand HCA low-level driver"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION(DRV_VERSION); + +#ifdef CONFIG_INFINIBAND_MTHCA_DEBUG + +int mthca_debug_level = 0; +module_param_named(debug_level, mthca_debug_level, int, 0644); +MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0"); + +#endif /* CONFIG_INFINIBAND_MTHCA_DEBUG */ + +#ifdef CONFIG_PCI_MSI + +static int msi_x = 1; +module_param(msi_x, int, 0444); +MODULE_PARM_DESC(msi_x, "attempt to use MSI-X if nonzero"); + +#else /* CONFIG_PCI_MSI */ + +#define msi_x (0) + +#endif /* CONFIG_PCI_MSI */ + +static int tune_pci = 0; +module_param(tune_pci, int, 0444); +MODULE_PARM_DESC(tune_pci, "increase PCI burst from the default set by BIOS if nonzero"); + +DEFINE_MUTEX(mthca_device_mutex); + +#define MTHCA_DEFAULT_NUM_QP (1 << 16) +#define MTHCA_DEFAULT_RDB_PER_QP (1 << 2) +#define MTHCA_DEFAULT_NUM_CQ (1 << 16) +#define MTHCA_DEFAULT_NUM_MCG (1 << 13) +#define MTHCA_DEFAULT_NUM_MPT (1 << 17) +#define MTHCA_DEFAULT_NUM_MTT (1 << 20) +#define MTHCA_DEFAULT_NUM_UDAV (1 << 15) +#define MTHCA_DEFAULT_NUM_RESERVED_MTTS (1 << 18) +#define MTHCA_DEFAULT_NUM_UARC_SIZE (1 << 18) + +static struct mthca_profile hca_profile = { + .num_qp = MTHCA_DEFAULT_NUM_QP, + .rdb_per_qp = MTHCA_DEFAULT_RDB_PER_QP, + .num_cq = MTHCA_DEFAULT_NUM_CQ, + .num_mcg = MTHCA_DEFAULT_NUM_MCG, + .num_mpt = MTHCA_DEFAULT_NUM_MPT, + .num_mtt = MTHCA_DEFAULT_NUM_MTT, + .num_udav = MTHCA_DEFAULT_NUM_UDAV, /* Tavor only */ + .fmr_reserved_mtts = MTHCA_DEFAULT_NUM_RESERVED_MTTS, /* Tavor only */ + .uarc_size = MTHCA_DEFAULT_NUM_UARC_SIZE, /* Arbel only */ +}; + +module_param_named(num_qp, hca_profile.num_qp, int, 0444); +MODULE_PARM_DESC(num_qp, "maximum number of QPs per HCA"); + +module_param_named(rdb_per_qp, hca_profile.rdb_per_qp, int, 0444); +MODULE_PARM_DESC(rdb_per_qp, "number of RDB buffers per QP"); + +module_param_named(num_cq, hca_profile.num_cq, int, 0444); +MODULE_PARM_DESC(num_cq, "maximum number of CQs per HCA"); + +module_param_named(num_mcg, hca_profile.num_mcg, int, 0444); +MODULE_PARM_DESC(num_mcg, "maximum number of multicast groups per HCA"); + +module_param_named(num_mpt, hca_profile.num_mpt, int, 0444); +MODULE_PARM_DESC(num_mpt, + "maximum number of memory protection table entries per HCA"); + +module_param_named(num_mtt, hca_profile.num_mtt, int, 0444); +MODULE_PARM_DESC(num_mtt, + "maximum number of memory translation table segments per HCA"); + +module_param_named(num_udav, hca_profile.num_udav, int, 0444); +MODULE_PARM_DESC(num_udav, "maximum number of UD address vectors per HCA"); + +module_param_named(fmr_reserved_mtts, hca_profile.fmr_reserved_mtts, int, 0444); +MODULE_PARM_DESC(fmr_reserved_mtts, + "number of memory translation table segments reserved for FMR"); + +static int log_mtts_per_seg = ilog2(MTHCA_MTT_SEG_SIZE / 8); +module_param_named(log_mtts_per_seg, log_mtts_per_seg, int, 0444); +MODULE_PARM_DESC(log_mtts_per_seg, "Log2 number of MTT entries per segment (1-5)"); + +static char mthca_version[] = + DRV_NAME ": Mellanox InfiniBand HCA driver v" + DRV_VERSION " (" DRV_RELDATE ")\n"; + +static int mthca_tune_pci(struct mthca_dev *mdev) +{ + if (!tune_pci) + return 0; + + /* First try to max out Read Byte Count */ + if (pci_find_capability(mdev->pdev, PCI_CAP_ID_PCIX)) { + if (pcix_set_mmrbc(mdev->pdev, pcix_get_max_mmrbc(mdev->pdev))) { + mthca_err(mdev, "Couldn't set PCI-X max read count, " + "aborting.\n"); + return -ENODEV; + } + } else if (!(mdev->mthca_flags & MTHCA_FLAG_PCIE)) + mthca_info(mdev, "No PCI-X capability, not setting RBC.\n"); + + if (pci_is_pcie(mdev->pdev)) { + if (pcie_set_readrq(mdev->pdev, 4096)) { + mthca_err(mdev, "Couldn't write PCI Express read request, " + "aborting.\n"); + return -ENODEV; + } + } else if (mdev->mthca_flags & MTHCA_FLAG_PCIE) + mthca_info(mdev, "No PCI Express capability, " + "not setting Max Read Request Size.\n"); + + return 0; +} + +static int mthca_dev_lim(struct mthca_dev *mdev, struct mthca_dev_lim *dev_lim) +{ + int err; + + mdev->limits.mtt_seg_size = (1 << log_mtts_per_seg) * 8; + err = mthca_QUERY_DEV_LIM(mdev, dev_lim); + if (err) { + mthca_err(mdev, "QUERY_DEV_LIM command returned %d" + ", aborting.\n", err); + return err; + } + if (dev_lim->min_page_sz > PAGE_SIZE) { + mthca_err(mdev, "HCA minimum page size of %d bigger than " + "kernel PAGE_SIZE of %ld, aborting.\n", + dev_lim->min_page_sz, PAGE_SIZE); + return -ENODEV; + } + if (dev_lim->num_ports > MTHCA_MAX_PORTS) { + mthca_err(mdev, "HCA has %d ports, but we only support %d, " + "aborting.\n", + dev_lim->num_ports, MTHCA_MAX_PORTS); + return -ENODEV; + } + + if (dev_lim->uar_size > pci_resource_len(mdev->pdev, 2)) { + mthca_err(mdev, "HCA reported UAR size of 0x%x bigger than " + "PCI resource 2 size of 0x%llx, aborting.\n", + dev_lim->uar_size, + (unsigned long long)pci_resource_len(mdev->pdev, 2)); + return -ENODEV; + } + + mdev->limits.num_ports = dev_lim->num_ports; + mdev->limits.vl_cap = dev_lim->max_vl; + mdev->limits.mtu_cap = dev_lim->max_mtu; + mdev->limits.gid_table_len = dev_lim->max_gids; + mdev->limits.pkey_table_len = dev_lim->max_pkeys; + mdev->limits.local_ca_ack_delay = dev_lim->local_ca_ack_delay; + /* + * Need to allow for worst case send WQE overhead and check + * whether max_desc_sz imposes a lower limit than max_sg; UD + * send has the biggest overhead. + */ + mdev->limits.max_sg = min_t(int, dev_lim->max_sg, + (dev_lim->max_desc_sz - + sizeof (struct mthca_next_seg) - + (mthca_is_memfree(mdev) ? + sizeof (struct mthca_arbel_ud_seg) : + sizeof (struct mthca_tavor_ud_seg))) / + sizeof (struct mthca_data_seg)); + mdev->limits.max_wqes = dev_lim->max_qp_sz; + mdev->limits.max_qp_init_rdma = dev_lim->max_requester_per_qp; + mdev->limits.reserved_qps = dev_lim->reserved_qps; + mdev->limits.max_srq_wqes = dev_lim->max_srq_sz; + mdev->limits.reserved_srqs = dev_lim->reserved_srqs; + mdev->limits.reserved_eecs = dev_lim->reserved_eecs; + mdev->limits.max_desc_sz = dev_lim->max_desc_sz; + mdev->limits.max_srq_sge = mthca_max_srq_sge(mdev); + /* + * Subtract 1 from the limit because we need to allocate a + * spare CQE so the HCA HW can tell the difference between an + * empty CQ and a full CQ. + */ + mdev->limits.max_cqes = dev_lim->max_cq_sz - 1; + mdev->limits.reserved_cqs = dev_lim->reserved_cqs; + mdev->limits.reserved_eqs = dev_lim->reserved_eqs; + mdev->limits.reserved_mtts = dev_lim->reserved_mtts; + mdev->limits.reserved_mrws = dev_lim->reserved_mrws; + mdev->limits.reserved_uars = dev_lim->reserved_uars; + mdev->limits.reserved_pds = dev_lim->reserved_pds; + mdev->limits.port_width_cap = dev_lim->max_port_width; + mdev->limits.page_size_cap = ~(u32) (dev_lim->min_page_sz - 1); + mdev->limits.flags = dev_lim->flags; + /* + * For old FW that doesn't return static rate support, use a + * value of 0x3 (only static rate values of 0 or 1 are handled), + * except on Sinai, where even old FW can handle static rate + * values of 2 and 3. + */ + if (dev_lim->stat_rate_support) + mdev->limits.stat_rate_support = dev_lim->stat_rate_support; + else if (mdev->mthca_flags & MTHCA_FLAG_SINAI_OPT) + mdev->limits.stat_rate_support = 0xf; + else + mdev->limits.stat_rate_support = 0x3; + + /* IB_DEVICE_RESIZE_MAX_WR not supported by driver. + May be doable since hardware supports it for SRQ. + + IB_DEVICE_N_NOTIFY_CQ is supported by hardware but not by driver. + + IB_DEVICE_SRQ_RESIZE is supported by hardware but SRQ is not + supported by driver. */ + mdev->device_cap_flags = IB_DEVICE_CHANGE_PHY_PORT | + IB_DEVICE_PORT_ACTIVE_EVENT | + IB_DEVICE_SYS_IMAGE_GUID | + IB_DEVICE_RC_RNR_NAK_GEN; + + if (dev_lim->flags & DEV_LIM_FLAG_BAD_PKEY_CNTR) + mdev->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR; + + if (dev_lim->flags & DEV_LIM_FLAG_BAD_QKEY_CNTR) + mdev->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR; + + if (dev_lim->flags & DEV_LIM_FLAG_RAW_MULTI) + mdev->device_cap_flags |= IB_DEVICE_RAW_MULTI; + + if (dev_lim->flags & DEV_LIM_FLAG_AUTO_PATH_MIG) + mdev->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG; + + if (dev_lim->flags & DEV_LIM_FLAG_UD_AV_PORT_ENFORCE) + mdev->device_cap_flags |= IB_DEVICE_UD_AV_PORT_ENFORCE; + + if (dev_lim->flags & DEV_LIM_FLAG_SRQ) + mdev->mthca_flags |= MTHCA_FLAG_SRQ; + + if (mthca_is_memfree(mdev)) + if (dev_lim->flags & DEV_LIM_FLAG_IPOIB_CSUM) + mdev->device_cap_flags |= IB_DEVICE_UD_IP_CSUM; + + return 0; +} + +static int mthca_init_tavor(struct mthca_dev *mdev) +{ + s64 size; + int err; + struct mthca_dev_lim dev_lim; + struct mthca_profile profile; + struct mthca_init_hca_param init_hca; + + err = mthca_SYS_EN(mdev); + if (err) { + mthca_err(mdev, "SYS_EN command returned %d, aborting.\n", err); + return err; + } + + err = mthca_QUERY_FW(mdev); + if (err) { + mthca_err(mdev, "QUERY_FW command returned %d," + " aborting.\n", err); + goto err_disable; + } + err = mthca_QUERY_DDR(mdev); + if (err) { + mthca_err(mdev, "QUERY_DDR command returned %d, aborting.\n", err); + goto err_disable; + } + + err = mthca_dev_lim(mdev, &dev_lim); + if (err) { + mthca_err(mdev, "QUERY_DEV_LIM command returned %d, aborting.\n", err); + goto err_disable; + } + + profile = hca_profile; + profile.num_uar = dev_lim.uar_size / PAGE_SIZE; + profile.uarc_size = 0; + if (mdev->mthca_flags & MTHCA_FLAG_SRQ) + profile.num_srq = dev_lim.max_srqs; + + size = mthca_make_profile(mdev, &profile, &dev_lim, &init_hca); + if (size < 0) { + err = size; + goto err_disable; + } + + err = mthca_INIT_HCA(mdev, &init_hca); + if (err) { + mthca_err(mdev, "INIT_HCA command returned %d, aborting.\n", err); + goto err_disable; + } + + return 0; + +err_disable: + mthca_SYS_DIS(mdev); + + return err; +} + +static int mthca_load_fw(struct mthca_dev *mdev) +{ + int err; + + /* FIXME: use HCA-attached memory for FW if present */ + + mdev->fw.arbel.fw_icm = + mthca_alloc_icm(mdev, mdev->fw.arbel.fw_pages, + GFP_HIGHUSER | __GFP_NOWARN, 0); + if (!mdev->fw.arbel.fw_icm) { + mthca_err(mdev, "Couldn't allocate FW area, aborting.\n"); + return -ENOMEM; + } + + err = mthca_MAP_FA(mdev, mdev->fw.arbel.fw_icm); + if (err) { + mthca_err(mdev, "MAP_FA command returned %d, aborting.\n", err); + goto err_free; + } + err = mthca_RUN_FW(mdev); + if (err) { + mthca_err(mdev, "RUN_FW command returned %d, aborting.\n", err); + goto err_unmap_fa; + } + + return 0; + +err_unmap_fa: + mthca_UNMAP_FA(mdev); + +err_free: + mthca_free_icm(mdev, mdev->fw.arbel.fw_icm, 0); + return err; +} + +static int mthca_init_icm(struct mthca_dev *mdev, + struct mthca_dev_lim *dev_lim, + struct mthca_init_hca_param *init_hca, + u64 icm_size) +{ + u64 aux_pages; + int err; + + err = mthca_SET_ICM_SIZE(mdev, icm_size, &aux_pages); + if (err) { + mthca_err(mdev, "SET_ICM_SIZE command returned %d, aborting.\n", err); + return err; + } + + mthca_dbg(mdev, "%lld KB of HCA context requires %lld KB aux memory.\n", + (unsigned long long) icm_size >> 10, + (unsigned long long) aux_pages << 2); + + mdev->fw.arbel.aux_icm = mthca_alloc_icm(mdev, aux_pages, + GFP_HIGHUSER | __GFP_NOWARN, 0); + if (!mdev->fw.arbel.aux_icm) { + mthca_err(mdev, "Couldn't allocate aux memory, aborting.\n"); + return -ENOMEM; + } + + err = mthca_MAP_ICM_AUX(mdev, mdev->fw.arbel.aux_icm); + if (err) { + mthca_err(mdev, "MAP_ICM_AUX returned %d, aborting.\n", err); + goto err_free_aux; + } + + err = mthca_map_eq_icm(mdev, init_hca->eqc_base); + if (err) { + mthca_err(mdev, "Failed to map EQ context memory, aborting.\n"); + goto err_unmap_aux; + } + + /* CPU writes to non-reserved MTTs, while HCA might DMA to reserved mtts */ + mdev->limits.reserved_mtts = ALIGN(mdev->limits.reserved_mtts * mdev->limits.mtt_seg_size, + dma_get_cache_alignment()) / mdev->limits.mtt_seg_size; + + mdev->mr_table.mtt_table = mthca_alloc_icm_table(mdev, init_hca->mtt_base, + mdev->limits.mtt_seg_size, + mdev->limits.num_mtt_segs, + mdev->limits.reserved_mtts, + 1, 0); + if (!mdev->mr_table.mtt_table) { + mthca_err(mdev, "Failed to map MTT context memory, aborting.\n"); + err = -ENOMEM; + goto err_unmap_eq; + } + + mdev->mr_table.mpt_table = mthca_alloc_icm_table(mdev, init_hca->mpt_base, + dev_lim->mpt_entry_sz, + mdev->limits.num_mpts, + mdev->limits.reserved_mrws, + 1, 1); + if (!mdev->mr_table.mpt_table) { + mthca_err(mdev, "Failed to map MPT context memory, aborting.\n"); + err = -ENOMEM; + goto err_unmap_mtt; + } + + mdev->qp_table.qp_table = mthca_alloc_icm_table(mdev, init_hca->qpc_base, + dev_lim->qpc_entry_sz, + mdev->limits.num_qps, + mdev->limits.reserved_qps, + 0, 0); + if (!mdev->qp_table.qp_table) { + mthca_err(mdev, "Failed to map QP context memory, aborting.\n"); + err = -ENOMEM; + goto err_unmap_mpt; + } + + mdev->qp_table.eqp_table = mthca_alloc_icm_table(mdev, init_hca->eqpc_base, + dev_lim->eqpc_entry_sz, + mdev->limits.num_qps, + mdev->limits.reserved_qps, + 0, 0); + if (!mdev->qp_table.eqp_table) { + mthca_err(mdev, "Failed to map EQP context memory, aborting.\n"); + err = -ENOMEM; + goto err_unmap_qp; + } + + mdev->qp_table.rdb_table = mthca_alloc_icm_table(mdev, init_hca->rdb_base, + MTHCA_RDB_ENTRY_SIZE, + mdev->limits.num_qps << + mdev->qp_table.rdb_shift, 0, + 0, 0); + if (!mdev->qp_table.rdb_table) { + mthca_err(mdev, "Failed to map RDB context memory, aborting\n"); + err = -ENOMEM; + goto err_unmap_eqp; + } + + mdev->cq_table.table = mthca_alloc_icm_table(mdev, init_hca->cqc_base, + dev_lim->cqc_entry_sz, + mdev->limits.num_cqs, + mdev->limits.reserved_cqs, + 0, 0); + if (!mdev->cq_table.table) { + mthca_err(mdev, "Failed to map CQ context memory, aborting.\n"); + err = -ENOMEM; + goto err_unmap_rdb; + } + + if (mdev->mthca_flags & MTHCA_FLAG_SRQ) { + mdev->srq_table.table = + mthca_alloc_icm_table(mdev, init_hca->srqc_base, + dev_lim->srq_entry_sz, + mdev->limits.num_srqs, + mdev->limits.reserved_srqs, + 0, 0); + if (!mdev->srq_table.table) { + mthca_err(mdev, "Failed to map SRQ context memory, " + "aborting.\n"); + err = -ENOMEM; + goto err_unmap_cq; + } + } + + /* + * It's not strictly required, but for simplicity just map the + * whole multicast group table now. The table isn't very big + * and it's a lot easier than trying to track ref counts. + */ + mdev->mcg_table.table = mthca_alloc_icm_table(mdev, init_hca->mc_base, + MTHCA_MGM_ENTRY_SIZE, + mdev->limits.num_mgms + + mdev->limits.num_amgms, + mdev->limits.num_mgms + + mdev->limits.num_amgms, + 0, 0); + if (!mdev->mcg_table.table) { + mthca_err(mdev, "Failed to map MCG context memory, aborting.\n"); + err = -ENOMEM; + goto err_unmap_srq; + } + + return 0; + +err_unmap_srq: + if (mdev->mthca_flags & MTHCA_FLAG_SRQ) + mthca_free_icm_table(mdev, mdev->srq_table.table); + +err_unmap_cq: + mthca_free_icm_table(mdev, mdev->cq_table.table); + +err_unmap_rdb: + mthca_free_icm_table(mdev, mdev->qp_table.rdb_table); + +err_unmap_eqp: + mthca_free_icm_table(mdev, mdev->qp_table.eqp_table); + +err_unmap_qp: + mthca_free_icm_table(mdev, mdev->qp_table.qp_table); + +err_unmap_mpt: + mthca_free_icm_table(mdev, mdev->mr_table.mpt_table); + +err_unmap_mtt: + mthca_free_icm_table(mdev, mdev->mr_table.mtt_table); + +err_unmap_eq: + mthca_unmap_eq_icm(mdev); + +err_unmap_aux: + mthca_UNMAP_ICM_AUX(mdev); + +err_free_aux: + mthca_free_icm(mdev, mdev->fw.arbel.aux_icm, 0); + + return err; +} + +static void mthca_free_icms(struct mthca_dev *mdev) +{ + + mthca_free_icm_table(mdev, mdev->mcg_table.table); + if (mdev->mthca_flags & MTHCA_FLAG_SRQ) + mthca_free_icm_table(mdev, mdev->srq_table.table); + mthca_free_icm_table(mdev, mdev->cq_table.table); + mthca_free_icm_table(mdev, mdev->qp_table.rdb_table); + mthca_free_icm_table(mdev, mdev->qp_table.eqp_table); + mthca_free_icm_table(mdev, mdev->qp_table.qp_table); + mthca_free_icm_table(mdev, mdev->mr_table.mpt_table); + mthca_free_icm_table(mdev, mdev->mr_table.mtt_table); + mthca_unmap_eq_icm(mdev); + + mthca_UNMAP_ICM_AUX(mdev); + mthca_free_icm(mdev, mdev->fw.arbel.aux_icm, 0); +} + +static int mthca_init_arbel(struct mthca_dev *mdev) +{ + struct mthca_dev_lim dev_lim; + struct mthca_profile profile; + struct mthca_init_hca_param init_hca; + s64 icm_size; + int err; + + err = mthca_QUERY_FW(mdev); + if (err) { + mthca_err(mdev, "QUERY_FW command failed %d, aborting.\n", err); + return err; + } + + err = mthca_ENABLE_LAM(mdev); + if (err == -EAGAIN) { + mthca_dbg(mdev, "No HCA-attached memory (running in MemFree mode)\n"); + mdev->mthca_flags |= MTHCA_FLAG_NO_LAM; + } else if (err) { + mthca_err(mdev, "ENABLE_LAM returned %d, aborting.\n", err); + return err; + } + + err = mthca_load_fw(mdev); + if (err) { + mthca_err(mdev, "Loading FW returned %d, aborting.\n", err); + goto err_disable; + } + + err = mthca_dev_lim(mdev, &dev_lim); + if (err) { + mthca_err(mdev, "QUERY_DEV_LIM returned %d, aborting.\n", err); + goto err_stop_fw; + } + + profile = hca_profile; + profile.num_uar = dev_lim.uar_size / PAGE_SIZE; + profile.num_udav = 0; + if (mdev->mthca_flags & MTHCA_FLAG_SRQ) + profile.num_srq = dev_lim.max_srqs; + + icm_size = mthca_make_profile(mdev, &profile, &dev_lim, &init_hca); + if (icm_size < 0) { + err = icm_size; + goto err_stop_fw; + } + + err = mthca_init_icm(mdev, &dev_lim, &init_hca, icm_size); + if (err) + goto err_stop_fw; + + err = mthca_INIT_HCA(mdev, &init_hca); + if (err) { + mthca_err(mdev, "INIT_HCA command returned %d, aborting.\n", err); + goto err_free_icm; + } + + return 0; + +err_free_icm: + mthca_free_icms(mdev); + +err_stop_fw: + mthca_UNMAP_FA(mdev); + mthca_free_icm(mdev, mdev->fw.arbel.fw_icm, 0); + +err_disable: + if (!(mdev->mthca_flags & MTHCA_FLAG_NO_LAM)) + mthca_DISABLE_LAM(mdev); + + return err; +} + +static void mthca_close_hca(struct mthca_dev *mdev) +{ + mthca_CLOSE_HCA(mdev, 0); + + if (mthca_is_memfree(mdev)) { + mthca_free_icms(mdev); + + mthca_UNMAP_FA(mdev); + mthca_free_icm(mdev, mdev->fw.arbel.fw_icm, 0); + + if (!(mdev->mthca_flags & MTHCA_FLAG_NO_LAM)) + mthca_DISABLE_LAM(mdev); + } else + mthca_SYS_DIS(mdev); +} + +static int mthca_init_hca(struct mthca_dev *mdev) +{ + int err; + struct mthca_adapter adapter; + + if (mthca_is_memfree(mdev)) + err = mthca_init_arbel(mdev); + else + err = mthca_init_tavor(mdev); + + if (err) + return err; + + err = mthca_QUERY_ADAPTER(mdev, &adapter); + if (err) { + mthca_err(mdev, "QUERY_ADAPTER command returned %d, aborting.\n", err); + goto err_close; + } + + mdev->eq_table.inta_pin = adapter.inta_pin; + if (!mthca_is_memfree(mdev)) + mdev->rev_id = adapter.revision_id; + memcpy(mdev->board_id, adapter.board_id, sizeof mdev->board_id); + + return 0; + +err_close: + mthca_close_hca(mdev); + return err; +} + +static int mthca_setup_hca(struct mthca_dev *dev) +{ + int err; + + MTHCA_INIT_DOORBELL_LOCK(&dev->doorbell_lock); + + err = mthca_init_uar_table(dev); + if (err) { + mthca_err(dev, "Failed to initialize " + "user access region table, aborting.\n"); + return err; + } + + err = mthca_uar_alloc(dev, &dev->driver_uar); + if (err) { + mthca_err(dev, "Failed to allocate driver access region, " + "aborting.\n"); + goto err_uar_table_free; + } + + dev->kar = ioremap((phys_addr_t) dev->driver_uar.pfn << PAGE_SHIFT, PAGE_SIZE); + if (!dev->kar) { + mthca_err(dev, "Couldn't map kernel access region, " + "aborting.\n"); + err = -ENOMEM; + goto err_uar_free; + } + + err = mthca_init_pd_table(dev); + if (err) { + mthca_err(dev, "Failed to initialize " + "protection domain table, aborting.\n"); + goto err_kar_unmap; + } + + err = mthca_init_mr_table(dev); + if (err) { + mthca_err(dev, "Failed to initialize " + "memory region table, aborting.\n"); + goto err_pd_table_free; + } + + err = mthca_pd_alloc(dev, 1, &dev->driver_pd); + if (err) { + mthca_err(dev, "Failed to create driver PD, " + "aborting.\n"); + goto err_mr_table_free; + } + + err = mthca_init_eq_table(dev); + if (err) { + mthca_err(dev, "Failed to initialize " + "event queue table, aborting.\n"); + goto err_pd_free; + } + + err = mthca_cmd_use_events(dev); + if (err) { + mthca_err(dev, "Failed to switch to event-driven " + "firmware commands, aborting.\n"); + goto err_eq_table_free; + } + + err = mthca_NOP(dev); + if (err) { + if (dev->mthca_flags & MTHCA_FLAG_MSI_X) { + mthca_warn(dev, "NOP command failed to generate interrupt " + "(IRQ %d).\n", + dev->eq_table.eq[MTHCA_EQ_CMD].msi_x_vector); + mthca_warn(dev, "Trying again with MSI-X disabled.\n"); + } else { + mthca_err(dev, "NOP command failed to generate interrupt " + "(IRQ %d), aborting.\n", + dev->pdev->irq); + mthca_err(dev, "BIOS or ACPI interrupt routing problem?\n"); + } + + goto err_cmd_poll; + } + + mthca_dbg(dev, "NOP command IRQ test passed\n"); + + err = mthca_init_cq_table(dev); + if (err) { + mthca_err(dev, "Failed to initialize " + "completion queue table, aborting.\n"); + goto err_cmd_poll; + } + + err = mthca_init_srq_table(dev); + if (err) { + mthca_err(dev, "Failed to initialize " + "shared receive queue table, aborting.\n"); + goto err_cq_table_free; + } + + err = mthca_init_qp_table(dev); + if (err) { + mthca_err(dev, "Failed to initialize " + "queue pair table, aborting.\n"); + goto err_srq_table_free; + } + + err = mthca_init_av_table(dev); + if (err) { + mthca_err(dev, "Failed to initialize " + "address vector table, aborting.\n"); + goto err_qp_table_free; + } + + err = mthca_init_mcg_table(dev); + if (err) { + mthca_err(dev, "Failed to initialize " + "multicast group table, aborting.\n"); + goto err_av_table_free; + } + + return 0; + +err_av_table_free: + mthca_cleanup_av_table(dev); + +err_qp_table_free: + mthca_cleanup_qp_table(dev); + +err_srq_table_free: + mthca_cleanup_srq_table(dev); + +err_cq_table_free: + mthca_cleanup_cq_table(dev); + +err_cmd_poll: + mthca_cmd_use_polling(dev); + +err_eq_table_free: + mthca_cleanup_eq_table(dev); + +err_pd_free: + mthca_pd_free(dev, &dev->driver_pd); + +err_mr_table_free: + mthca_cleanup_mr_table(dev); + +err_pd_table_free: + mthca_cleanup_pd_table(dev); + +err_kar_unmap: + iounmap(dev->kar); + +err_uar_free: + mthca_uar_free(dev, &dev->driver_uar); + +err_uar_table_free: + mthca_cleanup_uar_table(dev); + return err; +} + +static int mthca_enable_msi_x(struct mthca_dev *mdev) +{ + struct msix_entry entries[3]; + int err; + + entries[0].entry = 0; + entries[1].entry = 1; + entries[2].entry = 2; + + err = pci_enable_msix_exact(mdev->pdev, entries, ARRAY_SIZE(entries)); + if (err) + return err; + + mdev->eq_table.eq[MTHCA_EQ_COMP ].msi_x_vector = entries[0].vector; + mdev->eq_table.eq[MTHCA_EQ_ASYNC].msi_x_vector = entries[1].vector; + mdev->eq_table.eq[MTHCA_EQ_CMD ].msi_x_vector = entries[2].vector; + + return 0; +} + +/* Types of supported HCA */ +enum { + TAVOR, /* MT23108 */ + ARBEL_COMPAT, /* MT25208 in Tavor compat mode */ + ARBEL_NATIVE, /* MT25208 with extended features */ + SINAI /* MT25204 */ +}; + +#define MTHCA_FW_VER(major, minor, subminor) \ + (((u64) (major) << 32) | ((u64) (minor) << 16) | (u64) (subminor)) + +static struct { + u64 latest_fw; + u32 flags; +} mthca_hca_table[] = { + [TAVOR] = { .latest_fw = MTHCA_FW_VER(3, 5, 0), + .flags = 0 }, + [ARBEL_COMPAT] = { .latest_fw = MTHCA_FW_VER(4, 8, 200), + .flags = MTHCA_FLAG_PCIE }, + [ARBEL_NATIVE] = { .latest_fw = MTHCA_FW_VER(5, 3, 0), + .flags = MTHCA_FLAG_MEMFREE | + MTHCA_FLAG_PCIE }, + [SINAI] = { .latest_fw = MTHCA_FW_VER(1, 2, 0), + .flags = MTHCA_FLAG_MEMFREE | + MTHCA_FLAG_PCIE | + MTHCA_FLAG_SINAI_OPT } +}; + +static int __mthca_init_one(struct pci_dev *pdev, int hca_type) +{ + int ddr_hidden = 0; + int err; + struct mthca_dev *mdev; + + printk(KERN_INFO PFX "Initializing %s\n", + pci_name(pdev)); + + err = pci_enable_device(pdev); + if (err) { + dev_err(&pdev->dev, "Cannot enable PCI device, " + "aborting.\n"); + return err; + } + + /* + * Check for BARs. We expect 0: 1MB, 2: 8MB, 4: DDR (may not + * be present) + */ + if (!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM) || + pci_resource_len(pdev, 0) != 1 << 20) { + dev_err(&pdev->dev, "Missing DCS, aborting.\n"); + err = -ENODEV; + goto err_disable_pdev; + } + if (!(pci_resource_flags(pdev, 2) & IORESOURCE_MEM)) { + dev_err(&pdev->dev, "Missing UAR, aborting.\n"); + err = -ENODEV; + goto err_disable_pdev; + } + if (!(pci_resource_flags(pdev, 4) & IORESOURCE_MEM)) + ddr_hidden = 1; + + err = pci_request_regions(pdev, DRV_NAME); + if (err) { + dev_err(&pdev->dev, "Cannot obtain PCI resources, " + "aborting.\n"); + goto err_disable_pdev; + } + + pci_set_master(pdev); + + err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); + if (err) { + dev_warn(&pdev->dev, "Warning: couldn't set 64-bit PCI DMA mask.\n"); + err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); + if (err) { + dev_err(&pdev->dev, "Can't set PCI DMA mask, aborting.\n"); + goto err_free_res; + } + } + err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); + if (err) { + dev_warn(&pdev->dev, "Warning: couldn't set 64-bit " + "consistent PCI DMA mask.\n"); + err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)); + if (err) { + dev_err(&pdev->dev, "Can't set consistent PCI DMA mask, " + "aborting.\n"); + goto err_free_res; + } + } + + /* We can handle large RDMA requests, so allow larger segments. */ + dma_set_max_seg_size(&pdev->dev, 1024 * 1024 * 1024); + + mdev = (struct mthca_dev *) ib_alloc_device(sizeof *mdev); + if (!mdev) { + dev_err(&pdev->dev, "Device struct alloc failed, " + "aborting.\n"); + err = -ENOMEM; + goto err_free_res; + } + + mdev->pdev = pdev; + + mdev->mthca_flags = mthca_hca_table[hca_type].flags; + if (ddr_hidden) + mdev->mthca_flags |= MTHCA_FLAG_DDR_HIDDEN; + + /* + * Now reset the HCA before we touch the PCI capabilities or + * attempt a firmware command, since a boot ROM may have left + * the HCA in an undefined state. + */ + err = mthca_reset(mdev); + if (err) { + mthca_err(mdev, "Failed to reset HCA, aborting.\n"); + goto err_free_dev; + } + + if (mthca_cmd_init(mdev)) { + mthca_err(mdev, "Failed to init command interface, aborting.\n"); + goto err_free_dev; + } + + err = mthca_tune_pci(mdev); + if (err) + goto err_cmd; + + err = mthca_init_hca(mdev); + if (err) + goto err_cmd; + + if (mdev->fw_ver < mthca_hca_table[hca_type].latest_fw) { + mthca_warn(mdev, "HCA FW version %d.%d.%03d is old (%d.%d.%03d is current).\n", + (int) (mdev->fw_ver >> 32), (int) (mdev->fw_ver >> 16) & 0xffff, + (int) (mdev->fw_ver & 0xffff), + (int) (mthca_hca_table[hca_type].latest_fw >> 32), + (int) (mthca_hca_table[hca_type].latest_fw >> 16) & 0xffff, + (int) (mthca_hca_table[hca_type].latest_fw & 0xffff)); + mthca_warn(mdev, "If you have problems, try updating your HCA FW.\n"); + } + + if (msi_x && !mthca_enable_msi_x(mdev)) + mdev->mthca_flags |= MTHCA_FLAG_MSI_X; + + err = mthca_setup_hca(mdev); + if (err == -EBUSY && (mdev->mthca_flags & MTHCA_FLAG_MSI_X)) { + if (mdev->mthca_flags & MTHCA_FLAG_MSI_X) + pci_disable_msix(pdev); + mdev->mthca_flags &= ~MTHCA_FLAG_MSI_X; + + err = mthca_setup_hca(mdev); + } + + if (err) + goto err_close; + + err = mthca_register_device(mdev); + if (err) + goto err_cleanup; + + err = mthca_create_agents(mdev); + if (err) + goto err_unregister; + + pci_set_drvdata(pdev, mdev); + mdev->hca_type = hca_type; + + mdev->active = true; + + return 0; + +err_unregister: + mthca_unregister_device(mdev); + +err_cleanup: + mthca_cleanup_mcg_table(mdev); + mthca_cleanup_av_table(mdev); + mthca_cleanup_qp_table(mdev); + mthca_cleanup_srq_table(mdev); + mthca_cleanup_cq_table(mdev); + mthca_cmd_use_polling(mdev); + mthca_cleanup_eq_table(mdev); + + mthca_pd_free(mdev, &mdev->driver_pd); + + mthca_cleanup_mr_table(mdev); + mthca_cleanup_pd_table(mdev); + mthca_cleanup_uar_table(mdev); + +err_close: + if (mdev->mthca_flags & MTHCA_FLAG_MSI_X) + pci_disable_msix(pdev); + + mthca_close_hca(mdev); + +err_cmd: + mthca_cmd_cleanup(mdev); + +err_free_dev: + ib_dealloc_device(&mdev->ib_dev); + +err_free_res: + pci_release_regions(pdev); + +err_disable_pdev: + pci_disable_device(pdev); + pci_set_drvdata(pdev, NULL); + return err; +} + +static void __mthca_remove_one(struct pci_dev *pdev) +{ + struct mthca_dev *mdev = pci_get_drvdata(pdev); + int p; + + if (mdev) { + mthca_free_agents(mdev); + mthca_unregister_device(mdev); + + for (p = 1; p <= mdev->limits.num_ports; ++p) + mthca_CLOSE_IB(mdev, p); + + mthca_cleanup_mcg_table(mdev); + mthca_cleanup_av_table(mdev); + mthca_cleanup_qp_table(mdev); + mthca_cleanup_srq_table(mdev); + mthca_cleanup_cq_table(mdev); + mthca_cmd_use_polling(mdev); + mthca_cleanup_eq_table(mdev); + + mthca_pd_free(mdev, &mdev->driver_pd); + + mthca_cleanup_mr_table(mdev); + mthca_cleanup_pd_table(mdev); + + iounmap(mdev->kar); + mthca_uar_free(mdev, &mdev->driver_uar); + mthca_cleanup_uar_table(mdev); + mthca_close_hca(mdev); + mthca_cmd_cleanup(mdev); + + if (mdev->mthca_flags & MTHCA_FLAG_MSI_X) + pci_disable_msix(pdev); + + ib_dealloc_device(&mdev->ib_dev); + pci_release_regions(pdev); + pci_disable_device(pdev); + pci_set_drvdata(pdev, NULL); + } +} + +int __mthca_restart_one(struct pci_dev *pdev) +{ + struct mthca_dev *mdev; + int hca_type; + + mdev = pci_get_drvdata(pdev); + if (!mdev) + return -ENODEV; + hca_type = mdev->hca_type; + __mthca_remove_one(pdev); + return __mthca_init_one(pdev, hca_type); +} + +static int mthca_init_one(struct pci_dev *pdev, const struct pci_device_id *id) +{ + int ret; + + mutex_lock(&mthca_device_mutex); + + printk_once(KERN_INFO "%s", mthca_version); + + if (id->driver_data >= ARRAY_SIZE(mthca_hca_table)) { + printk(KERN_ERR PFX "%s has invalid driver data %lx\n", + pci_name(pdev), id->driver_data); + mutex_unlock(&mthca_device_mutex); + return -ENODEV; + } + + ret = __mthca_init_one(pdev, id->driver_data); + + mutex_unlock(&mthca_device_mutex); + + return ret; +} + +static void mthca_remove_one(struct pci_dev *pdev) +{ + mutex_lock(&mthca_device_mutex); + __mthca_remove_one(pdev); + mutex_unlock(&mthca_device_mutex); +} + +static struct pci_device_id mthca_pci_table[] = { + { PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_TAVOR), + .driver_data = TAVOR }, + { PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_TAVOR), + .driver_data = TAVOR }, + { PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_ARBEL_COMPAT), + .driver_data = ARBEL_COMPAT }, + { PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_ARBEL_COMPAT), + .driver_data = ARBEL_COMPAT }, + { PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_ARBEL), + .driver_data = ARBEL_NATIVE }, + { PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_ARBEL), + .driver_data = ARBEL_NATIVE }, + { PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_SINAI), + .driver_data = SINAI }, + { PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_SINAI), + .driver_data = SINAI }, + { PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_SINAI_OLD), + .driver_data = SINAI }, + { PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_SINAI_OLD), + .driver_data = SINAI }, + { 0, } +}; + +MODULE_DEVICE_TABLE(pci, mthca_pci_table); + +static struct pci_driver mthca_driver = { + .name = DRV_NAME, + .id_table = mthca_pci_table, + .probe = mthca_init_one, + .remove = mthca_remove_one, +}; + +static void __init __mthca_check_profile_val(const char *name, int *pval, + int pval_default) +{ + /* value must be positive and power of 2 */ + int old_pval = *pval; + + if (old_pval <= 0) + *pval = pval_default; + else + *pval = roundup_pow_of_two(old_pval); + + if (old_pval != *pval) { + printk(KERN_WARNING PFX "Invalid value %d for %s in module parameter.\n", + old_pval, name); + printk(KERN_WARNING PFX "Corrected %s to %d.\n", name, *pval); + } +} + +#define mthca_check_profile_val(name, default) \ + __mthca_check_profile_val(#name, &hca_profile.name, default) + +static void __init mthca_validate_profile(void) +{ + mthca_check_profile_val(num_qp, MTHCA_DEFAULT_NUM_QP); + mthca_check_profile_val(rdb_per_qp, MTHCA_DEFAULT_RDB_PER_QP); + mthca_check_profile_val(num_cq, MTHCA_DEFAULT_NUM_CQ); + mthca_check_profile_val(num_mcg, MTHCA_DEFAULT_NUM_MCG); + mthca_check_profile_val(num_mpt, MTHCA_DEFAULT_NUM_MPT); + mthca_check_profile_val(num_mtt, MTHCA_DEFAULT_NUM_MTT); + mthca_check_profile_val(num_udav, MTHCA_DEFAULT_NUM_UDAV); + mthca_check_profile_val(fmr_reserved_mtts, MTHCA_DEFAULT_NUM_RESERVED_MTTS); + + if (hca_profile.fmr_reserved_mtts >= hca_profile.num_mtt) { + printk(KERN_WARNING PFX "Invalid fmr_reserved_mtts module parameter %d.\n", + hca_profile.fmr_reserved_mtts); + printk(KERN_WARNING PFX "(Must be smaller than num_mtt %d)\n", + hca_profile.num_mtt); + hca_profile.fmr_reserved_mtts = hca_profile.num_mtt / 2; + printk(KERN_WARNING PFX "Corrected fmr_reserved_mtts to %d.\n", + hca_profile.fmr_reserved_mtts); + } + + if ((log_mtts_per_seg < 1) || (log_mtts_per_seg > 5)) { + printk(KERN_WARNING PFX "bad log_mtts_per_seg (%d). Using default - %d\n", + log_mtts_per_seg, ilog2(MTHCA_MTT_SEG_SIZE / 8)); + log_mtts_per_seg = ilog2(MTHCA_MTT_SEG_SIZE / 8); + } +} + +static int __init mthca_init(void) +{ + int ret; + + mthca_validate_profile(); + + ret = mthca_catas_init(); + if (ret) + return ret; + + ret = pci_register_driver(&mthca_driver); + if (ret < 0) { + mthca_catas_cleanup(); + return ret; + } + + return 0; +} + +static void __exit mthca_cleanup(void) +{ + pci_unregister_driver(&mthca_driver); + mthca_catas_cleanup(); +} + +module_init(mthca_init); +module_exit(mthca_cleanup); diff --git a/kernel/drivers/infiniband/hw/mthca/mthca_mcg.c b/kernel/drivers/infiniband/hw/mthca/mthca_mcg.c new file mode 100644 index 000000000..6304ae8f4 --- /dev/null +++ b/kernel/drivers/infiniband/hw/mthca/mthca_mcg.c @@ -0,0 +1,335 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include "mthca_dev.h" +#include "mthca_cmd.h" + +struct mthca_mgm { + __be32 next_gid_index; + u32 reserved[3]; + u8 gid[16]; + __be32 qp[MTHCA_QP_PER_MGM]; +}; + +static const u8 zero_gid[16]; /* automatically initialized to 0 */ + +/* + * Caller must hold MCG table semaphore. gid and mgm parameters must + * be properly aligned for command interface. + * + * Returns 0 unless a firmware command error occurs. + * + * If GID is found in MGM or MGM is empty, *index = *hash, *prev = -1 + * and *mgm holds MGM entry. + * + * if GID is found in AMGM, *index = index in AMGM, *prev = index of + * previous entry in hash chain and *mgm holds AMGM entry. + * + * If no AMGM exists for given gid, *index = -1, *prev = index of last + * entry in hash chain and *mgm holds end of hash chain. + */ +static int find_mgm(struct mthca_dev *dev, + u8 *gid, struct mthca_mailbox *mgm_mailbox, + u16 *hash, int *prev, int *index) +{ + struct mthca_mailbox *mailbox; + struct mthca_mgm *mgm = mgm_mailbox->buf; + u8 *mgid; + int err; + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) + return -ENOMEM; + mgid = mailbox->buf; + + memcpy(mgid, gid, 16); + + err = mthca_MGID_HASH(dev, mailbox, hash); + if (err) { + mthca_err(dev, "MGID_HASH failed (%d)\n", err); + goto out; + } + + if (0) + mthca_dbg(dev, "Hash for %pI6 is %04x\n", gid, *hash); + + *index = *hash; + *prev = -1; + + do { + err = mthca_READ_MGM(dev, *index, mgm_mailbox); + if (err) { + mthca_err(dev, "READ_MGM failed (%d)\n", err); + goto out; + } + + if (!memcmp(mgm->gid, zero_gid, 16)) { + if (*index != *hash) { + mthca_err(dev, "Found zero MGID in AMGM.\n"); + err = -EINVAL; + } + goto out; + } + + if (!memcmp(mgm->gid, gid, 16)) + goto out; + + *prev = *index; + *index = be32_to_cpu(mgm->next_gid_index) >> 6; + } while (*index); + + *index = -1; + + out: + mthca_free_mailbox(dev, mailbox); + return err; +} + +int mthca_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + struct mthca_dev *dev = to_mdev(ibqp->device); + struct mthca_mailbox *mailbox; + struct mthca_mgm *mgm; + u16 hash; + int index, prev; + int link = 0; + int i; + int err; + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + mgm = mailbox->buf; + + mutex_lock(&dev->mcg_table.mutex); + + err = find_mgm(dev, gid->raw, mailbox, &hash, &prev, &index); + if (err) + goto out; + + if (index != -1) { + if (!memcmp(mgm->gid, zero_gid, 16)) + memcpy(mgm->gid, gid->raw, 16); + } else { + link = 1; + + index = mthca_alloc(&dev->mcg_table.alloc); + if (index == -1) { + mthca_err(dev, "No AMGM entries left\n"); + err = -ENOMEM; + goto out; + } + + err = mthca_READ_MGM(dev, index, mailbox); + if (err) { + mthca_err(dev, "READ_MGM failed (%d)\n", err); + goto out; + } + memset(mgm, 0, sizeof *mgm); + memcpy(mgm->gid, gid->raw, 16); + } + + for (i = 0; i < MTHCA_QP_PER_MGM; ++i) + if (mgm->qp[i] == cpu_to_be32(ibqp->qp_num | (1 << 31))) { + mthca_dbg(dev, "QP %06x already a member of MGM\n", + ibqp->qp_num); + err = 0; + goto out; + } else if (!(mgm->qp[i] & cpu_to_be32(1 << 31))) { + mgm->qp[i] = cpu_to_be32(ibqp->qp_num | (1 << 31)); + break; + } + + if (i == MTHCA_QP_PER_MGM) { + mthca_err(dev, "MGM at index %x is full.\n", index); + err = -ENOMEM; + goto out; + } + + err = mthca_WRITE_MGM(dev, index, mailbox); + if (err) { + mthca_err(dev, "WRITE_MGM failed %d\n", err); + err = -EINVAL; + goto out; + } + + if (!link) + goto out; + + err = mthca_READ_MGM(dev, prev, mailbox); + if (err) { + mthca_err(dev, "READ_MGM failed %d\n", err); + goto out; + } + + mgm->next_gid_index = cpu_to_be32(index << 6); + + err = mthca_WRITE_MGM(dev, prev, mailbox); + if (err) + mthca_err(dev, "WRITE_MGM returned %d\n", err); + + out: + if (err && link && index != -1) { + BUG_ON(index < dev->limits.num_mgms); + mthca_free(&dev->mcg_table.alloc, index); + } + mutex_unlock(&dev->mcg_table.mutex); + + mthca_free_mailbox(dev, mailbox); + return err; +} + +int mthca_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + struct mthca_dev *dev = to_mdev(ibqp->device); + struct mthca_mailbox *mailbox; + struct mthca_mgm *mgm; + u16 hash; + int prev, index; + int i, loc; + int err; + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + mgm = mailbox->buf; + + mutex_lock(&dev->mcg_table.mutex); + + err = find_mgm(dev, gid->raw, mailbox, &hash, &prev, &index); + if (err) + goto out; + + if (index == -1) { + mthca_err(dev, "MGID %pI6 not found\n", gid->raw); + err = -EINVAL; + goto out; + } + + for (loc = -1, i = 0; i < MTHCA_QP_PER_MGM; ++i) { + if (mgm->qp[i] == cpu_to_be32(ibqp->qp_num | (1 << 31))) + loc = i; + if (!(mgm->qp[i] & cpu_to_be32(1 << 31))) + break; + } + + if (loc == -1) { + mthca_err(dev, "QP %06x not found in MGM\n", ibqp->qp_num); + err = -EINVAL; + goto out; + } + + mgm->qp[loc] = mgm->qp[i - 1]; + mgm->qp[i - 1] = 0; + + err = mthca_WRITE_MGM(dev, index, mailbox); + if (err) { + mthca_err(dev, "WRITE_MGM returned %d\n", err); + goto out; + } + + if (i != 1) + goto out; + + if (prev == -1) { + /* Remove entry from MGM */ + int amgm_index_to_free = be32_to_cpu(mgm->next_gid_index) >> 6; + if (amgm_index_to_free) { + err = mthca_READ_MGM(dev, amgm_index_to_free, + mailbox); + if (err) { + mthca_err(dev, "READ_MGM returned %d\n", err); + goto out; + } + } else + memset(mgm->gid, 0, 16); + + err = mthca_WRITE_MGM(dev, index, mailbox); + if (err) { + mthca_err(dev, "WRITE_MGM returned %d\n", err); + goto out; + } + if (amgm_index_to_free) { + BUG_ON(amgm_index_to_free < dev->limits.num_mgms); + mthca_free(&dev->mcg_table.alloc, amgm_index_to_free); + } + } else { + /* Remove entry from AMGM */ + int curr_next_index = be32_to_cpu(mgm->next_gid_index) >> 6; + err = mthca_READ_MGM(dev, prev, mailbox); + if (err) { + mthca_err(dev, "READ_MGM returned %d\n", err); + goto out; + } + + mgm->next_gid_index = cpu_to_be32(curr_next_index << 6); + + err = mthca_WRITE_MGM(dev, prev, mailbox); + if (err) { + mthca_err(dev, "WRITE_MGM returned %d\n", err); + goto out; + } + BUG_ON(index < dev->limits.num_mgms); + mthca_free(&dev->mcg_table.alloc, index); + } + + out: + mutex_unlock(&dev->mcg_table.mutex); + + mthca_free_mailbox(dev, mailbox); + return err; +} + +int mthca_init_mcg_table(struct mthca_dev *dev) +{ + int err; + int table_size = dev->limits.num_mgms + dev->limits.num_amgms; + + err = mthca_alloc_init(&dev->mcg_table.alloc, + table_size, + table_size - 1, + dev->limits.num_mgms); + if (err) + return err; + + mutex_init(&dev->mcg_table.mutex); + + return 0; +} + +void mthca_cleanup_mcg_table(struct mthca_dev *dev) +{ + mthca_alloc_cleanup(&dev->mcg_table.alloc); +} diff --git a/kernel/drivers/infiniband/hw/mthca/mthca_memfree.c b/kernel/drivers/infiniband/hw/mthca/mthca_memfree.c new file mode 100644 index 000000000..7d2e42dd6 --- /dev/null +++ b/kernel/drivers/infiniband/hw/mthca/mthca_memfree.c @@ -0,0 +1,760 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Cisco Systems. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#include + +#include "mthca_memfree.h" +#include "mthca_dev.h" +#include "mthca_cmd.h" + +/* + * We allocate in as big chunks as we can, up to a maximum of 256 KB + * per chunk. + */ +enum { + MTHCA_ICM_ALLOC_SIZE = 1 << 18, + MTHCA_TABLE_CHUNK_SIZE = 1 << 18 +}; + +struct mthca_user_db_table { + struct mutex mutex; + struct { + u64 uvirt; + struct scatterlist mem; + int refcount; + } page[0]; +}; + +static void mthca_free_icm_pages(struct mthca_dev *dev, struct mthca_icm_chunk *chunk) +{ + int i; + + if (chunk->nsg > 0) + pci_unmap_sg(dev->pdev, chunk->mem, chunk->npages, + PCI_DMA_BIDIRECTIONAL); + + for (i = 0; i < chunk->npages; ++i) + __free_pages(sg_page(&chunk->mem[i]), + get_order(chunk->mem[i].length)); +} + +static void mthca_free_icm_coherent(struct mthca_dev *dev, struct mthca_icm_chunk *chunk) +{ + int i; + + for (i = 0; i < chunk->npages; ++i) { + dma_free_coherent(&dev->pdev->dev, chunk->mem[i].length, + lowmem_page_address(sg_page(&chunk->mem[i])), + sg_dma_address(&chunk->mem[i])); + } +} + +void mthca_free_icm(struct mthca_dev *dev, struct mthca_icm *icm, int coherent) +{ + struct mthca_icm_chunk *chunk, *tmp; + + if (!icm) + return; + + list_for_each_entry_safe(chunk, tmp, &icm->chunk_list, list) { + if (coherent) + mthca_free_icm_coherent(dev, chunk); + else + mthca_free_icm_pages(dev, chunk); + + kfree(chunk); + } + + kfree(icm); +} + +static int mthca_alloc_icm_pages(struct scatterlist *mem, int order, gfp_t gfp_mask) +{ + struct page *page; + + /* + * Use __GFP_ZERO because buggy firmware assumes ICM pages are + * cleared, and subtle failures are seen if they aren't. + */ + page = alloc_pages(gfp_mask | __GFP_ZERO, order); + if (!page) + return -ENOMEM; + + sg_set_page(mem, page, PAGE_SIZE << order, 0); + return 0; +} + +static int mthca_alloc_icm_coherent(struct device *dev, struct scatterlist *mem, + int order, gfp_t gfp_mask) +{ + void *buf = dma_alloc_coherent(dev, PAGE_SIZE << order, &sg_dma_address(mem), + gfp_mask); + if (!buf) + return -ENOMEM; + + sg_set_buf(mem, buf, PAGE_SIZE << order); + BUG_ON(mem->offset); + sg_dma_len(mem) = PAGE_SIZE << order; + return 0; +} + +struct mthca_icm *mthca_alloc_icm(struct mthca_dev *dev, int npages, + gfp_t gfp_mask, int coherent) +{ + struct mthca_icm *icm; + struct mthca_icm_chunk *chunk = NULL; + int cur_order; + int ret; + + /* We use sg_set_buf for coherent allocs, which assumes low memory */ + BUG_ON(coherent && (gfp_mask & __GFP_HIGHMEM)); + + icm = kmalloc(sizeof *icm, gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN)); + if (!icm) + return icm; + + icm->refcount = 0; + INIT_LIST_HEAD(&icm->chunk_list); + + cur_order = get_order(MTHCA_ICM_ALLOC_SIZE); + + while (npages > 0) { + if (!chunk) { + chunk = kmalloc(sizeof *chunk, + gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN)); + if (!chunk) + goto fail; + + sg_init_table(chunk->mem, MTHCA_ICM_CHUNK_LEN); + chunk->npages = 0; + chunk->nsg = 0; + list_add_tail(&chunk->list, &icm->chunk_list); + } + + while (1 << cur_order > npages) + --cur_order; + + if (coherent) + ret = mthca_alloc_icm_coherent(&dev->pdev->dev, + &chunk->mem[chunk->npages], + cur_order, gfp_mask); + else + ret = mthca_alloc_icm_pages(&chunk->mem[chunk->npages], + cur_order, gfp_mask); + + if (!ret) { + ++chunk->npages; + + if (coherent) + ++chunk->nsg; + else if (chunk->npages == MTHCA_ICM_CHUNK_LEN) { + chunk->nsg = pci_map_sg(dev->pdev, chunk->mem, + chunk->npages, + PCI_DMA_BIDIRECTIONAL); + + if (chunk->nsg <= 0) + goto fail; + } + + if (chunk->npages == MTHCA_ICM_CHUNK_LEN) + chunk = NULL; + + npages -= 1 << cur_order; + } else { + --cur_order; + if (cur_order < 0) + goto fail; + } + } + + if (!coherent && chunk) { + chunk->nsg = pci_map_sg(dev->pdev, chunk->mem, + chunk->npages, + PCI_DMA_BIDIRECTIONAL); + + if (chunk->nsg <= 0) + goto fail; + } + + return icm; + +fail: + mthca_free_icm(dev, icm, coherent); + return NULL; +} + +int mthca_table_get(struct mthca_dev *dev, struct mthca_icm_table *table, int obj) +{ + int i = (obj & (table->num_obj - 1)) * table->obj_size / MTHCA_TABLE_CHUNK_SIZE; + int ret = 0; + + mutex_lock(&table->mutex); + + if (table->icm[i]) { + ++table->icm[i]->refcount; + goto out; + } + + table->icm[i] = mthca_alloc_icm(dev, MTHCA_TABLE_CHUNK_SIZE >> PAGE_SHIFT, + (table->lowmem ? GFP_KERNEL : GFP_HIGHUSER) | + __GFP_NOWARN, table->coherent); + if (!table->icm[i]) { + ret = -ENOMEM; + goto out; + } + + if (mthca_MAP_ICM(dev, table->icm[i], + table->virt + i * MTHCA_TABLE_CHUNK_SIZE)) { + mthca_free_icm(dev, table->icm[i], table->coherent); + table->icm[i] = NULL; + ret = -ENOMEM; + goto out; + } + + ++table->icm[i]->refcount; + +out: + mutex_unlock(&table->mutex); + return ret; +} + +void mthca_table_put(struct mthca_dev *dev, struct mthca_icm_table *table, int obj) +{ + int i; + + if (!mthca_is_memfree(dev)) + return; + + i = (obj & (table->num_obj - 1)) * table->obj_size / MTHCA_TABLE_CHUNK_SIZE; + + mutex_lock(&table->mutex); + + if (--table->icm[i]->refcount == 0) { + mthca_UNMAP_ICM(dev, table->virt + i * MTHCA_TABLE_CHUNK_SIZE, + MTHCA_TABLE_CHUNK_SIZE / MTHCA_ICM_PAGE_SIZE); + mthca_free_icm(dev, table->icm[i], table->coherent); + table->icm[i] = NULL; + } + + mutex_unlock(&table->mutex); +} + +void *mthca_table_find(struct mthca_icm_table *table, int obj, dma_addr_t *dma_handle) +{ + int idx, offset, dma_offset, i; + struct mthca_icm_chunk *chunk; + struct mthca_icm *icm; + struct page *page = NULL; + + if (!table->lowmem) + return NULL; + + mutex_lock(&table->mutex); + + idx = (obj & (table->num_obj - 1)) * table->obj_size; + icm = table->icm[idx / MTHCA_TABLE_CHUNK_SIZE]; + dma_offset = offset = idx % MTHCA_TABLE_CHUNK_SIZE; + + if (!icm) + goto out; + + list_for_each_entry(chunk, &icm->chunk_list, list) { + for (i = 0; i < chunk->npages; ++i) { + if (dma_handle && dma_offset >= 0) { + if (sg_dma_len(&chunk->mem[i]) > dma_offset) + *dma_handle = sg_dma_address(&chunk->mem[i]) + + dma_offset; + dma_offset -= sg_dma_len(&chunk->mem[i]); + } + /* DMA mapping can merge pages but not split them, + * so if we found the page, dma_handle has already + * been assigned to. */ + if (chunk->mem[i].length > offset) { + page = sg_page(&chunk->mem[i]); + goto out; + } + offset -= chunk->mem[i].length; + } + } + +out: + mutex_unlock(&table->mutex); + return page ? lowmem_page_address(page) + offset : NULL; +} + +int mthca_table_get_range(struct mthca_dev *dev, struct mthca_icm_table *table, + int start, int end) +{ + int inc = MTHCA_TABLE_CHUNK_SIZE / table->obj_size; + int i, err; + + for (i = start; i <= end; i += inc) { + err = mthca_table_get(dev, table, i); + if (err) + goto fail; + } + + return 0; + +fail: + while (i > start) { + i -= inc; + mthca_table_put(dev, table, i); + } + + return err; +} + +void mthca_table_put_range(struct mthca_dev *dev, struct mthca_icm_table *table, + int start, int end) +{ + int i; + + if (!mthca_is_memfree(dev)) + return; + + for (i = start; i <= end; i += MTHCA_TABLE_CHUNK_SIZE / table->obj_size) + mthca_table_put(dev, table, i); +} + +struct mthca_icm_table *mthca_alloc_icm_table(struct mthca_dev *dev, + u64 virt, int obj_size, + int nobj, int reserved, + int use_lowmem, int use_coherent) +{ + struct mthca_icm_table *table; + int obj_per_chunk; + int num_icm; + unsigned chunk_size; + int i; + + obj_per_chunk = MTHCA_TABLE_CHUNK_SIZE / obj_size; + num_icm = DIV_ROUND_UP(nobj, obj_per_chunk); + + table = kmalloc(sizeof *table + num_icm * sizeof *table->icm, GFP_KERNEL); + if (!table) + return NULL; + + table->virt = virt; + table->num_icm = num_icm; + table->num_obj = nobj; + table->obj_size = obj_size; + table->lowmem = use_lowmem; + table->coherent = use_coherent; + mutex_init(&table->mutex); + + for (i = 0; i < num_icm; ++i) + table->icm[i] = NULL; + + for (i = 0; i * MTHCA_TABLE_CHUNK_SIZE < reserved * obj_size; ++i) { + chunk_size = MTHCA_TABLE_CHUNK_SIZE; + if ((i + 1) * MTHCA_TABLE_CHUNK_SIZE > nobj * obj_size) + chunk_size = nobj * obj_size - i * MTHCA_TABLE_CHUNK_SIZE; + + table->icm[i] = mthca_alloc_icm(dev, chunk_size >> PAGE_SHIFT, + (use_lowmem ? GFP_KERNEL : GFP_HIGHUSER) | + __GFP_NOWARN, use_coherent); + if (!table->icm[i]) + goto err; + if (mthca_MAP_ICM(dev, table->icm[i], + virt + i * MTHCA_TABLE_CHUNK_SIZE)) { + mthca_free_icm(dev, table->icm[i], table->coherent); + table->icm[i] = NULL; + goto err; + } + + /* + * Add a reference to this ICM chunk so that it never + * gets freed (since it contains reserved firmware objects). + */ + ++table->icm[i]->refcount; + } + + return table; + +err: + for (i = 0; i < num_icm; ++i) + if (table->icm[i]) { + mthca_UNMAP_ICM(dev, virt + i * MTHCA_TABLE_CHUNK_SIZE, + MTHCA_TABLE_CHUNK_SIZE / MTHCA_ICM_PAGE_SIZE); + mthca_free_icm(dev, table->icm[i], table->coherent); + } + + kfree(table); + + return NULL; +} + +void mthca_free_icm_table(struct mthca_dev *dev, struct mthca_icm_table *table) +{ + int i; + + for (i = 0; i < table->num_icm; ++i) + if (table->icm[i]) { + mthca_UNMAP_ICM(dev, + table->virt + i * MTHCA_TABLE_CHUNK_SIZE, + MTHCA_TABLE_CHUNK_SIZE / MTHCA_ICM_PAGE_SIZE); + mthca_free_icm(dev, table->icm[i], table->coherent); + } + + kfree(table); +} + +static u64 mthca_uarc_virt(struct mthca_dev *dev, struct mthca_uar *uar, int page) +{ + return dev->uar_table.uarc_base + + uar->index * dev->uar_table.uarc_size + + page * MTHCA_ICM_PAGE_SIZE; +} + +int mthca_map_user_db(struct mthca_dev *dev, struct mthca_uar *uar, + struct mthca_user_db_table *db_tab, int index, u64 uaddr) +{ + struct page *pages[1]; + int ret = 0; + int i; + + if (!mthca_is_memfree(dev)) + return 0; + + if (index < 0 || index > dev->uar_table.uarc_size / 8) + return -EINVAL; + + mutex_lock(&db_tab->mutex); + + i = index / MTHCA_DB_REC_PER_PAGE; + + if ((db_tab->page[i].refcount >= MTHCA_DB_REC_PER_PAGE) || + (db_tab->page[i].uvirt && db_tab->page[i].uvirt != uaddr) || + (uaddr & 4095)) { + ret = -EINVAL; + goto out; + } + + if (db_tab->page[i].refcount) { + ++db_tab->page[i].refcount; + goto out; + } + + ret = get_user_pages(current, current->mm, uaddr & PAGE_MASK, 1, 1, 0, + pages, NULL); + if (ret < 0) + goto out; + + sg_set_page(&db_tab->page[i].mem, pages[0], MTHCA_ICM_PAGE_SIZE, + uaddr & ~PAGE_MASK); + + ret = pci_map_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE); + if (ret < 0) { + put_page(pages[0]); + goto out; + } + + ret = mthca_MAP_ICM_page(dev, sg_dma_address(&db_tab->page[i].mem), + mthca_uarc_virt(dev, uar, i)); + if (ret) { + pci_unmap_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE); + put_page(sg_page(&db_tab->page[i].mem)); + goto out; + } + + db_tab->page[i].uvirt = uaddr; + db_tab->page[i].refcount = 1; + +out: + mutex_unlock(&db_tab->mutex); + return ret; +} + +void mthca_unmap_user_db(struct mthca_dev *dev, struct mthca_uar *uar, + struct mthca_user_db_table *db_tab, int index) +{ + if (!mthca_is_memfree(dev)) + return; + + /* + * To make our bookkeeping simpler, we don't unmap DB + * pages until we clean up the whole db table. + */ + + mutex_lock(&db_tab->mutex); + + --db_tab->page[index / MTHCA_DB_REC_PER_PAGE].refcount; + + mutex_unlock(&db_tab->mutex); +} + +struct mthca_user_db_table *mthca_init_user_db_tab(struct mthca_dev *dev) +{ + struct mthca_user_db_table *db_tab; + int npages; + int i; + + if (!mthca_is_memfree(dev)) + return NULL; + + npages = dev->uar_table.uarc_size / MTHCA_ICM_PAGE_SIZE; + db_tab = kmalloc(sizeof *db_tab + npages * sizeof *db_tab->page, GFP_KERNEL); + if (!db_tab) + return ERR_PTR(-ENOMEM); + + mutex_init(&db_tab->mutex); + for (i = 0; i < npages; ++i) { + db_tab->page[i].refcount = 0; + db_tab->page[i].uvirt = 0; + sg_init_table(&db_tab->page[i].mem, 1); + } + + return db_tab; +} + +void mthca_cleanup_user_db_tab(struct mthca_dev *dev, struct mthca_uar *uar, + struct mthca_user_db_table *db_tab) +{ + int i; + + if (!mthca_is_memfree(dev)) + return; + + for (i = 0; i < dev->uar_table.uarc_size / MTHCA_ICM_PAGE_SIZE; ++i) { + if (db_tab->page[i].uvirt) { + mthca_UNMAP_ICM(dev, mthca_uarc_virt(dev, uar, i), 1); + pci_unmap_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE); + put_page(sg_page(&db_tab->page[i].mem)); + } + } + + kfree(db_tab); +} + +int mthca_alloc_db(struct mthca_dev *dev, enum mthca_db_type type, + u32 qn, __be32 **db) +{ + int group; + int start, end, dir; + int i, j; + struct mthca_db_page *page; + int ret = 0; + + mutex_lock(&dev->db_tab->mutex); + + switch (type) { + case MTHCA_DB_TYPE_CQ_ARM: + case MTHCA_DB_TYPE_SQ: + group = 0; + start = 0; + end = dev->db_tab->max_group1; + dir = 1; + break; + + case MTHCA_DB_TYPE_CQ_SET_CI: + case MTHCA_DB_TYPE_RQ: + case MTHCA_DB_TYPE_SRQ: + group = 1; + start = dev->db_tab->npages - 1; + end = dev->db_tab->min_group2; + dir = -1; + break; + + default: + ret = -EINVAL; + goto out; + } + + for (i = start; i != end; i += dir) + if (dev->db_tab->page[i].db_rec && + !bitmap_full(dev->db_tab->page[i].used, + MTHCA_DB_REC_PER_PAGE)) { + page = dev->db_tab->page + i; + goto found; + } + + for (i = start; i != end; i += dir) + if (!dev->db_tab->page[i].db_rec) { + page = dev->db_tab->page + i; + goto alloc; + } + + if (dev->db_tab->max_group1 >= dev->db_tab->min_group2 - 1) { + ret = -ENOMEM; + goto out; + } + + if (group == 0) + ++dev->db_tab->max_group1; + else + --dev->db_tab->min_group2; + + page = dev->db_tab->page + end; + +alloc: + page->db_rec = dma_alloc_coherent(&dev->pdev->dev, MTHCA_ICM_PAGE_SIZE, + &page->mapping, GFP_KERNEL); + if (!page->db_rec) { + ret = -ENOMEM; + goto out; + } + memset(page->db_rec, 0, MTHCA_ICM_PAGE_SIZE); + + ret = mthca_MAP_ICM_page(dev, page->mapping, + mthca_uarc_virt(dev, &dev->driver_uar, i)); + if (ret) { + dma_free_coherent(&dev->pdev->dev, MTHCA_ICM_PAGE_SIZE, + page->db_rec, page->mapping); + goto out; + } + + bitmap_zero(page->used, MTHCA_DB_REC_PER_PAGE); + +found: + j = find_first_zero_bit(page->used, MTHCA_DB_REC_PER_PAGE); + set_bit(j, page->used); + + if (group == 1) + j = MTHCA_DB_REC_PER_PAGE - 1 - j; + + ret = i * MTHCA_DB_REC_PER_PAGE + j; + + page->db_rec[j] = cpu_to_be64((qn << 8) | (type << 5)); + + *db = (__be32 *) &page->db_rec[j]; + +out: + mutex_unlock(&dev->db_tab->mutex); + + return ret; +} + +void mthca_free_db(struct mthca_dev *dev, int type, int db_index) +{ + int i, j; + struct mthca_db_page *page; + + i = db_index / MTHCA_DB_REC_PER_PAGE; + j = db_index % MTHCA_DB_REC_PER_PAGE; + + page = dev->db_tab->page + i; + + mutex_lock(&dev->db_tab->mutex); + + page->db_rec[j] = 0; + if (i >= dev->db_tab->min_group2) + j = MTHCA_DB_REC_PER_PAGE - 1 - j; + clear_bit(j, page->used); + + if (bitmap_empty(page->used, MTHCA_DB_REC_PER_PAGE) && + i >= dev->db_tab->max_group1 - 1) { + mthca_UNMAP_ICM(dev, mthca_uarc_virt(dev, &dev->driver_uar, i), 1); + + dma_free_coherent(&dev->pdev->dev, MTHCA_ICM_PAGE_SIZE, + page->db_rec, page->mapping); + page->db_rec = NULL; + + if (i == dev->db_tab->max_group1) { + --dev->db_tab->max_group1; + /* XXX may be able to unmap more pages now */ + } + if (i == dev->db_tab->min_group2) + ++dev->db_tab->min_group2; + } + + mutex_unlock(&dev->db_tab->mutex); +} + +int mthca_init_db_tab(struct mthca_dev *dev) +{ + int i; + + if (!mthca_is_memfree(dev)) + return 0; + + dev->db_tab = kmalloc(sizeof *dev->db_tab, GFP_KERNEL); + if (!dev->db_tab) + return -ENOMEM; + + mutex_init(&dev->db_tab->mutex); + + dev->db_tab->npages = dev->uar_table.uarc_size / MTHCA_ICM_PAGE_SIZE; + dev->db_tab->max_group1 = 0; + dev->db_tab->min_group2 = dev->db_tab->npages - 1; + + dev->db_tab->page = kmalloc(dev->db_tab->npages * + sizeof *dev->db_tab->page, + GFP_KERNEL); + if (!dev->db_tab->page) { + kfree(dev->db_tab); + return -ENOMEM; + } + + for (i = 0; i < dev->db_tab->npages; ++i) + dev->db_tab->page[i].db_rec = NULL; + + return 0; +} + +void mthca_cleanup_db_tab(struct mthca_dev *dev) +{ + int i; + + if (!mthca_is_memfree(dev)) + return; + + /* + * Because we don't always free our UARC pages when they + * become empty to make mthca_free_db() simpler we need to + * make a sweep through the doorbell pages and free any + * leftover pages now. + */ + for (i = 0; i < dev->db_tab->npages; ++i) { + if (!dev->db_tab->page[i].db_rec) + continue; + + if (!bitmap_empty(dev->db_tab->page[i].used, MTHCA_DB_REC_PER_PAGE)) + mthca_warn(dev, "Kernel UARC page %d not empty\n", i); + + mthca_UNMAP_ICM(dev, mthca_uarc_virt(dev, &dev->driver_uar, i), 1); + + dma_free_coherent(&dev->pdev->dev, MTHCA_ICM_PAGE_SIZE, + dev->db_tab->page[i].db_rec, + dev->db_tab->page[i].mapping); + } + + kfree(dev->db_tab->page); + kfree(dev->db_tab); +} diff --git a/kernel/drivers/infiniband/hw/mthca/mthca_memfree.h b/kernel/drivers/infiniband/hw/mthca/mthca_memfree.h new file mode 100644 index 000000000..da9b8f9b8 --- /dev/null +++ b/kernel/drivers/infiniband/hw/mthca/mthca_memfree.h @@ -0,0 +1,179 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Cisco Systems. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MTHCA_MEMFREE_H +#define MTHCA_MEMFREE_H + +#include +#include + +#define MTHCA_ICM_CHUNK_LEN \ + ((256 - sizeof (struct list_head) - 2 * sizeof (int)) / \ + (sizeof (struct scatterlist))) + +enum { + MTHCA_ICM_PAGE_SHIFT = 12, + MTHCA_ICM_PAGE_SIZE = 1 << MTHCA_ICM_PAGE_SHIFT, + MTHCA_DB_REC_PER_PAGE = MTHCA_ICM_PAGE_SIZE / 8 +}; + +struct mthca_icm_chunk { + struct list_head list; + int npages; + int nsg; + struct scatterlist mem[MTHCA_ICM_CHUNK_LEN]; +}; + +struct mthca_icm { + struct list_head chunk_list; + int refcount; +}; + +struct mthca_icm_table { + u64 virt; + int num_icm; + int num_obj; + int obj_size; + int lowmem; + int coherent; + struct mutex mutex; + struct mthca_icm *icm[0]; +}; + +struct mthca_icm_iter { + struct mthca_icm *icm; + struct mthca_icm_chunk *chunk; + int page_idx; +}; + +struct mthca_dev; + +struct mthca_icm *mthca_alloc_icm(struct mthca_dev *dev, int npages, + gfp_t gfp_mask, int coherent); +void mthca_free_icm(struct mthca_dev *dev, struct mthca_icm *icm, int coherent); + +struct mthca_icm_table *mthca_alloc_icm_table(struct mthca_dev *dev, + u64 virt, int obj_size, + int nobj, int reserved, + int use_lowmem, int use_coherent); +void mthca_free_icm_table(struct mthca_dev *dev, struct mthca_icm_table *table); +int mthca_table_get(struct mthca_dev *dev, struct mthca_icm_table *table, int obj); +void mthca_table_put(struct mthca_dev *dev, struct mthca_icm_table *table, int obj); +void *mthca_table_find(struct mthca_icm_table *table, int obj, dma_addr_t *dma_handle); +int mthca_table_get_range(struct mthca_dev *dev, struct mthca_icm_table *table, + int start, int end); +void mthca_table_put_range(struct mthca_dev *dev, struct mthca_icm_table *table, + int start, int end); + +static inline void mthca_icm_first(struct mthca_icm *icm, + struct mthca_icm_iter *iter) +{ + iter->icm = icm; + iter->chunk = list_empty(&icm->chunk_list) ? + NULL : list_entry(icm->chunk_list.next, + struct mthca_icm_chunk, list); + iter->page_idx = 0; +} + +static inline int mthca_icm_last(struct mthca_icm_iter *iter) +{ + return !iter->chunk; +} + +static inline void mthca_icm_next(struct mthca_icm_iter *iter) +{ + if (++iter->page_idx >= iter->chunk->nsg) { + if (iter->chunk->list.next == &iter->icm->chunk_list) { + iter->chunk = NULL; + return; + } + + iter->chunk = list_entry(iter->chunk->list.next, + struct mthca_icm_chunk, list); + iter->page_idx = 0; + } +} + +static inline dma_addr_t mthca_icm_addr(struct mthca_icm_iter *iter) +{ + return sg_dma_address(&iter->chunk->mem[iter->page_idx]); +} + +static inline unsigned long mthca_icm_size(struct mthca_icm_iter *iter) +{ + return sg_dma_len(&iter->chunk->mem[iter->page_idx]); +} + +struct mthca_db_page { + DECLARE_BITMAP(used, MTHCA_DB_REC_PER_PAGE); + __be64 *db_rec; + dma_addr_t mapping; +}; + +struct mthca_db_table { + int npages; + int max_group1; + int min_group2; + struct mthca_db_page *page; + struct mutex mutex; +}; + +enum mthca_db_type { + MTHCA_DB_TYPE_INVALID = 0x0, + MTHCA_DB_TYPE_CQ_SET_CI = 0x1, + MTHCA_DB_TYPE_CQ_ARM = 0x2, + MTHCA_DB_TYPE_SQ = 0x3, + MTHCA_DB_TYPE_RQ = 0x4, + MTHCA_DB_TYPE_SRQ = 0x5, + MTHCA_DB_TYPE_GROUP_SEP = 0x7 +}; + +struct mthca_user_db_table; +struct mthca_uar; + +int mthca_map_user_db(struct mthca_dev *dev, struct mthca_uar *uar, + struct mthca_user_db_table *db_tab, int index, u64 uaddr); +void mthca_unmap_user_db(struct mthca_dev *dev, struct mthca_uar *uar, + struct mthca_user_db_table *db_tab, int index); +struct mthca_user_db_table *mthca_init_user_db_tab(struct mthca_dev *dev); +void mthca_cleanup_user_db_tab(struct mthca_dev *dev, struct mthca_uar *uar, + struct mthca_user_db_table *db_tab); + +int mthca_init_db_tab(struct mthca_dev *dev); +void mthca_cleanup_db_tab(struct mthca_dev *dev); +int mthca_alloc_db(struct mthca_dev *dev, enum mthca_db_type type, + u32 qn, __be32 **db); +void mthca_free_db(struct mthca_dev *dev, int type, int db_index); + +#endif /* MTHCA_MEMFREE_H */ diff --git a/kernel/drivers/infiniband/hw/mthca/mthca_mr.c b/kernel/drivers/infiniband/hw/mthca/mthca_mr.c new file mode 100644 index 000000000..ed9a989e5 --- /dev/null +++ b/kernel/drivers/infiniband/hw/mthca/mthca_mr.c @@ -0,0 +1,965 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include "mthca_dev.h" +#include "mthca_cmd.h" +#include "mthca_memfree.h" + +struct mthca_mtt { + struct mthca_buddy *buddy; + int order; + u32 first_seg; +}; + +/* + * Must be packed because mtt_seg is 64 bits but only aligned to 32 bits. + */ +struct mthca_mpt_entry { + __be32 flags; + __be32 page_size; + __be32 key; + __be32 pd; + __be64 start; + __be64 length; + __be32 lkey; + __be32 window_count; + __be32 window_count_limit; + __be64 mtt_seg; + __be32 mtt_sz; /* Arbel only */ + u32 reserved[2]; +} __attribute__((packed)); + +#define MTHCA_MPT_FLAG_SW_OWNS (0xfUL << 28) +#define MTHCA_MPT_FLAG_MIO (1 << 17) +#define MTHCA_MPT_FLAG_BIND_ENABLE (1 << 15) +#define MTHCA_MPT_FLAG_PHYSICAL (1 << 9) +#define MTHCA_MPT_FLAG_REGION (1 << 8) + +#define MTHCA_MTT_FLAG_PRESENT 1 + +#define MTHCA_MPT_STATUS_SW 0xF0 +#define MTHCA_MPT_STATUS_HW 0x00 + +#define SINAI_FMR_KEY_INC 0x1000000 + +/* + * Buddy allocator for MTT segments (currently not very efficient + * since it doesn't keep a free list and just searches linearly + * through the bitmaps) + */ + +static u32 mthca_buddy_alloc(struct mthca_buddy *buddy, int order) +{ + int o; + int m; + u32 seg; + + spin_lock(&buddy->lock); + + for (o = order; o <= buddy->max_order; ++o) + if (buddy->num_free[o]) { + m = 1 << (buddy->max_order - o); + seg = find_first_bit(buddy->bits[o], m); + if (seg < m) + goto found; + } + + spin_unlock(&buddy->lock); + return -1; + + found: + clear_bit(seg, buddy->bits[o]); + --buddy->num_free[o]; + + while (o > order) { + --o; + seg <<= 1; + set_bit(seg ^ 1, buddy->bits[o]); + ++buddy->num_free[o]; + } + + spin_unlock(&buddy->lock); + + seg <<= order; + + return seg; +} + +static void mthca_buddy_free(struct mthca_buddy *buddy, u32 seg, int order) +{ + seg >>= order; + + spin_lock(&buddy->lock); + + while (test_bit(seg ^ 1, buddy->bits[order])) { + clear_bit(seg ^ 1, buddy->bits[order]); + --buddy->num_free[order]; + seg >>= 1; + ++order; + } + + set_bit(seg, buddy->bits[order]); + ++buddy->num_free[order]; + + spin_unlock(&buddy->lock); +} + +static int mthca_buddy_init(struct mthca_buddy *buddy, int max_order) +{ + int i, s; + + buddy->max_order = max_order; + spin_lock_init(&buddy->lock); + + buddy->bits = kzalloc((buddy->max_order + 1) * sizeof (long *), + GFP_KERNEL); + buddy->num_free = kcalloc((buddy->max_order + 1), sizeof *buddy->num_free, + GFP_KERNEL); + if (!buddy->bits || !buddy->num_free) + goto err_out; + + for (i = 0; i <= buddy->max_order; ++i) { + s = BITS_TO_LONGS(1 << (buddy->max_order - i)); + buddy->bits[i] = kmalloc(s * sizeof (long), GFP_KERNEL); + if (!buddy->bits[i]) + goto err_out_free; + bitmap_zero(buddy->bits[i], + 1 << (buddy->max_order - i)); + } + + set_bit(0, buddy->bits[buddy->max_order]); + buddy->num_free[buddy->max_order] = 1; + + return 0; + +err_out_free: + for (i = 0; i <= buddy->max_order; ++i) + kfree(buddy->bits[i]); + +err_out: + kfree(buddy->bits); + kfree(buddy->num_free); + + return -ENOMEM; +} + +static void mthca_buddy_cleanup(struct mthca_buddy *buddy) +{ + int i; + + for (i = 0; i <= buddy->max_order; ++i) + kfree(buddy->bits[i]); + + kfree(buddy->bits); + kfree(buddy->num_free); +} + +static u32 mthca_alloc_mtt_range(struct mthca_dev *dev, int order, + struct mthca_buddy *buddy) +{ + u32 seg = mthca_buddy_alloc(buddy, order); + + if (seg == -1) + return -1; + + if (mthca_is_memfree(dev)) + if (mthca_table_get_range(dev, dev->mr_table.mtt_table, seg, + seg + (1 << order) - 1)) { + mthca_buddy_free(buddy, seg, order); + seg = -1; + } + + return seg; +} + +static struct mthca_mtt *__mthca_alloc_mtt(struct mthca_dev *dev, int size, + struct mthca_buddy *buddy) +{ + struct mthca_mtt *mtt; + int i; + + if (size <= 0) + return ERR_PTR(-EINVAL); + + mtt = kmalloc(sizeof *mtt, GFP_KERNEL); + if (!mtt) + return ERR_PTR(-ENOMEM); + + mtt->buddy = buddy; + mtt->order = 0; + for (i = dev->limits.mtt_seg_size / 8; i < size; i <<= 1) + ++mtt->order; + + mtt->first_seg = mthca_alloc_mtt_range(dev, mtt->order, buddy); + if (mtt->first_seg == -1) { + kfree(mtt); + return ERR_PTR(-ENOMEM); + } + + return mtt; +} + +struct mthca_mtt *mthca_alloc_mtt(struct mthca_dev *dev, int size) +{ + return __mthca_alloc_mtt(dev, size, &dev->mr_table.mtt_buddy); +} + +void mthca_free_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt) +{ + if (!mtt) + return; + + mthca_buddy_free(mtt->buddy, mtt->first_seg, mtt->order); + + mthca_table_put_range(dev, dev->mr_table.mtt_table, + mtt->first_seg, + mtt->first_seg + (1 << mtt->order) - 1); + + kfree(mtt); +} + +static int __mthca_write_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt, + int start_index, u64 *buffer_list, int list_len) +{ + struct mthca_mailbox *mailbox; + __be64 *mtt_entry; + int err = 0; + int i; + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + mtt_entry = mailbox->buf; + + while (list_len > 0) { + mtt_entry[0] = cpu_to_be64(dev->mr_table.mtt_base + + mtt->first_seg * dev->limits.mtt_seg_size + + start_index * 8); + mtt_entry[1] = 0; + for (i = 0; i < list_len && i < MTHCA_MAILBOX_SIZE / 8 - 2; ++i) + mtt_entry[i + 2] = cpu_to_be64(buffer_list[i] | + MTHCA_MTT_FLAG_PRESENT); + + /* + * If we have an odd number of entries to write, add + * one more dummy entry for firmware efficiency. + */ + if (i & 1) + mtt_entry[i + 2] = 0; + + err = mthca_WRITE_MTT(dev, mailbox, (i + 1) & ~1); + if (err) { + mthca_warn(dev, "WRITE_MTT failed (%d)\n", err); + goto out; + } + + list_len -= i; + start_index += i; + buffer_list += i; + } + +out: + mthca_free_mailbox(dev, mailbox); + return err; +} + +int mthca_write_mtt_size(struct mthca_dev *dev) +{ + if (dev->mr_table.fmr_mtt_buddy != &dev->mr_table.mtt_buddy || + !(dev->mthca_flags & MTHCA_FLAG_FMR)) + /* + * Be friendly to WRITE_MTT command + * and leave two empty slots for the + * index and reserved fields of the + * mailbox. + */ + return PAGE_SIZE / sizeof (u64) - 2; + + /* For Arbel, all MTTs must fit in the same page. */ + return mthca_is_memfree(dev) ? (PAGE_SIZE / sizeof (u64)) : 0x7ffffff; +} + +static void mthca_tavor_write_mtt_seg(struct mthca_dev *dev, + struct mthca_mtt *mtt, int start_index, + u64 *buffer_list, int list_len) +{ + u64 __iomem *mtts; + int i; + + mtts = dev->mr_table.tavor_fmr.mtt_base + mtt->first_seg * dev->limits.mtt_seg_size + + start_index * sizeof (u64); + for (i = 0; i < list_len; ++i) + mthca_write64_raw(cpu_to_be64(buffer_list[i] | MTHCA_MTT_FLAG_PRESENT), + mtts + i); +} + +static void mthca_arbel_write_mtt_seg(struct mthca_dev *dev, + struct mthca_mtt *mtt, int start_index, + u64 *buffer_list, int list_len) +{ + __be64 *mtts; + dma_addr_t dma_handle; + int i; + int s = start_index * sizeof (u64); + + /* For Arbel, all MTTs must fit in the same page. */ + BUG_ON(s / PAGE_SIZE != (s + list_len * sizeof(u64) - 1) / PAGE_SIZE); + /* Require full segments */ + BUG_ON(s % dev->limits.mtt_seg_size); + + mtts = mthca_table_find(dev->mr_table.mtt_table, mtt->first_seg + + s / dev->limits.mtt_seg_size, &dma_handle); + + BUG_ON(!mtts); + + dma_sync_single_for_cpu(&dev->pdev->dev, dma_handle, + list_len * sizeof (u64), DMA_TO_DEVICE); + + for (i = 0; i < list_len; ++i) + mtts[i] = cpu_to_be64(buffer_list[i] | MTHCA_MTT_FLAG_PRESENT); + + dma_sync_single_for_device(&dev->pdev->dev, dma_handle, + list_len * sizeof (u64), DMA_TO_DEVICE); +} + +int mthca_write_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt, + int start_index, u64 *buffer_list, int list_len) +{ + int size = mthca_write_mtt_size(dev); + int chunk; + + if (dev->mr_table.fmr_mtt_buddy != &dev->mr_table.mtt_buddy || + !(dev->mthca_flags & MTHCA_FLAG_FMR)) + return __mthca_write_mtt(dev, mtt, start_index, buffer_list, list_len); + + while (list_len > 0) { + chunk = min(size, list_len); + if (mthca_is_memfree(dev)) + mthca_arbel_write_mtt_seg(dev, mtt, start_index, + buffer_list, chunk); + else + mthca_tavor_write_mtt_seg(dev, mtt, start_index, + buffer_list, chunk); + + list_len -= chunk; + start_index += chunk; + buffer_list += chunk; + } + + return 0; +} + +static inline u32 tavor_hw_index_to_key(u32 ind) +{ + return ind; +} + +static inline u32 tavor_key_to_hw_index(u32 key) +{ + return key; +} + +static inline u32 arbel_hw_index_to_key(u32 ind) +{ + return (ind >> 24) | (ind << 8); +} + +static inline u32 arbel_key_to_hw_index(u32 key) +{ + return (key << 24) | (key >> 8); +} + +static inline u32 hw_index_to_key(struct mthca_dev *dev, u32 ind) +{ + if (mthca_is_memfree(dev)) + return arbel_hw_index_to_key(ind); + else + return tavor_hw_index_to_key(ind); +} + +static inline u32 key_to_hw_index(struct mthca_dev *dev, u32 key) +{ + if (mthca_is_memfree(dev)) + return arbel_key_to_hw_index(key); + else + return tavor_key_to_hw_index(key); +} + +static inline u32 adjust_key(struct mthca_dev *dev, u32 key) +{ + if (dev->mthca_flags & MTHCA_FLAG_SINAI_OPT) + return ((key << 20) & 0x800000) | (key & 0x7fffff); + else + return key; +} + +int mthca_mr_alloc(struct mthca_dev *dev, u32 pd, int buffer_size_shift, + u64 iova, u64 total_size, u32 access, struct mthca_mr *mr) +{ + struct mthca_mailbox *mailbox; + struct mthca_mpt_entry *mpt_entry; + u32 key; + int i; + int err; + + WARN_ON(buffer_size_shift >= 32); + + key = mthca_alloc(&dev->mr_table.mpt_alloc); + if (key == -1) + return -ENOMEM; + key = adjust_key(dev, key); + mr->ibmr.rkey = mr->ibmr.lkey = hw_index_to_key(dev, key); + + if (mthca_is_memfree(dev)) { + err = mthca_table_get(dev, dev->mr_table.mpt_table, key); + if (err) + goto err_out_mpt_free; + } + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) { + err = PTR_ERR(mailbox); + goto err_out_table; + } + mpt_entry = mailbox->buf; + + mpt_entry->flags = cpu_to_be32(MTHCA_MPT_FLAG_SW_OWNS | + MTHCA_MPT_FLAG_MIO | + MTHCA_MPT_FLAG_REGION | + access); + if (!mr->mtt) + mpt_entry->flags |= cpu_to_be32(MTHCA_MPT_FLAG_PHYSICAL); + + mpt_entry->page_size = cpu_to_be32(buffer_size_shift - 12); + mpt_entry->key = cpu_to_be32(key); + mpt_entry->pd = cpu_to_be32(pd); + mpt_entry->start = cpu_to_be64(iova); + mpt_entry->length = cpu_to_be64(total_size); + + memset(&mpt_entry->lkey, 0, + sizeof *mpt_entry - offsetof(struct mthca_mpt_entry, lkey)); + + if (mr->mtt) + mpt_entry->mtt_seg = + cpu_to_be64(dev->mr_table.mtt_base + + mr->mtt->first_seg * dev->limits.mtt_seg_size); + + if (0) { + mthca_dbg(dev, "Dumping MPT entry %08x:\n", mr->ibmr.lkey); + for (i = 0; i < sizeof (struct mthca_mpt_entry) / 4; ++i) { + if (i % 4 == 0) + printk("[%02x] ", i * 4); + printk(" %08x", be32_to_cpu(((__be32 *) mpt_entry)[i])); + if ((i + 1) % 4 == 0) + printk("\n"); + } + } + + err = mthca_SW2HW_MPT(dev, mailbox, + key & (dev->limits.num_mpts - 1)); + if (err) { + mthca_warn(dev, "SW2HW_MPT failed (%d)\n", err); + goto err_out_mailbox; + } + + mthca_free_mailbox(dev, mailbox); + return err; + +err_out_mailbox: + mthca_free_mailbox(dev, mailbox); + +err_out_table: + mthca_table_put(dev, dev->mr_table.mpt_table, key); + +err_out_mpt_free: + mthca_free(&dev->mr_table.mpt_alloc, key); + return err; +} + +int mthca_mr_alloc_notrans(struct mthca_dev *dev, u32 pd, + u32 access, struct mthca_mr *mr) +{ + mr->mtt = NULL; + return mthca_mr_alloc(dev, pd, 12, 0, ~0ULL, access, mr); +} + +int mthca_mr_alloc_phys(struct mthca_dev *dev, u32 pd, + u64 *buffer_list, int buffer_size_shift, + int list_len, u64 iova, u64 total_size, + u32 access, struct mthca_mr *mr) +{ + int err; + + mr->mtt = mthca_alloc_mtt(dev, list_len); + if (IS_ERR(mr->mtt)) + return PTR_ERR(mr->mtt); + + err = mthca_write_mtt(dev, mr->mtt, 0, buffer_list, list_len); + if (err) { + mthca_free_mtt(dev, mr->mtt); + return err; + } + + err = mthca_mr_alloc(dev, pd, buffer_size_shift, iova, + total_size, access, mr); + if (err) + mthca_free_mtt(dev, mr->mtt); + + return err; +} + +/* Free mr or fmr */ +static void mthca_free_region(struct mthca_dev *dev, u32 lkey) +{ + mthca_table_put(dev, dev->mr_table.mpt_table, + key_to_hw_index(dev, lkey)); + + mthca_free(&dev->mr_table.mpt_alloc, key_to_hw_index(dev, lkey)); +} + +void mthca_free_mr(struct mthca_dev *dev, struct mthca_mr *mr) +{ + int err; + + err = mthca_HW2SW_MPT(dev, NULL, + key_to_hw_index(dev, mr->ibmr.lkey) & + (dev->limits.num_mpts - 1)); + if (err) + mthca_warn(dev, "HW2SW_MPT failed (%d)\n", err); + + mthca_free_region(dev, mr->ibmr.lkey); + mthca_free_mtt(dev, mr->mtt); +} + +int mthca_fmr_alloc(struct mthca_dev *dev, u32 pd, + u32 access, struct mthca_fmr *mr) +{ + struct mthca_mpt_entry *mpt_entry; + struct mthca_mailbox *mailbox; + u64 mtt_seg; + u32 key, idx; + int list_len = mr->attr.max_pages; + int err = -ENOMEM; + int i; + + if (mr->attr.page_shift < 12 || mr->attr.page_shift >= 32) + return -EINVAL; + + /* For Arbel, all MTTs must fit in the same page. */ + if (mthca_is_memfree(dev) && + mr->attr.max_pages * sizeof *mr->mem.arbel.mtts > PAGE_SIZE) + return -EINVAL; + + mr->maps = 0; + + key = mthca_alloc(&dev->mr_table.mpt_alloc); + if (key == -1) + return -ENOMEM; + key = adjust_key(dev, key); + + idx = key & (dev->limits.num_mpts - 1); + mr->ibmr.rkey = mr->ibmr.lkey = hw_index_to_key(dev, key); + + if (mthca_is_memfree(dev)) { + err = mthca_table_get(dev, dev->mr_table.mpt_table, key); + if (err) + goto err_out_mpt_free; + + mr->mem.arbel.mpt = mthca_table_find(dev->mr_table.mpt_table, key, NULL); + BUG_ON(!mr->mem.arbel.mpt); + } else + mr->mem.tavor.mpt = dev->mr_table.tavor_fmr.mpt_base + + sizeof *(mr->mem.tavor.mpt) * idx; + + mr->mtt = __mthca_alloc_mtt(dev, list_len, dev->mr_table.fmr_mtt_buddy); + if (IS_ERR(mr->mtt)) { + err = PTR_ERR(mr->mtt); + goto err_out_table; + } + + mtt_seg = mr->mtt->first_seg * dev->limits.mtt_seg_size; + + if (mthca_is_memfree(dev)) { + mr->mem.arbel.mtts = mthca_table_find(dev->mr_table.mtt_table, + mr->mtt->first_seg, + &mr->mem.arbel.dma_handle); + BUG_ON(!mr->mem.arbel.mtts); + } else + mr->mem.tavor.mtts = dev->mr_table.tavor_fmr.mtt_base + mtt_seg; + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) { + err = PTR_ERR(mailbox); + goto err_out_free_mtt; + } + + mpt_entry = mailbox->buf; + + mpt_entry->flags = cpu_to_be32(MTHCA_MPT_FLAG_SW_OWNS | + MTHCA_MPT_FLAG_MIO | + MTHCA_MPT_FLAG_REGION | + access); + + mpt_entry->page_size = cpu_to_be32(mr->attr.page_shift - 12); + mpt_entry->key = cpu_to_be32(key); + mpt_entry->pd = cpu_to_be32(pd); + memset(&mpt_entry->start, 0, + sizeof *mpt_entry - offsetof(struct mthca_mpt_entry, start)); + mpt_entry->mtt_seg = cpu_to_be64(dev->mr_table.mtt_base + mtt_seg); + + if (0) { + mthca_dbg(dev, "Dumping MPT entry %08x:\n", mr->ibmr.lkey); + for (i = 0; i < sizeof (struct mthca_mpt_entry) / 4; ++i) { + if (i % 4 == 0) + printk("[%02x] ", i * 4); + printk(" %08x", be32_to_cpu(((__be32 *) mpt_entry)[i])); + if ((i + 1) % 4 == 0) + printk("\n"); + } + } + + err = mthca_SW2HW_MPT(dev, mailbox, + key & (dev->limits.num_mpts - 1)); + if (err) { + mthca_warn(dev, "SW2HW_MPT failed (%d)\n", err); + goto err_out_mailbox_free; + } + + mthca_free_mailbox(dev, mailbox); + return 0; + +err_out_mailbox_free: + mthca_free_mailbox(dev, mailbox); + +err_out_free_mtt: + mthca_free_mtt(dev, mr->mtt); + +err_out_table: + mthca_table_put(dev, dev->mr_table.mpt_table, key); + +err_out_mpt_free: + mthca_free(&dev->mr_table.mpt_alloc, key); + return err; +} + +int mthca_free_fmr(struct mthca_dev *dev, struct mthca_fmr *fmr) +{ + if (fmr->maps) + return -EBUSY; + + mthca_free_region(dev, fmr->ibmr.lkey); + mthca_free_mtt(dev, fmr->mtt); + + return 0; +} + +static inline int mthca_check_fmr(struct mthca_fmr *fmr, u64 *page_list, + int list_len, u64 iova) +{ + int i, page_mask; + + if (list_len > fmr->attr.max_pages) + return -EINVAL; + + page_mask = (1 << fmr->attr.page_shift) - 1; + + /* We are getting page lists, so va must be page aligned. */ + if (iova & page_mask) + return -EINVAL; + + /* Trust the user not to pass misaligned data in page_list */ + if (0) + for (i = 0; i < list_len; ++i) { + if (page_list[i] & ~page_mask) + return -EINVAL; + } + + if (fmr->maps >= fmr->attr.max_maps) + return -EINVAL; + + return 0; +} + + +int mthca_tavor_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, + int list_len, u64 iova) +{ + struct mthca_fmr *fmr = to_mfmr(ibfmr); + struct mthca_dev *dev = to_mdev(ibfmr->device); + struct mthca_mpt_entry mpt_entry; + u32 key; + int i, err; + + err = mthca_check_fmr(fmr, page_list, list_len, iova); + if (err) + return err; + + ++fmr->maps; + + key = tavor_key_to_hw_index(fmr->ibmr.lkey); + key += dev->limits.num_mpts; + fmr->ibmr.lkey = fmr->ibmr.rkey = tavor_hw_index_to_key(key); + + writeb(MTHCA_MPT_STATUS_SW, fmr->mem.tavor.mpt); + + for (i = 0; i < list_len; ++i) { + __be64 mtt_entry = cpu_to_be64(page_list[i] | + MTHCA_MTT_FLAG_PRESENT); + mthca_write64_raw(mtt_entry, fmr->mem.tavor.mtts + i); + } + + mpt_entry.lkey = cpu_to_be32(key); + mpt_entry.length = cpu_to_be64(list_len * (1ull << fmr->attr.page_shift)); + mpt_entry.start = cpu_to_be64(iova); + + __raw_writel((__force u32) mpt_entry.lkey, &fmr->mem.tavor.mpt->key); + memcpy_toio(&fmr->mem.tavor.mpt->start, &mpt_entry.start, + offsetof(struct mthca_mpt_entry, window_count) - + offsetof(struct mthca_mpt_entry, start)); + + writeb(MTHCA_MPT_STATUS_HW, fmr->mem.tavor.mpt); + + return 0; +} + +int mthca_arbel_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, + int list_len, u64 iova) +{ + struct mthca_fmr *fmr = to_mfmr(ibfmr); + struct mthca_dev *dev = to_mdev(ibfmr->device); + u32 key; + int i, err; + + err = mthca_check_fmr(fmr, page_list, list_len, iova); + if (err) + return err; + + ++fmr->maps; + + key = arbel_key_to_hw_index(fmr->ibmr.lkey); + if (dev->mthca_flags & MTHCA_FLAG_SINAI_OPT) + key += SINAI_FMR_KEY_INC; + else + key += dev->limits.num_mpts; + fmr->ibmr.lkey = fmr->ibmr.rkey = arbel_hw_index_to_key(key); + + *(u8 *) fmr->mem.arbel.mpt = MTHCA_MPT_STATUS_SW; + + wmb(); + + dma_sync_single_for_cpu(&dev->pdev->dev, fmr->mem.arbel.dma_handle, + list_len * sizeof(u64), DMA_TO_DEVICE); + + for (i = 0; i < list_len; ++i) + fmr->mem.arbel.mtts[i] = cpu_to_be64(page_list[i] | + MTHCA_MTT_FLAG_PRESENT); + + dma_sync_single_for_device(&dev->pdev->dev, fmr->mem.arbel.dma_handle, + list_len * sizeof(u64), DMA_TO_DEVICE); + + fmr->mem.arbel.mpt->key = cpu_to_be32(key); + fmr->mem.arbel.mpt->lkey = cpu_to_be32(key); + fmr->mem.arbel.mpt->length = cpu_to_be64(list_len * (1ull << fmr->attr.page_shift)); + fmr->mem.arbel.mpt->start = cpu_to_be64(iova); + + wmb(); + + *(u8 *) fmr->mem.arbel.mpt = MTHCA_MPT_STATUS_HW; + + wmb(); + + return 0; +} + +void mthca_tavor_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr) +{ + if (!fmr->maps) + return; + + fmr->maps = 0; + + writeb(MTHCA_MPT_STATUS_SW, fmr->mem.tavor.mpt); +} + +void mthca_arbel_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr) +{ + if (!fmr->maps) + return; + + fmr->maps = 0; + + *(u8 *) fmr->mem.arbel.mpt = MTHCA_MPT_STATUS_SW; +} + +int mthca_init_mr_table(struct mthca_dev *dev) +{ + phys_addr_t addr; + int mpts, mtts, err, i; + + err = mthca_alloc_init(&dev->mr_table.mpt_alloc, + dev->limits.num_mpts, + ~0, dev->limits.reserved_mrws); + if (err) + return err; + + if (!mthca_is_memfree(dev) && + (dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN)) + dev->limits.fmr_reserved_mtts = 0; + else + dev->mthca_flags |= MTHCA_FLAG_FMR; + + if (dev->mthca_flags & MTHCA_FLAG_SINAI_OPT) + mthca_dbg(dev, "Memory key throughput optimization activated.\n"); + + err = mthca_buddy_init(&dev->mr_table.mtt_buddy, + fls(dev->limits.num_mtt_segs - 1)); + + if (err) + goto err_mtt_buddy; + + dev->mr_table.tavor_fmr.mpt_base = NULL; + dev->mr_table.tavor_fmr.mtt_base = NULL; + + if (dev->limits.fmr_reserved_mtts) { + i = fls(dev->limits.fmr_reserved_mtts - 1); + + if (i >= 31) { + mthca_warn(dev, "Unable to reserve 2^31 FMR MTTs.\n"); + err = -EINVAL; + goto err_fmr_mpt; + } + mpts = mtts = 1 << i; + } else { + mtts = dev->limits.num_mtt_segs; + mpts = dev->limits.num_mpts; + } + + if (!mthca_is_memfree(dev) && + (dev->mthca_flags & MTHCA_FLAG_FMR)) { + + addr = pci_resource_start(dev->pdev, 4) + + ((pci_resource_len(dev->pdev, 4) - 1) & + dev->mr_table.mpt_base); + + dev->mr_table.tavor_fmr.mpt_base = + ioremap(addr, mpts * sizeof(struct mthca_mpt_entry)); + + if (!dev->mr_table.tavor_fmr.mpt_base) { + mthca_warn(dev, "MPT ioremap for FMR failed.\n"); + err = -ENOMEM; + goto err_fmr_mpt; + } + + addr = pci_resource_start(dev->pdev, 4) + + ((pci_resource_len(dev->pdev, 4) - 1) & + dev->mr_table.mtt_base); + + dev->mr_table.tavor_fmr.mtt_base = + ioremap(addr, mtts * dev->limits.mtt_seg_size); + if (!dev->mr_table.tavor_fmr.mtt_base) { + mthca_warn(dev, "MTT ioremap for FMR failed.\n"); + err = -ENOMEM; + goto err_fmr_mtt; + } + } + + if (dev->limits.fmr_reserved_mtts) { + err = mthca_buddy_init(&dev->mr_table.tavor_fmr.mtt_buddy, fls(mtts - 1)); + if (err) + goto err_fmr_mtt_buddy; + + /* Prevent regular MRs from using FMR keys */ + err = mthca_buddy_alloc(&dev->mr_table.mtt_buddy, fls(mtts - 1)); + if (err) + goto err_reserve_fmr; + + dev->mr_table.fmr_mtt_buddy = + &dev->mr_table.tavor_fmr.mtt_buddy; + } else + dev->mr_table.fmr_mtt_buddy = &dev->mr_table.mtt_buddy; + + /* FMR table is always the first, take reserved MTTs out of there */ + if (dev->limits.reserved_mtts) { + i = fls(dev->limits.reserved_mtts - 1); + + if (mthca_alloc_mtt_range(dev, i, + dev->mr_table.fmr_mtt_buddy) == -1) { + mthca_warn(dev, "MTT table of order %d is too small.\n", + dev->mr_table.fmr_mtt_buddy->max_order); + err = -ENOMEM; + goto err_reserve_mtts; + } + } + + return 0; + +err_reserve_mtts: +err_reserve_fmr: + if (dev->limits.fmr_reserved_mtts) + mthca_buddy_cleanup(&dev->mr_table.tavor_fmr.mtt_buddy); + +err_fmr_mtt_buddy: + if (dev->mr_table.tavor_fmr.mtt_base) + iounmap(dev->mr_table.tavor_fmr.mtt_base); + +err_fmr_mtt: + if (dev->mr_table.tavor_fmr.mpt_base) + iounmap(dev->mr_table.tavor_fmr.mpt_base); + +err_fmr_mpt: + mthca_buddy_cleanup(&dev->mr_table.mtt_buddy); + +err_mtt_buddy: + mthca_alloc_cleanup(&dev->mr_table.mpt_alloc); + + return err; +} + +void mthca_cleanup_mr_table(struct mthca_dev *dev) +{ + /* XXX check if any MRs are still allocated? */ + if (dev->limits.fmr_reserved_mtts) + mthca_buddy_cleanup(&dev->mr_table.tavor_fmr.mtt_buddy); + + mthca_buddy_cleanup(&dev->mr_table.mtt_buddy); + + if (dev->mr_table.tavor_fmr.mtt_base) + iounmap(dev->mr_table.tavor_fmr.mtt_base); + if (dev->mr_table.tavor_fmr.mpt_base) + iounmap(dev->mr_table.tavor_fmr.mpt_base); + + mthca_alloc_cleanup(&dev->mr_table.mpt_alloc); +} diff --git a/kernel/drivers/infiniband/hw/mthca/mthca_pd.c b/kernel/drivers/infiniband/hw/mthca/mthca_pd.c new file mode 100644 index 000000000..266f14e47 --- /dev/null +++ b/kernel/drivers/infiniband/hw/mthca/mthca_pd.c @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Cisco Systems. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include "mthca_dev.h" + +int mthca_pd_alloc(struct mthca_dev *dev, int privileged, struct mthca_pd *pd) +{ + int err = 0; + + pd->privileged = privileged; + + atomic_set(&pd->sqp_count, 0); + pd->pd_num = mthca_alloc(&dev->pd_table.alloc); + if (pd->pd_num == -1) + return -ENOMEM; + + if (privileged) { + err = mthca_mr_alloc_notrans(dev, pd->pd_num, + MTHCA_MPT_FLAG_LOCAL_READ | + MTHCA_MPT_FLAG_LOCAL_WRITE, + &pd->ntmr); + if (err) + mthca_free(&dev->pd_table.alloc, pd->pd_num); + } + + return err; +} + +void mthca_pd_free(struct mthca_dev *dev, struct mthca_pd *pd) +{ + if (pd->privileged) + mthca_free_mr(dev, &pd->ntmr); + mthca_free(&dev->pd_table.alloc, pd->pd_num); +} + +int mthca_init_pd_table(struct mthca_dev *dev) +{ + return mthca_alloc_init(&dev->pd_table.alloc, + dev->limits.num_pds, + (1 << 24) - 1, + dev->limits.reserved_pds); +} + +void mthca_cleanup_pd_table(struct mthca_dev *dev) +{ + /* XXX check if any PDs are still allocated? */ + mthca_alloc_cleanup(&dev->pd_table.alloc); +} diff --git a/kernel/drivers/infiniband/hw/mthca/mthca_profile.c b/kernel/drivers/infiniband/hw/mthca/mthca_profile.c new file mode 100644 index 000000000..8edb28a9a --- /dev/null +++ b/kernel/drivers/infiniband/hw/mthca/mthca_profile.c @@ -0,0 +1,285 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#include "mthca_profile.h" + +enum { + MTHCA_RES_QP, + MTHCA_RES_EEC, + MTHCA_RES_SRQ, + MTHCA_RES_CQ, + MTHCA_RES_EQP, + MTHCA_RES_EEEC, + MTHCA_RES_EQ, + MTHCA_RES_RDB, + MTHCA_RES_MCG, + MTHCA_RES_MPT, + MTHCA_RES_MTT, + MTHCA_RES_UAR, + MTHCA_RES_UDAV, + MTHCA_RES_UARC, + MTHCA_RES_NUM +}; + +enum { + MTHCA_NUM_EQS = 32, + MTHCA_NUM_PDS = 1 << 15 +}; + +s64 mthca_make_profile(struct mthca_dev *dev, + struct mthca_profile *request, + struct mthca_dev_lim *dev_lim, + struct mthca_init_hca_param *init_hca) +{ + struct mthca_resource { + u64 size; + u64 start; + int type; + int num; + int log_num; + }; + + u64 mem_base, mem_avail; + s64 total_size = 0; + struct mthca_resource *profile; + struct mthca_resource tmp; + int i, j; + + profile = kzalloc(MTHCA_RES_NUM * sizeof *profile, GFP_KERNEL); + if (!profile) + return -ENOMEM; + + profile[MTHCA_RES_QP].size = dev_lim->qpc_entry_sz; + profile[MTHCA_RES_EEC].size = dev_lim->eec_entry_sz; + profile[MTHCA_RES_SRQ].size = dev_lim->srq_entry_sz; + profile[MTHCA_RES_CQ].size = dev_lim->cqc_entry_sz; + profile[MTHCA_RES_EQP].size = dev_lim->eqpc_entry_sz; + profile[MTHCA_RES_EEEC].size = dev_lim->eeec_entry_sz; + profile[MTHCA_RES_EQ].size = dev_lim->eqc_entry_sz; + profile[MTHCA_RES_RDB].size = MTHCA_RDB_ENTRY_SIZE; + profile[MTHCA_RES_MCG].size = MTHCA_MGM_ENTRY_SIZE; + profile[MTHCA_RES_MPT].size = dev_lim->mpt_entry_sz; + profile[MTHCA_RES_MTT].size = dev->limits.mtt_seg_size; + profile[MTHCA_RES_UAR].size = dev_lim->uar_scratch_entry_sz; + profile[MTHCA_RES_UDAV].size = MTHCA_AV_SIZE; + profile[MTHCA_RES_UARC].size = request->uarc_size; + + profile[MTHCA_RES_QP].num = request->num_qp; + profile[MTHCA_RES_SRQ].num = request->num_srq; + profile[MTHCA_RES_EQP].num = request->num_qp; + profile[MTHCA_RES_RDB].num = request->num_qp * request->rdb_per_qp; + profile[MTHCA_RES_CQ].num = request->num_cq; + profile[MTHCA_RES_EQ].num = MTHCA_NUM_EQS; + profile[MTHCA_RES_MCG].num = request->num_mcg; + profile[MTHCA_RES_MPT].num = request->num_mpt; + profile[MTHCA_RES_MTT].num = request->num_mtt; + profile[MTHCA_RES_UAR].num = request->num_uar; + profile[MTHCA_RES_UARC].num = request->num_uar; + profile[MTHCA_RES_UDAV].num = request->num_udav; + + for (i = 0; i < MTHCA_RES_NUM; ++i) { + profile[i].type = i; + profile[i].log_num = max(ffs(profile[i].num) - 1, 0); + profile[i].size *= profile[i].num; + if (mthca_is_memfree(dev)) + profile[i].size = max(profile[i].size, (u64) PAGE_SIZE); + } + + if (mthca_is_memfree(dev)) { + mem_base = 0; + mem_avail = dev_lim->hca.arbel.max_icm_sz; + } else { + mem_base = dev->ddr_start; + mem_avail = dev->fw.tavor.fw_start - dev->ddr_start; + } + + /* + * Sort the resources in decreasing order of size. Since they + * all have sizes that are powers of 2, we'll be able to keep + * resources aligned to their size and pack them without gaps + * using the sorted order. + */ + for (i = MTHCA_RES_NUM; i > 0; --i) + for (j = 1; j < i; ++j) { + if (profile[j].size > profile[j - 1].size) { + tmp = profile[j]; + profile[j] = profile[j - 1]; + profile[j - 1] = tmp; + } + } + + for (i = 0; i < MTHCA_RES_NUM; ++i) { + if (profile[i].size) { + profile[i].start = mem_base + total_size; + total_size += profile[i].size; + } + if (total_size > mem_avail) { + mthca_err(dev, "Profile requires 0x%llx bytes; " + "won't fit in 0x%llx bytes of context memory.\n", + (unsigned long long) total_size, + (unsigned long long) mem_avail); + kfree(profile); + return -ENOMEM; + } + + if (profile[i].size) + mthca_dbg(dev, "profile[%2d]--%2d/%2d @ 0x%16llx " + "(size 0x%8llx)\n", + i, profile[i].type, profile[i].log_num, + (unsigned long long) profile[i].start, + (unsigned long long) profile[i].size); + } + + if (mthca_is_memfree(dev)) + mthca_dbg(dev, "HCA context memory: reserving %d KB\n", + (int) (total_size >> 10)); + else + mthca_dbg(dev, "HCA memory: allocated %d KB/%d KB (%d KB free)\n", + (int) (total_size >> 10), (int) (mem_avail >> 10), + (int) ((mem_avail - total_size) >> 10)); + + for (i = 0; i < MTHCA_RES_NUM; ++i) { + switch (profile[i].type) { + case MTHCA_RES_QP: + dev->limits.num_qps = profile[i].num; + init_hca->qpc_base = profile[i].start; + init_hca->log_num_qps = profile[i].log_num; + break; + case MTHCA_RES_EEC: + dev->limits.num_eecs = profile[i].num; + init_hca->eec_base = profile[i].start; + init_hca->log_num_eecs = profile[i].log_num; + break; + case MTHCA_RES_SRQ: + dev->limits.num_srqs = profile[i].num; + init_hca->srqc_base = profile[i].start; + init_hca->log_num_srqs = profile[i].log_num; + break; + case MTHCA_RES_CQ: + dev->limits.num_cqs = profile[i].num; + init_hca->cqc_base = profile[i].start; + init_hca->log_num_cqs = profile[i].log_num; + break; + case MTHCA_RES_EQP: + init_hca->eqpc_base = profile[i].start; + break; + case MTHCA_RES_EEEC: + init_hca->eeec_base = profile[i].start; + break; + case MTHCA_RES_EQ: + dev->limits.num_eqs = profile[i].num; + init_hca->eqc_base = profile[i].start; + init_hca->log_num_eqs = profile[i].log_num; + break; + case MTHCA_RES_RDB: + for (dev->qp_table.rdb_shift = 0; + request->num_qp << dev->qp_table.rdb_shift < profile[i].num; + ++dev->qp_table.rdb_shift) + ; /* nothing */ + dev->qp_table.rdb_base = (u32) profile[i].start; + init_hca->rdb_base = profile[i].start; + break; + case MTHCA_RES_MCG: + dev->limits.num_mgms = profile[i].num >> 1; + dev->limits.num_amgms = profile[i].num >> 1; + init_hca->mc_base = profile[i].start; + init_hca->log_mc_entry_sz = ffs(MTHCA_MGM_ENTRY_SIZE) - 1; + init_hca->log_mc_table_sz = profile[i].log_num; + init_hca->mc_hash_sz = 1 << (profile[i].log_num - 1); + break; + case MTHCA_RES_MPT: + dev->limits.num_mpts = profile[i].num; + dev->mr_table.mpt_base = profile[i].start; + init_hca->mpt_base = profile[i].start; + init_hca->log_mpt_sz = profile[i].log_num; + break; + case MTHCA_RES_MTT: + dev->limits.num_mtt_segs = profile[i].num; + dev->mr_table.mtt_base = profile[i].start; + init_hca->mtt_base = profile[i].start; + init_hca->mtt_seg_sz = ffs(dev->limits.mtt_seg_size) - 7; + break; + case MTHCA_RES_UAR: + dev->limits.num_uars = profile[i].num; + init_hca->uar_scratch_base = profile[i].start; + break; + case MTHCA_RES_UDAV: + dev->av_table.ddr_av_base = profile[i].start; + dev->av_table.num_ddr_avs = profile[i].num; + break; + case MTHCA_RES_UARC: + dev->uar_table.uarc_size = request->uarc_size; + dev->uar_table.uarc_base = profile[i].start; + init_hca->uarc_base = profile[i].start; + init_hca->log_uarc_sz = ffs(request->uarc_size) - 13; + init_hca->log_uar_sz = ffs(request->num_uar) - 1; + break; + default: + break; + } + } + + /* + * PDs don't take any HCA memory, but we assign them as part + * of the HCA profile anyway. + */ + dev->limits.num_pds = MTHCA_NUM_PDS; + + if (dev->mthca_flags & MTHCA_FLAG_SINAI_OPT && + init_hca->log_mpt_sz > 23) { + mthca_warn(dev, "MPT table too large (requested size 2^%d >= 2^24)\n", + init_hca->log_mpt_sz); + mthca_warn(dev, "Disabling memory key throughput optimization.\n"); + dev->mthca_flags &= ~MTHCA_FLAG_SINAI_OPT; + } + + /* + * For Tavor, FMRs use ioremapped PCI memory. For 32 bit + * systems it may use too much vmalloc space to map all MTT + * memory, so we reserve some MTTs for FMR access, taking them + * out of the MR pool. They don't use additional memory, but + * we assign them as part of the HCA profile anyway. + */ + if (mthca_is_memfree(dev) || BITS_PER_LONG == 64) + dev->limits.fmr_reserved_mtts = 0; + else + dev->limits.fmr_reserved_mtts = request->fmr_reserved_mtts; + + kfree(profile); + return total_size; +} diff --git a/kernel/drivers/infiniband/hw/mthca/mthca_profile.h b/kernel/drivers/infiniband/hw/mthca/mthca_profile.h new file mode 100644 index 000000000..62b009cc8 --- /dev/null +++ b/kernel/drivers/infiniband/hw/mthca/mthca_profile.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MTHCA_PROFILE_H +#define MTHCA_PROFILE_H + +#include "mthca_dev.h" +#include "mthca_cmd.h" + +struct mthca_profile { + int num_qp; + int rdb_per_qp; + int num_srq; + int num_cq; + int num_mcg; + int num_mpt; + int num_mtt; + int num_udav; + int num_uar; + int uarc_size; + int fmr_reserved_mtts; +}; + +s64 mthca_make_profile(struct mthca_dev *mdev, + struct mthca_profile *request, + struct mthca_dev_lim *dev_lim, + struct mthca_init_hca_param *init_hca); + +#endif /* MTHCA_PROFILE_H */ diff --git a/kernel/drivers/infiniband/hw/mthca/mthca_provider.c b/kernel/drivers/infiniband/hw/mthca/mthca_provider.c new file mode 100644 index 000000000..415f8e1a5 --- /dev/null +++ b/kernel/drivers/infiniband/hw/mthca/mthca_provider.c @@ -0,0 +1,1375 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * Copyright (c) 2004 Voltaire, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "mthca_dev.h" +#include "mthca_cmd.h" +#include "mthca_user.h" +#include "mthca_memfree.h" + +static void init_query_mad(struct ib_smp *mad) +{ + mad->base_version = 1; + mad->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED; + mad->class_version = 1; + mad->method = IB_MGMT_METHOD_GET; +} + +static int mthca_query_device(struct ib_device *ibdev, + struct ib_device_attr *props) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + struct mthca_dev *mdev = to_mdev(ibdev); + + in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); + out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + memset(props, 0, sizeof *props); + + props->fw_ver = mdev->fw_ver; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_NODE_INFO; + + err = mthca_MAD_IFC(mdev, 1, 1, + 1, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + props->device_cap_flags = mdev->device_cap_flags; + props->vendor_id = be32_to_cpup((__be32 *) (out_mad->data + 36)) & + 0xffffff; + props->vendor_part_id = be16_to_cpup((__be16 *) (out_mad->data + 30)); + props->hw_ver = be32_to_cpup((__be32 *) (out_mad->data + 32)); + memcpy(&props->sys_image_guid, out_mad->data + 4, 8); + + props->max_mr_size = ~0ull; + props->page_size_cap = mdev->limits.page_size_cap; + props->max_qp = mdev->limits.num_qps - mdev->limits.reserved_qps; + props->max_qp_wr = mdev->limits.max_wqes; + props->max_sge = mdev->limits.max_sg; + props->max_cq = mdev->limits.num_cqs - mdev->limits.reserved_cqs; + props->max_cqe = mdev->limits.max_cqes; + props->max_mr = mdev->limits.num_mpts - mdev->limits.reserved_mrws; + props->max_pd = mdev->limits.num_pds - mdev->limits.reserved_pds; + props->max_qp_rd_atom = 1 << mdev->qp_table.rdb_shift; + props->max_qp_init_rd_atom = mdev->limits.max_qp_init_rdma; + props->max_res_rd_atom = props->max_qp_rd_atom * props->max_qp; + props->max_srq = mdev->limits.num_srqs - mdev->limits.reserved_srqs; + props->max_srq_wr = mdev->limits.max_srq_wqes; + props->max_srq_sge = mdev->limits.max_srq_sge; + props->local_ca_ack_delay = mdev->limits.local_ca_ack_delay; + props->atomic_cap = mdev->limits.flags & DEV_LIM_FLAG_ATOMIC ? + IB_ATOMIC_HCA : IB_ATOMIC_NONE; + props->max_pkeys = mdev->limits.pkey_table_len; + props->max_mcast_grp = mdev->limits.num_mgms + mdev->limits.num_amgms; + props->max_mcast_qp_attach = MTHCA_QP_PER_MGM; + props->max_total_mcast_qp_attach = props->max_mcast_qp_attach * + props->max_mcast_grp; + /* + * If Sinai memory key optimization is being used, then only + * the 8-bit key portion will change. For other HCAs, the + * unused index bits will also be used for FMR remapping. + */ + if (mdev->mthca_flags & MTHCA_FLAG_SINAI_OPT) + props->max_map_per_fmr = 255; + else + props->max_map_per_fmr = + (1 << (32 - ilog2(mdev->limits.num_mpts))) - 1; + + err = 0; + out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +static int mthca_query_port(struct ib_device *ibdev, + u8 port, struct ib_port_attr *props) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); + out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + memset(props, 0, sizeof *props); + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + err = mthca_MAD_IFC(to_mdev(ibdev), 1, 1, + port, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + props->lid = be16_to_cpup((__be16 *) (out_mad->data + 16)); + props->lmc = out_mad->data[34] & 0x7; + props->sm_lid = be16_to_cpup((__be16 *) (out_mad->data + 18)); + props->sm_sl = out_mad->data[36] & 0xf; + props->state = out_mad->data[32] & 0xf; + props->phys_state = out_mad->data[33] >> 4; + props->port_cap_flags = be32_to_cpup((__be32 *) (out_mad->data + 20)); + props->gid_tbl_len = to_mdev(ibdev)->limits.gid_table_len; + props->max_msg_sz = 0x80000000; + props->pkey_tbl_len = to_mdev(ibdev)->limits.pkey_table_len; + props->bad_pkey_cntr = be16_to_cpup((__be16 *) (out_mad->data + 46)); + props->qkey_viol_cntr = be16_to_cpup((__be16 *) (out_mad->data + 48)); + props->active_width = out_mad->data[31] & 0xf; + props->active_speed = out_mad->data[35] >> 4; + props->max_mtu = out_mad->data[41] & 0xf; + props->active_mtu = out_mad->data[36] >> 4; + props->subnet_timeout = out_mad->data[51] & 0x1f; + props->max_vl_num = out_mad->data[37] >> 4; + props->init_type_reply = out_mad->data[41] >> 4; + + out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +static int mthca_modify_device(struct ib_device *ibdev, + int mask, + struct ib_device_modify *props) +{ + if (mask & ~IB_DEVICE_MODIFY_NODE_DESC) + return -EOPNOTSUPP; + + if (mask & IB_DEVICE_MODIFY_NODE_DESC) { + if (mutex_lock_interruptible(&to_mdev(ibdev)->cap_mask_mutex)) + return -ERESTARTSYS; + memcpy(ibdev->node_desc, props->node_desc, 64); + mutex_unlock(&to_mdev(ibdev)->cap_mask_mutex); + } + + return 0; +} + +static int mthca_modify_port(struct ib_device *ibdev, + u8 port, int port_modify_mask, + struct ib_port_modify *props) +{ + struct mthca_set_ib_param set_ib; + struct ib_port_attr attr; + int err; + + if (mutex_lock_interruptible(&to_mdev(ibdev)->cap_mask_mutex)) + return -ERESTARTSYS; + + err = mthca_query_port(ibdev, port, &attr); + if (err) + goto out; + + set_ib.set_si_guid = 0; + set_ib.reset_qkey_viol = !!(port_modify_mask & IB_PORT_RESET_QKEY_CNTR); + + set_ib.cap_mask = (attr.port_cap_flags | props->set_port_cap_mask) & + ~props->clr_port_cap_mask; + + err = mthca_SET_IB(to_mdev(ibdev), &set_ib, port); + if (err) + goto out; +out: + mutex_unlock(&to_mdev(ibdev)->cap_mask_mutex); + return err; +} + +static int mthca_query_pkey(struct ib_device *ibdev, + u8 port, u16 index, u16 *pkey) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); + out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_PKEY_TABLE; + in_mad->attr_mod = cpu_to_be32(index / 32); + + err = mthca_MAD_IFC(to_mdev(ibdev), 1, 1, + port, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + *pkey = be16_to_cpu(((__be16 *) out_mad->data)[index % 32]); + + out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +static int mthca_query_gid(struct ib_device *ibdev, u8 port, + int index, union ib_gid *gid) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); + out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + err = mthca_MAD_IFC(to_mdev(ibdev), 1, 1, + port, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + memcpy(gid->raw, out_mad->data + 8, 8); + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_GUID_INFO; + in_mad->attr_mod = cpu_to_be32(index / 8); + + err = mthca_MAD_IFC(to_mdev(ibdev), 1, 1, + port, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + memcpy(gid->raw + 8, out_mad->data + (index % 8) * 8, 8); + + out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +static struct ib_ucontext *mthca_alloc_ucontext(struct ib_device *ibdev, + struct ib_udata *udata) +{ + struct mthca_alloc_ucontext_resp uresp; + struct mthca_ucontext *context; + int err; + + if (!(to_mdev(ibdev)->active)) + return ERR_PTR(-EAGAIN); + + memset(&uresp, 0, sizeof uresp); + + uresp.qp_tab_size = to_mdev(ibdev)->limits.num_qps; + if (mthca_is_memfree(to_mdev(ibdev))) + uresp.uarc_size = to_mdev(ibdev)->uar_table.uarc_size; + else + uresp.uarc_size = 0; + + context = kmalloc(sizeof *context, GFP_KERNEL); + if (!context) + return ERR_PTR(-ENOMEM); + + err = mthca_uar_alloc(to_mdev(ibdev), &context->uar); + if (err) { + kfree(context); + return ERR_PTR(err); + } + + context->db_tab = mthca_init_user_db_tab(to_mdev(ibdev)); + if (IS_ERR(context->db_tab)) { + err = PTR_ERR(context->db_tab); + mthca_uar_free(to_mdev(ibdev), &context->uar); + kfree(context); + return ERR_PTR(err); + } + + if (ib_copy_to_udata(udata, &uresp, sizeof uresp)) { + mthca_cleanup_user_db_tab(to_mdev(ibdev), &context->uar, context->db_tab); + mthca_uar_free(to_mdev(ibdev), &context->uar); + kfree(context); + return ERR_PTR(-EFAULT); + } + + context->reg_mr_warned = 0; + + return &context->ibucontext; +} + +static int mthca_dealloc_ucontext(struct ib_ucontext *context) +{ + mthca_cleanup_user_db_tab(to_mdev(context->device), &to_mucontext(context)->uar, + to_mucontext(context)->db_tab); + mthca_uar_free(to_mdev(context->device), &to_mucontext(context)->uar); + kfree(to_mucontext(context)); + + return 0; +} + +static int mthca_mmap_uar(struct ib_ucontext *context, + struct vm_area_struct *vma) +{ + if (vma->vm_end - vma->vm_start != PAGE_SIZE) + return -EINVAL; + + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + + if (io_remap_pfn_range(vma, vma->vm_start, + to_mucontext(context)->uar.pfn, + PAGE_SIZE, vma->vm_page_prot)) + return -EAGAIN; + + return 0; +} + +static struct ib_pd *mthca_alloc_pd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct mthca_pd *pd; + int err; + + pd = kmalloc(sizeof *pd, GFP_KERNEL); + if (!pd) + return ERR_PTR(-ENOMEM); + + err = mthca_pd_alloc(to_mdev(ibdev), !context, pd); + if (err) { + kfree(pd); + return ERR_PTR(err); + } + + if (context) { + if (ib_copy_to_udata(udata, &pd->pd_num, sizeof (__u32))) { + mthca_pd_free(to_mdev(ibdev), pd); + kfree(pd); + return ERR_PTR(-EFAULT); + } + } + + return &pd->ibpd; +} + +static int mthca_dealloc_pd(struct ib_pd *pd) +{ + mthca_pd_free(to_mdev(pd->device), to_mpd(pd)); + kfree(pd); + + return 0; +} + +static struct ib_ah *mthca_ah_create(struct ib_pd *pd, + struct ib_ah_attr *ah_attr) +{ + int err; + struct mthca_ah *ah; + + ah = kmalloc(sizeof *ah, GFP_ATOMIC); + if (!ah) + return ERR_PTR(-ENOMEM); + + err = mthca_create_ah(to_mdev(pd->device), to_mpd(pd), ah_attr, ah); + if (err) { + kfree(ah); + return ERR_PTR(err); + } + + return &ah->ibah; +} + +static int mthca_ah_destroy(struct ib_ah *ah) +{ + mthca_destroy_ah(to_mdev(ah->device), to_mah(ah)); + kfree(ah); + + return 0; +} + +static struct ib_srq *mthca_create_srq(struct ib_pd *pd, + struct ib_srq_init_attr *init_attr, + struct ib_udata *udata) +{ + struct mthca_create_srq ucmd; + struct mthca_ucontext *context = NULL; + struct mthca_srq *srq; + int err; + + if (init_attr->srq_type != IB_SRQT_BASIC) + return ERR_PTR(-ENOSYS); + + srq = kmalloc(sizeof *srq, GFP_KERNEL); + if (!srq) + return ERR_PTR(-ENOMEM); + + if (pd->uobject) { + context = to_mucontext(pd->uobject->context); + + if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) { + err = -EFAULT; + goto err_free; + } + + err = mthca_map_user_db(to_mdev(pd->device), &context->uar, + context->db_tab, ucmd.db_index, + ucmd.db_page); + + if (err) + goto err_free; + + srq->mr.ibmr.lkey = ucmd.lkey; + srq->db_index = ucmd.db_index; + } + + err = mthca_alloc_srq(to_mdev(pd->device), to_mpd(pd), + &init_attr->attr, srq); + + if (err && pd->uobject) + mthca_unmap_user_db(to_mdev(pd->device), &context->uar, + context->db_tab, ucmd.db_index); + + if (err) + goto err_free; + + if (context && ib_copy_to_udata(udata, &srq->srqn, sizeof (__u32))) { + mthca_free_srq(to_mdev(pd->device), srq); + err = -EFAULT; + goto err_free; + } + + return &srq->ibsrq; + +err_free: + kfree(srq); + + return ERR_PTR(err); +} + +static int mthca_destroy_srq(struct ib_srq *srq) +{ + struct mthca_ucontext *context; + + if (srq->uobject) { + context = to_mucontext(srq->uobject->context); + + mthca_unmap_user_db(to_mdev(srq->device), &context->uar, + context->db_tab, to_msrq(srq)->db_index); + } + + mthca_free_srq(to_mdev(srq->device), to_msrq(srq)); + kfree(srq); + + return 0; +} + +static struct ib_qp *mthca_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata) +{ + struct mthca_create_qp ucmd; + struct mthca_qp *qp; + int err; + + if (init_attr->create_flags) + return ERR_PTR(-EINVAL); + + switch (init_attr->qp_type) { + case IB_QPT_RC: + case IB_QPT_UC: + case IB_QPT_UD: + { + struct mthca_ucontext *context; + + qp = kmalloc(sizeof *qp, GFP_KERNEL); + if (!qp) + return ERR_PTR(-ENOMEM); + + if (pd->uobject) { + context = to_mucontext(pd->uobject->context); + + if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) { + kfree(qp); + return ERR_PTR(-EFAULT); + } + + err = mthca_map_user_db(to_mdev(pd->device), &context->uar, + context->db_tab, + ucmd.sq_db_index, ucmd.sq_db_page); + if (err) { + kfree(qp); + return ERR_PTR(err); + } + + err = mthca_map_user_db(to_mdev(pd->device), &context->uar, + context->db_tab, + ucmd.rq_db_index, ucmd.rq_db_page); + if (err) { + mthca_unmap_user_db(to_mdev(pd->device), + &context->uar, + context->db_tab, + ucmd.sq_db_index); + kfree(qp); + return ERR_PTR(err); + } + + qp->mr.ibmr.lkey = ucmd.lkey; + qp->sq.db_index = ucmd.sq_db_index; + qp->rq.db_index = ucmd.rq_db_index; + } + + err = mthca_alloc_qp(to_mdev(pd->device), to_mpd(pd), + to_mcq(init_attr->send_cq), + to_mcq(init_attr->recv_cq), + init_attr->qp_type, init_attr->sq_sig_type, + &init_attr->cap, qp); + + if (err && pd->uobject) { + context = to_mucontext(pd->uobject->context); + + mthca_unmap_user_db(to_mdev(pd->device), + &context->uar, + context->db_tab, + ucmd.sq_db_index); + mthca_unmap_user_db(to_mdev(pd->device), + &context->uar, + context->db_tab, + ucmd.rq_db_index); + } + + qp->ibqp.qp_num = qp->qpn; + break; + } + case IB_QPT_SMI: + case IB_QPT_GSI: + { + /* Don't allow userspace to create special QPs */ + if (pd->uobject) + return ERR_PTR(-EINVAL); + + qp = kmalloc(sizeof (struct mthca_sqp), GFP_KERNEL); + if (!qp) + return ERR_PTR(-ENOMEM); + + qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0 : 1; + + err = mthca_alloc_sqp(to_mdev(pd->device), to_mpd(pd), + to_mcq(init_attr->send_cq), + to_mcq(init_attr->recv_cq), + init_attr->sq_sig_type, &init_attr->cap, + qp->ibqp.qp_num, init_attr->port_num, + to_msqp(qp)); + break; + } + default: + /* Don't support raw QPs */ + return ERR_PTR(-ENOSYS); + } + + if (err) { + kfree(qp); + return ERR_PTR(err); + } + + init_attr->cap.max_send_wr = qp->sq.max; + init_attr->cap.max_recv_wr = qp->rq.max; + init_attr->cap.max_send_sge = qp->sq.max_gs; + init_attr->cap.max_recv_sge = qp->rq.max_gs; + init_attr->cap.max_inline_data = qp->max_inline_data; + + return &qp->ibqp; +} + +static int mthca_destroy_qp(struct ib_qp *qp) +{ + if (qp->uobject) { + mthca_unmap_user_db(to_mdev(qp->device), + &to_mucontext(qp->uobject->context)->uar, + to_mucontext(qp->uobject->context)->db_tab, + to_mqp(qp)->sq.db_index); + mthca_unmap_user_db(to_mdev(qp->device), + &to_mucontext(qp->uobject->context)->uar, + to_mucontext(qp->uobject->context)->db_tab, + to_mqp(qp)->rq.db_index); + } + mthca_free_qp(to_mdev(qp->device), to_mqp(qp)); + kfree(qp); + return 0; +} + +static struct ib_cq *mthca_create_cq(struct ib_device *ibdev, int entries, + int comp_vector, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct mthca_create_cq ucmd; + struct mthca_cq *cq; + int nent; + int err; + + if (entries < 1 || entries > to_mdev(ibdev)->limits.max_cqes) + return ERR_PTR(-EINVAL); + + if (context) { + if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) + return ERR_PTR(-EFAULT); + + err = mthca_map_user_db(to_mdev(ibdev), &to_mucontext(context)->uar, + to_mucontext(context)->db_tab, + ucmd.set_db_index, ucmd.set_db_page); + if (err) + return ERR_PTR(err); + + err = mthca_map_user_db(to_mdev(ibdev), &to_mucontext(context)->uar, + to_mucontext(context)->db_tab, + ucmd.arm_db_index, ucmd.arm_db_page); + if (err) + goto err_unmap_set; + } + + cq = kmalloc(sizeof *cq, GFP_KERNEL); + if (!cq) { + err = -ENOMEM; + goto err_unmap_arm; + } + + if (context) { + cq->buf.mr.ibmr.lkey = ucmd.lkey; + cq->set_ci_db_index = ucmd.set_db_index; + cq->arm_db_index = ucmd.arm_db_index; + } + + for (nent = 1; nent <= entries; nent <<= 1) + ; /* nothing */ + + err = mthca_init_cq(to_mdev(ibdev), nent, + context ? to_mucontext(context) : NULL, + context ? ucmd.pdn : to_mdev(ibdev)->driver_pd.pd_num, + cq); + if (err) + goto err_free; + + if (context && ib_copy_to_udata(udata, &cq->cqn, sizeof (__u32))) { + mthca_free_cq(to_mdev(ibdev), cq); + err = -EFAULT; + goto err_free; + } + + cq->resize_buf = NULL; + + return &cq->ibcq; + +err_free: + kfree(cq); + +err_unmap_arm: + if (context) + mthca_unmap_user_db(to_mdev(ibdev), &to_mucontext(context)->uar, + to_mucontext(context)->db_tab, ucmd.arm_db_index); + +err_unmap_set: + if (context) + mthca_unmap_user_db(to_mdev(ibdev), &to_mucontext(context)->uar, + to_mucontext(context)->db_tab, ucmd.set_db_index); + + return ERR_PTR(err); +} + +static int mthca_alloc_resize_buf(struct mthca_dev *dev, struct mthca_cq *cq, + int entries) +{ + int ret; + + spin_lock_irq(&cq->lock); + if (cq->resize_buf) { + ret = -EBUSY; + goto unlock; + } + + cq->resize_buf = kmalloc(sizeof *cq->resize_buf, GFP_ATOMIC); + if (!cq->resize_buf) { + ret = -ENOMEM; + goto unlock; + } + + cq->resize_buf->state = CQ_RESIZE_ALLOC; + + ret = 0; + +unlock: + spin_unlock_irq(&cq->lock); + + if (ret) + return ret; + + ret = mthca_alloc_cq_buf(dev, &cq->resize_buf->buf, entries); + if (ret) { + spin_lock_irq(&cq->lock); + kfree(cq->resize_buf); + cq->resize_buf = NULL; + spin_unlock_irq(&cq->lock); + return ret; + } + + cq->resize_buf->cqe = entries - 1; + + spin_lock_irq(&cq->lock); + cq->resize_buf->state = CQ_RESIZE_READY; + spin_unlock_irq(&cq->lock); + + return 0; +} + +static int mthca_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) +{ + struct mthca_dev *dev = to_mdev(ibcq->device); + struct mthca_cq *cq = to_mcq(ibcq); + struct mthca_resize_cq ucmd; + u32 lkey; + int ret; + + if (entries < 1 || entries > dev->limits.max_cqes) + return -EINVAL; + + mutex_lock(&cq->mutex); + + entries = roundup_pow_of_two(entries + 1); + if (entries == ibcq->cqe + 1) { + ret = 0; + goto out; + } + + if (cq->is_kernel) { + ret = mthca_alloc_resize_buf(dev, cq, entries); + if (ret) + goto out; + lkey = cq->resize_buf->buf.mr.ibmr.lkey; + } else { + if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) { + ret = -EFAULT; + goto out; + } + lkey = ucmd.lkey; + } + + ret = mthca_RESIZE_CQ(dev, cq->cqn, lkey, ilog2(entries)); + + if (ret) { + if (cq->resize_buf) { + mthca_free_cq_buf(dev, &cq->resize_buf->buf, + cq->resize_buf->cqe); + kfree(cq->resize_buf); + spin_lock_irq(&cq->lock); + cq->resize_buf = NULL; + spin_unlock_irq(&cq->lock); + } + goto out; + } + + if (cq->is_kernel) { + struct mthca_cq_buf tbuf; + int tcqe; + + spin_lock_irq(&cq->lock); + if (cq->resize_buf->state == CQ_RESIZE_READY) { + mthca_cq_resize_copy_cqes(cq); + tbuf = cq->buf; + tcqe = cq->ibcq.cqe; + cq->buf = cq->resize_buf->buf; + cq->ibcq.cqe = cq->resize_buf->cqe; + } else { + tbuf = cq->resize_buf->buf; + tcqe = cq->resize_buf->cqe; + } + + kfree(cq->resize_buf); + cq->resize_buf = NULL; + spin_unlock_irq(&cq->lock); + + mthca_free_cq_buf(dev, &tbuf, tcqe); + } else + ibcq->cqe = entries - 1; + +out: + mutex_unlock(&cq->mutex); + + return ret; +} + +static int mthca_destroy_cq(struct ib_cq *cq) +{ + if (cq->uobject) { + mthca_unmap_user_db(to_mdev(cq->device), + &to_mucontext(cq->uobject->context)->uar, + to_mucontext(cq->uobject->context)->db_tab, + to_mcq(cq)->arm_db_index); + mthca_unmap_user_db(to_mdev(cq->device), + &to_mucontext(cq->uobject->context)->uar, + to_mucontext(cq->uobject->context)->db_tab, + to_mcq(cq)->set_ci_db_index); + } + mthca_free_cq(to_mdev(cq->device), to_mcq(cq)); + kfree(cq); + + return 0; +} + +static inline u32 convert_access(int acc) +{ + return (acc & IB_ACCESS_REMOTE_ATOMIC ? MTHCA_MPT_FLAG_ATOMIC : 0) | + (acc & IB_ACCESS_REMOTE_WRITE ? MTHCA_MPT_FLAG_REMOTE_WRITE : 0) | + (acc & IB_ACCESS_REMOTE_READ ? MTHCA_MPT_FLAG_REMOTE_READ : 0) | + (acc & IB_ACCESS_LOCAL_WRITE ? MTHCA_MPT_FLAG_LOCAL_WRITE : 0) | + MTHCA_MPT_FLAG_LOCAL_READ; +} + +static struct ib_mr *mthca_get_dma_mr(struct ib_pd *pd, int acc) +{ + struct mthca_mr *mr; + int err; + + mr = kmalloc(sizeof *mr, GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + err = mthca_mr_alloc_notrans(to_mdev(pd->device), + to_mpd(pd)->pd_num, + convert_access(acc), mr); + + if (err) { + kfree(mr); + return ERR_PTR(err); + } + + mr->umem = NULL; + + return &mr->ibmr; +} + +static struct ib_mr *mthca_reg_phys_mr(struct ib_pd *pd, + struct ib_phys_buf *buffer_list, + int num_phys_buf, + int acc, + u64 *iova_start) +{ + struct mthca_mr *mr; + u64 *page_list; + u64 total_size; + unsigned long mask; + int shift; + int npages; + int err; + int i, j, n; + + mask = buffer_list[0].addr ^ *iova_start; + total_size = 0; + for (i = 0; i < num_phys_buf; ++i) { + if (i != 0) + mask |= buffer_list[i].addr; + if (i != num_phys_buf - 1) + mask |= buffer_list[i].addr + buffer_list[i].size; + + total_size += buffer_list[i].size; + } + + if (mask & ~PAGE_MASK) + return ERR_PTR(-EINVAL); + + shift = __ffs(mask | 1 << 31); + + buffer_list[0].size += buffer_list[0].addr & ((1ULL << shift) - 1); + buffer_list[0].addr &= ~0ull << shift; + + mr = kmalloc(sizeof *mr, GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + npages = 0; + for (i = 0; i < num_phys_buf; ++i) + npages += (buffer_list[i].size + (1ULL << shift) - 1) >> shift; + + if (!npages) + return &mr->ibmr; + + page_list = kmalloc(npages * sizeof *page_list, GFP_KERNEL); + if (!page_list) { + kfree(mr); + return ERR_PTR(-ENOMEM); + } + + n = 0; + for (i = 0; i < num_phys_buf; ++i) + for (j = 0; + j < (buffer_list[i].size + (1ULL << shift) - 1) >> shift; + ++j) + page_list[n++] = buffer_list[i].addr + ((u64) j << shift); + + mthca_dbg(to_mdev(pd->device), "Registering memory at %llx (iova %llx) " + "in PD %x; shift %d, npages %d.\n", + (unsigned long long) buffer_list[0].addr, + (unsigned long long) *iova_start, + to_mpd(pd)->pd_num, + shift, npages); + + err = mthca_mr_alloc_phys(to_mdev(pd->device), + to_mpd(pd)->pd_num, + page_list, shift, npages, + *iova_start, total_size, + convert_access(acc), mr); + + if (err) { + kfree(page_list); + kfree(mr); + return ERR_PTR(err); + } + + kfree(page_list); + mr->umem = NULL; + + return &mr->ibmr; +} + +static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt, int acc, struct ib_udata *udata) +{ + struct mthca_dev *dev = to_mdev(pd->device); + struct scatterlist *sg; + struct mthca_mr *mr; + struct mthca_reg_mr ucmd; + u64 *pages; + int shift, n, len; + int i, k, entry; + int err = 0; + int write_mtt_size; + + if (udata->inlen - sizeof (struct ib_uverbs_cmd_hdr) < sizeof ucmd) { + if (!to_mucontext(pd->uobject->context)->reg_mr_warned) { + mthca_warn(dev, "Process '%s' did not pass in MR attrs.\n", + current->comm); + mthca_warn(dev, " Update libmthca to fix this.\n"); + } + ++to_mucontext(pd->uobject->context)->reg_mr_warned; + ucmd.mr_attrs = 0; + } else if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) + return ERR_PTR(-EFAULT); + + mr = kmalloc(sizeof *mr, GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + mr->umem = ib_umem_get(pd->uobject->context, start, length, acc, + ucmd.mr_attrs & MTHCA_MR_DMASYNC); + + if (IS_ERR(mr->umem)) { + err = PTR_ERR(mr->umem); + goto err; + } + + shift = ffs(mr->umem->page_size) - 1; + n = mr->umem->nmap; + + mr->mtt = mthca_alloc_mtt(dev, n); + if (IS_ERR(mr->mtt)) { + err = PTR_ERR(mr->mtt); + goto err_umem; + } + + pages = (u64 *) __get_free_page(GFP_KERNEL); + if (!pages) { + err = -ENOMEM; + goto err_mtt; + } + + i = n = 0; + + write_mtt_size = min(mthca_write_mtt_size(dev), (int) (PAGE_SIZE / sizeof *pages)); + + for_each_sg(mr->umem->sg_head.sgl, sg, mr->umem->nmap, entry) { + len = sg_dma_len(sg) >> shift; + for (k = 0; k < len; ++k) { + pages[i++] = sg_dma_address(sg) + + mr->umem->page_size * k; + /* + * Be friendly to write_mtt and pass it chunks + * of appropriate size. + */ + if (i == write_mtt_size) { + err = mthca_write_mtt(dev, mr->mtt, n, pages, i); + if (err) + goto mtt_done; + n += i; + i = 0; + } + } + } + + if (i) + err = mthca_write_mtt(dev, mr->mtt, n, pages, i); +mtt_done: + free_page((unsigned long) pages); + if (err) + goto err_mtt; + + err = mthca_mr_alloc(dev, to_mpd(pd)->pd_num, shift, virt, length, + convert_access(acc), mr); + + if (err) + goto err_mtt; + + return &mr->ibmr; + +err_mtt: + mthca_free_mtt(dev, mr->mtt); + +err_umem: + ib_umem_release(mr->umem); + +err: + kfree(mr); + return ERR_PTR(err); +} + +static int mthca_dereg_mr(struct ib_mr *mr) +{ + struct mthca_mr *mmr = to_mmr(mr); + + mthca_free_mr(to_mdev(mr->device), mmr); + if (mmr->umem) + ib_umem_release(mmr->umem); + kfree(mmr); + + return 0; +} + +static struct ib_fmr *mthca_alloc_fmr(struct ib_pd *pd, int mr_access_flags, + struct ib_fmr_attr *fmr_attr) +{ + struct mthca_fmr *fmr; + int err; + + fmr = kmalloc(sizeof *fmr, GFP_KERNEL); + if (!fmr) + return ERR_PTR(-ENOMEM); + + memcpy(&fmr->attr, fmr_attr, sizeof *fmr_attr); + err = mthca_fmr_alloc(to_mdev(pd->device), to_mpd(pd)->pd_num, + convert_access(mr_access_flags), fmr); + + if (err) { + kfree(fmr); + return ERR_PTR(err); + } + + return &fmr->ibmr; +} + +static int mthca_dealloc_fmr(struct ib_fmr *fmr) +{ + struct mthca_fmr *mfmr = to_mfmr(fmr); + int err; + + err = mthca_free_fmr(to_mdev(fmr->device), mfmr); + if (err) + return err; + + kfree(mfmr); + return 0; +} + +static int mthca_unmap_fmr(struct list_head *fmr_list) +{ + struct ib_fmr *fmr; + int err; + struct mthca_dev *mdev = NULL; + + list_for_each_entry(fmr, fmr_list, list) { + if (mdev && to_mdev(fmr->device) != mdev) + return -EINVAL; + mdev = to_mdev(fmr->device); + } + + if (!mdev) + return 0; + + if (mthca_is_memfree(mdev)) { + list_for_each_entry(fmr, fmr_list, list) + mthca_arbel_fmr_unmap(mdev, to_mfmr(fmr)); + + wmb(); + } else + list_for_each_entry(fmr, fmr_list, list) + mthca_tavor_fmr_unmap(mdev, to_mfmr(fmr)); + + err = mthca_SYNC_TPT(mdev); + return err; +} + +static ssize_t show_rev(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mthca_dev *dev = + container_of(device, struct mthca_dev, ib_dev.dev); + return sprintf(buf, "%x\n", dev->rev_id); +} + +static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mthca_dev *dev = + container_of(device, struct mthca_dev, ib_dev.dev); + return sprintf(buf, "%d.%d.%d\n", (int) (dev->fw_ver >> 32), + (int) (dev->fw_ver >> 16) & 0xffff, + (int) dev->fw_ver & 0xffff); +} + +static ssize_t show_hca(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mthca_dev *dev = + container_of(device, struct mthca_dev, ib_dev.dev); + switch (dev->pdev->device) { + case PCI_DEVICE_ID_MELLANOX_TAVOR: + return sprintf(buf, "MT23108\n"); + case PCI_DEVICE_ID_MELLANOX_ARBEL_COMPAT: + return sprintf(buf, "MT25208 (MT23108 compat mode)\n"); + case PCI_DEVICE_ID_MELLANOX_ARBEL: + return sprintf(buf, "MT25208\n"); + case PCI_DEVICE_ID_MELLANOX_SINAI: + case PCI_DEVICE_ID_MELLANOX_SINAI_OLD: + return sprintf(buf, "MT25204\n"); + default: + return sprintf(buf, "unknown\n"); + } +} + +static ssize_t show_board(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mthca_dev *dev = + container_of(device, struct mthca_dev, ib_dev.dev); + return sprintf(buf, "%.*s\n", MTHCA_BOARD_ID_LEN, dev->board_id); +} + +static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); +static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); +static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); +static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); + +static struct device_attribute *mthca_dev_attributes[] = { + &dev_attr_hw_rev, + &dev_attr_fw_ver, + &dev_attr_hca_type, + &dev_attr_board_id +}; + +static int mthca_init_node_data(struct mthca_dev *dev) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); + out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_NODE_DESC; + + err = mthca_MAD_IFC(dev, 1, 1, + 1, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + memcpy(dev->ib_dev.node_desc, out_mad->data, 64); + + in_mad->attr_id = IB_SMP_ATTR_NODE_INFO; + + err = mthca_MAD_IFC(dev, 1, 1, + 1, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + if (mthca_is_memfree(dev)) + dev->rev_id = be32_to_cpup((__be32 *) (out_mad->data + 32)); + memcpy(&dev->ib_dev.node_guid, out_mad->data + 12, 8); + +out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +int mthca_register_device(struct mthca_dev *dev) +{ + int ret; + int i; + + ret = mthca_init_node_data(dev); + if (ret) + return ret; + + strlcpy(dev->ib_dev.name, "mthca%d", IB_DEVICE_NAME_MAX); + dev->ib_dev.owner = THIS_MODULE; + + dev->ib_dev.uverbs_abi_ver = MTHCA_UVERBS_ABI_VERSION; + dev->ib_dev.uverbs_cmd_mask = + (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | + (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | + (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_DEREG_MR) | + (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | + (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_CMD_RESIZE_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_QP) | + (1ull << IB_USER_VERBS_CMD_QUERY_QP) | + (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | + (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | + (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_DETACH_MCAST); + dev->ib_dev.node_type = RDMA_NODE_IB_CA; + dev->ib_dev.phys_port_cnt = dev->limits.num_ports; + dev->ib_dev.num_comp_vectors = 1; + dev->ib_dev.dma_device = &dev->pdev->dev; + dev->ib_dev.query_device = mthca_query_device; + dev->ib_dev.query_port = mthca_query_port; + dev->ib_dev.modify_device = mthca_modify_device; + dev->ib_dev.modify_port = mthca_modify_port; + dev->ib_dev.query_pkey = mthca_query_pkey; + dev->ib_dev.query_gid = mthca_query_gid; + dev->ib_dev.alloc_ucontext = mthca_alloc_ucontext; + dev->ib_dev.dealloc_ucontext = mthca_dealloc_ucontext; + dev->ib_dev.mmap = mthca_mmap_uar; + dev->ib_dev.alloc_pd = mthca_alloc_pd; + dev->ib_dev.dealloc_pd = mthca_dealloc_pd; + dev->ib_dev.create_ah = mthca_ah_create; + dev->ib_dev.query_ah = mthca_ah_query; + dev->ib_dev.destroy_ah = mthca_ah_destroy; + + if (dev->mthca_flags & MTHCA_FLAG_SRQ) { + dev->ib_dev.create_srq = mthca_create_srq; + dev->ib_dev.modify_srq = mthca_modify_srq; + dev->ib_dev.query_srq = mthca_query_srq; + dev->ib_dev.destroy_srq = mthca_destroy_srq; + dev->ib_dev.uverbs_cmd_mask |= + (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) | + (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) | + (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ); + + if (mthca_is_memfree(dev)) + dev->ib_dev.post_srq_recv = mthca_arbel_post_srq_recv; + else + dev->ib_dev.post_srq_recv = mthca_tavor_post_srq_recv; + } + + dev->ib_dev.create_qp = mthca_create_qp; + dev->ib_dev.modify_qp = mthca_modify_qp; + dev->ib_dev.query_qp = mthca_query_qp; + dev->ib_dev.destroy_qp = mthca_destroy_qp; + dev->ib_dev.create_cq = mthca_create_cq; + dev->ib_dev.resize_cq = mthca_resize_cq; + dev->ib_dev.destroy_cq = mthca_destroy_cq; + dev->ib_dev.poll_cq = mthca_poll_cq; + dev->ib_dev.get_dma_mr = mthca_get_dma_mr; + dev->ib_dev.reg_phys_mr = mthca_reg_phys_mr; + dev->ib_dev.reg_user_mr = mthca_reg_user_mr; + dev->ib_dev.dereg_mr = mthca_dereg_mr; + + if (dev->mthca_flags & MTHCA_FLAG_FMR) { + dev->ib_dev.alloc_fmr = mthca_alloc_fmr; + dev->ib_dev.unmap_fmr = mthca_unmap_fmr; + dev->ib_dev.dealloc_fmr = mthca_dealloc_fmr; + if (mthca_is_memfree(dev)) + dev->ib_dev.map_phys_fmr = mthca_arbel_map_phys_fmr; + else + dev->ib_dev.map_phys_fmr = mthca_tavor_map_phys_fmr; + } + + dev->ib_dev.attach_mcast = mthca_multicast_attach; + dev->ib_dev.detach_mcast = mthca_multicast_detach; + dev->ib_dev.process_mad = mthca_process_mad; + + if (mthca_is_memfree(dev)) { + dev->ib_dev.req_notify_cq = mthca_arbel_arm_cq; + dev->ib_dev.post_send = mthca_arbel_post_send; + dev->ib_dev.post_recv = mthca_arbel_post_receive; + } else { + dev->ib_dev.req_notify_cq = mthca_tavor_arm_cq; + dev->ib_dev.post_send = mthca_tavor_post_send; + dev->ib_dev.post_recv = mthca_tavor_post_receive; + } + + mutex_init(&dev->cap_mask_mutex); + + ret = ib_register_device(&dev->ib_dev, NULL); + if (ret) + return ret; + + for (i = 0; i < ARRAY_SIZE(mthca_dev_attributes); ++i) { + ret = device_create_file(&dev->ib_dev.dev, + mthca_dev_attributes[i]); + if (ret) { + ib_unregister_device(&dev->ib_dev); + return ret; + } + } + + mthca_start_catas_poll(dev); + + return 0; +} + +void mthca_unregister_device(struct mthca_dev *dev) +{ + mthca_stop_catas_poll(dev); + ib_unregister_device(&dev->ib_dev); +} diff --git a/kernel/drivers/infiniband/hw/mthca/mthca_provider.h b/kernel/drivers/infiniband/hw/mthca/mthca_provider.h new file mode 100644 index 000000000..596acc455 --- /dev/null +++ b/kernel/drivers/infiniband/hw/mthca/mthca_provider.h @@ -0,0 +1,344 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MTHCA_PROVIDER_H +#define MTHCA_PROVIDER_H + +#include +#include + +#define MTHCA_MPT_FLAG_ATOMIC (1 << 14) +#define MTHCA_MPT_FLAG_REMOTE_WRITE (1 << 13) +#define MTHCA_MPT_FLAG_REMOTE_READ (1 << 12) +#define MTHCA_MPT_FLAG_LOCAL_WRITE (1 << 11) +#define MTHCA_MPT_FLAG_LOCAL_READ (1 << 10) + +struct mthca_buf_list { + void *buf; + DEFINE_DMA_UNMAP_ADDR(mapping); +}; + +union mthca_buf { + struct mthca_buf_list direct; + struct mthca_buf_list *page_list; +}; + +struct mthca_uar { + unsigned long pfn; + int index; +}; + +struct mthca_user_db_table; + +struct mthca_ucontext { + struct ib_ucontext ibucontext; + struct mthca_uar uar; + struct mthca_user_db_table *db_tab; + int reg_mr_warned; +}; + +struct mthca_mtt; + +struct mthca_mr { + struct ib_mr ibmr; + struct ib_umem *umem; + struct mthca_mtt *mtt; +}; + +struct mthca_fmr { + struct ib_fmr ibmr; + struct ib_fmr_attr attr; + struct mthca_mtt *mtt; + int maps; + union { + struct { + struct mthca_mpt_entry __iomem *mpt; + u64 __iomem *mtts; + } tavor; + struct { + struct mthca_mpt_entry *mpt; + __be64 *mtts; + dma_addr_t dma_handle; + } arbel; + } mem; +}; + +struct mthca_pd { + struct ib_pd ibpd; + u32 pd_num; + atomic_t sqp_count; + struct mthca_mr ntmr; + int privileged; +}; + +struct mthca_eq { + struct mthca_dev *dev; + int eqn; + u32 eqn_mask; + u32 cons_index; + u16 msi_x_vector; + u16 msi_x_entry; + int have_irq; + int nent; + struct mthca_buf_list *page_list; + struct mthca_mr mr; + char irq_name[IB_DEVICE_NAME_MAX]; +}; + +struct mthca_av; + +enum mthca_ah_type { + MTHCA_AH_ON_HCA, + MTHCA_AH_PCI_POOL, + MTHCA_AH_KMALLOC +}; + +struct mthca_ah { + struct ib_ah ibah; + enum mthca_ah_type type; + u32 key; + struct mthca_av *av; + dma_addr_t avdma; +}; + +/* + * Quick description of our CQ/QP locking scheme: + * + * We have one global lock that protects dev->cq/qp_table. Each + * struct mthca_cq/qp also has its own lock. An individual qp lock + * may be taken inside of an individual cq lock. Both cqs attached to + * a qp may be locked, with the cq with the lower cqn locked first. + * No other nesting should be done. + * + * Each struct mthca_cq/qp also has an ref count, protected by the + * corresponding table lock. The pointer from the cq/qp_table to the + * struct counts as one reference. This reference also is good for + * access through the consumer API, so modifying the CQ/QP etc doesn't + * need to take another reference. Access to a QP because of a + * completion being polled does not need a reference either. + * + * Finally, each struct mthca_cq/qp has a wait_queue_head_t for the + * destroy function to sleep on. + * + * This means that access from the consumer API requires nothing but + * taking the struct's lock. + * + * Access because of a completion event should go as follows: + * - lock cq/qp_table and look up struct + * - increment ref count in struct + * - drop cq/qp_table lock + * - lock struct, do your thing, and unlock struct + * - decrement ref count; if zero, wake up waiters + * + * To destroy a CQ/QP, we can do the following: + * - lock cq/qp_table + * - remove pointer and decrement ref count + * - unlock cq/qp_table lock + * - wait_event until ref count is zero + * + * It is the consumer's responsibilty to make sure that no QP + * operations (WQE posting or state modification) are pending when a + * QP is destroyed. Also, the consumer must make sure that calls to + * qp_modify are serialized. Similarly, the consumer is responsible + * for ensuring that no CQ resize operations are pending when a CQ + * is destroyed. + * + * Possible optimizations (wait for profile data to see if/where we + * have locks bouncing between CPUs): + * - split cq/qp table lock into n separate (cache-aligned) locks, + * indexed (say) by the page in the table + * - split QP struct lock into three (one for common info, one for the + * send queue and one for the receive queue) + */ + +struct mthca_cq_buf { + union mthca_buf queue; + struct mthca_mr mr; + int is_direct; +}; + +struct mthca_cq_resize { + struct mthca_cq_buf buf; + int cqe; + enum { + CQ_RESIZE_ALLOC, + CQ_RESIZE_READY, + CQ_RESIZE_SWAPPED + } state; +}; + +struct mthca_cq { + struct ib_cq ibcq; + spinlock_t lock; + int refcount; + int cqn; + u32 cons_index; + struct mthca_cq_buf buf; + struct mthca_cq_resize *resize_buf; + int is_kernel; + + /* Next fields are Arbel only */ + int set_ci_db_index; + __be32 *set_ci_db; + int arm_db_index; + __be32 *arm_db; + int arm_sn; + + wait_queue_head_t wait; + struct mutex mutex; +}; + +struct mthca_srq { + struct ib_srq ibsrq; + spinlock_t lock; + int refcount; + int srqn; + int max; + int max_gs; + int wqe_shift; + int first_free; + int last_free; + u16 counter; /* Arbel only */ + int db_index; /* Arbel only */ + __be32 *db; /* Arbel only */ + void *last; + + int is_direct; + u64 *wrid; + union mthca_buf queue; + struct mthca_mr mr; + + wait_queue_head_t wait; + struct mutex mutex; +}; + +struct mthca_wq { + spinlock_t lock; + int max; + unsigned next_ind; + unsigned last_comp; + unsigned head; + unsigned tail; + void *last; + int max_gs; + int wqe_shift; + + int db_index; /* Arbel only */ + __be32 *db; +}; + +struct mthca_qp { + struct ib_qp ibqp; + int refcount; + u32 qpn; + int is_direct; + u8 port; /* for SQP and memfree use only */ + u8 alt_port; /* for memfree use only */ + u8 transport; + u8 state; + u8 atomic_rd_en; + u8 resp_depth; + + struct mthca_mr mr; + + struct mthca_wq rq; + struct mthca_wq sq; + enum ib_sig_type sq_policy; + int send_wqe_offset; + int max_inline_data; + + u64 *wrid; + union mthca_buf queue; + + wait_queue_head_t wait; + struct mutex mutex; +}; + +struct mthca_sqp { + struct mthca_qp qp; + int pkey_index; + u32 qkey; + u32 send_psn; + struct ib_ud_header ud_header; + int header_buf_size; + void *header_buf; + dma_addr_t header_dma; +}; + +static inline struct mthca_ucontext *to_mucontext(struct ib_ucontext *ibucontext) +{ + return container_of(ibucontext, struct mthca_ucontext, ibucontext); +} + +static inline struct mthca_fmr *to_mfmr(struct ib_fmr *ibmr) +{ + return container_of(ibmr, struct mthca_fmr, ibmr); +} + +static inline struct mthca_mr *to_mmr(struct ib_mr *ibmr) +{ + return container_of(ibmr, struct mthca_mr, ibmr); +} + +static inline struct mthca_pd *to_mpd(struct ib_pd *ibpd) +{ + return container_of(ibpd, struct mthca_pd, ibpd); +} + +static inline struct mthca_ah *to_mah(struct ib_ah *ibah) +{ + return container_of(ibah, struct mthca_ah, ibah); +} + +static inline struct mthca_cq *to_mcq(struct ib_cq *ibcq) +{ + return container_of(ibcq, struct mthca_cq, ibcq); +} + +static inline struct mthca_srq *to_msrq(struct ib_srq *ibsrq) +{ + return container_of(ibsrq, struct mthca_srq, ibsrq); +} + +static inline struct mthca_qp *to_mqp(struct ib_qp *ibqp) +{ + return container_of(ibqp, struct mthca_qp, ibqp); +} + +static inline struct mthca_sqp *to_msqp(struct mthca_qp *qp) +{ + return container_of(qp, struct mthca_sqp, qp); +} + +#endif /* MTHCA_PROVIDER_H */ diff --git a/kernel/drivers/infiniband/hw/mthca/mthca_qp.c b/kernel/drivers/infiniband/hw/mthca/mthca_qp.c new file mode 100644 index 000000000..e354b2f04 --- /dev/null +++ b/kernel/drivers/infiniband/hw/mthca/mthca_qp.c @@ -0,0 +1,2311 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Cisco Systems. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * Copyright (c) 2004 Voltaire, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include + +#include +#include +#include + +#include "mthca_dev.h" +#include "mthca_cmd.h" +#include "mthca_memfree.h" +#include "mthca_wqe.h" + +enum { + MTHCA_MAX_DIRECT_QP_SIZE = 4 * PAGE_SIZE, + MTHCA_ACK_REQ_FREQ = 10, + MTHCA_FLIGHT_LIMIT = 9, + MTHCA_UD_HEADER_SIZE = 72, /* largest UD header possible */ + MTHCA_INLINE_HEADER_SIZE = 4, /* data segment overhead for inline */ + MTHCA_INLINE_CHUNK_SIZE = 16 /* inline data segment chunk */ +}; + +enum { + MTHCA_QP_STATE_RST = 0, + MTHCA_QP_STATE_INIT = 1, + MTHCA_QP_STATE_RTR = 2, + MTHCA_QP_STATE_RTS = 3, + MTHCA_QP_STATE_SQE = 4, + MTHCA_QP_STATE_SQD = 5, + MTHCA_QP_STATE_ERR = 6, + MTHCA_QP_STATE_DRAINING = 7 +}; + +enum { + MTHCA_QP_ST_RC = 0x0, + MTHCA_QP_ST_UC = 0x1, + MTHCA_QP_ST_RD = 0x2, + MTHCA_QP_ST_UD = 0x3, + MTHCA_QP_ST_MLX = 0x7 +}; + +enum { + MTHCA_QP_PM_MIGRATED = 0x3, + MTHCA_QP_PM_ARMED = 0x0, + MTHCA_QP_PM_REARM = 0x1 +}; + +enum { + /* qp_context flags */ + MTHCA_QP_BIT_DE = 1 << 8, + /* params1 */ + MTHCA_QP_BIT_SRE = 1 << 15, + MTHCA_QP_BIT_SWE = 1 << 14, + MTHCA_QP_BIT_SAE = 1 << 13, + MTHCA_QP_BIT_SIC = 1 << 4, + MTHCA_QP_BIT_SSC = 1 << 3, + /* params2 */ + MTHCA_QP_BIT_RRE = 1 << 15, + MTHCA_QP_BIT_RWE = 1 << 14, + MTHCA_QP_BIT_RAE = 1 << 13, + MTHCA_QP_BIT_RIC = 1 << 4, + MTHCA_QP_BIT_RSC = 1 << 3 +}; + +enum { + MTHCA_SEND_DOORBELL_FENCE = 1 << 5 +}; + +struct mthca_qp_path { + __be32 port_pkey; + u8 rnr_retry; + u8 g_mylmc; + __be16 rlid; + u8 ackto; + u8 mgid_index; + u8 static_rate; + u8 hop_limit; + __be32 sl_tclass_flowlabel; + u8 rgid[16]; +} __attribute__((packed)); + +struct mthca_qp_context { + __be32 flags; + __be32 tavor_sched_queue; /* Reserved on Arbel */ + u8 mtu_msgmax; + u8 rq_size_stride; /* Reserved on Tavor */ + u8 sq_size_stride; /* Reserved on Tavor */ + u8 rlkey_arbel_sched_queue; /* Reserved on Tavor */ + __be32 usr_page; + __be32 local_qpn; + __be32 remote_qpn; + u32 reserved1[2]; + struct mthca_qp_path pri_path; + struct mthca_qp_path alt_path; + __be32 rdd; + __be32 pd; + __be32 wqe_base; + __be32 wqe_lkey; + __be32 params1; + __be32 reserved2; + __be32 next_send_psn; + __be32 cqn_snd; + __be32 snd_wqe_base_l; /* Next send WQE on Tavor */ + __be32 snd_db_index; /* (debugging only entries) */ + __be32 last_acked_psn; + __be32 ssn; + __be32 params2; + __be32 rnr_nextrecvpsn; + __be32 ra_buff_indx; + __be32 cqn_rcv; + __be32 rcv_wqe_base_l; /* Next recv WQE on Tavor */ + __be32 rcv_db_index; /* (debugging only entries) */ + __be32 qkey; + __be32 srqn; + __be32 rmsn; + __be16 rq_wqe_counter; /* reserved on Tavor */ + __be16 sq_wqe_counter; /* reserved on Tavor */ + u32 reserved3[18]; +} __attribute__((packed)); + +struct mthca_qp_param { + __be32 opt_param_mask; + u32 reserved1; + struct mthca_qp_context context; + u32 reserved2[62]; +} __attribute__((packed)); + +enum { + MTHCA_QP_OPTPAR_ALT_ADDR_PATH = 1 << 0, + MTHCA_QP_OPTPAR_RRE = 1 << 1, + MTHCA_QP_OPTPAR_RAE = 1 << 2, + MTHCA_QP_OPTPAR_RWE = 1 << 3, + MTHCA_QP_OPTPAR_PKEY_INDEX = 1 << 4, + MTHCA_QP_OPTPAR_Q_KEY = 1 << 5, + MTHCA_QP_OPTPAR_RNR_TIMEOUT = 1 << 6, + MTHCA_QP_OPTPAR_PRIMARY_ADDR_PATH = 1 << 7, + MTHCA_QP_OPTPAR_SRA_MAX = 1 << 8, + MTHCA_QP_OPTPAR_RRA_MAX = 1 << 9, + MTHCA_QP_OPTPAR_PM_STATE = 1 << 10, + MTHCA_QP_OPTPAR_PORT_NUM = 1 << 11, + MTHCA_QP_OPTPAR_RETRY_COUNT = 1 << 12, + MTHCA_QP_OPTPAR_ALT_RNR_RETRY = 1 << 13, + MTHCA_QP_OPTPAR_ACK_TIMEOUT = 1 << 14, + MTHCA_QP_OPTPAR_RNR_RETRY = 1 << 15, + MTHCA_QP_OPTPAR_SCHED_QUEUE = 1 << 16 +}; + +static const u8 mthca_opcode[] = { + [IB_WR_SEND] = MTHCA_OPCODE_SEND, + [IB_WR_SEND_WITH_IMM] = MTHCA_OPCODE_SEND_IMM, + [IB_WR_RDMA_WRITE] = MTHCA_OPCODE_RDMA_WRITE, + [IB_WR_RDMA_WRITE_WITH_IMM] = MTHCA_OPCODE_RDMA_WRITE_IMM, + [IB_WR_RDMA_READ] = MTHCA_OPCODE_RDMA_READ, + [IB_WR_ATOMIC_CMP_AND_SWP] = MTHCA_OPCODE_ATOMIC_CS, + [IB_WR_ATOMIC_FETCH_AND_ADD] = MTHCA_OPCODE_ATOMIC_FA, +}; + +static int is_sqp(struct mthca_dev *dev, struct mthca_qp *qp) +{ + return qp->qpn >= dev->qp_table.sqp_start && + qp->qpn <= dev->qp_table.sqp_start + 3; +} + +static int is_qp0(struct mthca_dev *dev, struct mthca_qp *qp) +{ + return qp->qpn >= dev->qp_table.sqp_start && + qp->qpn <= dev->qp_table.sqp_start + 1; +} + +static void *get_recv_wqe(struct mthca_qp *qp, int n) +{ + if (qp->is_direct) + return qp->queue.direct.buf + (n << qp->rq.wqe_shift); + else + return qp->queue.page_list[(n << qp->rq.wqe_shift) >> PAGE_SHIFT].buf + + ((n << qp->rq.wqe_shift) & (PAGE_SIZE - 1)); +} + +static void *get_send_wqe(struct mthca_qp *qp, int n) +{ + if (qp->is_direct) + return qp->queue.direct.buf + qp->send_wqe_offset + + (n << qp->sq.wqe_shift); + else + return qp->queue.page_list[(qp->send_wqe_offset + + (n << qp->sq.wqe_shift)) >> + PAGE_SHIFT].buf + + ((qp->send_wqe_offset + (n << qp->sq.wqe_shift)) & + (PAGE_SIZE - 1)); +} + +static void mthca_wq_reset(struct mthca_wq *wq) +{ + wq->next_ind = 0; + wq->last_comp = wq->max - 1; + wq->head = 0; + wq->tail = 0; +} + +void mthca_qp_event(struct mthca_dev *dev, u32 qpn, + enum ib_event_type event_type) +{ + struct mthca_qp *qp; + struct ib_event event; + + spin_lock(&dev->qp_table.lock); + qp = mthca_array_get(&dev->qp_table.qp, qpn & (dev->limits.num_qps - 1)); + if (qp) + ++qp->refcount; + spin_unlock(&dev->qp_table.lock); + + if (!qp) { + mthca_warn(dev, "Async event %d for bogus QP %08x\n", + event_type, qpn); + return; + } + + if (event_type == IB_EVENT_PATH_MIG) + qp->port = qp->alt_port; + + event.device = &dev->ib_dev; + event.event = event_type; + event.element.qp = &qp->ibqp; + if (qp->ibqp.event_handler) + qp->ibqp.event_handler(&event, qp->ibqp.qp_context); + + spin_lock(&dev->qp_table.lock); + if (!--qp->refcount) + wake_up(&qp->wait); + spin_unlock(&dev->qp_table.lock); +} + +static int to_mthca_state(enum ib_qp_state ib_state) +{ + switch (ib_state) { + case IB_QPS_RESET: return MTHCA_QP_STATE_RST; + case IB_QPS_INIT: return MTHCA_QP_STATE_INIT; + case IB_QPS_RTR: return MTHCA_QP_STATE_RTR; + case IB_QPS_RTS: return MTHCA_QP_STATE_RTS; + case IB_QPS_SQD: return MTHCA_QP_STATE_SQD; + case IB_QPS_SQE: return MTHCA_QP_STATE_SQE; + case IB_QPS_ERR: return MTHCA_QP_STATE_ERR; + default: return -1; + } +} + +enum { RC, UC, UD, RD, RDEE, MLX, NUM_TRANS }; + +static int to_mthca_st(int transport) +{ + switch (transport) { + case RC: return MTHCA_QP_ST_RC; + case UC: return MTHCA_QP_ST_UC; + case UD: return MTHCA_QP_ST_UD; + case RD: return MTHCA_QP_ST_RD; + case MLX: return MTHCA_QP_ST_MLX; + default: return -1; + } +} + +static void store_attrs(struct mthca_sqp *sqp, const struct ib_qp_attr *attr, + int attr_mask) +{ + if (attr_mask & IB_QP_PKEY_INDEX) + sqp->pkey_index = attr->pkey_index; + if (attr_mask & IB_QP_QKEY) + sqp->qkey = attr->qkey; + if (attr_mask & IB_QP_SQ_PSN) + sqp->send_psn = attr->sq_psn; +} + +static void init_port(struct mthca_dev *dev, int port) +{ + int err; + struct mthca_init_ib_param param; + + memset(¶m, 0, sizeof param); + + param.port_width = dev->limits.port_width_cap; + param.vl_cap = dev->limits.vl_cap; + param.mtu_cap = dev->limits.mtu_cap; + param.gid_cap = dev->limits.gid_table_len; + param.pkey_cap = dev->limits.pkey_table_len; + + err = mthca_INIT_IB(dev, ¶m, port); + if (err) + mthca_warn(dev, "INIT_IB failed, return code %d.\n", err); +} + +static __be32 get_hw_access_flags(struct mthca_qp *qp, const struct ib_qp_attr *attr, + int attr_mask) +{ + u8 dest_rd_atomic; + u32 access_flags; + u32 hw_access_flags = 0; + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) + dest_rd_atomic = attr->max_dest_rd_atomic; + else + dest_rd_atomic = qp->resp_depth; + + if (attr_mask & IB_QP_ACCESS_FLAGS) + access_flags = attr->qp_access_flags; + else + access_flags = qp->atomic_rd_en; + + if (!dest_rd_atomic) + access_flags &= IB_ACCESS_REMOTE_WRITE; + + if (access_flags & IB_ACCESS_REMOTE_READ) + hw_access_flags |= MTHCA_QP_BIT_RRE; + if (access_flags & IB_ACCESS_REMOTE_ATOMIC) + hw_access_flags |= MTHCA_QP_BIT_RAE; + if (access_flags & IB_ACCESS_REMOTE_WRITE) + hw_access_flags |= MTHCA_QP_BIT_RWE; + + return cpu_to_be32(hw_access_flags); +} + +static inline enum ib_qp_state to_ib_qp_state(int mthca_state) +{ + switch (mthca_state) { + case MTHCA_QP_STATE_RST: return IB_QPS_RESET; + case MTHCA_QP_STATE_INIT: return IB_QPS_INIT; + case MTHCA_QP_STATE_RTR: return IB_QPS_RTR; + case MTHCA_QP_STATE_RTS: return IB_QPS_RTS; + case MTHCA_QP_STATE_DRAINING: + case MTHCA_QP_STATE_SQD: return IB_QPS_SQD; + case MTHCA_QP_STATE_SQE: return IB_QPS_SQE; + case MTHCA_QP_STATE_ERR: return IB_QPS_ERR; + default: return -1; + } +} + +static inline enum ib_mig_state to_ib_mig_state(int mthca_mig_state) +{ + switch (mthca_mig_state) { + case 0: return IB_MIG_ARMED; + case 1: return IB_MIG_REARM; + case 3: return IB_MIG_MIGRATED; + default: return -1; + } +} + +static int to_ib_qp_access_flags(int mthca_flags) +{ + int ib_flags = 0; + + if (mthca_flags & MTHCA_QP_BIT_RRE) + ib_flags |= IB_ACCESS_REMOTE_READ; + if (mthca_flags & MTHCA_QP_BIT_RWE) + ib_flags |= IB_ACCESS_REMOTE_WRITE; + if (mthca_flags & MTHCA_QP_BIT_RAE) + ib_flags |= IB_ACCESS_REMOTE_ATOMIC; + + return ib_flags; +} + +static void to_ib_ah_attr(struct mthca_dev *dev, struct ib_ah_attr *ib_ah_attr, + struct mthca_qp_path *path) +{ + memset(ib_ah_attr, 0, sizeof *ib_ah_attr); + ib_ah_attr->port_num = (be32_to_cpu(path->port_pkey) >> 24) & 0x3; + + if (ib_ah_attr->port_num == 0 || ib_ah_attr->port_num > dev->limits.num_ports) + return; + + ib_ah_attr->dlid = be16_to_cpu(path->rlid); + ib_ah_attr->sl = be32_to_cpu(path->sl_tclass_flowlabel) >> 28; + ib_ah_attr->src_path_bits = path->g_mylmc & 0x7f; + ib_ah_attr->static_rate = mthca_rate_to_ib(dev, + path->static_rate & 0xf, + ib_ah_attr->port_num); + ib_ah_attr->ah_flags = (path->g_mylmc & (1 << 7)) ? IB_AH_GRH : 0; + if (ib_ah_attr->ah_flags) { + ib_ah_attr->grh.sgid_index = path->mgid_index & (dev->limits.gid_table_len - 1); + ib_ah_attr->grh.hop_limit = path->hop_limit; + ib_ah_attr->grh.traffic_class = + (be32_to_cpu(path->sl_tclass_flowlabel) >> 20) & 0xff; + ib_ah_attr->grh.flow_label = + be32_to_cpu(path->sl_tclass_flowlabel) & 0xfffff; + memcpy(ib_ah_attr->grh.dgid.raw, + path->rgid, sizeof ib_ah_attr->grh.dgid.raw); + } +} + +int mthca_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr) +{ + struct mthca_dev *dev = to_mdev(ibqp->device); + struct mthca_qp *qp = to_mqp(ibqp); + int err = 0; + struct mthca_mailbox *mailbox = NULL; + struct mthca_qp_param *qp_param; + struct mthca_qp_context *context; + int mthca_state; + + mutex_lock(&qp->mutex); + + if (qp->state == IB_QPS_RESET) { + qp_attr->qp_state = IB_QPS_RESET; + goto done; + } + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) { + err = PTR_ERR(mailbox); + goto out; + } + + err = mthca_QUERY_QP(dev, qp->qpn, 0, mailbox); + if (err) { + mthca_warn(dev, "QUERY_QP failed (%d)\n", err); + goto out_mailbox; + } + + qp_param = mailbox->buf; + context = &qp_param->context; + mthca_state = be32_to_cpu(context->flags) >> 28; + + qp->state = to_ib_qp_state(mthca_state); + qp_attr->qp_state = qp->state; + qp_attr->path_mtu = context->mtu_msgmax >> 5; + qp_attr->path_mig_state = + to_ib_mig_state((be32_to_cpu(context->flags) >> 11) & 0x3); + qp_attr->qkey = be32_to_cpu(context->qkey); + qp_attr->rq_psn = be32_to_cpu(context->rnr_nextrecvpsn) & 0xffffff; + qp_attr->sq_psn = be32_to_cpu(context->next_send_psn) & 0xffffff; + qp_attr->dest_qp_num = be32_to_cpu(context->remote_qpn) & 0xffffff; + qp_attr->qp_access_flags = + to_ib_qp_access_flags(be32_to_cpu(context->params2)); + + if (qp->transport == RC || qp->transport == UC) { + to_ib_ah_attr(dev, &qp_attr->ah_attr, &context->pri_path); + to_ib_ah_attr(dev, &qp_attr->alt_ah_attr, &context->alt_path); + qp_attr->alt_pkey_index = + be32_to_cpu(context->alt_path.port_pkey) & 0x7f; + qp_attr->alt_port_num = qp_attr->alt_ah_attr.port_num; + } + + qp_attr->pkey_index = be32_to_cpu(context->pri_path.port_pkey) & 0x7f; + qp_attr->port_num = + (be32_to_cpu(context->pri_path.port_pkey) >> 24) & 0x3; + + /* qp_attr->en_sqd_async_notify is only applicable in modify qp */ + qp_attr->sq_draining = mthca_state == MTHCA_QP_STATE_DRAINING; + + qp_attr->max_rd_atomic = 1 << ((be32_to_cpu(context->params1) >> 21) & 0x7); + + qp_attr->max_dest_rd_atomic = + 1 << ((be32_to_cpu(context->params2) >> 21) & 0x7); + qp_attr->min_rnr_timer = + (be32_to_cpu(context->rnr_nextrecvpsn) >> 24) & 0x1f; + qp_attr->timeout = context->pri_path.ackto >> 3; + qp_attr->retry_cnt = (be32_to_cpu(context->params1) >> 16) & 0x7; + qp_attr->rnr_retry = context->pri_path.rnr_retry >> 5; + qp_attr->alt_timeout = context->alt_path.ackto >> 3; + +done: + qp_attr->cur_qp_state = qp_attr->qp_state; + qp_attr->cap.max_send_wr = qp->sq.max; + qp_attr->cap.max_recv_wr = qp->rq.max; + qp_attr->cap.max_send_sge = qp->sq.max_gs; + qp_attr->cap.max_recv_sge = qp->rq.max_gs; + qp_attr->cap.max_inline_data = qp->max_inline_data; + + qp_init_attr->cap = qp_attr->cap; + qp_init_attr->sq_sig_type = qp->sq_policy; + +out_mailbox: + mthca_free_mailbox(dev, mailbox); + +out: + mutex_unlock(&qp->mutex); + return err; +} + +static int mthca_path_set(struct mthca_dev *dev, const struct ib_ah_attr *ah, + struct mthca_qp_path *path, u8 port) +{ + path->g_mylmc = ah->src_path_bits & 0x7f; + path->rlid = cpu_to_be16(ah->dlid); + path->static_rate = mthca_get_rate(dev, ah->static_rate, port); + + if (ah->ah_flags & IB_AH_GRH) { + if (ah->grh.sgid_index >= dev->limits.gid_table_len) { + mthca_dbg(dev, "sgid_index (%u) too large. max is %d\n", + ah->grh.sgid_index, dev->limits.gid_table_len-1); + return -1; + } + + path->g_mylmc |= 1 << 7; + path->mgid_index = ah->grh.sgid_index; + path->hop_limit = ah->grh.hop_limit; + path->sl_tclass_flowlabel = + cpu_to_be32((ah->sl << 28) | + (ah->grh.traffic_class << 20) | + (ah->grh.flow_label)); + memcpy(path->rgid, ah->grh.dgid.raw, 16); + } else + path->sl_tclass_flowlabel = cpu_to_be32(ah->sl << 28); + + return 0; +} + +static int __mthca_modify_qp(struct ib_qp *ibqp, + const struct ib_qp_attr *attr, int attr_mask, + enum ib_qp_state cur_state, enum ib_qp_state new_state) +{ + struct mthca_dev *dev = to_mdev(ibqp->device); + struct mthca_qp *qp = to_mqp(ibqp); + struct mthca_mailbox *mailbox; + struct mthca_qp_param *qp_param; + struct mthca_qp_context *qp_context; + u32 sqd_event = 0; + int err = -EINVAL; + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) { + err = PTR_ERR(mailbox); + goto out; + } + qp_param = mailbox->buf; + qp_context = &qp_param->context; + memset(qp_param, 0, sizeof *qp_param); + + qp_context->flags = cpu_to_be32((to_mthca_state(new_state) << 28) | + (to_mthca_st(qp->transport) << 16)); + qp_context->flags |= cpu_to_be32(MTHCA_QP_BIT_DE); + if (!(attr_mask & IB_QP_PATH_MIG_STATE)) + qp_context->flags |= cpu_to_be32(MTHCA_QP_PM_MIGRATED << 11); + else { + qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PM_STATE); + switch (attr->path_mig_state) { + case IB_MIG_MIGRATED: + qp_context->flags |= cpu_to_be32(MTHCA_QP_PM_MIGRATED << 11); + break; + case IB_MIG_REARM: + qp_context->flags |= cpu_to_be32(MTHCA_QP_PM_REARM << 11); + break; + case IB_MIG_ARMED: + qp_context->flags |= cpu_to_be32(MTHCA_QP_PM_ARMED << 11); + break; + } + } + + /* leave tavor_sched_queue as 0 */ + + if (qp->transport == MLX || qp->transport == UD) + qp_context->mtu_msgmax = (IB_MTU_2048 << 5) | 11; + else if (attr_mask & IB_QP_PATH_MTU) { + if (attr->path_mtu < IB_MTU_256 || attr->path_mtu > IB_MTU_2048) { + mthca_dbg(dev, "path MTU (%u) is invalid\n", + attr->path_mtu); + goto out_mailbox; + } + qp_context->mtu_msgmax = (attr->path_mtu << 5) | 31; + } + + if (mthca_is_memfree(dev)) { + if (qp->rq.max) + qp_context->rq_size_stride = ilog2(qp->rq.max) << 3; + qp_context->rq_size_stride |= qp->rq.wqe_shift - 4; + + if (qp->sq.max) + qp_context->sq_size_stride = ilog2(qp->sq.max) << 3; + qp_context->sq_size_stride |= qp->sq.wqe_shift - 4; + } + + /* leave arbel_sched_queue as 0 */ + + if (qp->ibqp.uobject) + qp_context->usr_page = + cpu_to_be32(to_mucontext(qp->ibqp.uobject->context)->uar.index); + else + qp_context->usr_page = cpu_to_be32(dev->driver_uar.index); + qp_context->local_qpn = cpu_to_be32(qp->qpn); + if (attr_mask & IB_QP_DEST_QPN) { + qp_context->remote_qpn = cpu_to_be32(attr->dest_qp_num); + } + + if (qp->transport == MLX) + qp_context->pri_path.port_pkey |= + cpu_to_be32(qp->port << 24); + else { + if (attr_mask & IB_QP_PORT) { + qp_context->pri_path.port_pkey |= + cpu_to_be32(attr->port_num << 24); + qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PORT_NUM); + } + } + + if (attr_mask & IB_QP_PKEY_INDEX) { + qp_context->pri_path.port_pkey |= + cpu_to_be32(attr->pkey_index); + qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PKEY_INDEX); + } + + if (attr_mask & IB_QP_RNR_RETRY) { + qp_context->alt_path.rnr_retry = qp_context->pri_path.rnr_retry = + attr->rnr_retry << 5; + qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RNR_RETRY | + MTHCA_QP_OPTPAR_ALT_RNR_RETRY); + } + + if (attr_mask & IB_QP_AV) { + if (mthca_path_set(dev, &attr->ah_attr, &qp_context->pri_path, + attr_mask & IB_QP_PORT ? attr->port_num : qp->port)) + goto out_mailbox; + + qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PRIMARY_ADDR_PATH); + } + + if (ibqp->qp_type == IB_QPT_RC && + cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) { + u8 sched_queue = ibqp->uobject ? 0x2 : 0x1; + + if (mthca_is_memfree(dev)) + qp_context->rlkey_arbel_sched_queue |= sched_queue; + else + qp_context->tavor_sched_queue |= cpu_to_be32(sched_queue); + + qp_param->opt_param_mask |= + cpu_to_be32(MTHCA_QP_OPTPAR_SCHED_QUEUE); + } + + if (attr_mask & IB_QP_TIMEOUT) { + qp_context->pri_path.ackto = attr->timeout << 3; + qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_ACK_TIMEOUT); + } + + if (attr_mask & IB_QP_ALT_PATH) { + if (attr->alt_pkey_index >= dev->limits.pkey_table_len) { + mthca_dbg(dev, "Alternate P_Key index (%u) too large. max is %d\n", + attr->alt_pkey_index, dev->limits.pkey_table_len-1); + goto out_mailbox; + } + + if (attr->alt_port_num == 0 || attr->alt_port_num > dev->limits.num_ports) { + mthca_dbg(dev, "Alternate port number (%u) is invalid\n", + attr->alt_port_num); + goto out_mailbox; + } + + if (mthca_path_set(dev, &attr->alt_ah_attr, &qp_context->alt_path, + attr->alt_ah_attr.port_num)) + goto out_mailbox; + + qp_context->alt_path.port_pkey |= cpu_to_be32(attr->alt_pkey_index | + attr->alt_port_num << 24); + qp_context->alt_path.ackto = attr->alt_timeout << 3; + qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_ALT_ADDR_PATH); + } + + /* leave rdd as 0 */ + qp_context->pd = cpu_to_be32(to_mpd(ibqp->pd)->pd_num); + /* leave wqe_base as 0 (we always create an MR based at 0 for WQs) */ + qp_context->wqe_lkey = cpu_to_be32(qp->mr.ibmr.lkey); + qp_context->params1 = cpu_to_be32((MTHCA_ACK_REQ_FREQ << 28) | + (MTHCA_FLIGHT_LIMIT << 24) | + MTHCA_QP_BIT_SWE); + if (qp->sq_policy == IB_SIGNAL_ALL_WR) + qp_context->params1 |= cpu_to_be32(MTHCA_QP_BIT_SSC); + if (attr_mask & IB_QP_RETRY_CNT) { + qp_context->params1 |= cpu_to_be32(attr->retry_cnt << 16); + qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RETRY_COUNT); + } + + if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) { + if (attr->max_rd_atomic) { + qp_context->params1 |= + cpu_to_be32(MTHCA_QP_BIT_SRE | + MTHCA_QP_BIT_SAE); + qp_context->params1 |= + cpu_to_be32(fls(attr->max_rd_atomic - 1) << 21); + } + qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_SRA_MAX); + } + + if (attr_mask & IB_QP_SQ_PSN) + qp_context->next_send_psn = cpu_to_be32(attr->sq_psn); + qp_context->cqn_snd = cpu_to_be32(to_mcq(ibqp->send_cq)->cqn); + + if (mthca_is_memfree(dev)) { + qp_context->snd_wqe_base_l = cpu_to_be32(qp->send_wqe_offset); + qp_context->snd_db_index = cpu_to_be32(qp->sq.db_index); + } + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) { + if (attr->max_dest_rd_atomic) + qp_context->params2 |= + cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21); + + qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RRA_MAX); + } + + if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) { + qp_context->params2 |= get_hw_access_flags(qp, attr, attr_mask); + qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RWE | + MTHCA_QP_OPTPAR_RRE | + MTHCA_QP_OPTPAR_RAE); + } + + qp_context->params2 |= cpu_to_be32(MTHCA_QP_BIT_RSC); + + if (ibqp->srq) + qp_context->params2 |= cpu_to_be32(MTHCA_QP_BIT_RIC); + + if (attr_mask & IB_QP_MIN_RNR_TIMER) { + qp_context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24); + qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RNR_TIMEOUT); + } + if (attr_mask & IB_QP_RQ_PSN) + qp_context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn); + + qp_context->ra_buff_indx = + cpu_to_be32(dev->qp_table.rdb_base + + ((qp->qpn & (dev->limits.num_qps - 1)) * MTHCA_RDB_ENTRY_SIZE << + dev->qp_table.rdb_shift)); + + qp_context->cqn_rcv = cpu_to_be32(to_mcq(ibqp->recv_cq)->cqn); + + if (mthca_is_memfree(dev)) + qp_context->rcv_db_index = cpu_to_be32(qp->rq.db_index); + + if (attr_mask & IB_QP_QKEY) { + qp_context->qkey = cpu_to_be32(attr->qkey); + qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_Q_KEY); + } + + if (ibqp->srq) + qp_context->srqn = cpu_to_be32(1 << 24 | + to_msrq(ibqp->srq)->srqn); + + if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD && + attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY && + attr->en_sqd_async_notify) + sqd_event = 1 << 31; + + err = mthca_MODIFY_QP(dev, cur_state, new_state, qp->qpn, 0, + mailbox, sqd_event); + if (err) { + mthca_warn(dev, "modify QP %d->%d returned %d.\n", + cur_state, new_state, err); + goto out_mailbox; + } + + qp->state = new_state; + if (attr_mask & IB_QP_ACCESS_FLAGS) + qp->atomic_rd_en = attr->qp_access_flags; + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) + qp->resp_depth = attr->max_dest_rd_atomic; + if (attr_mask & IB_QP_PORT) + qp->port = attr->port_num; + if (attr_mask & IB_QP_ALT_PATH) + qp->alt_port = attr->alt_port_num; + + if (is_sqp(dev, qp)) + store_attrs(to_msqp(qp), attr, attr_mask); + + /* + * If we moved QP0 to RTR, bring the IB link up; if we moved + * QP0 to RESET or ERROR, bring the link back down. + */ + if (is_qp0(dev, qp)) { + if (cur_state != IB_QPS_RTR && + new_state == IB_QPS_RTR) + init_port(dev, qp->port); + + if (cur_state != IB_QPS_RESET && + cur_state != IB_QPS_ERR && + (new_state == IB_QPS_RESET || + new_state == IB_QPS_ERR)) + mthca_CLOSE_IB(dev, qp->port); + } + + /* + * If we moved a kernel QP to RESET, clean up all old CQ + * entries and reinitialize the QP. + */ + if (new_state == IB_QPS_RESET && !qp->ibqp.uobject) { + mthca_cq_clean(dev, to_mcq(qp->ibqp.recv_cq), qp->qpn, + qp->ibqp.srq ? to_msrq(qp->ibqp.srq) : NULL); + if (qp->ibqp.send_cq != qp->ibqp.recv_cq) + mthca_cq_clean(dev, to_mcq(qp->ibqp.send_cq), qp->qpn, NULL); + + mthca_wq_reset(&qp->sq); + qp->sq.last = get_send_wqe(qp, qp->sq.max - 1); + + mthca_wq_reset(&qp->rq); + qp->rq.last = get_recv_wqe(qp, qp->rq.max - 1); + + if (mthca_is_memfree(dev)) { + *qp->sq.db = 0; + *qp->rq.db = 0; + } + } + +out_mailbox: + mthca_free_mailbox(dev, mailbox); +out: + return err; +} + +int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, + struct ib_udata *udata) +{ + struct mthca_dev *dev = to_mdev(ibqp->device); + struct mthca_qp *qp = to_mqp(ibqp); + enum ib_qp_state cur_state, new_state; + int err = -EINVAL; + + mutex_lock(&qp->mutex); + if (attr_mask & IB_QP_CUR_STATE) { + cur_state = attr->cur_qp_state; + } else { + spin_lock_irq(&qp->sq.lock); + spin_lock(&qp->rq.lock); + cur_state = qp->state; + spin_unlock(&qp->rq.lock); + spin_unlock_irq(&qp->sq.lock); + } + + new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; + + if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask, + IB_LINK_LAYER_UNSPECIFIED)) { + mthca_dbg(dev, "Bad QP transition (transport %d) " + "%d->%d with attr 0x%08x\n", + qp->transport, cur_state, new_state, + attr_mask); + goto out; + } + + if ((attr_mask & IB_QP_PKEY_INDEX) && + attr->pkey_index >= dev->limits.pkey_table_len) { + mthca_dbg(dev, "P_Key index (%u) too large. max is %d\n", + attr->pkey_index, dev->limits.pkey_table_len-1); + goto out; + } + + if ((attr_mask & IB_QP_PORT) && + (attr->port_num == 0 || attr->port_num > dev->limits.num_ports)) { + mthca_dbg(dev, "Port number (%u) is invalid\n", attr->port_num); + goto out; + } + + if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC && + attr->max_rd_atomic > dev->limits.max_qp_init_rdma) { + mthca_dbg(dev, "Max rdma_atomic as initiator %u too large (max is %d)\n", + attr->max_rd_atomic, dev->limits.max_qp_init_rdma); + goto out; + } + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC && + attr->max_dest_rd_atomic > 1 << dev->qp_table.rdb_shift) { + mthca_dbg(dev, "Max rdma_atomic as responder %u too large (max %d)\n", + attr->max_dest_rd_atomic, 1 << dev->qp_table.rdb_shift); + goto out; + } + + if (cur_state == new_state && cur_state == IB_QPS_RESET) { + err = 0; + goto out; + } + + err = __mthca_modify_qp(ibqp, attr, attr_mask, cur_state, new_state); + +out: + mutex_unlock(&qp->mutex); + return err; +} + +static int mthca_max_data_size(struct mthca_dev *dev, struct mthca_qp *qp, int desc_sz) +{ + /* + * Calculate the maximum size of WQE s/g segments, excluding + * the next segment and other non-data segments. + */ + int max_data_size = desc_sz - sizeof (struct mthca_next_seg); + + switch (qp->transport) { + case MLX: + max_data_size -= 2 * sizeof (struct mthca_data_seg); + break; + + case UD: + if (mthca_is_memfree(dev)) + max_data_size -= sizeof (struct mthca_arbel_ud_seg); + else + max_data_size -= sizeof (struct mthca_tavor_ud_seg); + break; + + default: + max_data_size -= sizeof (struct mthca_raddr_seg); + break; + } + + return max_data_size; +} + +static inline int mthca_max_inline_data(struct mthca_pd *pd, int max_data_size) +{ + /* We don't support inline data for kernel QPs (yet). */ + return pd->ibpd.uobject ? max_data_size - MTHCA_INLINE_HEADER_SIZE : 0; +} + +static void mthca_adjust_qp_caps(struct mthca_dev *dev, + struct mthca_pd *pd, + struct mthca_qp *qp) +{ + int max_data_size = mthca_max_data_size(dev, qp, + min(dev->limits.max_desc_sz, + 1 << qp->sq.wqe_shift)); + + qp->max_inline_data = mthca_max_inline_data(pd, max_data_size); + + qp->sq.max_gs = min_t(int, dev->limits.max_sg, + max_data_size / sizeof (struct mthca_data_seg)); + qp->rq.max_gs = min_t(int, dev->limits.max_sg, + (min(dev->limits.max_desc_sz, 1 << qp->rq.wqe_shift) - + sizeof (struct mthca_next_seg)) / + sizeof (struct mthca_data_seg)); +} + +/* + * Allocate and register buffer for WQEs. qp->rq.max, sq.max, + * rq.max_gs and sq.max_gs must all be assigned. + * mthca_alloc_wqe_buf will calculate rq.wqe_shift and + * sq.wqe_shift (as well as send_wqe_offset, is_direct, and + * queue) + */ +static int mthca_alloc_wqe_buf(struct mthca_dev *dev, + struct mthca_pd *pd, + struct mthca_qp *qp) +{ + int size; + int err = -ENOMEM; + + size = sizeof (struct mthca_next_seg) + + qp->rq.max_gs * sizeof (struct mthca_data_seg); + + if (size > dev->limits.max_desc_sz) + return -EINVAL; + + for (qp->rq.wqe_shift = 6; 1 << qp->rq.wqe_shift < size; + qp->rq.wqe_shift++) + ; /* nothing */ + + size = qp->sq.max_gs * sizeof (struct mthca_data_seg); + switch (qp->transport) { + case MLX: + size += 2 * sizeof (struct mthca_data_seg); + break; + + case UD: + size += mthca_is_memfree(dev) ? + sizeof (struct mthca_arbel_ud_seg) : + sizeof (struct mthca_tavor_ud_seg); + break; + + case UC: + size += sizeof (struct mthca_raddr_seg); + break; + + case RC: + size += sizeof (struct mthca_raddr_seg); + /* + * An atomic op will require an atomic segment, a + * remote address segment and one scatter entry. + */ + size = max_t(int, size, + sizeof (struct mthca_atomic_seg) + + sizeof (struct mthca_raddr_seg) + + sizeof (struct mthca_data_seg)); + break; + + default: + break; + } + + /* Make sure that we have enough space for a bind request */ + size = max_t(int, size, sizeof (struct mthca_bind_seg)); + + size += sizeof (struct mthca_next_seg); + + if (size > dev->limits.max_desc_sz) + return -EINVAL; + + for (qp->sq.wqe_shift = 6; 1 << qp->sq.wqe_shift < size; + qp->sq.wqe_shift++) + ; /* nothing */ + + qp->send_wqe_offset = ALIGN(qp->rq.max << qp->rq.wqe_shift, + 1 << qp->sq.wqe_shift); + + /* + * If this is a userspace QP, we don't actually have to + * allocate anything. All we need is to calculate the WQE + * sizes and the send_wqe_offset, so we're done now. + */ + if (pd->ibpd.uobject) + return 0; + + size = PAGE_ALIGN(qp->send_wqe_offset + + (qp->sq.max << qp->sq.wqe_shift)); + + qp->wrid = kmalloc((qp->rq.max + qp->sq.max) * sizeof (u64), + GFP_KERNEL); + if (!qp->wrid) + goto err_out; + + err = mthca_buf_alloc(dev, size, MTHCA_MAX_DIRECT_QP_SIZE, + &qp->queue, &qp->is_direct, pd, 0, &qp->mr); + if (err) + goto err_out; + + return 0; + +err_out: + kfree(qp->wrid); + return err; +} + +static void mthca_free_wqe_buf(struct mthca_dev *dev, + struct mthca_qp *qp) +{ + mthca_buf_free(dev, PAGE_ALIGN(qp->send_wqe_offset + + (qp->sq.max << qp->sq.wqe_shift)), + &qp->queue, qp->is_direct, &qp->mr); + kfree(qp->wrid); +} + +static int mthca_map_memfree(struct mthca_dev *dev, + struct mthca_qp *qp) +{ + int ret; + + if (mthca_is_memfree(dev)) { + ret = mthca_table_get(dev, dev->qp_table.qp_table, qp->qpn); + if (ret) + return ret; + + ret = mthca_table_get(dev, dev->qp_table.eqp_table, qp->qpn); + if (ret) + goto err_qpc; + + ret = mthca_table_get(dev, dev->qp_table.rdb_table, + qp->qpn << dev->qp_table.rdb_shift); + if (ret) + goto err_eqpc; + + } + + return 0; + +err_eqpc: + mthca_table_put(dev, dev->qp_table.eqp_table, qp->qpn); + +err_qpc: + mthca_table_put(dev, dev->qp_table.qp_table, qp->qpn); + + return ret; +} + +static void mthca_unmap_memfree(struct mthca_dev *dev, + struct mthca_qp *qp) +{ + mthca_table_put(dev, dev->qp_table.rdb_table, + qp->qpn << dev->qp_table.rdb_shift); + mthca_table_put(dev, dev->qp_table.eqp_table, qp->qpn); + mthca_table_put(dev, dev->qp_table.qp_table, qp->qpn); +} + +static int mthca_alloc_memfree(struct mthca_dev *dev, + struct mthca_qp *qp) +{ + if (mthca_is_memfree(dev)) { + qp->rq.db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_RQ, + qp->qpn, &qp->rq.db); + if (qp->rq.db_index < 0) + return -ENOMEM; + + qp->sq.db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_SQ, + qp->qpn, &qp->sq.db); + if (qp->sq.db_index < 0) { + mthca_free_db(dev, MTHCA_DB_TYPE_RQ, qp->rq.db_index); + return -ENOMEM; + } + } + + return 0; +} + +static void mthca_free_memfree(struct mthca_dev *dev, + struct mthca_qp *qp) +{ + if (mthca_is_memfree(dev)) { + mthca_free_db(dev, MTHCA_DB_TYPE_SQ, qp->sq.db_index); + mthca_free_db(dev, MTHCA_DB_TYPE_RQ, qp->rq.db_index); + } +} + +static int mthca_alloc_qp_common(struct mthca_dev *dev, + struct mthca_pd *pd, + struct mthca_cq *send_cq, + struct mthca_cq *recv_cq, + enum ib_sig_type send_policy, + struct mthca_qp *qp) +{ + int ret; + int i; + struct mthca_next_seg *next; + + qp->refcount = 1; + init_waitqueue_head(&qp->wait); + mutex_init(&qp->mutex); + qp->state = IB_QPS_RESET; + qp->atomic_rd_en = 0; + qp->resp_depth = 0; + qp->sq_policy = send_policy; + mthca_wq_reset(&qp->sq); + mthca_wq_reset(&qp->rq); + + spin_lock_init(&qp->sq.lock); + spin_lock_init(&qp->rq.lock); + + ret = mthca_map_memfree(dev, qp); + if (ret) + return ret; + + ret = mthca_alloc_wqe_buf(dev, pd, qp); + if (ret) { + mthca_unmap_memfree(dev, qp); + return ret; + } + + mthca_adjust_qp_caps(dev, pd, qp); + + /* + * If this is a userspace QP, we're done now. The doorbells + * will be allocated and buffers will be initialized in + * userspace. + */ + if (pd->ibpd.uobject) + return 0; + + ret = mthca_alloc_memfree(dev, qp); + if (ret) { + mthca_free_wqe_buf(dev, qp); + mthca_unmap_memfree(dev, qp); + return ret; + } + + if (mthca_is_memfree(dev)) { + struct mthca_data_seg *scatter; + int size = (sizeof (struct mthca_next_seg) + + qp->rq.max_gs * sizeof (struct mthca_data_seg)) / 16; + + for (i = 0; i < qp->rq.max; ++i) { + next = get_recv_wqe(qp, i); + next->nda_op = cpu_to_be32(((i + 1) & (qp->rq.max - 1)) << + qp->rq.wqe_shift); + next->ee_nds = cpu_to_be32(size); + + for (scatter = (void *) (next + 1); + (void *) scatter < (void *) next + (1 << qp->rq.wqe_shift); + ++scatter) + scatter->lkey = cpu_to_be32(MTHCA_INVAL_LKEY); + } + + for (i = 0; i < qp->sq.max; ++i) { + next = get_send_wqe(qp, i); + next->nda_op = cpu_to_be32((((i + 1) & (qp->sq.max - 1)) << + qp->sq.wqe_shift) + + qp->send_wqe_offset); + } + } else { + for (i = 0; i < qp->rq.max; ++i) { + next = get_recv_wqe(qp, i); + next->nda_op = htonl((((i + 1) % qp->rq.max) << + qp->rq.wqe_shift) | 1); + } + + } + + qp->sq.last = get_send_wqe(qp, qp->sq.max - 1); + qp->rq.last = get_recv_wqe(qp, qp->rq.max - 1); + + return 0; +} + +static int mthca_set_qp_size(struct mthca_dev *dev, struct ib_qp_cap *cap, + struct mthca_pd *pd, struct mthca_qp *qp) +{ + int max_data_size = mthca_max_data_size(dev, qp, dev->limits.max_desc_sz); + + /* Sanity check QP size before proceeding */ + if (cap->max_send_wr > dev->limits.max_wqes || + cap->max_recv_wr > dev->limits.max_wqes || + cap->max_send_sge > dev->limits.max_sg || + cap->max_recv_sge > dev->limits.max_sg || + cap->max_inline_data > mthca_max_inline_data(pd, max_data_size)) + return -EINVAL; + + /* + * For MLX transport we need 2 extra send gather entries: + * one for the header and one for the checksum at the end + */ + if (qp->transport == MLX && cap->max_send_sge + 2 > dev->limits.max_sg) + return -EINVAL; + + if (mthca_is_memfree(dev)) { + qp->rq.max = cap->max_recv_wr ? + roundup_pow_of_two(cap->max_recv_wr) : 0; + qp->sq.max = cap->max_send_wr ? + roundup_pow_of_two(cap->max_send_wr) : 0; + } else { + qp->rq.max = cap->max_recv_wr; + qp->sq.max = cap->max_send_wr; + } + + qp->rq.max_gs = cap->max_recv_sge; + qp->sq.max_gs = max_t(int, cap->max_send_sge, + ALIGN(cap->max_inline_data + MTHCA_INLINE_HEADER_SIZE, + MTHCA_INLINE_CHUNK_SIZE) / + sizeof (struct mthca_data_seg)); + + return 0; +} + +int mthca_alloc_qp(struct mthca_dev *dev, + struct mthca_pd *pd, + struct mthca_cq *send_cq, + struct mthca_cq *recv_cq, + enum ib_qp_type type, + enum ib_sig_type send_policy, + struct ib_qp_cap *cap, + struct mthca_qp *qp) +{ + int err; + + switch (type) { + case IB_QPT_RC: qp->transport = RC; break; + case IB_QPT_UC: qp->transport = UC; break; + case IB_QPT_UD: qp->transport = UD; break; + default: return -EINVAL; + } + + err = mthca_set_qp_size(dev, cap, pd, qp); + if (err) + return err; + + qp->qpn = mthca_alloc(&dev->qp_table.alloc); + if (qp->qpn == -1) + return -ENOMEM; + + /* initialize port to zero for error-catching. */ + qp->port = 0; + + err = mthca_alloc_qp_common(dev, pd, send_cq, recv_cq, + send_policy, qp); + if (err) { + mthca_free(&dev->qp_table.alloc, qp->qpn); + return err; + } + + spin_lock_irq(&dev->qp_table.lock); + mthca_array_set(&dev->qp_table.qp, + qp->qpn & (dev->limits.num_qps - 1), qp); + spin_unlock_irq(&dev->qp_table.lock); + + return 0; +} + +static void mthca_lock_cqs(struct mthca_cq *send_cq, struct mthca_cq *recv_cq) + __acquires(&send_cq->lock) __acquires(&recv_cq->lock) +{ + if (send_cq == recv_cq) { + spin_lock_irq(&send_cq->lock); + __acquire(&recv_cq->lock); + } else if (send_cq->cqn < recv_cq->cqn) { + spin_lock_irq(&send_cq->lock); + spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING); + } else { + spin_lock_irq(&recv_cq->lock); + spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING); + } +} + +static void mthca_unlock_cqs(struct mthca_cq *send_cq, struct mthca_cq *recv_cq) + __releases(&send_cq->lock) __releases(&recv_cq->lock) +{ + if (send_cq == recv_cq) { + __release(&recv_cq->lock); + spin_unlock_irq(&send_cq->lock); + } else if (send_cq->cqn < recv_cq->cqn) { + spin_unlock(&recv_cq->lock); + spin_unlock_irq(&send_cq->lock); + } else { + spin_unlock(&send_cq->lock); + spin_unlock_irq(&recv_cq->lock); + } +} + +int mthca_alloc_sqp(struct mthca_dev *dev, + struct mthca_pd *pd, + struct mthca_cq *send_cq, + struct mthca_cq *recv_cq, + enum ib_sig_type send_policy, + struct ib_qp_cap *cap, + int qpn, + int port, + struct mthca_sqp *sqp) +{ + u32 mqpn = qpn * 2 + dev->qp_table.sqp_start + port - 1; + int err; + + sqp->qp.transport = MLX; + err = mthca_set_qp_size(dev, cap, pd, &sqp->qp); + if (err) + return err; + + sqp->header_buf_size = sqp->qp.sq.max * MTHCA_UD_HEADER_SIZE; + sqp->header_buf = dma_alloc_coherent(&dev->pdev->dev, sqp->header_buf_size, + &sqp->header_dma, GFP_KERNEL); + if (!sqp->header_buf) + return -ENOMEM; + + spin_lock_irq(&dev->qp_table.lock); + if (mthca_array_get(&dev->qp_table.qp, mqpn)) + err = -EBUSY; + else + mthca_array_set(&dev->qp_table.qp, mqpn, sqp); + spin_unlock_irq(&dev->qp_table.lock); + + if (err) + goto err_out; + + sqp->qp.port = port; + sqp->qp.qpn = mqpn; + sqp->qp.transport = MLX; + + err = mthca_alloc_qp_common(dev, pd, send_cq, recv_cq, + send_policy, &sqp->qp); + if (err) + goto err_out_free; + + atomic_inc(&pd->sqp_count); + + return 0; + + err_out_free: + /* + * Lock CQs here, so that CQ polling code can do QP lookup + * without taking a lock. + */ + mthca_lock_cqs(send_cq, recv_cq); + + spin_lock(&dev->qp_table.lock); + mthca_array_clear(&dev->qp_table.qp, mqpn); + spin_unlock(&dev->qp_table.lock); + + mthca_unlock_cqs(send_cq, recv_cq); + + err_out: + dma_free_coherent(&dev->pdev->dev, sqp->header_buf_size, + sqp->header_buf, sqp->header_dma); + + return err; +} + +static inline int get_qp_refcount(struct mthca_dev *dev, struct mthca_qp *qp) +{ + int c; + + spin_lock_irq(&dev->qp_table.lock); + c = qp->refcount; + spin_unlock_irq(&dev->qp_table.lock); + + return c; +} + +void mthca_free_qp(struct mthca_dev *dev, + struct mthca_qp *qp) +{ + struct mthca_cq *send_cq; + struct mthca_cq *recv_cq; + + send_cq = to_mcq(qp->ibqp.send_cq); + recv_cq = to_mcq(qp->ibqp.recv_cq); + + /* + * Lock CQs here, so that CQ polling code can do QP lookup + * without taking a lock. + */ + mthca_lock_cqs(send_cq, recv_cq); + + spin_lock(&dev->qp_table.lock); + mthca_array_clear(&dev->qp_table.qp, + qp->qpn & (dev->limits.num_qps - 1)); + --qp->refcount; + spin_unlock(&dev->qp_table.lock); + + mthca_unlock_cqs(send_cq, recv_cq); + + wait_event(qp->wait, !get_qp_refcount(dev, qp)); + + if (qp->state != IB_QPS_RESET) + mthca_MODIFY_QP(dev, qp->state, IB_QPS_RESET, qp->qpn, 0, + NULL, 0); + + /* + * If this is a userspace QP, the buffers, MR, CQs and so on + * will be cleaned up in userspace, so all we have to do is + * unref the mem-free tables and free the QPN in our table. + */ + if (!qp->ibqp.uobject) { + mthca_cq_clean(dev, recv_cq, qp->qpn, + qp->ibqp.srq ? to_msrq(qp->ibqp.srq) : NULL); + if (send_cq != recv_cq) + mthca_cq_clean(dev, send_cq, qp->qpn, NULL); + + mthca_free_memfree(dev, qp); + mthca_free_wqe_buf(dev, qp); + } + + mthca_unmap_memfree(dev, qp); + + if (is_sqp(dev, qp)) { + atomic_dec(&(to_mpd(qp->ibqp.pd)->sqp_count)); + dma_free_coherent(&dev->pdev->dev, + to_msqp(qp)->header_buf_size, + to_msqp(qp)->header_buf, + to_msqp(qp)->header_dma); + } else + mthca_free(&dev->qp_table.alloc, qp->qpn); +} + +/* Create UD header for an MLX send and build a data segment for it */ +static int build_mlx_header(struct mthca_dev *dev, struct mthca_sqp *sqp, + int ind, struct ib_send_wr *wr, + struct mthca_mlx_seg *mlx, + struct mthca_data_seg *data) +{ + int header_size; + int err; + u16 pkey; + + ib_ud_header_init(256, /* assume a MAD */ 1, 0, 0, + mthca_ah_grh_present(to_mah(wr->wr.ud.ah)), 0, + &sqp->ud_header); + + err = mthca_read_ah(dev, to_mah(wr->wr.ud.ah), &sqp->ud_header); + if (err) + return err; + mlx->flags &= ~cpu_to_be32(MTHCA_NEXT_SOLICIT | 1); + mlx->flags |= cpu_to_be32((!sqp->qp.ibqp.qp_num ? MTHCA_MLX_VL15 : 0) | + (sqp->ud_header.lrh.destination_lid == + IB_LID_PERMISSIVE ? MTHCA_MLX_SLR : 0) | + (sqp->ud_header.lrh.service_level << 8)); + mlx->rlid = sqp->ud_header.lrh.destination_lid; + mlx->vcrc = 0; + + switch (wr->opcode) { + case IB_WR_SEND: + sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY; + sqp->ud_header.immediate_present = 0; + break; + case IB_WR_SEND_WITH_IMM: + sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE; + sqp->ud_header.immediate_present = 1; + sqp->ud_header.immediate_data = wr->ex.imm_data; + break; + default: + return -EINVAL; + } + + sqp->ud_header.lrh.virtual_lane = !sqp->qp.ibqp.qp_num ? 15 : 0; + if (sqp->ud_header.lrh.destination_lid == IB_LID_PERMISSIVE) + sqp->ud_header.lrh.source_lid = IB_LID_PERMISSIVE; + sqp->ud_header.bth.solicited_event = !!(wr->send_flags & IB_SEND_SOLICITED); + if (!sqp->qp.ibqp.qp_num) + ib_get_cached_pkey(&dev->ib_dev, sqp->qp.port, + sqp->pkey_index, &pkey); + else + ib_get_cached_pkey(&dev->ib_dev, sqp->qp.port, + wr->wr.ud.pkey_index, &pkey); + sqp->ud_header.bth.pkey = cpu_to_be16(pkey); + sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->wr.ud.remote_qpn); + sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1)); + sqp->ud_header.deth.qkey = cpu_to_be32(wr->wr.ud.remote_qkey & 0x80000000 ? + sqp->qkey : wr->wr.ud.remote_qkey); + sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.ibqp.qp_num); + + header_size = ib_ud_header_pack(&sqp->ud_header, + sqp->header_buf + + ind * MTHCA_UD_HEADER_SIZE); + + data->byte_count = cpu_to_be32(header_size); + data->lkey = cpu_to_be32(to_mpd(sqp->qp.ibqp.pd)->ntmr.ibmr.lkey); + data->addr = cpu_to_be64(sqp->header_dma + + ind * MTHCA_UD_HEADER_SIZE); + + return 0; +} + +static inline int mthca_wq_overflow(struct mthca_wq *wq, int nreq, + struct ib_cq *ib_cq) +{ + unsigned cur; + struct mthca_cq *cq; + + cur = wq->head - wq->tail; + if (likely(cur + nreq < wq->max)) + return 0; + + cq = to_mcq(ib_cq); + spin_lock(&cq->lock); + cur = wq->head - wq->tail; + spin_unlock(&cq->lock); + + return cur + nreq >= wq->max; +} + +static __always_inline void set_raddr_seg(struct mthca_raddr_seg *rseg, + u64 remote_addr, u32 rkey) +{ + rseg->raddr = cpu_to_be64(remote_addr); + rseg->rkey = cpu_to_be32(rkey); + rseg->reserved = 0; +} + +static __always_inline void set_atomic_seg(struct mthca_atomic_seg *aseg, + struct ib_send_wr *wr) +{ + if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP) { + aseg->swap_add = cpu_to_be64(wr->wr.atomic.swap); + aseg->compare = cpu_to_be64(wr->wr.atomic.compare_add); + } else { + aseg->swap_add = cpu_to_be64(wr->wr.atomic.compare_add); + aseg->compare = 0; + } + +} + +static void set_tavor_ud_seg(struct mthca_tavor_ud_seg *useg, + struct ib_send_wr *wr) +{ + useg->lkey = cpu_to_be32(to_mah(wr->wr.ud.ah)->key); + useg->av_addr = cpu_to_be64(to_mah(wr->wr.ud.ah)->avdma); + useg->dqpn = cpu_to_be32(wr->wr.ud.remote_qpn); + useg->qkey = cpu_to_be32(wr->wr.ud.remote_qkey); + +} + +static void set_arbel_ud_seg(struct mthca_arbel_ud_seg *useg, + struct ib_send_wr *wr) +{ + memcpy(useg->av, to_mah(wr->wr.ud.ah)->av, MTHCA_AV_SIZE); + useg->dqpn = cpu_to_be32(wr->wr.ud.remote_qpn); + useg->qkey = cpu_to_be32(wr->wr.ud.remote_qkey); +} + +int mthca_tavor_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr) +{ + struct mthca_dev *dev = to_mdev(ibqp->device); + struct mthca_qp *qp = to_mqp(ibqp); + void *wqe; + void *prev_wqe; + unsigned long flags; + int err = 0; + int nreq; + int i; + int size; + /* + * f0 and size0 are only used if nreq != 0, and they will + * always be initialized the first time through the main loop + * before nreq is incremented. So nreq cannot become non-zero + * without initializing f0 and size0, and they are in fact + * never used uninitialized. + */ + int uninitialized_var(size0); + u32 uninitialized_var(f0); + int ind; + u8 op0 = 0; + + spin_lock_irqsave(&qp->sq.lock, flags); + + /* XXX check that state is OK to post send */ + + ind = qp->sq.next_ind; + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + if (mthca_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) { + mthca_err(dev, "SQ %06x full (%u head, %u tail," + " %d max, %d nreq)\n", qp->qpn, + qp->sq.head, qp->sq.tail, + qp->sq.max, nreq); + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + wqe = get_send_wqe(qp, ind); + prev_wqe = qp->sq.last; + qp->sq.last = wqe; + + ((struct mthca_next_seg *) wqe)->nda_op = 0; + ((struct mthca_next_seg *) wqe)->ee_nds = 0; + ((struct mthca_next_seg *) wqe)->flags = + ((wr->send_flags & IB_SEND_SIGNALED) ? + cpu_to_be32(MTHCA_NEXT_CQ_UPDATE) : 0) | + ((wr->send_flags & IB_SEND_SOLICITED) ? + cpu_to_be32(MTHCA_NEXT_SOLICIT) : 0) | + cpu_to_be32(1); + if (wr->opcode == IB_WR_SEND_WITH_IMM || + wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) + ((struct mthca_next_seg *) wqe)->imm = wr->ex.imm_data; + + wqe += sizeof (struct mthca_next_seg); + size = sizeof (struct mthca_next_seg) / 16; + + switch (qp->transport) { + case RC: + switch (wr->opcode) { + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + set_raddr_seg(wqe, wr->wr.atomic.remote_addr, + wr->wr.atomic.rkey); + wqe += sizeof (struct mthca_raddr_seg); + + set_atomic_seg(wqe, wr); + wqe += sizeof (struct mthca_atomic_seg); + size += (sizeof (struct mthca_raddr_seg) + + sizeof (struct mthca_atomic_seg)) / 16; + break; + + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + case IB_WR_RDMA_READ: + set_raddr_seg(wqe, wr->wr.rdma.remote_addr, + wr->wr.rdma.rkey); + wqe += sizeof (struct mthca_raddr_seg); + size += sizeof (struct mthca_raddr_seg) / 16; + break; + + default: + /* No extra segments required for sends */ + break; + } + + break; + + case UC: + switch (wr->opcode) { + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + set_raddr_seg(wqe, wr->wr.rdma.remote_addr, + wr->wr.rdma.rkey); + wqe += sizeof (struct mthca_raddr_seg); + size += sizeof (struct mthca_raddr_seg) / 16; + break; + + default: + /* No extra segments required for sends */ + break; + } + + break; + + case UD: + set_tavor_ud_seg(wqe, wr); + wqe += sizeof (struct mthca_tavor_ud_seg); + size += sizeof (struct mthca_tavor_ud_seg) / 16; + break; + + case MLX: + err = build_mlx_header(dev, to_msqp(qp), ind, wr, + wqe - sizeof (struct mthca_next_seg), + wqe); + if (err) { + *bad_wr = wr; + goto out; + } + wqe += sizeof (struct mthca_data_seg); + size += sizeof (struct mthca_data_seg) / 16; + break; + } + + if (wr->num_sge > qp->sq.max_gs) { + mthca_err(dev, "too many gathers\n"); + err = -EINVAL; + *bad_wr = wr; + goto out; + } + + for (i = 0; i < wr->num_sge; ++i) { + mthca_set_data_seg(wqe, wr->sg_list + i); + wqe += sizeof (struct mthca_data_seg); + size += sizeof (struct mthca_data_seg) / 16; + } + + /* Add one more inline data segment for ICRC */ + if (qp->transport == MLX) { + ((struct mthca_data_seg *) wqe)->byte_count = + cpu_to_be32((1 << 31) | 4); + ((u32 *) wqe)[1] = 0; + wqe += sizeof (struct mthca_data_seg); + size += sizeof (struct mthca_data_seg) / 16; + } + + qp->wrid[ind + qp->rq.max] = wr->wr_id; + + if (wr->opcode >= ARRAY_SIZE(mthca_opcode)) { + mthca_err(dev, "opcode invalid\n"); + err = -EINVAL; + *bad_wr = wr; + goto out; + } + + ((struct mthca_next_seg *) prev_wqe)->nda_op = + cpu_to_be32(((ind << qp->sq.wqe_shift) + + qp->send_wqe_offset) | + mthca_opcode[wr->opcode]); + wmb(); + ((struct mthca_next_seg *) prev_wqe)->ee_nds = + cpu_to_be32((nreq ? 0 : MTHCA_NEXT_DBD) | size | + ((wr->send_flags & IB_SEND_FENCE) ? + MTHCA_NEXT_FENCE : 0)); + + if (!nreq) { + size0 = size; + op0 = mthca_opcode[wr->opcode]; + f0 = wr->send_flags & IB_SEND_FENCE ? + MTHCA_SEND_DOORBELL_FENCE : 0; + } + + ++ind; + if (unlikely(ind >= qp->sq.max)) + ind -= qp->sq.max; + } + +out: + if (likely(nreq)) { + wmb(); + + mthca_write64(((qp->sq.next_ind << qp->sq.wqe_shift) + + qp->send_wqe_offset) | f0 | op0, + (qp->qpn << 8) | size0, + dev->kar + MTHCA_SEND_DOORBELL, + MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock)); + /* + * Make sure doorbells don't leak out of SQ spinlock + * and reach the HCA out of order: + */ + mmiowb(); + } + + qp->sq.next_ind = ind; + qp->sq.head += nreq; + + spin_unlock_irqrestore(&qp->sq.lock, flags); + return err; +} + +int mthca_tavor_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct mthca_dev *dev = to_mdev(ibqp->device); + struct mthca_qp *qp = to_mqp(ibqp); + unsigned long flags; + int err = 0; + int nreq; + int i; + int size; + /* + * size0 is only used if nreq != 0, and it will always be + * initialized the first time through the main loop before + * nreq is incremented. So nreq cannot become non-zero + * without initializing size0, and it is in fact never used + * uninitialized. + */ + int uninitialized_var(size0); + int ind; + void *wqe; + void *prev_wqe; + + spin_lock_irqsave(&qp->rq.lock, flags); + + /* XXX check that state is OK to post receive */ + + ind = qp->rq.next_ind; + + for (nreq = 0; wr; wr = wr->next) { + if (mthca_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) { + mthca_err(dev, "RQ %06x full (%u head, %u tail," + " %d max, %d nreq)\n", qp->qpn, + qp->rq.head, qp->rq.tail, + qp->rq.max, nreq); + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + wqe = get_recv_wqe(qp, ind); + prev_wqe = qp->rq.last; + qp->rq.last = wqe; + + ((struct mthca_next_seg *) wqe)->ee_nds = + cpu_to_be32(MTHCA_NEXT_DBD); + ((struct mthca_next_seg *) wqe)->flags = 0; + + wqe += sizeof (struct mthca_next_seg); + size = sizeof (struct mthca_next_seg) / 16; + + if (unlikely(wr->num_sge > qp->rq.max_gs)) { + err = -EINVAL; + *bad_wr = wr; + goto out; + } + + for (i = 0; i < wr->num_sge; ++i) { + mthca_set_data_seg(wqe, wr->sg_list + i); + wqe += sizeof (struct mthca_data_seg); + size += sizeof (struct mthca_data_seg) / 16; + } + + qp->wrid[ind] = wr->wr_id; + + ((struct mthca_next_seg *) prev_wqe)->ee_nds = + cpu_to_be32(MTHCA_NEXT_DBD | size); + + if (!nreq) + size0 = size; + + ++ind; + if (unlikely(ind >= qp->rq.max)) + ind -= qp->rq.max; + + ++nreq; + if (unlikely(nreq == MTHCA_TAVOR_MAX_WQES_PER_RECV_DB)) { + nreq = 0; + + wmb(); + + mthca_write64((qp->rq.next_ind << qp->rq.wqe_shift) | size0, + qp->qpn << 8, dev->kar + MTHCA_RECEIVE_DOORBELL, + MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock)); + + qp->rq.next_ind = ind; + qp->rq.head += MTHCA_TAVOR_MAX_WQES_PER_RECV_DB; + } + } + +out: + if (likely(nreq)) { + wmb(); + + mthca_write64((qp->rq.next_ind << qp->rq.wqe_shift) | size0, + qp->qpn << 8 | nreq, dev->kar + MTHCA_RECEIVE_DOORBELL, + MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock)); + } + + qp->rq.next_ind = ind; + qp->rq.head += nreq; + + /* + * Make sure doorbells don't leak out of RQ spinlock and reach + * the HCA out of order: + */ + mmiowb(); + + spin_unlock_irqrestore(&qp->rq.lock, flags); + return err; +} + +int mthca_arbel_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr) +{ + struct mthca_dev *dev = to_mdev(ibqp->device); + struct mthca_qp *qp = to_mqp(ibqp); + u32 dbhi; + void *wqe; + void *prev_wqe; + unsigned long flags; + int err = 0; + int nreq; + int i; + int size; + /* + * f0 and size0 are only used if nreq != 0, and they will + * always be initialized the first time through the main loop + * before nreq is incremented. So nreq cannot become non-zero + * without initializing f0 and size0, and they are in fact + * never used uninitialized. + */ + int uninitialized_var(size0); + u32 uninitialized_var(f0); + int ind; + u8 op0 = 0; + + spin_lock_irqsave(&qp->sq.lock, flags); + + /* XXX check that state is OK to post send */ + + ind = qp->sq.head & (qp->sq.max - 1); + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + if (unlikely(nreq == MTHCA_ARBEL_MAX_WQES_PER_SEND_DB)) { + nreq = 0; + + dbhi = (MTHCA_ARBEL_MAX_WQES_PER_SEND_DB << 24) | + ((qp->sq.head & 0xffff) << 8) | f0 | op0; + + qp->sq.head += MTHCA_ARBEL_MAX_WQES_PER_SEND_DB; + + /* + * Make sure that descriptors are written before + * doorbell record. + */ + wmb(); + *qp->sq.db = cpu_to_be32(qp->sq.head & 0xffff); + + /* + * Make sure doorbell record is written before we + * write MMIO send doorbell. + */ + wmb(); + + mthca_write64(dbhi, (qp->qpn << 8) | size0, + dev->kar + MTHCA_SEND_DOORBELL, + MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock)); + } + + if (mthca_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) { + mthca_err(dev, "SQ %06x full (%u head, %u tail," + " %d max, %d nreq)\n", qp->qpn, + qp->sq.head, qp->sq.tail, + qp->sq.max, nreq); + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + wqe = get_send_wqe(qp, ind); + prev_wqe = qp->sq.last; + qp->sq.last = wqe; + + ((struct mthca_next_seg *) wqe)->flags = + ((wr->send_flags & IB_SEND_SIGNALED) ? + cpu_to_be32(MTHCA_NEXT_CQ_UPDATE) : 0) | + ((wr->send_flags & IB_SEND_SOLICITED) ? + cpu_to_be32(MTHCA_NEXT_SOLICIT) : 0) | + ((wr->send_flags & IB_SEND_IP_CSUM) ? + cpu_to_be32(MTHCA_NEXT_IP_CSUM | MTHCA_NEXT_TCP_UDP_CSUM) : 0) | + cpu_to_be32(1); + if (wr->opcode == IB_WR_SEND_WITH_IMM || + wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) + ((struct mthca_next_seg *) wqe)->imm = wr->ex.imm_data; + + wqe += sizeof (struct mthca_next_seg); + size = sizeof (struct mthca_next_seg) / 16; + + switch (qp->transport) { + case RC: + switch (wr->opcode) { + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + set_raddr_seg(wqe, wr->wr.atomic.remote_addr, + wr->wr.atomic.rkey); + wqe += sizeof (struct mthca_raddr_seg); + + set_atomic_seg(wqe, wr); + wqe += sizeof (struct mthca_atomic_seg); + size += (sizeof (struct mthca_raddr_seg) + + sizeof (struct mthca_atomic_seg)) / 16; + break; + + case IB_WR_RDMA_READ: + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + set_raddr_seg(wqe, wr->wr.rdma.remote_addr, + wr->wr.rdma.rkey); + wqe += sizeof (struct mthca_raddr_seg); + size += sizeof (struct mthca_raddr_seg) / 16; + break; + + default: + /* No extra segments required for sends */ + break; + } + + break; + + case UC: + switch (wr->opcode) { + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + set_raddr_seg(wqe, wr->wr.rdma.remote_addr, + wr->wr.rdma.rkey); + wqe += sizeof (struct mthca_raddr_seg); + size += sizeof (struct mthca_raddr_seg) / 16; + break; + + default: + /* No extra segments required for sends */ + break; + } + + break; + + case UD: + set_arbel_ud_seg(wqe, wr); + wqe += sizeof (struct mthca_arbel_ud_seg); + size += sizeof (struct mthca_arbel_ud_seg) / 16; + break; + + case MLX: + err = build_mlx_header(dev, to_msqp(qp), ind, wr, + wqe - sizeof (struct mthca_next_seg), + wqe); + if (err) { + *bad_wr = wr; + goto out; + } + wqe += sizeof (struct mthca_data_seg); + size += sizeof (struct mthca_data_seg) / 16; + break; + } + + if (wr->num_sge > qp->sq.max_gs) { + mthca_err(dev, "too many gathers\n"); + err = -EINVAL; + *bad_wr = wr; + goto out; + } + + for (i = 0; i < wr->num_sge; ++i) { + mthca_set_data_seg(wqe, wr->sg_list + i); + wqe += sizeof (struct mthca_data_seg); + size += sizeof (struct mthca_data_seg) / 16; + } + + /* Add one more inline data segment for ICRC */ + if (qp->transport == MLX) { + ((struct mthca_data_seg *) wqe)->byte_count = + cpu_to_be32((1 << 31) | 4); + ((u32 *) wqe)[1] = 0; + wqe += sizeof (struct mthca_data_seg); + size += sizeof (struct mthca_data_seg) / 16; + } + + qp->wrid[ind + qp->rq.max] = wr->wr_id; + + if (wr->opcode >= ARRAY_SIZE(mthca_opcode)) { + mthca_err(dev, "opcode invalid\n"); + err = -EINVAL; + *bad_wr = wr; + goto out; + } + + ((struct mthca_next_seg *) prev_wqe)->nda_op = + cpu_to_be32(((ind << qp->sq.wqe_shift) + + qp->send_wqe_offset) | + mthca_opcode[wr->opcode]); + wmb(); + ((struct mthca_next_seg *) prev_wqe)->ee_nds = + cpu_to_be32(MTHCA_NEXT_DBD | size | + ((wr->send_flags & IB_SEND_FENCE) ? + MTHCA_NEXT_FENCE : 0)); + + if (!nreq) { + size0 = size; + op0 = mthca_opcode[wr->opcode]; + f0 = wr->send_flags & IB_SEND_FENCE ? + MTHCA_SEND_DOORBELL_FENCE : 0; + } + + ++ind; + if (unlikely(ind >= qp->sq.max)) + ind -= qp->sq.max; + } + +out: + if (likely(nreq)) { + dbhi = (nreq << 24) | ((qp->sq.head & 0xffff) << 8) | f0 | op0; + + qp->sq.head += nreq; + + /* + * Make sure that descriptors are written before + * doorbell record. + */ + wmb(); + *qp->sq.db = cpu_to_be32(qp->sq.head & 0xffff); + + /* + * Make sure doorbell record is written before we + * write MMIO send doorbell. + */ + wmb(); + + mthca_write64(dbhi, (qp->qpn << 8) | size0, dev->kar + MTHCA_SEND_DOORBELL, + MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock)); + } + + /* + * Make sure doorbells don't leak out of SQ spinlock and reach + * the HCA out of order: + */ + mmiowb(); + + spin_unlock_irqrestore(&qp->sq.lock, flags); + return err; +} + +int mthca_arbel_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct mthca_dev *dev = to_mdev(ibqp->device); + struct mthca_qp *qp = to_mqp(ibqp); + unsigned long flags; + int err = 0; + int nreq; + int ind; + int i; + void *wqe; + + spin_lock_irqsave(&qp->rq.lock, flags); + + /* XXX check that state is OK to post receive */ + + ind = qp->rq.head & (qp->rq.max - 1); + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + if (mthca_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) { + mthca_err(dev, "RQ %06x full (%u head, %u tail," + " %d max, %d nreq)\n", qp->qpn, + qp->rq.head, qp->rq.tail, + qp->rq.max, nreq); + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + wqe = get_recv_wqe(qp, ind); + + ((struct mthca_next_seg *) wqe)->flags = 0; + + wqe += sizeof (struct mthca_next_seg); + + if (unlikely(wr->num_sge > qp->rq.max_gs)) { + err = -EINVAL; + *bad_wr = wr; + goto out; + } + + for (i = 0; i < wr->num_sge; ++i) { + mthca_set_data_seg(wqe, wr->sg_list + i); + wqe += sizeof (struct mthca_data_seg); + } + + if (i < qp->rq.max_gs) + mthca_set_data_seg_inval(wqe); + + qp->wrid[ind] = wr->wr_id; + + ++ind; + if (unlikely(ind >= qp->rq.max)) + ind -= qp->rq.max; + } +out: + if (likely(nreq)) { + qp->rq.head += nreq; + + /* + * Make sure that descriptors are written before + * doorbell record. + */ + wmb(); + *qp->rq.db = cpu_to_be32(qp->rq.head & 0xffff); + } + + spin_unlock_irqrestore(&qp->rq.lock, flags); + return err; +} + +void mthca_free_err_wqe(struct mthca_dev *dev, struct mthca_qp *qp, int is_send, + int index, int *dbd, __be32 *new_wqe) +{ + struct mthca_next_seg *next; + + /* + * For SRQs, all receive WQEs generate a CQE, so we're always + * at the end of the doorbell chain. + */ + if (qp->ibqp.srq && !is_send) { + *new_wqe = 0; + return; + } + + if (is_send) + next = get_send_wqe(qp, index); + else + next = get_recv_wqe(qp, index); + + *dbd = !!(next->ee_nds & cpu_to_be32(MTHCA_NEXT_DBD)); + if (next->ee_nds & cpu_to_be32(0x3f)) + *new_wqe = (next->nda_op & cpu_to_be32(~0x3f)) | + (next->ee_nds & cpu_to_be32(0x3f)); + else + *new_wqe = 0; +} + +int mthca_init_qp_table(struct mthca_dev *dev) +{ + int err; + int i; + + spin_lock_init(&dev->qp_table.lock); + + /* + * We reserve 2 extra QPs per port for the special QPs. The + * special QP for port 1 has to be even, so round up. + */ + dev->qp_table.sqp_start = (dev->limits.reserved_qps + 1) & ~1UL; + err = mthca_alloc_init(&dev->qp_table.alloc, + dev->limits.num_qps, + (1 << 24) - 1, + dev->qp_table.sqp_start + + MTHCA_MAX_PORTS * 2); + if (err) + return err; + + err = mthca_array_init(&dev->qp_table.qp, + dev->limits.num_qps); + if (err) { + mthca_alloc_cleanup(&dev->qp_table.alloc); + return err; + } + + for (i = 0; i < 2; ++i) { + err = mthca_CONF_SPECIAL_QP(dev, i ? IB_QPT_GSI : IB_QPT_SMI, + dev->qp_table.sqp_start + i * 2); + if (err) { + mthca_warn(dev, "CONF_SPECIAL_QP returned " + "%d, aborting.\n", err); + goto err_out; + } + } + return 0; + + err_out: + for (i = 0; i < 2; ++i) + mthca_CONF_SPECIAL_QP(dev, i, 0); + + mthca_array_cleanup(&dev->qp_table.qp, dev->limits.num_qps); + mthca_alloc_cleanup(&dev->qp_table.alloc); + + return err; +} + +void mthca_cleanup_qp_table(struct mthca_dev *dev) +{ + int i; + + for (i = 0; i < 2; ++i) + mthca_CONF_SPECIAL_QP(dev, i, 0); + + mthca_array_cleanup(&dev->qp_table.qp, dev->limits.num_qps); + mthca_alloc_cleanup(&dev->qp_table.alloc); +} diff --git a/kernel/drivers/infiniband/hw/mthca/mthca_reset.c b/kernel/drivers/infiniband/hw/mthca/mthca_reset.c new file mode 100644 index 000000000..74c6a9426 --- /dev/null +++ b/kernel/drivers/infiniband/hw/mthca/mthca_reset.c @@ -0,0 +1,288 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#include "mthca_dev.h" +#include "mthca_cmd.h" + +int mthca_reset(struct mthca_dev *mdev) +{ + int i; + int err = 0; + u32 *hca_header = NULL; + u32 *bridge_header = NULL; + struct pci_dev *bridge = NULL; + int bridge_pcix_cap = 0; + int hca_pcie_cap = 0; + int hca_pcix_cap = 0; + + u16 devctl; + u16 linkctl; + +#define MTHCA_RESET_OFFSET 0xf0010 +#define MTHCA_RESET_VALUE swab32(1) + + /* + * Reset the chip. This is somewhat ugly because we have to + * save off the PCI header before reset and then restore it + * after the chip reboots. We skip config space offsets 22 + * and 23 since those have a special meaning. + * + * To make matters worse, for Tavor (PCI-X HCA) we have to + * find the associated bridge device and save off its PCI + * header as well. + */ + + if (!(mdev->mthca_flags & MTHCA_FLAG_PCIE)) { + /* Look for the bridge -- its device ID will be 2 more + than HCA's device ID. */ + while ((bridge = pci_get_device(mdev->pdev->vendor, + mdev->pdev->device + 2, + bridge)) != NULL) { + if (bridge->hdr_type == PCI_HEADER_TYPE_BRIDGE && + bridge->subordinate == mdev->pdev->bus) { + mthca_dbg(mdev, "Found bridge: %s\n", + pci_name(bridge)); + break; + } + } + + if (!bridge) { + /* + * Didn't find a bridge for a Tavor device -- + * assume we're in no-bridge mode and hope for + * the best. + */ + mthca_warn(mdev, "No bridge found for %s\n", + pci_name(mdev->pdev)); + } + + } + + /* For Arbel do we need to save off the full 4K PCI Express header?? */ + hca_header = kmalloc(256, GFP_KERNEL); + if (!hca_header) { + err = -ENOMEM; + mthca_err(mdev, "Couldn't allocate memory to save HCA " + "PCI header, aborting.\n"); + goto out; + } + + for (i = 0; i < 64; ++i) { + if (i == 22 || i == 23) + continue; + if (pci_read_config_dword(mdev->pdev, i * 4, hca_header + i)) { + err = -ENODEV; + mthca_err(mdev, "Couldn't save HCA " + "PCI header, aborting.\n"); + goto out; + } + } + + hca_pcix_cap = pci_find_capability(mdev->pdev, PCI_CAP_ID_PCIX); + hca_pcie_cap = pci_pcie_cap(mdev->pdev); + + if (bridge) { + bridge_header = kmalloc(256, GFP_KERNEL); + if (!bridge_header) { + err = -ENOMEM; + mthca_err(mdev, "Couldn't allocate memory to save HCA " + "bridge PCI header, aborting.\n"); + goto out; + } + + for (i = 0; i < 64; ++i) { + if (i == 22 || i == 23) + continue; + if (pci_read_config_dword(bridge, i * 4, bridge_header + i)) { + err = -ENODEV; + mthca_err(mdev, "Couldn't save HCA bridge " + "PCI header, aborting.\n"); + goto out; + } + } + bridge_pcix_cap = pci_find_capability(bridge, PCI_CAP_ID_PCIX); + if (!bridge_pcix_cap) { + err = -ENODEV; + mthca_err(mdev, "Couldn't locate HCA bridge " + "PCI-X capability, aborting.\n"); + goto out; + } + } + + /* actually hit reset */ + { + void __iomem *reset = ioremap(pci_resource_start(mdev->pdev, 0) + + MTHCA_RESET_OFFSET, 4); + + if (!reset) { + err = -ENOMEM; + mthca_err(mdev, "Couldn't map HCA reset register, " + "aborting.\n"); + goto out; + } + + writel(MTHCA_RESET_VALUE, reset); + iounmap(reset); + } + + /* Docs say to wait one second before accessing device */ + msleep(1000); + + /* Now wait for PCI device to start responding again */ + { + u32 v; + int c = 0; + + for (c = 0; c < 100; ++c) { + if (pci_read_config_dword(bridge ? bridge : mdev->pdev, 0, &v)) { + err = -ENODEV; + mthca_err(mdev, "Couldn't access HCA after reset, " + "aborting.\n"); + goto out; + } + + if (v != 0xffffffff) + goto good; + + msleep(100); + } + + err = -ENODEV; + mthca_err(mdev, "PCI device did not come back after reset, " + "aborting.\n"); + goto out; + } + +good: + /* Now restore the PCI headers */ + if (bridge) { + if (pci_write_config_dword(bridge, bridge_pcix_cap + 0x8, + bridge_header[(bridge_pcix_cap + 0x8) / 4])) { + err = -ENODEV; + mthca_err(mdev, "Couldn't restore HCA bridge Upstream " + "split transaction control, aborting.\n"); + goto out; + } + if (pci_write_config_dword(bridge, bridge_pcix_cap + 0xc, + bridge_header[(bridge_pcix_cap + 0xc) / 4])) { + err = -ENODEV; + mthca_err(mdev, "Couldn't restore HCA bridge Downstream " + "split transaction control, aborting.\n"); + goto out; + } + /* + * Bridge control register is at 0x3e, so we'll + * naturally restore it last in this loop. + */ + for (i = 0; i < 16; ++i) { + if (i * 4 == PCI_COMMAND) + continue; + + if (pci_write_config_dword(bridge, i * 4, bridge_header[i])) { + err = -ENODEV; + mthca_err(mdev, "Couldn't restore HCA bridge reg %x, " + "aborting.\n", i); + goto out; + } + } + + if (pci_write_config_dword(bridge, PCI_COMMAND, + bridge_header[PCI_COMMAND / 4])) { + err = -ENODEV; + mthca_err(mdev, "Couldn't restore HCA bridge COMMAND, " + "aborting.\n"); + goto out; + } + } + + if (hca_pcix_cap) { + if (pci_write_config_dword(mdev->pdev, hca_pcix_cap, + hca_header[hca_pcix_cap / 4])) { + err = -ENODEV; + mthca_err(mdev, "Couldn't restore HCA PCI-X " + "command register, aborting.\n"); + goto out; + } + } + + if (hca_pcie_cap) { + devctl = hca_header[(hca_pcie_cap + PCI_EXP_DEVCTL) / 4]; + if (pcie_capability_write_word(mdev->pdev, PCI_EXP_DEVCTL, + devctl)) { + err = -ENODEV; + mthca_err(mdev, "Couldn't restore HCA PCI Express " + "Device Control register, aborting.\n"); + goto out; + } + linkctl = hca_header[(hca_pcie_cap + PCI_EXP_LNKCTL) / 4]; + if (pcie_capability_write_word(mdev->pdev, PCI_EXP_LNKCTL, + linkctl)) { + err = -ENODEV; + mthca_err(mdev, "Couldn't restore HCA PCI Express " + "Link control register, aborting.\n"); + goto out; + } + } + + for (i = 0; i < 16; ++i) { + if (i * 4 == PCI_COMMAND) + continue; + + if (pci_write_config_dword(mdev->pdev, i * 4, hca_header[i])) { + err = -ENODEV; + mthca_err(mdev, "Couldn't restore HCA reg %x, " + "aborting.\n", i); + goto out; + } + } + + if (pci_write_config_dword(mdev->pdev, PCI_COMMAND, + hca_header[PCI_COMMAND / 4])) { + err = -ENODEV; + mthca_err(mdev, "Couldn't restore HCA COMMAND, " + "aborting.\n"); + goto out; + } + +out: + if (bridge) + pci_dev_put(bridge); + kfree(bridge_header); + kfree(hca_header); + + return err; +} diff --git a/kernel/drivers/infiniband/hw/mthca/mthca_srq.c b/kernel/drivers/infiniband/hw/mthca/mthca_srq.c new file mode 100644 index 000000000..d22f97048 --- /dev/null +++ b/kernel/drivers/infiniband/hw/mthca/mthca_srq.c @@ -0,0 +1,696 @@ +/* + * Copyright (c) 2005 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include + +#include "mthca_dev.h" +#include "mthca_cmd.h" +#include "mthca_memfree.h" +#include "mthca_wqe.h" + +enum { + MTHCA_MAX_DIRECT_SRQ_SIZE = 4 * PAGE_SIZE +}; + +struct mthca_tavor_srq_context { + __be64 wqe_base_ds; /* low 6 bits is descriptor size */ + __be32 state_pd; + __be32 lkey; + __be32 uar; + __be16 limit_watermark; + __be16 wqe_cnt; + u32 reserved[2]; +}; + +struct mthca_arbel_srq_context { + __be32 state_logsize_srqn; + __be32 lkey; + __be32 db_index; + __be32 logstride_usrpage; + __be64 wqe_base; + __be32 eq_pd; + __be16 limit_watermark; + __be16 wqe_cnt; + u16 reserved1; + __be16 wqe_counter; + u32 reserved2[3]; +}; + +static void *get_wqe(struct mthca_srq *srq, int n) +{ + if (srq->is_direct) + return srq->queue.direct.buf + (n << srq->wqe_shift); + else + return srq->queue.page_list[(n << srq->wqe_shift) >> PAGE_SHIFT].buf + + ((n << srq->wqe_shift) & (PAGE_SIZE - 1)); +} + +/* + * Return a pointer to the location within a WQE that we're using as a + * link when the WQE is in the free list. We use the imm field + * because in the Tavor case, posting a WQE may overwrite the next + * segment of the previous WQE, but a receive WQE will never touch the + * imm field. This avoids corrupting our free list if the previous + * WQE has already completed and been put on the free list when we + * post the next WQE. + */ +static inline int *wqe_to_link(void *wqe) +{ + return (int *) (wqe + offsetof(struct mthca_next_seg, imm)); +} + +static void mthca_tavor_init_srq_context(struct mthca_dev *dev, + struct mthca_pd *pd, + struct mthca_srq *srq, + struct mthca_tavor_srq_context *context) +{ + memset(context, 0, sizeof *context); + + context->wqe_base_ds = cpu_to_be64(1 << (srq->wqe_shift - 4)); + context->state_pd = cpu_to_be32(pd->pd_num); + context->lkey = cpu_to_be32(srq->mr.ibmr.lkey); + + if (pd->ibpd.uobject) + context->uar = + cpu_to_be32(to_mucontext(pd->ibpd.uobject->context)->uar.index); + else + context->uar = cpu_to_be32(dev->driver_uar.index); +} + +static void mthca_arbel_init_srq_context(struct mthca_dev *dev, + struct mthca_pd *pd, + struct mthca_srq *srq, + struct mthca_arbel_srq_context *context) +{ + int logsize, max; + + memset(context, 0, sizeof *context); + + /* + * Put max in a temporary variable to work around gcc bug + * triggered by ilog2() on sparc64. + */ + max = srq->max; + logsize = ilog2(max); + context->state_logsize_srqn = cpu_to_be32(logsize << 24 | srq->srqn); + context->lkey = cpu_to_be32(srq->mr.ibmr.lkey); + context->db_index = cpu_to_be32(srq->db_index); + context->logstride_usrpage = cpu_to_be32((srq->wqe_shift - 4) << 29); + if (pd->ibpd.uobject) + context->logstride_usrpage |= + cpu_to_be32(to_mucontext(pd->ibpd.uobject->context)->uar.index); + else + context->logstride_usrpage |= cpu_to_be32(dev->driver_uar.index); + context->eq_pd = cpu_to_be32(MTHCA_EQ_ASYNC << 24 | pd->pd_num); +} + +static void mthca_free_srq_buf(struct mthca_dev *dev, struct mthca_srq *srq) +{ + mthca_buf_free(dev, srq->max << srq->wqe_shift, &srq->queue, + srq->is_direct, &srq->mr); + kfree(srq->wrid); +} + +static int mthca_alloc_srq_buf(struct mthca_dev *dev, struct mthca_pd *pd, + struct mthca_srq *srq) +{ + struct mthca_data_seg *scatter; + void *wqe; + int err; + int i; + + if (pd->ibpd.uobject) + return 0; + + srq->wrid = kmalloc(srq->max * sizeof (u64), GFP_KERNEL); + if (!srq->wrid) + return -ENOMEM; + + err = mthca_buf_alloc(dev, srq->max << srq->wqe_shift, + MTHCA_MAX_DIRECT_SRQ_SIZE, + &srq->queue, &srq->is_direct, pd, 1, &srq->mr); + if (err) { + kfree(srq->wrid); + return err; + } + + /* + * Now initialize the SRQ buffer so that all of the WQEs are + * linked into the list of free WQEs. In addition, set the + * scatter list L_Keys to the sentry value of 0x100. + */ + for (i = 0; i < srq->max; ++i) { + struct mthca_next_seg *next; + + next = wqe = get_wqe(srq, i); + + if (i < srq->max - 1) { + *wqe_to_link(wqe) = i + 1; + next->nda_op = htonl(((i + 1) << srq->wqe_shift) | 1); + } else { + *wqe_to_link(wqe) = -1; + next->nda_op = 0; + } + + for (scatter = wqe + sizeof (struct mthca_next_seg); + (void *) scatter < wqe + (1 << srq->wqe_shift); + ++scatter) + scatter->lkey = cpu_to_be32(MTHCA_INVAL_LKEY); + } + + srq->last = get_wqe(srq, srq->max - 1); + + return 0; +} + +int mthca_alloc_srq(struct mthca_dev *dev, struct mthca_pd *pd, + struct ib_srq_attr *attr, struct mthca_srq *srq) +{ + struct mthca_mailbox *mailbox; + int ds; + int err; + + /* Sanity check SRQ size before proceeding */ + if (attr->max_wr > dev->limits.max_srq_wqes || + attr->max_sge > dev->limits.max_srq_sge) + return -EINVAL; + + srq->max = attr->max_wr; + srq->max_gs = attr->max_sge; + srq->counter = 0; + + if (mthca_is_memfree(dev)) + srq->max = roundup_pow_of_two(srq->max + 1); + else + srq->max = srq->max + 1; + + ds = max(64UL, + roundup_pow_of_two(sizeof (struct mthca_next_seg) + + srq->max_gs * sizeof (struct mthca_data_seg))); + + if (!mthca_is_memfree(dev) && (ds > dev->limits.max_desc_sz)) + return -EINVAL; + + srq->wqe_shift = ilog2(ds); + + srq->srqn = mthca_alloc(&dev->srq_table.alloc); + if (srq->srqn == -1) + return -ENOMEM; + + if (mthca_is_memfree(dev)) { + err = mthca_table_get(dev, dev->srq_table.table, srq->srqn); + if (err) + goto err_out; + + if (!pd->ibpd.uobject) { + srq->db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_SRQ, + srq->srqn, &srq->db); + if (srq->db_index < 0) { + err = -ENOMEM; + goto err_out_icm; + } + } + } + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) { + err = PTR_ERR(mailbox); + goto err_out_db; + } + + err = mthca_alloc_srq_buf(dev, pd, srq); + if (err) + goto err_out_mailbox; + + spin_lock_init(&srq->lock); + srq->refcount = 1; + init_waitqueue_head(&srq->wait); + mutex_init(&srq->mutex); + + if (mthca_is_memfree(dev)) + mthca_arbel_init_srq_context(dev, pd, srq, mailbox->buf); + else + mthca_tavor_init_srq_context(dev, pd, srq, mailbox->buf); + + err = mthca_SW2HW_SRQ(dev, mailbox, srq->srqn); + + if (err) { + mthca_warn(dev, "SW2HW_SRQ failed (%d)\n", err); + goto err_out_free_buf; + } + + spin_lock_irq(&dev->srq_table.lock); + if (mthca_array_set(&dev->srq_table.srq, + srq->srqn & (dev->limits.num_srqs - 1), + srq)) { + spin_unlock_irq(&dev->srq_table.lock); + goto err_out_free_srq; + } + spin_unlock_irq(&dev->srq_table.lock); + + mthca_free_mailbox(dev, mailbox); + + srq->first_free = 0; + srq->last_free = srq->max - 1; + + attr->max_wr = srq->max - 1; + attr->max_sge = srq->max_gs; + + return 0; + +err_out_free_srq: + err = mthca_HW2SW_SRQ(dev, mailbox, srq->srqn); + if (err) + mthca_warn(dev, "HW2SW_SRQ failed (%d)\n", err); + +err_out_free_buf: + if (!pd->ibpd.uobject) + mthca_free_srq_buf(dev, srq); + +err_out_mailbox: + mthca_free_mailbox(dev, mailbox); + +err_out_db: + if (!pd->ibpd.uobject && mthca_is_memfree(dev)) + mthca_free_db(dev, MTHCA_DB_TYPE_SRQ, srq->db_index); + +err_out_icm: + mthca_table_put(dev, dev->srq_table.table, srq->srqn); + +err_out: + mthca_free(&dev->srq_table.alloc, srq->srqn); + + return err; +} + +static inline int get_srq_refcount(struct mthca_dev *dev, struct mthca_srq *srq) +{ + int c; + + spin_lock_irq(&dev->srq_table.lock); + c = srq->refcount; + spin_unlock_irq(&dev->srq_table.lock); + + return c; +} + +void mthca_free_srq(struct mthca_dev *dev, struct mthca_srq *srq) +{ + struct mthca_mailbox *mailbox; + int err; + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) { + mthca_warn(dev, "No memory for mailbox to free SRQ.\n"); + return; + } + + err = mthca_HW2SW_SRQ(dev, mailbox, srq->srqn); + if (err) + mthca_warn(dev, "HW2SW_SRQ failed (%d)\n", err); + + spin_lock_irq(&dev->srq_table.lock); + mthca_array_clear(&dev->srq_table.srq, + srq->srqn & (dev->limits.num_srqs - 1)); + --srq->refcount; + spin_unlock_irq(&dev->srq_table.lock); + + wait_event(srq->wait, !get_srq_refcount(dev, srq)); + + if (!srq->ibsrq.uobject) { + mthca_free_srq_buf(dev, srq); + if (mthca_is_memfree(dev)) + mthca_free_db(dev, MTHCA_DB_TYPE_SRQ, srq->db_index); + } + + mthca_table_put(dev, dev->srq_table.table, srq->srqn); + mthca_free(&dev->srq_table.alloc, srq->srqn); + mthca_free_mailbox(dev, mailbox); +} + +int mthca_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, struct ib_udata *udata) +{ + struct mthca_dev *dev = to_mdev(ibsrq->device); + struct mthca_srq *srq = to_msrq(ibsrq); + int ret = 0; + + /* We don't support resizing SRQs (yet?) */ + if (attr_mask & IB_SRQ_MAX_WR) + return -EINVAL; + + if (attr_mask & IB_SRQ_LIMIT) { + u32 max_wr = mthca_is_memfree(dev) ? srq->max - 1 : srq->max; + if (attr->srq_limit > max_wr) + return -EINVAL; + + mutex_lock(&srq->mutex); + ret = mthca_ARM_SRQ(dev, srq->srqn, attr->srq_limit); + mutex_unlock(&srq->mutex); + } + + return ret; +} + +int mthca_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr) +{ + struct mthca_dev *dev = to_mdev(ibsrq->device); + struct mthca_srq *srq = to_msrq(ibsrq); + struct mthca_mailbox *mailbox; + struct mthca_arbel_srq_context *arbel_ctx; + struct mthca_tavor_srq_context *tavor_ctx; + int err; + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + err = mthca_QUERY_SRQ(dev, srq->srqn, mailbox); + if (err) + goto out; + + if (mthca_is_memfree(dev)) { + arbel_ctx = mailbox->buf; + srq_attr->srq_limit = be16_to_cpu(arbel_ctx->limit_watermark); + } else { + tavor_ctx = mailbox->buf; + srq_attr->srq_limit = be16_to_cpu(tavor_ctx->limit_watermark); + } + + srq_attr->max_wr = srq->max - 1; + srq_attr->max_sge = srq->max_gs; + +out: + mthca_free_mailbox(dev, mailbox); + + return err; +} + +void mthca_srq_event(struct mthca_dev *dev, u32 srqn, + enum ib_event_type event_type) +{ + struct mthca_srq *srq; + struct ib_event event; + + spin_lock(&dev->srq_table.lock); + srq = mthca_array_get(&dev->srq_table.srq, srqn & (dev->limits.num_srqs - 1)); + if (srq) + ++srq->refcount; + spin_unlock(&dev->srq_table.lock); + + if (!srq) { + mthca_warn(dev, "Async event for bogus SRQ %08x\n", srqn); + return; + } + + if (!srq->ibsrq.event_handler) + goto out; + + event.device = &dev->ib_dev; + event.event = event_type; + event.element.srq = &srq->ibsrq; + srq->ibsrq.event_handler(&event, srq->ibsrq.srq_context); + +out: + spin_lock(&dev->srq_table.lock); + if (!--srq->refcount) + wake_up(&srq->wait); + spin_unlock(&dev->srq_table.lock); +} + +/* + * This function must be called with IRQs disabled. + */ +void mthca_free_srq_wqe(struct mthca_srq *srq, u32 wqe_addr) +{ + int ind; + struct mthca_next_seg *last_free; + + ind = wqe_addr >> srq->wqe_shift; + + spin_lock(&srq->lock); + + last_free = get_wqe(srq, srq->last_free); + *wqe_to_link(last_free) = ind; + last_free->nda_op = htonl((ind << srq->wqe_shift) | 1); + *wqe_to_link(get_wqe(srq, ind)) = -1; + srq->last_free = ind; + + spin_unlock(&srq->lock); +} + +int mthca_tavor_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct mthca_dev *dev = to_mdev(ibsrq->device); + struct mthca_srq *srq = to_msrq(ibsrq); + unsigned long flags; + int err = 0; + int first_ind; + int ind; + int next_ind; + int nreq; + int i; + void *wqe; + void *prev_wqe; + + spin_lock_irqsave(&srq->lock, flags); + + first_ind = srq->first_free; + + for (nreq = 0; wr; wr = wr->next) { + ind = srq->first_free; + wqe = get_wqe(srq, ind); + next_ind = *wqe_to_link(wqe); + + if (unlikely(next_ind < 0)) { + mthca_err(dev, "SRQ %06x full\n", srq->srqn); + err = -ENOMEM; + *bad_wr = wr; + break; + } + + prev_wqe = srq->last; + srq->last = wqe; + + ((struct mthca_next_seg *) wqe)->ee_nds = 0; + /* flags field will always remain 0 */ + + wqe += sizeof (struct mthca_next_seg); + + if (unlikely(wr->num_sge > srq->max_gs)) { + err = -EINVAL; + *bad_wr = wr; + srq->last = prev_wqe; + break; + } + + for (i = 0; i < wr->num_sge; ++i) { + mthca_set_data_seg(wqe, wr->sg_list + i); + wqe += sizeof (struct mthca_data_seg); + } + + if (i < srq->max_gs) + mthca_set_data_seg_inval(wqe); + + ((struct mthca_next_seg *) prev_wqe)->ee_nds = + cpu_to_be32(MTHCA_NEXT_DBD); + + srq->wrid[ind] = wr->wr_id; + srq->first_free = next_ind; + + ++nreq; + if (unlikely(nreq == MTHCA_TAVOR_MAX_WQES_PER_RECV_DB)) { + nreq = 0; + + /* + * Make sure that descriptors are written + * before doorbell is rung. + */ + wmb(); + + mthca_write64(first_ind << srq->wqe_shift, srq->srqn << 8, + dev->kar + MTHCA_RECEIVE_DOORBELL, + MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock)); + + first_ind = srq->first_free; + } + } + + if (likely(nreq)) { + /* + * Make sure that descriptors are written before + * doorbell is rung. + */ + wmb(); + + mthca_write64(first_ind << srq->wqe_shift, (srq->srqn << 8) | nreq, + dev->kar + MTHCA_RECEIVE_DOORBELL, + MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock)); + } + + /* + * Make sure doorbells don't leak out of SRQ spinlock and + * reach the HCA out of order: + */ + mmiowb(); + + spin_unlock_irqrestore(&srq->lock, flags); + return err; +} + +int mthca_arbel_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct mthca_dev *dev = to_mdev(ibsrq->device); + struct mthca_srq *srq = to_msrq(ibsrq); + unsigned long flags; + int err = 0; + int ind; + int next_ind; + int nreq; + int i; + void *wqe; + + spin_lock_irqsave(&srq->lock, flags); + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + ind = srq->first_free; + wqe = get_wqe(srq, ind); + next_ind = *wqe_to_link(wqe); + + if (unlikely(next_ind < 0)) { + mthca_err(dev, "SRQ %06x full\n", srq->srqn); + err = -ENOMEM; + *bad_wr = wr; + break; + } + + ((struct mthca_next_seg *) wqe)->ee_nds = 0; + /* flags field will always remain 0 */ + + wqe += sizeof (struct mthca_next_seg); + + if (unlikely(wr->num_sge > srq->max_gs)) { + err = -EINVAL; + *bad_wr = wr; + break; + } + + for (i = 0; i < wr->num_sge; ++i) { + mthca_set_data_seg(wqe, wr->sg_list + i); + wqe += sizeof (struct mthca_data_seg); + } + + if (i < srq->max_gs) + mthca_set_data_seg_inval(wqe); + + srq->wrid[ind] = wr->wr_id; + srq->first_free = next_ind; + } + + if (likely(nreq)) { + srq->counter += nreq; + + /* + * Make sure that descriptors are written before + * we write doorbell record. + */ + wmb(); + *srq->db = cpu_to_be32(srq->counter); + } + + spin_unlock_irqrestore(&srq->lock, flags); + return err; +} + +int mthca_max_srq_sge(struct mthca_dev *dev) +{ + if (mthca_is_memfree(dev)) + return dev->limits.max_sg; + + /* + * SRQ allocations are based on powers of 2 for Tavor, + * (although they only need to be multiples of 16 bytes). + * + * Therefore, we need to base the max number of sg entries on + * the largest power of 2 descriptor size that is <= to the + * actual max WQE descriptor size, rather than return the + * max_sg value given by the firmware (which is based on WQE + * sizes as multiples of 16, not powers of 2). + * + * If SRQ implementation is changed for Tavor to be based on + * multiples of 16, the calculation below can be deleted and + * the FW max_sg value returned. + */ + return min_t(int, dev->limits.max_sg, + ((1 << (fls(dev->limits.max_desc_sz) - 1)) - + sizeof (struct mthca_next_seg)) / + sizeof (struct mthca_data_seg)); +} + +int mthca_init_srq_table(struct mthca_dev *dev) +{ + int err; + + if (!(dev->mthca_flags & MTHCA_FLAG_SRQ)) + return 0; + + spin_lock_init(&dev->srq_table.lock); + + err = mthca_alloc_init(&dev->srq_table.alloc, + dev->limits.num_srqs, + dev->limits.num_srqs - 1, + dev->limits.reserved_srqs); + if (err) + return err; + + err = mthca_array_init(&dev->srq_table.srq, + dev->limits.num_srqs); + if (err) + mthca_alloc_cleanup(&dev->srq_table.alloc); + + return err; +} + +void mthca_cleanup_srq_table(struct mthca_dev *dev) +{ + if (!(dev->mthca_flags & MTHCA_FLAG_SRQ)) + return; + + mthca_array_cleanup(&dev->srq_table.srq, dev->limits.num_srqs); + mthca_alloc_cleanup(&dev->srq_table.alloc); +} diff --git a/kernel/drivers/infiniband/hw/mthca/mthca_uar.c b/kernel/drivers/infiniband/hw/mthca/mthca_uar.c new file mode 100644 index 000000000..ca5900c96 --- /dev/null +++ b/kernel/drivers/infiniband/hw/mthca/mthca_uar.c @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include /* PAGE_SHIFT */ + +#include "mthca_dev.h" +#include "mthca_memfree.h" + +int mthca_uar_alloc(struct mthca_dev *dev, struct mthca_uar *uar) +{ + uar->index = mthca_alloc(&dev->uar_table.alloc); + if (uar->index == -1) + return -ENOMEM; + + uar->pfn = (pci_resource_start(dev->pdev, 2) >> PAGE_SHIFT) + uar->index; + + return 0; +} + +void mthca_uar_free(struct mthca_dev *dev, struct mthca_uar *uar) +{ + mthca_free(&dev->uar_table.alloc, uar->index); +} + +int mthca_init_uar_table(struct mthca_dev *dev) +{ + int ret; + + ret = mthca_alloc_init(&dev->uar_table.alloc, + dev->limits.num_uars, + dev->limits.num_uars - 1, + dev->limits.reserved_uars + 1); + if (ret) + return ret; + + ret = mthca_init_db_tab(dev); + if (ret) + mthca_alloc_cleanup(&dev->uar_table.alloc); + + return ret; +} + +void mthca_cleanup_uar_table(struct mthca_dev *dev) +{ + mthca_cleanup_db_tab(dev); + + /* XXX check if any UARs are still allocated? */ + mthca_alloc_cleanup(&dev->uar_table.alloc); +} diff --git a/kernel/drivers/infiniband/hw/mthca/mthca_user.h b/kernel/drivers/infiniband/hw/mthca/mthca_user.h new file mode 100644 index 000000000..5fe56e810 --- /dev/null +++ b/kernel/drivers/infiniband/hw/mthca/mthca_user.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MTHCA_USER_H +#define MTHCA_USER_H + +#include + +/* + * Increment this value if any changes that break userspace ABI + * compatibility are made. + */ +#define MTHCA_UVERBS_ABI_VERSION 1 + +/* + * Make sure that all structs defined in this file remain laid out so + * that they pack the same way on 32-bit and 64-bit architectures (to + * avoid incompatibility between 32-bit userspace and 64-bit kernels). + * In particular do not use pointer types -- pass pointers in __u64 + * instead. + */ + +struct mthca_alloc_ucontext_resp { + __u32 qp_tab_size; + __u32 uarc_size; +}; + +struct mthca_alloc_pd_resp { + __u32 pdn; + __u32 reserved; +}; + +struct mthca_reg_mr { +/* + * Mark the memory region with a DMA attribute that causes + * in-flight DMA to be flushed when the region is written to: + */ +#define MTHCA_MR_DMASYNC 0x1 + __u32 mr_attrs; + __u32 reserved; +}; + +struct mthca_create_cq { + __u32 lkey; + __u32 pdn; + __u64 arm_db_page; + __u64 set_db_page; + __u32 arm_db_index; + __u32 set_db_index; +}; + +struct mthca_create_cq_resp { + __u32 cqn; + __u32 reserved; +}; + +struct mthca_resize_cq { + __u32 lkey; + __u32 reserved; +}; + +struct mthca_create_srq { + __u32 lkey; + __u32 db_index; + __u64 db_page; +}; + +struct mthca_create_srq_resp { + __u32 srqn; + __u32 reserved; +}; + +struct mthca_create_qp { + __u32 lkey; + __u32 reserved; + __u64 sq_db_page; + __u64 rq_db_page; + __u32 sq_db_index; + __u32 rq_db_index; +}; + +#endif /* MTHCA_USER_H */ diff --git a/kernel/drivers/infiniband/hw/mthca/mthca_wqe.h b/kernel/drivers/infiniband/hw/mthca/mthca_wqe.h new file mode 100644 index 000000000..341a5ae88 --- /dev/null +++ b/kernel/drivers/infiniband/hw/mthca/mthca_wqe.h @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2005 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MTHCA_WQE_H +#define MTHCA_WQE_H + +#include + +enum { + MTHCA_NEXT_DBD = 1 << 7, + MTHCA_NEXT_FENCE = 1 << 6, + MTHCA_NEXT_CQ_UPDATE = 1 << 3, + MTHCA_NEXT_EVENT_GEN = 1 << 2, + MTHCA_NEXT_SOLICIT = 1 << 1, + MTHCA_NEXT_IP_CSUM = 1 << 4, + MTHCA_NEXT_TCP_UDP_CSUM = 1 << 5, + + MTHCA_MLX_VL15 = 1 << 17, + MTHCA_MLX_SLR = 1 << 16 +}; + +enum { + MTHCA_INVAL_LKEY = 0x100, + MTHCA_TAVOR_MAX_WQES_PER_RECV_DB = 256, + MTHCA_ARBEL_MAX_WQES_PER_SEND_DB = 255 +}; + +struct mthca_next_seg { + __be32 nda_op; /* [31:6] next WQE [4:0] next opcode */ + __be32 ee_nds; /* [31:8] next EE [7] DBD [6] F [5:0] next WQE size */ + __be32 flags; /* [3] CQ [2] Event [1] Solicit */ + __be32 imm; /* immediate data */ +}; + +struct mthca_tavor_ud_seg { + u32 reserved1; + __be32 lkey; + __be64 av_addr; + u32 reserved2[4]; + __be32 dqpn; + __be32 qkey; + u32 reserved3[2]; +}; + +struct mthca_arbel_ud_seg { + __be32 av[8]; + __be32 dqpn; + __be32 qkey; + u32 reserved[2]; +}; + +struct mthca_bind_seg { + __be32 flags; /* [31] Atomic [30] rem write [29] rem read */ + u32 reserved; + __be32 new_rkey; + __be32 lkey; + __be64 addr; + __be64 length; +}; + +struct mthca_raddr_seg { + __be64 raddr; + __be32 rkey; + u32 reserved; +}; + +struct mthca_atomic_seg { + __be64 swap_add; + __be64 compare; +}; + +struct mthca_data_seg { + __be32 byte_count; + __be32 lkey; + __be64 addr; +}; + +struct mthca_mlx_seg { + __be32 nda_op; + __be32 nds; + __be32 flags; /* [17] VL15 [16] SLR [14:12] static rate + [11:8] SL [3] C [2] E */ + __be16 rlid; + __be16 vcrc; +}; + +static __always_inline void mthca_set_data_seg(struct mthca_data_seg *dseg, + struct ib_sge *sg) +{ + dseg->byte_count = cpu_to_be32(sg->length); + dseg->lkey = cpu_to_be32(sg->lkey); + dseg->addr = cpu_to_be64(sg->addr); +} + +static __always_inline void mthca_set_data_seg_inval(struct mthca_data_seg *dseg) +{ + dseg->byte_count = 0; + dseg->lkey = cpu_to_be32(MTHCA_INVAL_LKEY); + dseg->addr = 0; +} + +#endif /* MTHCA_WQE_H */ diff --git a/kernel/drivers/infiniband/hw/nes/Kconfig b/kernel/drivers/infiniband/hw/nes/Kconfig new file mode 100644 index 000000000..846dc97cf --- /dev/null +++ b/kernel/drivers/infiniband/hw/nes/Kconfig @@ -0,0 +1,16 @@ +config INFINIBAND_NES + tristate "NetEffect RNIC Driver" + depends on PCI && INET && INFINIBAND + select LIBCRC32C + select INET_LRO + ---help--- + This is the RDMA Network Interface Card (RNIC) driver for + NetEffect Ethernet Cluster Server Adapters. + +config INFINIBAND_NES_DEBUG + bool "Verbose debugging output" + depends on INFINIBAND_NES + default n + ---help--- + This option enables debug messages from the NetEffect RNIC + driver. Select this if you are diagnosing a problem. diff --git a/kernel/drivers/infiniband/hw/nes/Makefile b/kernel/drivers/infiniband/hw/nes/Makefile new file mode 100644 index 000000000..97820c23e --- /dev/null +++ b/kernel/drivers/infiniband/hw/nes/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_INFINIBAND_NES) += iw_nes.o + +iw_nes-objs := nes.o nes_hw.o nes_nic.o nes_utils.o nes_verbs.o nes_cm.o nes_mgt.o diff --git a/kernel/drivers/infiniband/hw/nes/nes.c b/kernel/drivers/infiniband/hw/nes/nes.c new file mode 100644 index 000000000..9f9d5c563 --- /dev/null +++ b/kernel/drivers/infiniband/hw/nes/nes.c @@ -0,0 +1,1270 @@ +/* + * Copyright (c) 2006 - 2011 Intel Corporation. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "nes.h" + +#include +#include +#include +#include + +MODULE_AUTHOR("NetEffect"); +MODULE_DESCRIPTION("NetEffect RNIC Low-level iWARP Driver"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION(DRV_VERSION); + +int max_mtu = 9000; +int interrupt_mod_interval = 0; + +/* Interoperability */ +int mpa_version = 1; +module_param(mpa_version, int, 0644); +MODULE_PARM_DESC(mpa_version, "MPA version to be used int MPA Req/Resp (0 or 1)"); + +/* Interoperability */ +int disable_mpa_crc = 0; +module_param(disable_mpa_crc, int, 0644); +MODULE_PARM_DESC(disable_mpa_crc, "Disable checking of MPA CRC"); + +unsigned int nes_drv_opt = NES_DRV_OPT_DISABLE_INT_MOD | NES_DRV_OPT_ENABLE_PAU; +module_param(nes_drv_opt, int, 0644); +MODULE_PARM_DESC(nes_drv_opt, "Driver option parameters"); + +unsigned int nes_debug_level = 0; +module_param_named(debug_level, nes_debug_level, uint, 0644); +MODULE_PARM_DESC(debug_level, "Enable debug output level"); + +unsigned int wqm_quanta = 0x10000; +module_param(wqm_quanta, int, 0644); +MODULE_PARM_DESC(wqm_quanta, "WQM quanta"); + +static bool limit_maxrdreqsz; +module_param(limit_maxrdreqsz, bool, 0644); +MODULE_PARM_DESC(limit_maxrdreqsz, "Limit max read request size to 256 Bytes"); + +LIST_HEAD(nes_adapter_list); +static LIST_HEAD(nes_dev_list); + +atomic_t qps_destroyed; + +static unsigned int ee_flsh_adapter; +static unsigned int sysfs_nonidx_addr; +static unsigned int sysfs_idx_addr; + +static struct pci_device_id nes_pci_table[] = { + { PCI_VDEVICE(NETEFFECT, PCI_DEVICE_ID_NETEFFECT_NE020), }, + { PCI_VDEVICE(NETEFFECT, PCI_DEVICE_ID_NETEFFECT_NE020_KR), }, + {0} +}; + +MODULE_DEVICE_TABLE(pci, nes_pci_table); + +/* registered nes netlink callbacks */ +static struct ibnl_client_cbs nes_nl_cb_table[] = { + [RDMA_NL_IWPM_REG_PID] = {.dump = iwpm_register_pid_cb}, + [RDMA_NL_IWPM_ADD_MAPPING] = {.dump = iwpm_add_mapping_cb}, + [RDMA_NL_IWPM_QUERY_MAPPING] = {.dump = iwpm_add_and_query_mapping_cb}, + [RDMA_NL_IWPM_REMOTE_INFO] = {.dump = iwpm_remote_info_cb}, + [RDMA_NL_IWPM_HANDLE_ERR] = {.dump = iwpm_mapping_error_cb}, + [RDMA_NL_IWPM_MAPINFO] = {.dump = iwpm_mapping_info_cb}, + [RDMA_NL_IWPM_MAPINFO_NUM] = {.dump = iwpm_ack_mapping_info_cb} +}; + +static int nes_inetaddr_event(struct notifier_block *, unsigned long, void *); +static int nes_net_event(struct notifier_block *, unsigned long, void *); +static int nes_notifiers_registered; + + +static struct notifier_block nes_inetaddr_notifier = { + .notifier_call = nes_inetaddr_event +}; + +static struct notifier_block nes_net_notifier = { + .notifier_call = nes_net_event +}; + +/** + * nes_inetaddr_event + */ +static int nes_inetaddr_event(struct notifier_block *notifier, + unsigned long event, void *ptr) +{ + struct in_ifaddr *ifa = ptr; + struct net_device *event_netdev = ifa->ifa_dev->dev; + struct nes_device *nesdev; + struct net_device *netdev; + struct net_device *upper_dev; + struct nes_vnic *nesvnic; + unsigned int is_bonded; + + nes_debug(NES_DBG_NETDEV, "nes_inetaddr_event: ip address %pI4, netmask %pI4.\n", + &ifa->ifa_address, &ifa->ifa_mask); + list_for_each_entry(nesdev, &nes_dev_list, list) { + nes_debug(NES_DBG_NETDEV, "Nesdev list entry = 0x%p. (%s)\n", + nesdev, nesdev->netdev[0]->name); + netdev = nesdev->netdev[0]; + nesvnic = netdev_priv(netdev); + upper_dev = netdev_master_upper_dev_get(netdev); + is_bonded = netif_is_bond_slave(netdev) && + (upper_dev == event_netdev); + if ((netdev == event_netdev) || is_bonded) { + if (nesvnic->rdma_enabled == 0) { + nes_debug(NES_DBG_NETDEV, "Returning without processing event for %s since" + " RDMA is not enabled.\n", + netdev->name); + return NOTIFY_OK; + } + /* we have ifa->ifa_address/mask here if we need it */ + switch (event) { + case NETDEV_DOWN: + nes_debug(NES_DBG_NETDEV, "event:DOWN\n"); + nes_write_indexed(nesdev, + NES_IDX_DST_IP_ADDR+(0x10*PCI_FUNC(nesdev->pcidev->devfn)), 0); + + nes_manage_arp_cache(netdev, netdev->dev_addr, + ntohl(nesvnic->local_ipaddr), NES_ARP_DELETE); + nesvnic->local_ipaddr = 0; + if (is_bonded) + continue; + else + return NOTIFY_OK; + break; + case NETDEV_UP: + nes_debug(NES_DBG_NETDEV, "event:UP\n"); + + if (nesvnic->local_ipaddr != 0) { + nes_debug(NES_DBG_NETDEV, "Interface already has local_ipaddr\n"); + return NOTIFY_OK; + } + /* fall through */ + case NETDEV_CHANGEADDR: + /* Add the address to the IP table */ + if (upper_dev) + nesvnic->local_ipaddr = + ((struct in_device *)upper_dev->ip_ptr)->ifa_list->ifa_address; + else + nesvnic->local_ipaddr = ifa->ifa_address; + + nes_write_indexed(nesdev, + NES_IDX_DST_IP_ADDR+(0x10*PCI_FUNC(nesdev->pcidev->devfn)), + ntohl(nesvnic->local_ipaddr)); + nes_manage_arp_cache(netdev, netdev->dev_addr, + ntohl(nesvnic->local_ipaddr), NES_ARP_ADD); + if (is_bonded) + continue; + else + return NOTIFY_OK; + break; + default: + break; + } + } + } + + return NOTIFY_DONE; +} + + +/** + * nes_net_event + */ +static int nes_net_event(struct notifier_block *notifier, + unsigned long event, void *ptr) +{ + struct neighbour *neigh = ptr; + struct nes_device *nesdev; + struct net_device *netdev; + struct nes_vnic *nesvnic; + + switch (event) { + case NETEVENT_NEIGH_UPDATE: + list_for_each_entry(nesdev, &nes_dev_list, list) { + /* nes_debug(NES_DBG_NETDEV, "Nesdev list entry = 0x%p.\n", nesdev); */ + netdev = nesdev->netdev[0]; + nesvnic = netdev_priv(netdev); + if (netdev == neigh->dev) { + if (nesvnic->rdma_enabled == 0) { + nes_debug(NES_DBG_NETDEV, "Skipping device %s since no RDMA\n", + netdev->name); + } else { + if (neigh->nud_state & NUD_VALID) { + nes_manage_arp_cache(neigh->dev, neigh->ha, + ntohl(*(__be32 *)neigh->primary_key), NES_ARP_ADD); + } else { + nes_manage_arp_cache(neigh->dev, neigh->ha, + ntohl(*(__be32 *)neigh->primary_key), NES_ARP_DELETE); + } + } + return NOTIFY_OK; + } + } + break; + default: + nes_debug(NES_DBG_NETDEV, "NETEVENT_ %lu undefined\n", event); + break; + } + + return NOTIFY_DONE; +} + + +/** + * nes_add_ref + */ +void nes_add_ref(struct ib_qp *ibqp) +{ + struct nes_qp *nesqp; + + nesqp = to_nesqp(ibqp); + nes_debug(NES_DBG_QP, "Bumping refcount for QP%u. Pre-inc value = %u\n", + ibqp->qp_num, atomic_read(&nesqp->refcount)); + atomic_inc(&nesqp->refcount); +} + +static void nes_cqp_rem_ref_callback(struct nes_device *nesdev, struct nes_cqp_request *cqp_request) +{ + unsigned long flags; + struct nes_qp *nesqp = cqp_request->cqp_callback_pointer; + struct nes_adapter *nesadapter = nesdev->nesadapter; + + atomic_inc(&qps_destroyed); + + /* Free the control structures */ + + if (nesqp->pbl_vbase) { + pci_free_consistent(nesdev->pcidev, nesqp->qp_mem_size, + nesqp->hwqp.q2_vbase, nesqp->hwqp.q2_pbase); + spin_lock_irqsave(&nesadapter->pbl_lock, flags); + nesadapter->free_256pbl++; + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + pci_free_consistent(nesdev->pcidev, 256, nesqp->pbl_vbase, nesqp->pbl_pbase); + nesqp->pbl_vbase = NULL; + + } else { + pci_free_consistent(nesdev->pcidev, nesqp->qp_mem_size, + nesqp->hwqp.sq_vbase, nesqp->hwqp.sq_pbase); + } + nes_free_resource(nesadapter, nesadapter->allocated_qps, nesqp->hwqp.qp_id); + + nesadapter->qp_table[nesqp->hwqp.qp_id-NES_FIRST_QPN] = NULL; + kfree(nesqp->allocated_buffer); + +} + +/** + * nes_rem_ref + */ +void nes_rem_ref(struct ib_qp *ibqp) +{ + u64 u64temp; + struct nes_qp *nesqp; + struct nes_vnic *nesvnic = to_nesvnic(ibqp->device); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_hw_cqp_wqe *cqp_wqe; + struct nes_cqp_request *cqp_request; + u32 opcode; + + nesqp = to_nesqp(ibqp); + + if (atomic_read(&nesqp->refcount) == 0) { + printk(KERN_INFO PFX "%s: Reference count already 0 for QP%d, last aeq = 0x%04X.\n", + __func__, ibqp->qp_num, nesqp->last_aeq); + BUG(); + } + + if (atomic_dec_and_test(&nesqp->refcount)) { + if (nesqp->pau_mode) + nes_destroy_pau_qp(nesdev, nesqp); + + /* Destroy the QP */ + cqp_request = nes_get_cqp_request(nesdev); + if (cqp_request == NULL) { + nes_debug(NES_DBG_QP, "Failed to get a cqp_request.\n"); + return; + } + cqp_request->waiting = 0; + cqp_request->callback = 1; + cqp_request->cqp_callback = nes_cqp_rem_ref_callback; + cqp_request->cqp_callback_pointer = nesqp; + cqp_wqe = &cqp_request->cqp_wqe; + + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + opcode = NES_CQP_DESTROY_QP | NES_CQP_QP_TYPE_IWARP; + + if (nesqp->hte_added) { + opcode |= NES_CQP_QP_DEL_HTE; + nesqp->hte_added = 0; + } + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id); + u64temp = (u64)nesqp->nesqp_context_pbase; + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp); + nes_post_cqp_request(nesdev, cqp_request); + } +} + + +/** + * nes_get_qp + */ +struct ib_qp *nes_get_qp(struct ib_device *device, int qpn) +{ + struct nes_vnic *nesvnic = to_nesvnic(device); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + + if ((qpn < NES_FIRST_QPN) || (qpn >= (NES_FIRST_QPN + nesadapter->max_qp))) + return NULL; + + return &nesadapter->qp_table[qpn - NES_FIRST_QPN]->ibqp; +} + + +/** + * nes_print_macaddr + */ +static void nes_print_macaddr(struct net_device *netdev) +{ + nes_debug(NES_DBG_INIT, "%s: %pM, IRQ %u\n", + netdev->name, netdev->dev_addr, netdev->irq); +} + +/** + * nes_interrupt - handle interrupts + */ +static irqreturn_t nes_interrupt(int irq, void *dev_id) +{ + struct nes_device *nesdev = (struct nes_device *)dev_id; + int handled = 0; + u32 int_mask; + u32 int_req; + u32 int_stat; + u32 intf_int_stat; + u32 timer_stat; + + if (nesdev->msi_enabled) { + /* No need to read the interrupt pending register if msi is enabled */ + handled = 1; + } else { + if (unlikely(nesdev->nesadapter->hw_rev == NE020_REV)) { + /* Master interrupt enable provides synchronization for kicking off bottom half + when interrupt sharing is going on */ + int_mask = nes_read32(nesdev->regs + NES_INT_MASK); + if (int_mask & 0x80000000) { + /* Check interrupt status to see if this might be ours */ + int_stat = nes_read32(nesdev->regs + NES_INT_STAT); + int_req = nesdev->int_req; + if (int_stat&int_req) { + /* if interesting CEQ or AEQ is pending, claim the interrupt */ + if ((int_stat&int_req) & (~(NES_INT_TIMER|NES_INT_INTF))) { + handled = 1; + } else { + if (((int_stat & int_req) & NES_INT_TIMER) == NES_INT_TIMER) { + /* Timer might be running but might be for another function */ + timer_stat = nes_read32(nesdev->regs + NES_TIMER_STAT); + if ((timer_stat & nesdev->timer_int_req) != 0) { + handled = 1; + } + } + if ((((int_stat & int_req) & NES_INT_INTF) == NES_INT_INTF) && + (handled == 0)) { + intf_int_stat = nes_read32(nesdev->regs+NES_INTF_INT_STAT); + if ((intf_int_stat & nesdev->intf_int_req) != 0) { + handled = 1; + } + } + } + if (handled) { + nes_write32(nesdev->regs+NES_INT_MASK, int_mask & (~0x80000000)); + int_mask = nes_read32(nesdev->regs+NES_INT_MASK); + /* Save off the status to save an additional read */ + nesdev->int_stat = int_stat; + nesdev->napi_isr_ran = 1; + } + } + } + } else { + handled = nes_read32(nesdev->regs+NES_INT_PENDING); + } + } + + if (handled) { + + if (nes_napi_isr(nesdev) == 0) { + tasklet_schedule(&nesdev->dpc_tasklet); + + } + return IRQ_HANDLED; + } else { + return IRQ_NONE; + } +} + + +/** + * nes_probe - Device initialization + */ +static int nes_probe(struct pci_dev *pcidev, const struct pci_device_id *ent) +{ + struct net_device *netdev = NULL; + struct nes_device *nesdev = NULL; + int ret = 0; + void __iomem *mmio_regs = NULL; + u8 hw_rev; + + assert(pcidev != NULL); + assert(ent != NULL); + + printk(KERN_INFO PFX "NetEffect RNIC driver v%s loading. (%s)\n", + DRV_VERSION, pci_name(pcidev)); + + ret = pci_enable_device(pcidev); + if (ret) { + printk(KERN_ERR PFX "Unable to enable PCI device. (%s)\n", pci_name(pcidev)); + goto bail0; + } + + nes_debug(NES_DBG_INIT, "BAR0 (@0x%08lX) size = 0x%lX bytes\n", + (long unsigned int)pci_resource_start(pcidev, BAR_0), + (long unsigned int)pci_resource_len(pcidev, BAR_0)); + nes_debug(NES_DBG_INIT, "BAR1 (@0x%08lX) size = 0x%lX bytes\n", + (long unsigned int)pci_resource_start(pcidev, BAR_1), + (long unsigned int)pci_resource_len(pcidev, BAR_1)); + + /* Make sure PCI base addr are MMIO */ + if (!(pci_resource_flags(pcidev, BAR_0) & IORESOURCE_MEM) || + !(pci_resource_flags(pcidev, BAR_1) & IORESOURCE_MEM)) { + printk(KERN_ERR PFX "PCI regions not an MMIO resource\n"); + ret = -ENODEV; + goto bail1; + } + + /* Reserve PCI I/O and memory resources */ + ret = pci_request_regions(pcidev, DRV_NAME); + if (ret) { + printk(KERN_ERR PFX "Unable to request regions. (%s)\n", pci_name(pcidev)); + goto bail1; + } + + if ((sizeof(dma_addr_t) > 4)) { + ret = pci_set_dma_mask(pcidev, DMA_BIT_MASK(64)); + if (ret < 0) { + printk(KERN_ERR PFX "64b DMA mask configuration failed\n"); + goto bail2; + } + ret = pci_set_consistent_dma_mask(pcidev, DMA_BIT_MASK(64)); + if (ret) { + printk(KERN_ERR PFX "64b DMA consistent mask configuration failed\n"); + goto bail2; + } + } else { + ret = pci_set_dma_mask(pcidev, DMA_BIT_MASK(32)); + if (ret < 0) { + printk(KERN_ERR PFX "32b DMA mask configuration failed\n"); + goto bail2; + } + ret = pci_set_consistent_dma_mask(pcidev, DMA_BIT_MASK(32)); + if (ret) { + printk(KERN_ERR PFX "32b DMA consistent mask configuration failed\n"); + goto bail2; + } + } + + pci_set_master(pcidev); + + /* Allocate hardware structure */ + nesdev = kzalloc(sizeof(struct nes_device), GFP_KERNEL); + if (!nesdev) { + printk(KERN_ERR PFX "%s: Unable to alloc hardware struct\n", pci_name(pcidev)); + ret = -ENOMEM; + goto bail2; + } + + nes_debug(NES_DBG_INIT, "Allocated nes device at %p\n", nesdev); + nesdev->pcidev = pcidev; + pci_set_drvdata(pcidev, nesdev); + + pci_read_config_byte(pcidev, 0x0008, &hw_rev); + nes_debug(NES_DBG_INIT, "hw_rev=%u\n", hw_rev); + + spin_lock_init(&nesdev->indexed_regs_lock); + + /* Remap the PCI registers in adapter BAR0 to kernel VA space */ + mmio_regs = ioremap_nocache(pci_resource_start(pcidev, BAR_0), + pci_resource_len(pcidev, BAR_0)); + if (mmio_regs == NULL) { + printk(KERN_ERR PFX "Unable to remap BAR0\n"); + ret = -EIO; + goto bail3; + } + nesdev->regs = mmio_regs; + nesdev->index_reg = 0x50 + (PCI_FUNC(pcidev->devfn)*8) + mmio_regs; + + /* Ensure interrupts are disabled */ + nes_write32(nesdev->regs+NES_INT_MASK, 0x7fffffff); + + if (nes_drv_opt & NES_DRV_OPT_ENABLE_MSI) { + if (!pci_enable_msi(nesdev->pcidev)) { + nesdev->msi_enabled = 1; + nes_debug(NES_DBG_INIT, "MSI is enabled for device %s\n", + pci_name(pcidev)); + } else { + nes_debug(NES_DBG_INIT, "MSI is disabled by linux for device %s\n", + pci_name(pcidev)); + } + } else { + nes_debug(NES_DBG_INIT, "MSI not requested due to driver options for device %s\n", + pci_name(pcidev)); + } + + nesdev->csr_start = pci_resource_start(nesdev->pcidev, BAR_0); + nesdev->doorbell_region = pci_resource_start(nesdev->pcidev, BAR_1); + + /* Init the adapter */ + nesdev->nesadapter = nes_init_adapter(nesdev, hw_rev); + if (!nesdev->nesadapter) { + printk(KERN_ERR PFX "Unable to initialize adapter.\n"); + ret = -ENOMEM; + goto bail5; + } + nesdev->nesadapter->et_rx_coalesce_usecs_irq = interrupt_mod_interval; + nesdev->nesadapter->wqm_quanta = wqm_quanta; + + /* nesdev->base_doorbell_index = + nesdev->nesadapter->pd_config_base[PCI_FUNC(nesdev->pcidev->devfn)]; */ + nesdev->base_doorbell_index = 1; + nesdev->doorbell_start = nesdev->nesadapter->doorbell_start; + if (nesdev->nesadapter->phy_type[0] == NES_PHY_TYPE_PUMA_1G) { + switch (PCI_FUNC(nesdev->pcidev->devfn) % + nesdev->nesadapter->port_count) { + case 1: + nesdev->mac_index = 2; + break; + case 2: + nesdev->mac_index = 1; + break; + case 3: + nesdev->mac_index = 3; + break; + case 0: + default: + nesdev->mac_index = 0; + } + } else { + nesdev->mac_index = PCI_FUNC(nesdev->pcidev->devfn) % + nesdev->nesadapter->port_count; + } + + if ((limit_maxrdreqsz || + ((nesdev->nesadapter->phy_type[0] == NES_PHY_TYPE_GLADIUS) && + (hw_rev == NE020_REV1))) && + (pcie_get_readrq(pcidev) > 256)) { + if (pcie_set_readrq(pcidev, 256)) + printk(KERN_ERR PFX "Unable to set max read request" + " to 256 bytes\n"); + else + nes_debug(NES_DBG_INIT, "Max read request size set" + " to 256 bytes\n"); + } + + tasklet_init(&nesdev->dpc_tasklet, nes_dpc, (unsigned long)nesdev); + + /* bring up the Control QP */ + if (nes_init_cqp(nesdev)) { + ret = -ENODEV; + goto bail6; + } + + /* Arm the CCQ */ + nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT | + PCI_FUNC(nesdev->pcidev->devfn)); + nes_read32(nesdev->regs+NES_CQE_ALLOC); + + /* Enable the interrupts */ + nesdev->int_req = (0x101 << PCI_FUNC(nesdev->pcidev->devfn)) | + (1 << (PCI_FUNC(nesdev->pcidev->devfn)+16)); + if (PCI_FUNC(nesdev->pcidev->devfn) < 4) { + nesdev->int_req |= (1 << (PCI_FUNC(nesdev->mac_index)+24)); + } + + /* TODO: This really should be the first driver to load, not function 0 */ + if (PCI_FUNC(nesdev->pcidev->devfn) == 0) { + /* pick up PCI and critical errors if the first driver to load */ + nesdev->intf_int_req = NES_INTF_INT_PCIERR | NES_INTF_INT_CRITERR; + nesdev->int_req |= NES_INT_INTF; + } else { + nesdev->intf_int_req = 0; + } + nesdev->intf_int_req |= (1 << (PCI_FUNC(nesdev->pcidev->devfn)+16)); + nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS0, 0); + nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS1, 0); + nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS2, 0x00001265); + nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS4, 0x18021804); + + nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS3, 0x17801790); + + /* deal with both periodic and one_shot */ + nesdev->timer_int_req = 0x101 << PCI_FUNC(nesdev->pcidev->devfn); + nesdev->nesadapter->timer_int_req |= nesdev->timer_int_req; + nes_debug(NES_DBG_INIT, "setting int_req for function %u, nesdev = 0x%04X, adapter = 0x%04X\n", + PCI_FUNC(nesdev->pcidev->devfn), + nesdev->timer_int_req, nesdev->nesadapter->timer_int_req); + + nes_write32(nesdev->regs+NES_INTF_INT_MASK, ~(nesdev->intf_int_req)); + + list_add_tail(&nesdev->list, &nes_dev_list); + + /* Request an interrupt line for the driver */ + ret = request_irq(pcidev->irq, nes_interrupt, IRQF_SHARED, DRV_NAME, nesdev); + if (ret) { + printk(KERN_ERR PFX "%s: requested IRQ %u is busy\n", + pci_name(pcidev), pcidev->irq); + goto bail65; + } + + nes_write32(nesdev->regs+NES_INT_MASK, ~nesdev->int_req); + + if (nes_notifiers_registered == 0) { + register_inetaddr_notifier(&nes_inetaddr_notifier); + register_netevent_notifier(&nes_net_notifier); + } + nes_notifiers_registered++; + + if (ibnl_add_client(RDMA_NL_NES, RDMA_NL_IWPM_NUM_OPS, nes_nl_cb_table)) + printk(KERN_ERR PFX "%s[%u]: Failed to add netlink callback\n", + __func__, __LINE__); + + ret = iwpm_init(RDMA_NL_NES); + if (ret) { + printk(KERN_ERR PFX "%s: port mapper initialization failed\n", + pci_name(pcidev)); + goto bail7; + } + + INIT_DELAYED_WORK(&nesdev->work, nes_recheck_link_status); + + /* Initialize network devices */ + netdev = nes_netdev_init(nesdev, mmio_regs); + if (netdev == NULL) { + ret = -ENOMEM; + goto bail7; + } + + /* Register network device */ + ret = register_netdev(netdev); + if (ret) { + printk(KERN_ERR PFX "Unable to register netdev, ret = %d\n", ret); + nes_netdev_destroy(netdev); + goto bail7; + } + + nes_print_macaddr(netdev); + + nesdev->netdev_count++; + nesdev->nesadapter->netdev_count++; + + printk(KERN_INFO PFX "%s: NetEffect RNIC driver successfully loaded.\n", + pci_name(pcidev)); + return 0; + + bail7: + printk(KERN_ERR PFX "bail7\n"); + while (nesdev->netdev_count > 0) { + nesdev->netdev_count--; + nesdev->nesadapter->netdev_count--; + + unregister_netdev(nesdev->netdev[nesdev->netdev_count]); + nes_netdev_destroy(nesdev->netdev[nesdev->netdev_count]); + } + + nes_debug(NES_DBG_INIT, "netdev_count=%d, nesadapter->netdev_count=%d\n", + nesdev->netdev_count, nesdev->nesadapter->netdev_count); + ibnl_remove_client(RDMA_NL_NES); + + nes_notifiers_registered--; + if (nes_notifiers_registered == 0) { + unregister_netevent_notifier(&nes_net_notifier); + unregister_inetaddr_notifier(&nes_inetaddr_notifier); + } + + list_del(&nesdev->list); + nes_destroy_cqp(nesdev); + + bail65: + printk(KERN_ERR PFX "bail65\n"); + free_irq(pcidev->irq, nesdev); + if (nesdev->msi_enabled) { + pci_disable_msi(pcidev); + } + bail6: + printk(KERN_ERR PFX "bail6\n"); + tasklet_kill(&nesdev->dpc_tasklet); + /* Deallocate the Adapter Structure */ + nes_destroy_adapter(nesdev->nesadapter); + + bail5: + printk(KERN_ERR PFX "bail5\n"); + iounmap(nesdev->regs); + + bail3: + printk(KERN_ERR PFX "bail3\n"); + kfree(nesdev); + + bail2: + pci_release_regions(pcidev); + + bail1: + pci_disable_device(pcidev); + + bail0: + return ret; +} + + +/** + * nes_remove - unload from kernel + */ +static void nes_remove(struct pci_dev *pcidev) +{ + struct nes_device *nesdev = pci_get_drvdata(pcidev); + struct net_device *netdev; + int netdev_index = 0; + unsigned long flags; + + if (nesdev->netdev_count) { + netdev = nesdev->netdev[netdev_index]; + if (netdev) { + netif_stop_queue(netdev); + unregister_netdev(netdev); + nes_netdev_destroy(netdev); + + nesdev->netdev[netdev_index] = NULL; + nesdev->netdev_count--; + nesdev->nesadapter->netdev_count--; + } + } + ibnl_remove_client(RDMA_NL_NES); + iwpm_exit(RDMA_NL_NES); + + nes_notifiers_registered--; + if (nes_notifiers_registered == 0) { + unregister_netevent_notifier(&nes_net_notifier); + unregister_inetaddr_notifier(&nes_inetaddr_notifier); + } + + list_del(&nesdev->list); + nes_destroy_cqp(nesdev); + + free_irq(pcidev->irq, nesdev); + tasklet_kill(&nesdev->dpc_tasklet); + + spin_lock_irqsave(&nesdev->nesadapter->phy_lock, flags); + if (nesdev->link_recheck) { + spin_unlock_irqrestore(&nesdev->nesadapter->phy_lock, flags); + cancel_delayed_work_sync(&nesdev->work); + } else { + spin_unlock_irqrestore(&nesdev->nesadapter->phy_lock, flags); + } + + /* Deallocate the Adapter Structure */ + nes_destroy_adapter(nesdev->nesadapter); + + if (nesdev->msi_enabled) { + pci_disable_msi(pcidev); + } + + iounmap(nesdev->regs); + kfree(nesdev); + + /* nes_debug(NES_DBG_SHUTDOWN, "calling pci_release_regions.\n"); */ + pci_release_regions(pcidev); + pci_disable_device(pcidev); + pci_set_drvdata(pcidev, NULL); +} + + +static struct pci_driver nes_pci_driver = { + .name = DRV_NAME, + .id_table = nes_pci_table, + .probe = nes_probe, + .remove = nes_remove, +}; + +static ssize_t nes_show_adapter(struct device_driver *ddp, char *buf) +{ + unsigned int devfn = 0xffffffff; + unsigned char bus_number = 0xff; + unsigned int i = 0; + struct nes_device *nesdev; + + list_for_each_entry(nesdev, &nes_dev_list, list) { + if (i == ee_flsh_adapter) { + devfn = nesdev->pcidev->devfn; + bus_number = nesdev->pcidev->bus->number; + break; + } + i++; + } + + return snprintf(buf, PAGE_SIZE, "%x:%x\n", bus_number, devfn); +} + +static ssize_t nes_store_adapter(struct device_driver *ddp, + const char *buf, size_t count) +{ + char *p = (char *)buf; + + ee_flsh_adapter = simple_strtoul(p, &p, 10); + return strnlen(buf, count); +} + +static ssize_t nes_show_ee_cmd(struct device_driver *ddp, char *buf) +{ + u32 eeprom_cmd = 0xdead; + u32 i = 0; + struct nes_device *nesdev; + + list_for_each_entry(nesdev, &nes_dev_list, list) { + if (i == ee_flsh_adapter) { + eeprom_cmd = nes_read32(nesdev->regs + NES_EEPROM_COMMAND); + break; + } + i++; + } + return snprintf(buf, PAGE_SIZE, "0x%x\n", eeprom_cmd); +} + +static ssize_t nes_store_ee_cmd(struct device_driver *ddp, + const char *buf, size_t count) +{ + char *p = (char *)buf; + u32 val; + u32 i = 0; + struct nes_device *nesdev; + + if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') { + val = simple_strtoul(p, &p, 16); + list_for_each_entry(nesdev, &nes_dev_list, list) { + if (i == ee_flsh_adapter) { + nes_write32(nesdev->regs + NES_EEPROM_COMMAND, val); + break; + } + i++; + } + } + return strnlen(buf, count); +} + +static ssize_t nes_show_ee_data(struct device_driver *ddp, char *buf) +{ + u32 eeprom_data = 0xdead; + u32 i = 0; + struct nes_device *nesdev; + + list_for_each_entry(nesdev, &nes_dev_list, list) { + if (i == ee_flsh_adapter) { + eeprom_data = nes_read32(nesdev->regs + NES_EEPROM_DATA); + break; + } + i++; + } + + return snprintf(buf, PAGE_SIZE, "0x%x\n", eeprom_data); +} + +static ssize_t nes_store_ee_data(struct device_driver *ddp, + const char *buf, size_t count) +{ + char *p = (char *)buf; + u32 val; + u32 i = 0; + struct nes_device *nesdev; + + if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') { + val = simple_strtoul(p, &p, 16); + list_for_each_entry(nesdev, &nes_dev_list, list) { + if (i == ee_flsh_adapter) { + nes_write32(nesdev->regs + NES_EEPROM_DATA, val); + break; + } + i++; + } + } + return strnlen(buf, count); +} + +static ssize_t nes_show_flash_cmd(struct device_driver *ddp, char *buf) +{ + u32 flash_cmd = 0xdead; + u32 i = 0; + struct nes_device *nesdev; + + list_for_each_entry(nesdev, &nes_dev_list, list) { + if (i == ee_flsh_adapter) { + flash_cmd = nes_read32(nesdev->regs + NES_FLASH_COMMAND); + break; + } + i++; + } + + return snprintf(buf, PAGE_SIZE, "0x%x\n", flash_cmd); +} + +static ssize_t nes_store_flash_cmd(struct device_driver *ddp, + const char *buf, size_t count) +{ + char *p = (char *)buf; + u32 val; + u32 i = 0; + struct nes_device *nesdev; + + if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') { + val = simple_strtoul(p, &p, 16); + list_for_each_entry(nesdev, &nes_dev_list, list) { + if (i == ee_flsh_adapter) { + nes_write32(nesdev->regs + NES_FLASH_COMMAND, val); + break; + } + i++; + } + } + return strnlen(buf, count); +} + +static ssize_t nes_show_flash_data(struct device_driver *ddp, char *buf) +{ + u32 flash_data = 0xdead; + u32 i = 0; + struct nes_device *nesdev; + + list_for_each_entry(nesdev, &nes_dev_list, list) { + if (i == ee_flsh_adapter) { + flash_data = nes_read32(nesdev->regs + NES_FLASH_DATA); + break; + } + i++; + } + + return snprintf(buf, PAGE_SIZE, "0x%x\n", flash_data); +} + +static ssize_t nes_store_flash_data(struct device_driver *ddp, + const char *buf, size_t count) +{ + char *p = (char *)buf; + u32 val; + u32 i = 0; + struct nes_device *nesdev; + + if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') { + val = simple_strtoul(p, &p, 16); + list_for_each_entry(nesdev, &nes_dev_list, list) { + if (i == ee_flsh_adapter) { + nes_write32(nesdev->regs + NES_FLASH_DATA, val); + break; + } + i++; + } + } + return strnlen(buf, count); +} + +static ssize_t nes_show_nonidx_addr(struct device_driver *ddp, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "0x%x\n", sysfs_nonidx_addr); +} + +static ssize_t nes_store_nonidx_addr(struct device_driver *ddp, + const char *buf, size_t count) +{ + char *p = (char *)buf; + + if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') + sysfs_nonidx_addr = simple_strtoul(p, &p, 16); + + return strnlen(buf, count); +} + +static ssize_t nes_show_nonidx_data(struct device_driver *ddp, char *buf) +{ + u32 nonidx_data = 0xdead; + u32 i = 0; + struct nes_device *nesdev; + + list_for_each_entry(nesdev, &nes_dev_list, list) { + if (i == ee_flsh_adapter) { + nonidx_data = nes_read32(nesdev->regs + sysfs_nonidx_addr); + break; + } + i++; + } + + return snprintf(buf, PAGE_SIZE, "0x%x\n", nonidx_data); +} + +static ssize_t nes_store_nonidx_data(struct device_driver *ddp, + const char *buf, size_t count) +{ + char *p = (char *)buf; + u32 val; + u32 i = 0; + struct nes_device *nesdev; + + if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') { + val = simple_strtoul(p, &p, 16); + list_for_each_entry(nesdev, &nes_dev_list, list) { + if (i == ee_flsh_adapter) { + nes_write32(nesdev->regs + sysfs_nonidx_addr, val); + break; + } + i++; + } + } + return strnlen(buf, count); +} + +static ssize_t nes_show_idx_addr(struct device_driver *ddp, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "0x%x\n", sysfs_idx_addr); +} + +static ssize_t nes_store_idx_addr(struct device_driver *ddp, + const char *buf, size_t count) +{ + char *p = (char *)buf; + + if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') + sysfs_idx_addr = simple_strtoul(p, &p, 16); + + return strnlen(buf, count); +} + +static ssize_t nes_show_idx_data(struct device_driver *ddp, char *buf) +{ + u32 idx_data = 0xdead; + u32 i = 0; + struct nes_device *nesdev; + + list_for_each_entry(nesdev, &nes_dev_list, list) { + if (i == ee_flsh_adapter) { + idx_data = nes_read_indexed(nesdev, sysfs_idx_addr); + break; + } + i++; + } + + return snprintf(buf, PAGE_SIZE, "0x%x\n", idx_data); +} + +static ssize_t nes_store_idx_data(struct device_driver *ddp, + const char *buf, size_t count) +{ + char *p = (char *)buf; + u32 val; + u32 i = 0; + struct nes_device *nesdev; + + if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') { + val = simple_strtoul(p, &p, 16); + list_for_each_entry(nesdev, &nes_dev_list, list) { + if (i == ee_flsh_adapter) { + nes_write_indexed(nesdev, sysfs_idx_addr, val); + break; + } + i++; + } + } + return strnlen(buf, count); +} + + +/** + * nes_show_wqm_quanta + */ +static ssize_t nes_show_wqm_quanta(struct device_driver *ddp, char *buf) +{ + u32 wqm_quanta_value = 0xdead; + u32 i = 0; + struct nes_device *nesdev; + + list_for_each_entry(nesdev, &nes_dev_list, list) { + if (i == ee_flsh_adapter) { + wqm_quanta_value = nesdev->nesadapter->wqm_quanta; + break; + } + i++; + } + + return snprintf(buf, PAGE_SIZE, "0x%X\n", wqm_quanta_value); +} + + +/** + * nes_store_wqm_quanta + */ +static ssize_t nes_store_wqm_quanta(struct device_driver *ddp, + const char *buf, size_t count) +{ + unsigned long wqm_quanta_value; + u32 wqm_config1; + u32 i = 0; + struct nes_device *nesdev; + + if (kstrtoul(buf, 0, &wqm_quanta_value) < 0) + return -EINVAL; + + list_for_each_entry(nesdev, &nes_dev_list, list) { + if (i == ee_flsh_adapter) { + nesdev->nesadapter->wqm_quanta = wqm_quanta_value; + wqm_config1 = nes_read_indexed(nesdev, + NES_IDX_WQM_CONFIG1); + nes_write_indexed(nesdev, NES_IDX_WQM_CONFIG1, + ((wqm_quanta_value << 1) | + (wqm_config1 & 0x00000001))); + break; + } + i++; + } + return strnlen(buf, count); +} + +static DRIVER_ATTR(adapter, S_IRUSR | S_IWUSR, + nes_show_adapter, nes_store_adapter); +static DRIVER_ATTR(eeprom_cmd, S_IRUSR | S_IWUSR, + nes_show_ee_cmd, nes_store_ee_cmd); +static DRIVER_ATTR(eeprom_data, S_IRUSR | S_IWUSR, + nes_show_ee_data, nes_store_ee_data); +static DRIVER_ATTR(flash_cmd, S_IRUSR | S_IWUSR, + nes_show_flash_cmd, nes_store_flash_cmd); +static DRIVER_ATTR(flash_data, S_IRUSR | S_IWUSR, + nes_show_flash_data, nes_store_flash_data); +static DRIVER_ATTR(nonidx_addr, S_IRUSR | S_IWUSR, + nes_show_nonidx_addr, nes_store_nonidx_addr); +static DRIVER_ATTR(nonidx_data, S_IRUSR | S_IWUSR, + nes_show_nonidx_data, nes_store_nonidx_data); +static DRIVER_ATTR(idx_addr, S_IRUSR | S_IWUSR, + nes_show_idx_addr, nes_store_idx_addr); +static DRIVER_ATTR(idx_data, S_IRUSR | S_IWUSR, + nes_show_idx_data, nes_store_idx_data); +static DRIVER_ATTR(wqm_quanta, S_IRUSR | S_IWUSR, + nes_show_wqm_quanta, nes_store_wqm_quanta); + +static int nes_create_driver_sysfs(struct pci_driver *drv) +{ + int error; + error = driver_create_file(&drv->driver, &driver_attr_adapter); + error |= driver_create_file(&drv->driver, &driver_attr_eeprom_cmd); + error |= driver_create_file(&drv->driver, &driver_attr_eeprom_data); + error |= driver_create_file(&drv->driver, &driver_attr_flash_cmd); + error |= driver_create_file(&drv->driver, &driver_attr_flash_data); + error |= driver_create_file(&drv->driver, &driver_attr_nonidx_addr); + error |= driver_create_file(&drv->driver, &driver_attr_nonidx_data); + error |= driver_create_file(&drv->driver, &driver_attr_idx_addr); + error |= driver_create_file(&drv->driver, &driver_attr_idx_data); + error |= driver_create_file(&drv->driver, &driver_attr_wqm_quanta); + return error; +} + +static void nes_remove_driver_sysfs(struct pci_driver *drv) +{ + driver_remove_file(&drv->driver, &driver_attr_adapter); + driver_remove_file(&drv->driver, &driver_attr_eeprom_cmd); + driver_remove_file(&drv->driver, &driver_attr_eeprom_data); + driver_remove_file(&drv->driver, &driver_attr_flash_cmd); + driver_remove_file(&drv->driver, &driver_attr_flash_data); + driver_remove_file(&drv->driver, &driver_attr_nonidx_addr); + driver_remove_file(&drv->driver, &driver_attr_nonidx_data); + driver_remove_file(&drv->driver, &driver_attr_idx_addr); + driver_remove_file(&drv->driver, &driver_attr_idx_data); + driver_remove_file(&drv->driver, &driver_attr_wqm_quanta); +} + +/** + * nes_init_module - module initialization entry point + */ +static int __init nes_init_module(void) +{ + int retval; + int retval1; + + retval = nes_cm_start(); + if (retval) { + printk(KERN_ERR PFX "Unable to start NetEffect iWARP CM.\n"); + return retval; + } + retval = pci_register_driver(&nes_pci_driver); + if (retval >= 0) { + retval1 = nes_create_driver_sysfs(&nes_pci_driver); + if (retval1 < 0) + printk(KERN_ERR PFX "Unable to create NetEffect sys files.\n"); + } + return retval; +} + + +/** + * nes_exit_module - module unload entry point + */ +static void __exit nes_exit_module(void) +{ + nes_cm_stop(); + nes_remove_driver_sysfs(&nes_pci_driver); + + pci_unregister_driver(&nes_pci_driver); +} + + +module_init(nes_init_module); +module_exit(nes_exit_module); diff --git a/kernel/drivers/infiniband/hw/nes/nes.h b/kernel/drivers/infiniband/hw/nes/nes.h new file mode 100644 index 000000000..bd9d132f1 --- /dev/null +++ b/kernel/drivers/infiniband/hw/nes/nes.h @@ -0,0 +1,582 @@ +/* + * Copyright (c) 2006 - 2011 Intel Corporation. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __NES_H +#define __NES_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#define NES_SEND_FIRST_WRITE + +#define QUEUE_DISCONNECTS + +#define DRV_NAME "iw_nes" +#define DRV_VERSION "1.5.0.1" +#define PFX DRV_NAME ": " + +/* + * NetEffect PCI vendor id and NE010 PCI device id. + */ +#ifndef PCI_VENDOR_ID_NETEFFECT /* not in pci.ids yet */ +#define PCI_VENDOR_ID_NETEFFECT 0x1678 +#define PCI_DEVICE_ID_NETEFFECT_NE020 0x0100 +#define PCI_DEVICE_ID_NETEFFECT_NE020_KR 0x0110 +#endif + +#define NE020_REV 4 +#define NE020_REV1 5 + +#define BAR_0 0 +#define BAR_1 2 + +#define RX_BUF_SIZE (1536 + 8) +#define NES_REG0_SIZE (4 * 1024) +#define NES_TX_TIMEOUT (6*HZ) +#define NES_FIRST_QPN 64 +#define NES_SW_CONTEXT_ALIGN 1024 + +#define NES_NIC_MAX_NICS 16 +#define NES_MAX_ARP_TABLE_SIZE 4096 + +#define NES_NIC_CEQ_SIZE 8 +/* NICs will be on a separate CQ */ +#define NES_CCEQ_SIZE ((nesadapter->max_cq / nesadapter->port_count) - 32) + +#define NES_MAX_PORT_COUNT 4 + +#define MAX_DPC_ITERATIONS 128 + +#define NES_DRV_OPT_ENABLE_MPA_VER_0 0x00000001 +#define NES_DRV_OPT_DISABLE_MPA_CRC 0x00000002 +#define NES_DRV_OPT_DISABLE_FIRST_WRITE 0x00000004 +#define NES_DRV_OPT_DISABLE_INTF 0x00000008 +#define NES_DRV_OPT_ENABLE_MSI 0x00000010 +#define NES_DRV_OPT_DUAL_LOGICAL_PORT 0x00000020 +#define NES_DRV_OPT_SUPRESS_OPTION_BC 0x00000040 +#define NES_DRV_OPT_NO_INLINE_DATA 0x00000080 +#define NES_DRV_OPT_DISABLE_INT_MOD 0x00000100 +#define NES_DRV_OPT_DISABLE_VIRT_WQ 0x00000200 +#define NES_DRV_OPT_ENABLE_PAU 0x00000400 + +#define NES_AEQ_EVENT_TIMEOUT 2500 +#define NES_DISCONNECT_EVENT_TIMEOUT 2000 + +/* debug levels */ +/* must match userspace */ +#define NES_DBG_HW 0x00000001 +#define NES_DBG_INIT 0x00000002 +#define NES_DBG_ISR 0x00000004 +#define NES_DBG_PHY 0x00000008 +#define NES_DBG_NETDEV 0x00000010 +#define NES_DBG_CM 0x00000020 +#define NES_DBG_CM1 0x00000040 +#define NES_DBG_NIC_RX 0x00000080 +#define NES_DBG_NIC_TX 0x00000100 +#define NES_DBG_CQP 0x00000200 +#define NES_DBG_MMAP 0x00000400 +#define NES_DBG_MR 0x00000800 +#define NES_DBG_PD 0x00001000 +#define NES_DBG_CQ 0x00002000 +#define NES_DBG_QP 0x00004000 +#define NES_DBG_MOD_QP 0x00008000 +#define NES_DBG_AEQ 0x00010000 +#define NES_DBG_IW_RX 0x00020000 +#define NES_DBG_IW_TX 0x00040000 +#define NES_DBG_SHUTDOWN 0x00080000 +#define NES_DBG_PAU 0x00100000 +#define NES_DBG_NLMSG 0x00200000 +#define NES_DBG_RSVD1 0x10000000 +#define NES_DBG_RSVD2 0x20000000 +#define NES_DBG_RSVD3 0x40000000 +#define NES_DBG_RSVD4 0x80000000 +#define NES_DBG_ALL 0xffffffff + +#ifdef CONFIG_INFINIBAND_NES_DEBUG +#define nes_debug(level, fmt, args...) \ +do { \ + if (level & nes_debug_level) \ + printk(KERN_ERR PFX "%s[%u]: " fmt, __func__, __LINE__, ##args); \ +} while (0) + +#define assert(expr) \ +do { \ + if (!(expr)) { \ + printk(KERN_ERR PFX "Assertion failed! %s, %s, %s, line %d\n", \ + #expr, __FILE__, __func__, __LINE__); \ + } \ +} while (0) + +#define NES_EVENT_TIMEOUT 1200000 +#else +#define nes_debug(level, fmt, args...) +#define assert(expr) do {} while (0) + +#define NES_EVENT_TIMEOUT 100000 +#endif + +#include "nes_hw.h" +#include "nes_verbs.h" +#include "nes_context.h" +#include "nes_user.h" +#include "nes_cm.h" +#include "nes_mgt.h" + +extern int max_mtu; +#define max_frame_len (max_mtu+ETH_HLEN) +extern int interrupt_mod_interval; +extern int nes_if_count; +extern int mpa_version; +extern int disable_mpa_crc; +extern unsigned int nes_drv_opt; +extern unsigned int nes_debug_level; +extern unsigned int wqm_quanta; +extern struct list_head nes_adapter_list; + +extern atomic_t cm_connects; +extern atomic_t cm_accepts; +extern atomic_t cm_disconnects; +extern atomic_t cm_closes; +extern atomic_t cm_connecteds; +extern atomic_t cm_connect_reqs; +extern atomic_t cm_rejects; +extern atomic_t mod_qp_timouts; +extern atomic_t qps_created; +extern atomic_t qps_destroyed; +extern atomic_t sw_qps_destroyed; +extern u32 mh_detected; +extern u32 mh_pauses_sent; +extern u32 cm_packets_sent; +extern u32 cm_packets_bounced; +extern u32 cm_packets_created; +extern u32 cm_packets_received; +extern u32 cm_packets_dropped; +extern u32 cm_packets_retrans; +extern atomic_t cm_listens_created; +extern atomic_t cm_listens_destroyed; +extern u32 cm_backlog_drops; +extern atomic_t cm_loopbacks; +extern atomic_t cm_nodes_created; +extern atomic_t cm_nodes_destroyed; +extern atomic_t cm_accel_dropped_pkts; +extern atomic_t cm_resets_recvd; +extern atomic_t pau_qps_created; +extern atomic_t pau_qps_destroyed; + +extern u32 int_mod_timer_init; +extern u32 int_mod_cq_depth_256; +extern u32 int_mod_cq_depth_128; +extern u32 int_mod_cq_depth_32; +extern u32 int_mod_cq_depth_24; +extern u32 int_mod_cq_depth_16; +extern u32 int_mod_cq_depth_4; +extern u32 int_mod_cq_depth_1; + +struct nes_device { + struct nes_adapter *nesadapter; + void __iomem *regs; + void __iomem *index_reg; + struct pci_dev *pcidev; + struct net_device *netdev[NES_NIC_MAX_NICS]; + u64 link_status_interrupts; + struct tasklet_struct dpc_tasklet; + spinlock_t indexed_regs_lock; + unsigned long csr_start; + unsigned long doorbell_region; + unsigned long doorbell_start; + unsigned long mac_tx_errors; + unsigned long mac_pause_frames_sent; + unsigned long mac_pause_frames_received; + unsigned long mac_rx_errors; + unsigned long mac_rx_crc_errors; + unsigned long mac_rx_symbol_err_frames; + unsigned long mac_rx_jabber_frames; + unsigned long mac_rx_oversized_frames; + unsigned long mac_rx_short_frames; + unsigned long port_rx_discards; + unsigned long port_tx_discards; + unsigned int mac_index; + unsigned int nes_stack_start; + + /* Control Structures */ + void *cqp_vbase; + dma_addr_t cqp_pbase; + u32 cqp_mem_size; + u8 ceq_index; + u8 nic_ceq_index; + struct nes_hw_cqp cqp; + struct nes_hw_cq ccq; + struct list_head cqp_avail_reqs; + struct list_head cqp_pending_reqs; + struct nes_cqp_request *nes_cqp_requests; + + u32 int_req; + u32 int_stat; + u32 timer_int_req; + u32 timer_only_int_count; + u32 intf_int_req; + u32 last_mac_tx_pauses; + u32 last_used_chunks_tx; + struct list_head list; + + u16 base_doorbell_index; + u16 currcq_count; + u16 deepcq_count; + u8 iw_status; + u8 msi_enabled; + u8 netdev_count; + u8 napi_isr_ran; + u8 disable_rx_flow_control; + u8 disable_tx_flow_control; + + struct delayed_work work; + u8 link_recheck; +}; + +/* Receive skb private area - must fit in skb->cb area */ +struct nes_rskb_cb { + u64 busaddr; + u32 maplen; + u32 seqnum; + u8 *data_start; + struct nes_qp *nesqp; +}; + +static inline __le32 get_crc_value(struct nes_v4_quad *nes_quad) +{ + u32 crc_value; + crc_value = crc32c(~0, (void *)nes_quad, sizeof (struct nes_v4_quad)); + + /* + * With commit ef19454b ("[LIB] crc32c: Keep intermediate crc + * state in cpu order"), behavior of crc32c changes on + * big-endian platforms. Our algorithm expects the previous + * behavior; otherwise we have RDMA connection establishment + * issue on big-endian. + */ + return cpu_to_le32(crc_value); +} + +static inline void +set_wqe_64bit_value(__le32 *wqe_words, u32 index, u64 value) +{ + wqe_words[index] = cpu_to_le32((u32) value); + wqe_words[index + 1] = cpu_to_le32(upper_32_bits(value)); +} + +static inline void +set_wqe_32bit_value(__le32 *wqe_words, u32 index, u32 value) +{ + wqe_words[index] = cpu_to_le32(value); +} + +static inline void +nes_fill_init_cqp_wqe(struct nes_hw_cqp_wqe *cqp_wqe, struct nes_device *nesdev) +{ + cqp_wqe->wqe_words[NES_CQP_WQE_COMP_CTX_LOW_IDX] = 0; + cqp_wqe->wqe_words[NES_CQP_WQE_COMP_CTX_HIGH_IDX] = 0; + cqp_wqe->wqe_words[NES_CQP_WQE_COMP_SCRATCH_LOW_IDX] = 0; + cqp_wqe->wqe_words[NES_CQP_WQE_COMP_SCRATCH_HIGH_IDX] = 0; + cqp_wqe->wqe_words[NES_CQP_STAG_WQE_PBL_BLK_COUNT_IDX] = 0; + cqp_wqe->wqe_words[NES_CQP_STAG_WQE_PBL_LEN_IDX] = 0; + cqp_wqe->wqe_words[NES_CQP_STAG_WQE_LEN_LOW_IDX] = 0; + cqp_wqe->wqe_words[NES_CQP_STAG_WQE_PA_LOW_IDX] = 0; + cqp_wqe->wqe_words[NES_CQP_STAG_WQE_PA_HIGH_IDX] = 0; +} + +static inline void +nes_fill_init_qp_wqe(struct nes_hw_qp_wqe *wqe, struct nes_qp *nesqp, u32 head) +{ + u32 value; + value = ((u32)((unsigned long) nesqp)) | head; + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_CTX_HIGH_IDX, + (u32)(upper_32_bits((unsigned long)(nesqp)))); + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_CTX_LOW_IDX, value); +} + +/* Read from memory-mapped device */ +static inline u32 nes_read_indexed(struct nes_device *nesdev, u32 reg_index) +{ + unsigned long flags; + void __iomem *addr = nesdev->index_reg; + u32 value; + + spin_lock_irqsave(&nesdev->indexed_regs_lock, flags); + + writel(reg_index, addr); + value = readl((void __iomem *)addr + 4); + + spin_unlock_irqrestore(&nesdev->indexed_regs_lock, flags); + return value; +} + +static inline u32 nes_read32(const void __iomem *addr) +{ + return readl(addr); +} + +static inline u16 nes_read16(const void __iomem *addr) +{ + return readw(addr); +} + +static inline u8 nes_read8(const void __iomem *addr) +{ + return readb(addr); +} + +/* Write to memory-mapped device */ +static inline void nes_write_indexed(struct nes_device *nesdev, u32 reg_index, u32 val) +{ + unsigned long flags; + void __iomem *addr = nesdev->index_reg; + + spin_lock_irqsave(&nesdev->indexed_regs_lock, flags); + + writel(reg_index, addr); + writel(val, (void __iomem *)addr + 4); + + spin_unlock_irqrestore(&nesdev->indexed_regs_lock, flags); +} + +static inline void nes_write32(void __iomem *addr, u32 val) +{ + writel(val, addr); +} + +static inline void nes_write16(void __iomem *addr, u16 val) +{ + writew(val, addr); +} + +static inline void nes_write8(void __iomem *addr, u8 val) +{ + writeb(val, addr); +} + +enum nes_resource { + NES_RESOURCE_MW = 1, + NES_RESOURCE_FAST_MR, + NES_RESOURCE_PHYS_MR, + NES_RESOURCE_USER_MR, + NES_RESOURCE_PD, + NES_RESOURCE_QP, + NES_RESOURCE_CQ, + NES_RESOURCE_ARP +}; + +static inline int nes_alloc_resource(struct nes_adapter *nesadapter, + unsigned long *resource_array, u32 max_resources, + u32 *req_resource_num, u32 *next, enum nes_resource resource_type) +{ + unsigned long flags; + u32 resource_num; + + spin_lock_irqsave(&nesadapter->resource_lock, flags); + + resource_num = find_next_zero_bit(resource_array, max_resources, *next); + if (resource_num >= max_resources) { + resource_num = find_first_zero_bit(resource_array, max_resources); + if (resource_num >= max_resources) { + printk(KERN_ERR PFX "%s: No available resources [type=%u].\n", __func__, resource_type); + spin_unlock_irqrestore(&nesadapter->resource_lock, flags); + return -EMFILE; + } + } + set_bit(resource_num, resource_array); + *next = resource_num+1; + if (*next == max_resources) { + *next = 0; + } + spin_unlock_irqrestore(&nesadapter->resource_lock, flags); + *req_resource_num = resource_num; + + return 0; +} + +static inline int nes_is_resource_allocated(struct nes_adapter *nesadapter, + unsigned long *resource_array, u32 resource_num) +{ + unsigned long flags; + int bit_is_set; + + spin_lock_irqsave(&nesadapter->resource_lock, flags); + + bit_is_set = test_bit(resource_num, resource_array); + nes_debug(NES_DBG_HW, "resource_num %u is%s allocated.\n", + resource_num, (bit_is_set ? "": " not")); + spin_unlock_irqrestore(&nesadapter->resource_lock, flags); + + return bit_is_set; +} + +static inline void nes_free_resource(struct nes_adapter *nesadapter, + unsigned long *resource_array, u32 resource_num) +{ + unsigned long flags; + + spin_lock_irqsave(&nesadapter->resource_lock, flags); + clear_bit(resource_num, resource_array); + spin_unlock_irqrestore(&nesadapter->resource_lock, flags); +} + +static inline struct nes_vnic *to_nesvnic(struct ib_device *ibdev) +{ + return container_of(ibdev, struct nes_ib_device, ibdev)->nesvnic; +} + +static inline struct nes_pd *to_nespd(struct ib_pd *ibpd) +{ + return container_of(ibpd, struct nes_pd, ibpd); +} + +static inline struct nes_ucontext *to_nesucontext(struct ib_ucontext *ibucontext) +{ + return container_of(ibucontext, struct nes_ucontext, ibucontext); +} + +static inline struct nes_mr *to_nesmr(struct ib_mr *ibmr) +{ + return container_of(ibmr, struct nes_mr, ibmr); +} + +static inline struct nes_mr *to_nesmr_from_ibfmr(struct ib_fmr *ibfmr) +{ + return container_of(ibfmr, struct nes_mr, ibfmr); +} + +static inline struct nes_mr *to_nesmw(struct ib_mw *ibmw) +{ + return container_of(ibmw, struct nes_mr, ibmw); +} + +static inline struct nes_fmr *to_nesfmr(struct nes_mr *nesmr) +{ + return container_of(nesmr, struct nes_fmr, nesmr); +} + +static inline struct nes_cq *to_nescq(struct ib_cq *ibcq) +{ + return container_of(ibcq, struct nes_cq, ibcq); +} + +static inline struct nes_qp *to_nesqp(struct ib_qp *ibqp) +{ + return container_of(ibqp, struct nes_qp, ibqp); +} + + + +/* nes.c */ +void nes_add_ref(struct ib_qp *); +void nes_rem_ref(struct ib_qp *); +struct ib_qp *nes_get_qp(struct ib_device *, int); + + +/* nes_hw.c */ +struct nes_adapter *nes_init_adapter(struct nes_device *, u8); +void nes_nic_init_timer_defaults(struct nes_device *, u8); +void nes_destroy_adapter(struct nes_adapter *); +int nes_init_cqp(struct nes_device *); +int nes_init_phy(struct nes_device *); +int nes_init_nic_qp(struct nes_device *, struct net_device *); +void nes_destroy_nic_qp(struct nes_vnic *); +int nes_napi_isr(struct nes_device *); +void nes_dpc(unsigned long); +void nes_nic_ce_handler(struct nes_device *, struct nes_hw_nic_cq *); +void nes_iwarp_ce_handler(struct nes_device *, struct nes_hw_cq *); +int nes_destroy_cqp(struct nes_device *); +int nes_nic_cm_xmit(struct sk_buff *, struct net_device *); +void nes_recheck_link_status(struct work_struct *work); +void nes_terminate_timeout(unsigned long context); + +/* nes_nic.c */ +struct net_device *nes_netdev_init(struct nes_device *, void __iomem *); +void nes_netdev_destroy(struct net_device *); +int nes_nic_cm_xmit(struct sk_buff *, struct net_device *); + +/* nes_cm.c */ +void *nes_cm_create(struct net_device *); +int nes_cm_recv(struct sk_buff *, struct net_device *); +void nes_update_arp(unsigned char *, u32, u32, u16, u16); +void nes_manage_arp_cache(struct net_device *, unsigned char *, u32, u32); +void nes_sock_release(struct nes_qp *, unsigned long *); +void flush_wqes(struct nes_device *nesdev, struct nes_qp *, u32, u32); +int nes_manage_apbvt(struct nes_vnic *, u32, u32, u32); +int nes_cm_disconn(struct nes_qp *); +void nes_cm_disconn_worker(void *); + +/* nes_verbs.c */ +int nes_hw_modify_qp(struct nes_device *, struct nes_qp *, u32, u32, u32); +int nes_modify_qp(struct ib_qp *, struct ib_qp_attr *, int, struct ib_udata *); +struct nes_ib_device *nes_init_ofa_device(struct net_device *); +void nes_port_ibevent(struct nes_vnic *nesvnic); +void nes_destroy_ofa_device(struct nes_ib_device *); +int nes_register_ofa_device(struct nes_ib_device *); + +/* nes_util.c */ +int nes_read_eeprom_values(struct nes_device *, struct nes_adapter *); +void nes_write_1G_phy_reg(struct nes_device *, u8, u8, u16); +void nes_read_1G_phy_reg(struct nes_device *, u8, u8, u16 *); +void nes_write_10G_phy_reg(struct nes_device *, u16, u8, u16, u16); +void nes_read_10G_phy_reg(struct nes_device *, u8, u8, u16); +struct nes_cqp_request *nes_get_cqp_request(struct nes_device *); +void nes_free_cqp_request(struct nes_device *nesdev, + struct nes_cqp_request *cqp_request); +void nes_put_cqp_request(struct nes_device *nesdev, + struct nes_cqp_request *cqp_request); +void nes_post_cqp_request(struct nes_device *, struct nes_cqp_request *); +int nes_arp_table(struct nes_device *, u32, u8 *, u32); +void nes_mh_fix(unsigned long); +void nes_clc(unsigned long); +void nes_dump_mem(unsigned int, void *, int); +u32 nes_crc32(u32, u32, u32, u32, u8 *, u32, u32, u32); + +#endif /* __NES_H */ diff --git a/kernel/drivers/infiniband/hw/nes/nes_cm.c b/kernel/drivers/infiniband/hw/nes/nes_cm.c new file mode 100644 index 000000000..72b43417c --- /dev/null +++ b/kernel/drivers/infiniband/hw/nes/nes_cm.c @@ -0,0 +1,4184 @@ +/* + * Copyright (c) 2006 - 2014 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + + +#define TCPOPT_TIMESTAMP 8 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "nes.h" + +u32 cm_packets_sent; +u32 cm_packets_bounced; +u32 cm_packets_dropped; +u32 cm_packets_retrans; +u32 cm_packets_created; +u32 cm_packets_received; +atomic_t cm_listens_created; +atomic_t cm_listens_destroyed; +u32 cm_backlog_drops; +atomic_t cm_loopbacks; +atomic_t cm_nodes_created; +atomic_t cm_nodes_destroyed; +atomic_t cm_accel_dropped_pkts; +atomic_t cm_resets_recvd; + +static inline int mini_cm_accelerated(struct nes_cm_core *, struct nes_cm_node *); +static struct nes_cm_listener *mini_cm_listen(struct nes_cm_core *, struct nes_vnic *, struct nes_cm_info *); +static int mini_cm_del_listen(struct nes_cm_core *, struct nes_cm_listener *); +static struct nes_cm_node *mini_cm_connect(struct nes_cm_core *, struct nes_vnic *, u16, void *, struct nes_cm_info *); +static int mini_cm_close(struct nes_cm_core *, struct nes_cm_node *); +static int mini_cm_accept(struct nes_cm_core *, struct nes_cm_node *); +static int mini_cm_reject(struct nes_cm_core *, struct nes_cm_node *); +static int mini_cm_recv_pkt(struct nes_cm_core *, struct nes_vnic *, struct sk_buff *); +static int mini_cm_dealloc_core(struct nes_cm_core *); +static int mini_cm_get(struct nes_cm_core *); +static int mini_cm_set(struct nes_cm_core *, u32, u32); + +static void form_cm_frame(struct sk_buff *, struct nes_cm_node *, void *, u32, void *, u32, u8); +static int add_ref_cm_node(struct nes_cm_node *); +static int rem_ref_cm_node(struct nes_cm_core *, struct nes_cm_node *); + +static int nes_cm_disconn_true(struct nes_qp *); +static int nes_cm_post_event(struct nes_cm_event *event); +static int nes_disconnect(struct nes_qp *nesqp, int abrupt); +static void nes_disconnect_worker(struct work_struct *work); + +static int send_mpa_request(struct nes_cm_node *, struct sk_buff *); +static int send_mpa_reject(struct nes_cm_node *); +static int send_syn(struct nes_cm_node *, u32, struct sk_buff *); +static int send_reset(struct nes_cm_node *, struct sk_buff *); +static int send_ack(struct nes_cm_node *cm_node, struct sk_buff *skb); +static int send_fin(struct nes_cm_node *cm_node, struct sk_buff *skb); +static void process_packet(struct nes_cm_node *, struct sk_buff *, struct nes_cm_core *); + +static void active_open_err(struct nes_cm_node *, struct sk_buff *, int); +static void passive_open_err(struct nes_cm_node *, struct sk_buff *, int); +static void cleanup_retrans_entry(struct nes_cm_node *); +static void handle_rcv_mpa(struct nes_cm_node *, struct sk_buff *); +static void free_retrans_entry(struct nes_cm_node *cm_node); +static int handle_tcp_options(struct nes_cm_node *cm_node, struct tcphdr *tcph, struct sk_buff *skb, int optionsize, int passive); + +/* CM event handler functions */ +static void cm_event_connected(struct nes_cm_event *); +static void cm_event_connect_error(struct nes_cm_event *); +static void cm_event_reset(struct nes_cm_event *); +static void cm_event_mpa_req(struct nes_cm_event *); +static void cm_event_mpa_reject(struct nes_cm_event *); +static void handle_recv_entry(struct nes_cm_node *cm_node, u32 rem_node); + +/* MPA build functions */ +static int cm_build_mpa_frame(struct nes_cm_node *, u8 **, u16 *, u8 *, u8); +static void build_mpa_v2(struct nes_cm_node *, void *, u8); +static void build_mpa_v1(struct nes_cm_node *, void *, u8); +static void build_rdma0_msg(struct nes_cm_node *, struct nes_qp **); + +static void print_core(struct nes_cm_core *core); +static void record_ird_ord(struct nes_cm_node *, u16, u16); + +/* External CM API Interface */ +/* instance of function pointers for client API */ +/* set address of this instance to cm_core->cm_ops at cm_core alloc */ +static struct nes_cm_ops nes_cm_api = { + mini_cm_accelerated, + mini_cm_listen, + mini_cm_del_listen, + mini_cm_connect, + mini_cm_close, + mini_cm_accept, + mini_cm_reject, + mini_cm_recv_pkt, + mini_cm_dealloc_core, + mini_cm_get, + mini_cm_set +}; + +static struct nes_cm_core *g_cm_core; + +atomic_t cm_connects; +atomic_t cm_accepts; +atomic_t cm_disconnects; +atomic_t cm_closes; +atomic_t cm_connecteds; +atomic_t cm_connect_reqs; +atomic_t cm_rejects; + +int nes_add_ref_cm_node(struct nes_cm_node *cm_node) +{ + return add_ref_cm_node(cm_node); +} + +int nes_rem_ref_cm_node(struct nes_cm_node *cm_node) +{ + return rem_ref_cm_node(cm_node->cm_core, cm_node); +} +/** + * create_event + */ +static struct nes_cm_event *create_event(struct nes_cm_node * cm_node, + enum nes_cm_event_type type) +{ + struct nes_cm_event *event; + + if (!cm_node->cm_id) + return NULL; + + /* allocate an empty event */ + event = kzalloc(sizeof(*event), GFP_ATOMIC); + + if (!event) + return NULL; + + event->type = type; + event->cm_node = cm_node; + event->cm_info.rem_addr = cm_node->rem_addr; + event->cm_info.loc_addr = cm_node->loc_addr; + event->cm_info.rem_port = cm_node->rem_port; + event->cm_info.loc_port = cm_node->loc_port; + event->cm_info.cm_id = cm_node->cm_id; + + nes_debug(NES_DBG_CM, "cm_node=%p Created event=%p, type=%u, " + "dst_addr=%08x[%x], src_addr=%08x[%x]\n", + cm_node, event, type, event->cm_info.loc_addr, + event->cm_info.loc_port, event->cm_info.rem_addr, + event->cm_info.rem_port); + + nes_cm_post_event(event); + return event; +} + + +/** + * send_mpa_request + */ +static int send_mpa_request(struct nes_cm_node *cm_node, struct sk_buff *skb) +{ + u8 start_addr = 0; + u8 *start_ptr = &start_addr; + u8 **start_buff = &start_ptr; + u16 buff_len = 0; + + if (!skb) { + nes_debug(NES_DBG_CM, "skb set to NULL\n"); + return -1; + } + + /* send an MPA Request frame */ + cm_build_mpa_frame(cm_node, start_buff, &buff_len, NULL, MPA_KEY_REQUEST); + form_cm_frame(skb, cm_node, NULL, 0, *start_buff, buff_len, SET_ACK); + + return schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 1, 0); +} + + + +static int send_mpa_reject(struct nes_cm_node *cm_node) +{ + struct sk_buff *skb = NULL; + u8 start_addr = 0; + u8 *start_ptr = &start_addr; + u8 **start_buff = &start_ptr; + u16 buff_len = 0; + struct ietf_mpa_v1 *mpa_frame; + + skb = dev_alloc_skb(MAX_CM_BUFFER); + if (!skb) { + nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n"); + return -ENOMEM; + } + + /* send an MPA reject frame */ + cm_build_mpa_frame(cm_node, start_buff, &buff_len, NULL, MPA_KEY_REPLY); + mpa_frame = (struct ietf_mpa_v1 *)*start_buff; + mpa_frame->flags |= IETF_MPA_FLAGS_REJECT; + form_cm_frame(skb, cm_node, NULL, 0, *start_buff, buff_len, SET_ACK | SET_FIN); + + cm_node->state = NES_CM_STATE_FIN_WAIT1; + return schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 1, 0); +} + + +/** + * recv_mpa - process a received TCP pkt, we are expecting an + * IETF MPA frame + */ +static int parse_mpa(struct nes_cm_node *cm_node, u8 *buffer, u32 *type, + u32 len) +{ + struct ietf_mpa_v1 *mpa_frame; + struct ietf_mpa_v2 *mpa_v2_frame; + struct ietf_rtr_msg *rtr_msg; + int mpa_hdr_len; + int priv_data_len; + + *type = NES_MPA_REQUEST_ACCEPT; + + /* assume req frame is in tcp data payload */ + if (len < sizeof(struct ietf_mpa_v1)) { + nes_debug(NES_DBG_CM, "The received ietf buffer was too small (%x)\n", len); + return -EINVAL; + } + + /* points to the beginning of the frame, which could be MPA V1 or V2 */ + mpa_frame = (struct ietf_mpa_v1 *)buffer; + mpa_hdr_len = sizeof(struct ietf_mpa_v1); + priv_data_len = ntohs(mpa_frame->priv_data_len); + + /* make sure mpa private data len is less than 512 bytes */ + if (priv_data_len > IETF_MAX_PRIV_DATA_LEN) { + nes_debug(NES_DBG_CM, "The received Length of Private" + " Data field exceeds 512 octets\n"); + return -EINVAL; + } + /* + * make sure MPA receiver interoperate with the + * received MPA version and MPA key information + * + */ + if (mpa_frame->rev != IETF_MPA_V1 && mpa_frame->rev != IETF_MPA_V2) { + nes_debug(NES_DBG_CM, "The received mpa version" + " is not supported\n"); + return -EINVAL; + } + /* + * backwards compatibility only + */ + if (mpa_frame->rev > cm_node->mpa_frame_rev) { + nes_debug(NES_DBG_CM, "The received mpa version" + " can not be interoperated\n"); + return -EINVAL; + } else { + cm_node->mpa_frame_rev = mpa_frame->rev; + } + + if (cm_node->state != NES_CM_STATE_MPAREQ_SENT) { + if (memcmp(mpa_frame->key, IEFT_MPA_KEY_REQ, IETF_MPA_KEY_SIZE)) { + nes_debug(NES_DBG_CM, "Unexpected MPA Key received \n"); + return -EINVAL; + } + } else { + if (memcmp(mpa_frame->key, IEFT_MPA_KEY_REP, IETF_MPA_KEY_SIZE)) { + nes_debug(NES_DBG_CM, "Unexpected MPA Key received \n"); + return -EINVAL; + } + } + + if (priv_data_len + mpa_hdr_len != len) { + nes_debug(NES_DBG_CM, "The received ietf buffer was not right" + " complete (%x + %x != %x)\n", + priv_data_len, mpa_hdr_len, len); + return -EINVAL; + } + /* make sure it does not exceed the max size */ + if (len > MAX_CM_BUFFER) { + nes_debug(NES_DBG_CM, "The received ietf buffer was too large" + " (%x + %x != %x)\n", + priv_data_len, mpa_hdr_len, len); + return -EINVAL; + } + + cm_node->mpa_frame_size = priv_data_len; + + switch (mpa_frame->rev) { + case IETF_MPA_V2: { + u16 ird_size; + u16 ord_size; + u16 rtr_ctrl_ird; + u16 rtr_ctrl_ord; + + mpa_v2_frame = (struct ietf_mpa_v2 *)buffer; + mpa_hdr_len += IETF_RTR_MSG_SIZE; + cm_node->mpa_frame_size -= IETF_RTR_MSG_SIZE; + rtr_msg = &mpa_v2_frame->rtr_msg; + + /* parse rtr message */ + rtr_ctrl_ird = ntohs(rtr_msg->ctrl_ird); + rtr_ctrl_ord = ntohs(rtr_msg->ctrl_ord); + ird_size = rtr_ctrl_ird & IETF_NO_IRD_ORD; + ord_size = rtr_ctrl_ord & IETF_NO_IRD_ORD; + + if (!(rtr_ctrl_ird & IETF_PEER_TO_PEER)) { + /* send reset */ + return -EINVAL; + } + if (ird_size == IETF_NO_IRD_ORD || ord_size == IETF_NO_IRD_ORD) + cm_node->mpav2_ird_ord = IETF_NO_IRD_ORD; + + if (cm_node->mpav2_ird_ord != IETF_NO_IRD_ORD) { + /* responder */ + if (cm_node->state != NES_CM_STATE_MPAREQ_SENT) { + /* we are still negotiating */ + if (ord_size > NES_MAX_IRD) { + cm_node->ird_size = NES_MAX_IRD; + } else { + cm_node->ird_size = ord_size; + if (ord_size == 0 && + (rtr_ctrl_ord & IETF_RDMA0_READ)) { + cm_node->ird_size = 1; + nes_debug(NES_DBG_CM, + "%s: Remote peer doesn't support RDMA0_READ (ord=%u)\n", + __func__, ord_size); + } + } + if (ird_size > NES_MAX_ORD) + cm_node->ord_size = NES_MAX_ORD; + else + cm_node->ord_size = ird_size; + } else { /* initiator */ + if (ord_size > NES_MAX_IRD) { + nes_debug(NES_DBG_CM, + "%s: Unable to support the requested (ord =%u)\n", + __func__, ord_size); + return -EINVAL; + } + cm_node->ird_size = ord_size; + + if (ird_size > NES_MAX_ORD) { + cm_node->ord_size = NES_MAX_ORD; + } else { + if (ird_size == 0 && + (rtr_ctrl_ord & IETF_RDMA0_READ)) { + nes_debug(NES_DBG_CM, + "%s: Remote peer doesn't support RDMA0_READ (ird=%u)\n", + __func__, ird_size); + return -EINVAL; + } else { + cm_node->ord_size = ird_size; + } + } + } + } + + if (rtr_ctrl_ord & IETF_RDMA0_READ) { + cm_node->send_rdma0_op = SEND_RDMA_READ_ZERO; + + } else if (rtr_ctrl_ord & IETF_RDMA0_WRITE) { + cm_node->send_rdma0_op = SEND_RDMA_WRITE_ZERO; + } else { /* Not supported RDMA0 operation */ + return -EINVAL; + } + break; + } + case IETF_MPA_V1: + default: + break; + } + + /* copy entire MPA frame to our cm_node's frame */ + memcpy(cm_node->mpa_frame_buf, buffer + mpa_hdr_len, cm_node->mpa_frame_size); + + if (mpa_frame->flags & IETF_MPA_FLAGS_REJECT) + *type = NES_MPA_REQUEST_REJECT; + return 0; +} + + +/** + * form_cm_frame - get a free packet and build empty frame Use + * node info to build. + */ +static void form_cm_frame(struct sk_buff *skb, + struct nes_cm_node *cm_node, void *options, u32 optionsize, + void *data, u32 datasize, u8 flags) +{ + struct tcphdr *tcph; + struct iphdr *iph; + struct ethhdr *ethh; + u8 *buf; + u16 packetsize = sizeof(*iph); + + packetsize += sizeof(*tcph); + packetsize += optionsize + datasize; + + skb_trim(skb, 0); + memset(skb->data, 0x00, ETH_HLEN + sizeof(*iph) + sizeof(*tcph)); + + buf = skb_put(skb, packetsize + ETH_HLEN); + + ethh = (struct ethhdr *)buf; + buf += ETH_HLEN; + + iph = (struct iphdr *)buf; + buf += sizeof(*iph); + tcph = (struct tcphdr *)buf; + skb_reset_mac_header(skb); + skb_set_network_header(skb, ETH_HLEN); + skb_set_transport_header(skb, ETH_HLEN + sizeof(*iph)); + buf += sizeof(*tcph); + + skb->ip_summed = CHECKSUM_PARTIAL; + if (!(cm_node->netdev->features & NETIF_F_IP_CSUM)) + skb->ip_summed = CHECKSUM_NONE; + skb->protocol = htons(0x800); + skb->data_len = 0; + skb->mac_len = ETH_HLEN; + + memcpy(ethh->h_dest, cm_node->rem_mac, ETH_ALEN); + memcpy(ethh->h_source, cm_node->loc_mac, ETH_ALEN); + ethh->h_proto = htons(0x0800); + + iph->version = IPVERSION; + iph->ihl = 5; /* 5 * 4Byte words, IP headr len */ + iph->tos = 0; + iph->tot_len = htons(packetsize); + iph->id = htons(++cm_node->tcp_cntxt.loc_id); + + iph->frag_off = htons(0x4000); + iph->ttl = 0x40; + iph->protocol = 0x06; /* IPPROTO_TCP */ + + iph->saddr = htonl(cm_node->mapped_loc_addr); + iph->daddr = htonl(cm_node->mapped_rem_addr); + + tcph->source = htons(cm_node->mapped_loc_port); + tcph->dest = htons(cm_node->mapped_rem_port); + tcph->seq = htonl(cm_node->tcp_cntxt.loc_seq_num); + + if (flags & SET_ACK) { + cm_node->tcp_cntxt.loc_ack_num = cm_node->tcp_cntxt.rcv_nxt; + tcph->ack_seq = htonl(cm_node->tcp_cntxt.loc_ack_num); + tcph->ack = 1; + } else { + tcph->ack_seq = 0; + } + + if (flags & SET_SYN) { + cm_node->tcp_cntxt.loc_seq_num++; + tcph->syn = 1; + } else { + cm_node->tcp_cntxt.loc_seq_num += datasize; + } + + if (flags & SET_FIN) { + cm_node->tcp_cntxt.loc_seq_num++; + tcph->fin = 1; + } + + if (flags & SET_RST) + tcph->rst = 1; + + tcph->doff = (u16)((sizeof(*tcph) + optionsize + 3) >> 2); + tcph->window = htons(cm_node->tcp_cntxt.rcv_wnd); + tcph->urg_ptr = 0; + if (optionsize) + memcpy(buf, options, optionsize); + buf += optionsize; + if (datasize) + memcpy(buf, data, datasize); + + skb_shinfo(skb)->nr_frags = 0; + cm_packets_created++; +} + +/* + * nes_create_sockaddr - Record ip addr and tcp port in a sockaddr struct + */ +static void nes_create_sockaddr(__be32 ip_addr, __be16 port, + struct sockaddr_storage *addr) +{ + struct sockaddr_in *nes_sockaddr = (struct sockaddr_in *)addr; + nes_sockaddr->sin_family = AF_INET; + memcpy(&nes_sockaddr->sin_addr.s_addr, &ip_addr, sizeof(__be32)); + nes_sockaddr->sin_port = port; +} + +/* + * nes_create_mapinfo - Create a mapinfo object in the port mapper data base + */ +static int nes_create_mapinfo(struct nes_cm_info *cm_info) +{ + struct sockaddr_storage local_sockaddr; + struct sockaddr_storage mapped_sockaddr; + + nes_create_sockaddr(htonl(cm_info->loc_addr), htons(cm_info->loc_port), + &local_sockaddr); + nes_create_sockaddr(htonl(cm_info->mapped_loc_addr), + htons(cm_info->mapped_loc_port), &mapped_sockaddr); + + return iwpm_create_mapinfo(&local_sockaddr, + &mapped_sockaddr, RDMA_NL_NES); +} + +/* + * nes_remove_mapinfo - Remove a mapinfo object from the port mapper data base + * and send a remove mapping op message to + * the userspace port mapper + */ +static int nes_remove_mapinfo(u32 loc_addr, u16 loc_port, + u32 mapped_loc_addr, u16 mapped_loc_port) +{ + struct sockaddr_storage local_sockaddr; + struct sockaddr_storage mapped_sockaddr; + + nes_create_sockaddr(htonl(loc_addr), htons(loc_port), &local_sockaddr); + nes_create_sockaddr(htonl(mapped_loc_addr), htons(mapped_loc_port), + &mapped_sockaddr); + + iwpm_remove_mapinfo(&local_sockaddr, &mapped_sockaddr); + return iwpm_remove_mapping(&local_sockaddr, RDMA_NL_NES); +} + +/* + * nes_form_pm_msg - Form a port mapper message with mapping info + */ +static void nes_form_pm_msg(struct nes_cm_info *cm_info, + struct iwpm_sa_data *pm_msg) +{ + nes_create_sockaddr(htonl(cm_info->loc_addr), htons(cm_info->loc_port), + &pm_msg->loc_addr); + nes_create_sockaddr(htonl(cm_info->rem_addr), htons(cm_info->rem_port), + &pm_msg->rem_addr); +} + +/* + * nes_form_reg_msg - Form a port mapper message with dev info + */ +static void nes_form_reg_msg(struct nes_vnic *nesvnic, + struct iwpm_dev_data *pm_msg) +{ + memcpy(pm_msg->dev_name, nesvnic->nesibdev->ibdev.name, + IWPM_DEVNAME_SIZE); + memcpy(pm_msg->if_name, nesvnic->netdev->name, IWPM_IFNAME_SIZE); +} + +static void record_sockaddr_info(struct sockaddr_storage *addr_info, + nes_addr_t *ip_addr, u16 *port_num) +{ + struct sockaddr_in *in_addr = (struct sockaddr_in *)addr_info; + + if (in_addr->sin_family == AF_INET) { + *ip_addr = ntohl(in_addr->sin_addr.s_addr); + *port_num = ntohs(in_addr->sin_port); + } +} + +/* + * nes_record_pm_msg - Save the received mapping info + */ +static void nes_record_pm_msg(struct nes_cm_info *cm_info, + struct iwpm_sa_data *pm_msg) +{ + record_sockaddr_info(&pm_msg->mapped_loc_addr, + &cm_info->mapped_loc_addr, &cm_info->mapped_loc_port); + + record_sockaddr_info(&pm_msg->mapped_rem_addr, + &cm_info->mapped_rem_addr, &cm_info->mapped_rem_port); +} + +/* + * nes_get_reminfo - Get the address info of the remote connecting peer + */ +static int nes_get_remote_addr(struct nes_cm_node *cm_node) +{ + struct sockaddr_storage mapped_loc_addr, mapped_rem_addr; + struct sockaddr_storage remote_addr; + int ret; + + nes_create_sockaddr(htonl(cm_node->mapped_loc_addr), + htons(cm_node->mapped_loc_port), &mapped_loc_addr); + nes_create_sockaddr(htonl(cm_node->mapped_rem_addr), + htons(cm_node->mapped_rem_port), &mapped_rem_addr); + + ret = iwpm_get_remote_info(&mapped_loc_addr, &mapped_rem_addr, + &remote_addr, RDMA_NL_NES); + if (ret) + nes_debug(NES_DBG_CM, "Unable to find remote peer address info\n"); + else + record_sockaddr_info(&remote_addr, &cm_node->rem_addr, + &cm_node->rem_port); + return ret; +} + +/** + * print_core - dump a cm core + */ +static void print_core(struct nes_cm_core *core) +{ + nes_debug(NES_DBG_CM, "---------------------------------------------\n"); + nes_debug(NES_DBG_CM, "CM Core -- (core = %p )\n", core); + if (!core) + return; + nes_debug(NES_DBG_CM, "---------------------------------------------\n"); + + nes_debug(NES_DBG_CM, "State : %u \n", core->state); + + nes_debug(NES_DBG_CM, "Listen Nodes : %u \n", atomic_read(&core->listen_node_cnt)); + nes_debug(NES_DBG_CM, "Active Nodes : %u \n", atomic_read(&core->node_cnt)); + + nes_debug(NES_DBG_CM, "core : %p \n", core); + + nes_debug(NES_DBG_CM, "-------------- end core ---------------\n"); +} + +static void record_ird_ord(struct nes_cm_node *cm_node, + u16 conn_ird, u16 conn_ord) +{ + if (conn_ird > NES_MAX_IRD) + conn_ird = NES_MAX_IRD; + + if (conn_ord > NES_MAX_ORD) + conn_ord = NES_MAX_ORD; + + cm_node->ird_size = conn_ird; + cm_node->ord_size = conn_ord; +} + +/** + * cm_build_mpa_frame - build a MPA V1 frame or MPA V2 frame + */ +static int cm_build_mpa_frame(struct nes_cm_node *cm_node, u8 **start_buff, + u16 *buff_len, u8 *pci_mem, u8 mpa_key) +{ + int ret = 0; + + *start_buff = (pci_mem) ? pci_mem : &cm_node->mpa_frame_buf[0]; + + switch (cm_node->mpa_frame_rev) { + case IETF_MPA_V1: + *start_buff = (u8 *)*start_buff + sizeof(struct ietf_rtr_msg); + *buff_len = sizeof(struct ietf_mpa_v1) + cm_node->mpa_frame_size; + build_mpa_v1(cm_node, *start_buff, mpa_key); + break; + case IETF_MPA_V2: + *buff_len = sizeof(struct ietf_mpa_v2) + cm_node->mpa_frame_size; + build_mpa_v2(cm_node, *start_buff, mpa_key); + break; + default: + ret = -EINVAL; + } + return ret; +} + +/** + * build_mpa_v2 - build a MPA V2 frame + */ +static void build_mpa_v2(struct nes_cm_node *cm_node, + void *start_addr, u8 mpa_key) +{ + struct ietf_mpa_v2 *mpa_frame = (struct ietf_mpa_v2 *)start_addr; + struct ietf_rtr_msg *rtr_msg = &mpa_frame->rtr_msg; + u16 ctrl_ird; + u16 ctrl_ord; + + /* initialize the upper 5 bytes of the frame */ + build_mpa_v1(cm_node, start_addr, mpa_key); + mpa_frame->flags |= IETF_MPA_V2_FLAG; /* set a bit to indicate MPA V2 */ + mpa_frame->priv_data_len += htons(IETF_RTR_MSG_SIZE); + + /* initialize RTR msg */ + if (cm_node->mpav2_ird_ord == IETF_NO_IRD_ORD) { + ctrl_ird = IETF_NO_IRD_ORD; + ctrl_ord = IETF_NO_IRD_ORD; + } else { + ctrl_ird = cm_node->ird_size & IETF_NO_IRD_ORD; + ctrl_ord = cm_node->ord_size & IETF_NO_IRD_ORD; + } + ctrl_ird |= IETF_PEER_TO_PEER; + ctrl_ird |= IETF_FLPDU_ZERO_LEN; + + switch (mpa_key) { + case MPA_KEY_REQUEST: + ctrl_ord |= IETF_RDMA0_WRITE; + ctrl_ord |= IETF_RDMA0_READ; + break; + case MPA_KEY_REPLY: + switch (cm_node->send_rdma0_op) { + case SEND_RDMA_WRITE_ZERO: + ctrl_ord |= IETF_RDMA0_WRITE; + break; + case SEND_RDMA_READ_ZERO: + ctrl_ord |= IETF_RDMA0_READ; + break; + } + } + rtr_msg->ctrl_ird = htons(ctrl_ird); + rtr_msg->ctrl_ord = htons(ctrl_ord); +} + +/** + * build_mpa_v1 - build a MPA V1 frame + */ +static void build_mpa_v1(struct nes_cm_node *cm_node, void *start_addr, u8 mpa_key) +{ + struct ietf_mpa_v1 *mpa_frame = (struct ietf_mpa_v1 *)start_addr; + + switch (mpa_key) { + case MPA_KEY_REQUEST: + memcpy(mpa_frame->key, IEFT_MPA_KEY_REQ, IETF_MPA_KEY_SIZE); + break; + case MPA_KEY_REPLY: + memcpy(mpa_frame->key, IEFT_MPA_KEY_REP, IETF_MPA_KEY_SIZE); + break; + } + mpa_frame->flags = IETF_MPA_FLAGS_CRC; + mpa_frame->rev = cm_node->mpa_frame_rev; + mpa_frame->priv_data_len = htons(cm_node->mpa_frame_size); +} + +static void build_rdma0_msg(struct nes_cm_node *cm_node, struct nes_qp **nesqp_addr) +{ + u64 u64temp; + struct nes_qp *nesqp = *nesqp_addr; + struct nes_hw_qp_wqe *wqe = &nesqp->hwqp.sq_vbase[0]; + + u64temp = (unsigned long)nesqp->nesuqp_addr; + u64temp |= NES_SW_CONTEXT_ALIGN >> 1; + set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_CTX_LOW_IDX, u64temp); + + wqe->wqe_words[NES_IWARP_SQ_WQE_FRAG0_LOW_IDX] = 0; + wqe->wqe_words[NES_IWARP_SQ_WQE_FRAG0_HIGH_IDX] = 0; + + switch (cm_node->send_rdma0_op) { + case SEND_RDMA_WRITE_ZERO: + nes_debug(NES_DBG_CM, "Sending first write.\n"); + wqe->wqe_words[NES_IWARP_SQ_WQE_MISC_IDX] = + cpu_to_le32(NES_IWARP_SQ_OP_RDMAW); + wqe->wqe_words[NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX] = 0; + wqe->wqe_words[NES_IWARP_SQ_WQE_LENGTH0_IDX] = 0; + wqe->wqe_words[NES_IWARP_SQ_WQE_STAG0_IDX] = 0; + break; + + case SEND_RDMA_READ_ZERO: + default: + if (cm_node->send_rdma0_op != SEND_RDMA_READ_ZERO) + WARN(1, "Unsupported RDMA0 len operation=%u\n", + cm_node->send_rdma0_op); + nes_debug(NES_DBG_CM, "Sending first rdma operation.\n"); + wqe->wqe_words[NES_IWARP_SQ_WQE_MISC_IDX] = + cpu_to_le32(NES_IWARP_SQ_OP_RDMAR); + wqe->wqe_words[NES_IWARP_SQ_WQE_RDMA_TO_LOW_IDX] = 1; + wqe->wqe_words[NES_IWARP_SQ_WQE_RDMA_TO_HIGH_IDX] = 0; + wqe->wqe_words[NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX] = 0; + wqe->wqe_words[NES_IWARP_SQ_WQE_RDMA_STAG_IDX] = 1; + wqe->wqe_words[NES_IWARP_SQ_WQE_STAG0_IDX] = 1; + break; + } + + if (nesqp->sq_kmapped) { + nesqp->sq_kmapped = 0; + kunmap(nesqp->page); + } + + /*use the reserved spot on the WQ for the extra first WQE*/ + nesqp->nesqp_context->ird_ord_sizes &= cpu_to_le32(~(NES_QPCONTEXT_ORDIRD_LSMM_PRESENT | + NES_QPCONTEXT_ORDIRD_WRPDU | + NES_QPCONTEXT_ORDIRD_ALSMM)); + nesqp->skip_lsmm = 1; + nesqp->hwqp.sq_tail = 0; +} + +/** + * schedule_nes_timer + * note - cm_node needs to be protected before calling this. Encase in: + * rem_ref_cm_node(cm_core, cm_node);add_ref_cm_node(cm_node); + */ +int schedule_nes_timer(struct nes_cm_node *cm_node, struct sk_buff *skb, + enum nes_timer_type type, int send_retrans, + int close_when_complete) +{ + unsigned long flags; + struct nes_cm_core *cm_core = cm_node->cm_core; + struct nes_timer_entry *new_send; + int ret = 0; + + new_send = kzalloc(sizeof(*new_send), GFP_ATOMIC); + if (!new_send) + return -ENOMEM; + + /* new_send->timetosend = currenttime */ + new_send->retrycount = NES_DEFAULT_RETRYS; + new_send->retranscount = NES_DEFAULT_RETRANS; + new_send->skb = skb; + new_send->timetosend = jiffies; + new_send->type = type; + new_send->netdev = cm_node->netdev; + new_send->send_retrans = send_retrans; + new_send->close_when_complete = close_when_complete; + + if (type == NES_TIMER_TYPE_CLOSE) { + new_send->timetosend += (HZ / 10); + if (cm_node->recv_entry) { + kfree(new_send); + WARN_ON(1); + return -EINVAL; + } + cm_node->recv_entry = new_send; + } + + if (type == NES_TIMER_TYPE_SEND) { + new_send->seq_num = ntohl(tcp_hdr(skb)->seq); + atomic_inc(&new_send->skb->users); + spin_lock_irqsave(&cm_node->retrans_list_lock, flags); + cm_node->send_entry = new_send; + add_ref_cm_node(cm_node); + spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags); + new_send->timetosend = jiffies + NES_RETRY_TIMEOUT; + + ret = nes_nic_cm_xmit(new_send->skb, cm_node->netdev); + if (ret != NETDEV_TX_OK) { + nes_debug(NES_DBG_CM, "Error sending packet %p " + "(jiffies = %lu)\n", new_send, jiffies); + new_send->timetosend = jiffies; + ret = NETDEV_TX_OK; + } else { + cm_packets_sent++; + if (!send_retrans) { + cleanup_retrans_entry(cm_node); + if (close_when_complete) + rem_ref_cm_node(cm_core, cm_node); + return ret; + } + } + } + + if (!timer_pending(&cm_core->tcp_timer)) + mod_timer(&cm_core->tcp_timer, new_send->timetosend); + + return ret; +} + +static void nes_retrans_expired(struct nes_cm_node *cm_node) +{ + struct iw_cm_id *cm_id = cm_node->cm_id; + enum nes_cm_node_state state = cm_node->state; + cm_node->state = NES_CM_STATE_CLOSED; + + switch (state) { + case NES_CM_STATE_SYN_RCVD: + case NES_CM_STATE_CLOSING: + rem_ref_cm_node(cm_node->cm_core, cm_node); + break; + case NES_CM_STATE_LAST_ACK: + case NES_CM_STATE_FIN_WAIT1: + if (cm_node->cm_id) + cm_id->rem_ref(cm_id); + send_reset(cm_node, NULL); + break; + default: + add_ref_cm_node(cm_node); + send_reset(cm_node, NULL); + create_event(cm_node, NES_CM_EVENT_ABORTED); + } +} + +static void handle_recv_entry(struct nes_cm_node *cm_node, u32 rem_node) +{ + struct nes_timer_entry *recv_entry = cm_node->recv_entry; + struct iw_cm_id *cm_id = cm_node->cm_id; + struct nes_qp *nesqp; + unsigned long qplockflags; + + if (!recv_entry) + return; + nesqp = (struct nes_qp *)recv_entry->skb; + if (nesqp) { + spin_lock_irqsave(&nesqp->lock, qplockflags); + if (nesqp->cm_id) { + nes_debug(NES_DBG_CM, "QP%u: cm_id = %p, " + "refcount = %d: HIT A " + "NES_TIMER_TYPE_CLOSE with something " + "to do!!!\n", nesqp->hwqp.qp_id, cm_id, + atomic_read(&nesqp->refcount)); + nesqp->hw_tcp_state = NES_AEQE_TCP_STATE_CLOSED; + nesqp->last_aeq = NES_AEQE_AEID_RESET_SENT; + nesqp->ibqp_state = IB_QPS_ERR; + spin_unlock_irqrestore(&nesqp->lock, qplockflags); + nes_cm_disconn(nesqp); + } else { + spin_unlock_irqrestore(&nesqp->lock, qplockflags); + nes_debug(NES_DBG_CM, "QP%u: cm_id = %p, " + "refcount = %d: HIT A " + "NES_TIMER_TYPE_CLOSE with nothing " + "to do!!!\n", nesqp->hwqp.qp_id, cm_id, + atomic_read(&nesqp->refcount)); + } + } else if (rem_node) { + /* TIME_WAIT state */ + rem_ref_cm_node(cm_node->cm_core, cm_node); + } + if (cm_node->cm_id) + cm_id->rem_ref(cm_id); + kfree(recv_entry); + cm_node->recv_entry = NULL; +} + +/** + * nes_cm_timer_tick + */ +static void nes_cm_timer_tick(unsigned long pass) +{ + unsigned long flags; + unsigned long nexttimeout = jiffies + NES_LONG_TIME; + struct nes_cm_node *cm_node; + struct nes_timer_entry *send_entry, *recv_entry; + struct list_head *list_core_temp; + struct list_head *list_node; + struct nes_cm_core *cm_core = g_cm_core; + u32 settimer = 0; + unsigned long timetosend; + int ret = NETDEV_TX_OK; + + struct list_head timer_list; + + INIT_LIST_HEAD(&timer_list); + spin_lock_irqsave(&cm_core->ht_lock, flags); + + list_for_each_safe(list_node, list_core_temp, + &cm_core->connected_nodes) { + cm_node = container_of(list_node, struct nes_cm_node, list); + if ((cm_node->recv_entry) || (cm_node->send_entry)) { + add_ref_cm_node(cm_node); + list_add(&cm_node->timer_entry, &timer_list); + } + } + spin_unlock_irqrestore(&cm_core->ht_lock, flags); + + list_for_each_safe(list_node, list_core_temp, &timer_list) { + cm_node = container_of(list_node, struct nes_cm_node, + timer_entry); + recv_entry = cm_node->recv_entry; + + if (recv_entry) { + if (time_after(recv_entry->timetosend, jiffies)) { + if (nexttimeout > recv_entry->timetosend || + !settimer) { + nexttimeout = recv_entry->timetosend; + settimer = 1; + } + } else { + handle_recv_entry(cm_node, 1); + } + } + + spin_lock_irqsave(&cm_node->retrans_list_lock, flags); + do { + send_entry = cm_node->send_entry; + if (!send_entry) + break; + if (time_after(send_entry->timetosend, jiffies)) { + if (cm_node->state != NES_CM_STATE_TSA) { + if ((nexttimeout > + send_entry->timetosend) || + !settimer) { + nexttimeout = + send_entry->timetosend; + settimer = 1; + } + } else { + free_retrans_entry(cm_node); + } + break; + } + + if ((cm_node->state == NES_CM_STATE_TSA) || + (cm_node->state == NES_CM_STATE_CLOSED)) { + free_retrans_entry(cm_node); + break; + } + + if (!send_entry->retranscount || + !send_entry->retrycount) { + cm_packets_dropped++; + free_retrans_entry(cm_node); + + spin_unlock_irqrestore( + &cm_node->retrans_list_lock, flags); + nes_retrans_expired(cm_node); + cm_node->state = NES_CM_STATE_CLOSED; + spin_lock_irqsave(&cm_node->retrans_list_lock, + flags); + break; + } + atomic_inc(&send_entry->skb->users); + cm_packets_retrans++; + nes_debug(NES_DBG_CM, "Retransmitting send_entry %p " + "for node %p, jiffies = %lu, time to send = " + "%lu, retranscount = %u, send_entry->seq_num = " + "0x%08X, cm_node->tcp_cntxt.rem_ack_num = " + "0x%08X\n", send_entry, cm_node, jiffies, + send_entry->timetosend, + send_entry->retranscount, + send_entry->seq_num, + cm_node->tcp_cntxt.rem_ack_num); + + spin_unlock_irqrestore(&cm_node->retrans_list_lock, + flags); + ret = nes_nic_cm_xmit(send_entry->skb, cm_node->netdev); + spin_lock_irqsave(&cm_node->retrans_list_lock, flags); + if (ret != NETDEV_TX_OK) { + nes_debug(NES_DBG_CM, "rexmit failed for " + "node=%p\n", cm_node); + cm_packets_bounced++; + send_entry->retrycount--; + nexttimeout = jiffies + NES_SHORT_TIME; + settimer = 1; + break; + } else { + cm_packets_sent++; + } + nes_debug(NES_DBG_CM, "Packet Sent: retrans count = " + "%u, retry count = %u.\n", + send_entry->retranscount, + send_entry->retrycount); + if (send_entry->send_retrans) { + send_entry->retranscount--; + timetosend = (NES_RETRY_TIMEOUT << + (NES_DEFAULT_RETRANS - send_entry->retranscount)); + + send_entry->timetosend = jiffies + + min(timetosend, NES_MAX_TIMEOUT); + if (nexttimeout > send_entry->timetosend || + !settimer) { + nexttimeout = send_entry->timetosend; + settimer = 1; + } + } else { + int close_when_complete; + close_when_complete = + send_entry->close_when_complete; + nes_debug(NES_DBG_CM, "cm_node=%p state=%d\n", + cm_node, cm_node->state); + free_retrans_entry(cm_node); + if (close_when_complete) + rem_ref_cm_node(cm_node->cm_core, + cm_node); + } + } while (0); + + spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags); + rem_ref_cm_node(cm_node->cm_core, cm_node); + } + + if (settimer) { + if (!timer_pending(&cm_core->tcp_timer)) + mod_timer(&cm_core->tcp_timer, nexttimeout); + } +} + + +/** + * send_syn + */ +static int send_syn(struct nes_cm_node *cm_node, u32 sendack, + struct sk_buff *skb) +{ + int ret; + int flags = SET_SYN; + char optionsbuffer[sizeof(struct option_mss) + + sizeof(struct option_windowscale) + sizeof(struct option_base) + + TCP_OPTIONS_PADDING]; + + int optionssize = 0; + /* Sending MSS option */ + union all_known_options *options; + + if (!cm_node) + return -EINVAL; + + options = (union all_known_options *)&optionsbuffer[optionssize]; + options->as_mss.optionnum = OPTION_NUMBER_MSS; + options->as_mss.length = sizeof(struct option_mss); + options->as_mss.mss = htons(cm_node->tcp_cntxt.mss); + optionssize += sizeof(struct option_mss); + + options = (union all_known_options *)&optionsbuffer[optionssize]; + options->as_windowscale.optionnum = OPTION_NUMBER_WINDOW_SCALE; + options->as_windowscale.length = sizeof(struct option_windowscale); + options->as_windowscale.shiftcount = cm_node->tcp_cntxt.rcv_wscale; + optionssize += sizeof(struct option_windowscale); + + if (sendack && !(NES_DRV_OPT_SUPRESS_OPTION_BC & nes_drv_opt)) { + options = (union all_known_options *)&optionsbuffer[optionssize]; + options->as_base.optionnum = OPTION_NUMBER_WRITE0; + options->as_base.length = sizeof(struct option_base); + optionssize += sizeof(struct option_base); + /* we need the size to be a multiple of 4 */ + options = (union all_known_options *)&optionsbuffer[optionssize]; + options->as_end = 1; + optionssize += 1; + options = (union all_known_options *)&optionsbuffer[optionssize]; + options->as_end = 1; + optionssize += 1; + } + + options = (union all_known_options *)&optionsbuffer[optionssize]; + options->as_end = OPTION_NUMBER_END; + optionssize += 1; + + if (!skb) + skb = dev_alloc_skb(MAX_CM_BUFFER); + if (!skb) { + nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n"); + return -1; + } + + if (sendack) + flags |= SET_ACK; + + form_cm_frame(skb, cm_node, optionsbuffer, optionssize, NULL, 0, flags); + ret = schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 1, 0); + + return ret; +} + + +/** + * send_reset + */ +static int send_reset(struct nes_cm_node *cm_node, struct sk_buff *skb) +{ + int ret; + int flags = SET_RST | SET_ACK; + + if (!skb) + skb = dev_alloc_skb(MAX_CM_BUFFER); + if (!skb) { + nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n"); + return -ENOMEM; + } + + form_cm_frame(skb, cm_node, NULL, 0, NULL, 0, flags); + ret = schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 0, 1); + + return ret; +} + + +/** + * send_ack + */ +static int send_ack(struct nes_cm_node *cm_node, struct sk_buff *skb) +{ + int ret; + + if (!skb) + skb = dev_alloc_skb(MAX_CM_BUFFER); + + if (!skb) { + nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n"); + return -1; + } + + form_cm_frame(skb, cm_node, NULL, 0, NULL, 0, SET_ACK); + ret = schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 0, 0); + + return ret; +} + + +/** + * send_fin + */ +static int send_fin(struct nes_cm_node *cm_node, struct sk_buff *skb) +{ + int ret; + + /* if we didn't get a frame get one */ + if (!skb) + skb = dev_alloc_skb(MAX_CM_BUFFER); + + if (!skb) { + nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n"); + return -1; + } + + form_cm_frame(skb, cm_node, NULL, 0, NULL, 0, SET_ACK | SET_FIN); + ret = schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 1, 0); + + return ret; +} + + +/** + * find_node - find a cm node that matches the reference cm node + */ +static struct nes_cm_node *find_node(struct nes_cm_core *cm_core, + u16 rem_port, nes_addr_t rem_addr, u16 loc_port, nes_addr_t loc_addr) +{ + unsigned long flags; + struct list_head *hte; + struct nes_cm_node *cm_node; + + /* get a handle on the hte */ + hte = &cm_core->connected_nodes; + + /* walk list and find cm_node associated with this session ID */ + spin_lock_irqsave(&cm_core->ht_lock, flags); + list_for_each_entry(cm_node, hte, list) { + /* compare quad, return node handle if a match */ + nes_debug(NES_DBG_CM, "finding node %x:%x =? %x:%x ^ %x:%x =? %x:%x\n", + cm_node->loc_addr, cm_node->loc_port, + loc_addr, loc_port, + cm_node->rem_addr, cm_node->rem_port, + rem_addr, rem_port); + if ((cm_node->mapped_loc_addr == loc_addr) && + (cm_node->mapped_loc_port == loc_port) && + (cm_node->mapped_rem_addr == rem_addr) && + (cm_node->mapped_rem_port == rem_port)) { + + add_ref_cm_node(cm_node); + spin_unlock_irqrestore(&cm_core->ht_lock, flags); + return cm_node; + } + } + spin_unlock_irqrestore(&cm_core->ht_lock, flags); + + /* no owner node */ + return NULL; +} + + +/** + * find_listener - find a cm node listening on this addr-port pair + */ +static struct nes_cm_listener *find_listener(struct nes_cm_core *cm_core, + nes_addr_t dst_addr, u16 dst_port, + enum nes_cm_listener_state listener_state, int local) +{ + unsigned long flags; + struct nes_cm_listener *listen_node; + nes_addr_t listen_addr; + u16 listen_port; + + /* walk list and find cm_node associated with this session ID */ + spin_lock_irqsave(&cm_core->listen_list_lock, flags); + list_for_each_entry(listen_node, &cm_core->listen_list.list, list) { + if (local) { + listen_addr = listen_node->loc_addr; + listen_port = listen_node->loc_port; + } else { + listen_addr = listen_node->mapped_loc_addr; + listen_port = listen_node->mapped_loc_port; + } + /* compare node pair, return node handle if a match */ + if (((listen_addr == dst_addr) || + listen_addr == 0x00000000) && + (listen_port == dst_port) && + (listener_state & listen_node->listener_state)) { + atomic_inc(&listen_node->ref_count); + spin_unlock_irqrestore(&cm_core->listen_list_lock, flags); + return listen_node; + } + } + spin_unlock_irqrestore(&cm_core->listen_list_lock, flags); + + /* no listener */ + return NULL; +} + +/** + * add_hte_node - add a cm node to the hash table + */ +static int add_hte_node(struct nes_cm_core *cm_core, struct nes_cm_node *cm_node) +{ + unsigned long flags; + struct list_head *hte; + + if (!cm_node || !cm_core) + return -EINVAL; + + nes_debug(NES_DBG_CM, "Adding Node %p to Active Connection HT\n", + cm_node); + + spin_lock_irqsave(&cm_core->ht_lock, flags); + + /* get a handle on the hash table element (list head for this slot) */ + hte = &cm_core->connected_nodes; + list_add_tail(&cm_node->list, hte); + atomic_inc(&cm_core->ht_node_cnt); + + spin_unlock_irqrestore(&cm_core->ht_lock, flags); + + return 0; +} + + +/** + * mini_cm_dec_refcnt_listen + */ +static int mini_cm_dec_refcnt_listen(struct nes_cm_core *cm_core, + struct nes_cm_listener *listener, int free_hanging_nodes) +{ + int ret = -EINVAL; + int err = 0; + unsigned long flags; + struct list_head *list_pos = NULL; + struct list_head *list_temp = NULL; + struct nes_cm_node *cm_node = NULL; + struct list_head reset_list; + + nes_debug(NES_DBG_CM, "attempting listener= %p free_nodes= %d, " + "refcnt=%d\n", listener, free_hanging_nodes, + atomic_read(&listener->ref_count)); + /* free non-accelerated child nodes for this listener */ + INIT_LIST_HEAD(&reset_list); + if (free_hanging_nodes) { + spin_lock_irqsave(&cm_core->ht_lock, flags); + list_for_each_safe(list_pos, list_temp, + &g_cm_core->connected_nodes) { + cm_node = container_of(list_pos, struct nes_cm_node, + list); + if ((cm_node->listener == listener) && + (!cm_node->accelerated)) { + add_ref_cm_node(cm_node); + list_add(&cm_node->reset_entry, &reset_list); + } + } + spin_unlock_irqrestore(&cm_core->ht_lock, flags); + } + + list_for_each_safe(list_pos, list_temp, &reset_list) { + cm_node = container_of(list_pos, struct nes_cm_node, + reset_entry); + { + struct nes_cm_node *loopback = cm_node->loopbackpartner; + enum nes_cm_node_state old_state; + if (NES_CM_STATE_FIN_WAIT1 <= cm_node->state) { + rem_ref_cm_node(cm_node->cm_core, cm_node); + } else { + if (!loopback) { + cleanup_retrans_entry(cm_node); + err = send_reset(cm_node, NULL); + if (err) { + cm_node->state = + NES_CM_STATE_CLOSED; + WARN_ON(1); + } else { + old_state = cm_node->state; + cm_node->state = NES_CM_STATE_LISTENER_DESTROYED; + if (old_state != NES_CM_STATE_MPAREQ_RCVD) + rem_ref_cm_node( + cm_node->cm_core, + cm_node); + } + } else { + struct nes_cm_event event; + + event.cm_node = loopback; + event.cm_info.rem_addr = + loopback->rem_addr; + event.cm_info.loc_addr = + loopback->loc_addr; + event.cm_info.rem_port = + loopback->rem_port; + event.cm_info.loc_port = + loopback->loc_port; + event.cm_info.cm_id = loopback->cm_id; + add_ref_cm_node(loopback); + loopback->state = NES_CM_STATE_CLOSED; + cm_event_connect_error(&event); + cm_node->state = NES_CM_STATE_LISTENER_DESTROYED; + + rem_ref_cm_node(cm_node->cm_core, + cm_node); + + } + } + } + } + + spin_lock_irqsave(&cm_core->listen_list_lock, flags); + if (!atomic_dec_return(&listener->ref_count)) { + list_del(&listener->list); + + /* decrement our listen node count */ + atomic_dec(&cm_core->listen_node_cnt); + + spin_unlock_irqrestore(&cm_core->listen_list_lock, flags); + + if (listener->nesvnic) { + nes_manage_apbvt(listener->nesvnic, + listener->mapped_loc_port, + PCI_FUNC(listener->nesvnic->nesdev->pcidev->devfn), + NES_MANAGE_APBVT_DEL); + + nes_remove_mapinfo(listener->loc_addr, + listener->loc_port, + listener->mapped_loc_addr, + listener->mapped_loc_port); + nes_debug(NES_DBG_NLMSG, + "Delete APBVT mapped_loc_port = %04X\n", + listener->mapped_loc_port); + } + + nes_debug(NES_DBG_CM, "destroying listener (%p)\n", listener); + + kfree(listener); + listener = NULL; + ret = 0; + atomic_inc(&cm_listens_destroyed); + } else { + spin_unlock_irqrestore(&cm_core->listen_list_lock, flags); + } + if (listener) { + if (atomic_read(&listener->pend_accepts_cnt) > 0) + nes_debug(NES_DBG_CM, "destroying listener (%p)" + " with non-zero pending accepts=%u\n", + listener, atomic_read(&listener->pend_accepts_cnt)); + } + + return ret; +} + + +/** + * mini_cm_del_listen + */ +static int mini_cm_del_listen(struct nes_cm_core *cm_core, + struct nes_cm_listener *listener) +{ + listener->listener_state = NES_CM_LISTENER_PASSIVE_STATE; + listener->cm_id = NULL; /* going to be destroyed pretty soon */ + return mini_cm_dec_refcnt_listen(cm_core, listener, 1); +} + + +/** + * mini_cm_accelerated + */ +static inline int mini_cm_accelerated(struct nes_cm_core *cm_core, + struct nes_cm_node *cm_node) +{ + cm_node->accelerated = 1; + + if (cm_node->accept_pend) { + BUG_ON(!cm_node->listener); + atomic_dec(&cm_node->listener->pend_accepts_cnt); + cm_node->accept_pend = 0; + BUG_ON(atomic_read(&cm_node->listener->pend_accepts_cnt) < 0); + } + + if (!timer_pending(&cm_core->tcp_timer)) + mod_timer(&cm_core->tcp_timer, (jiffies + NES_SHORT_TIME)); + + return 0; +} + + +/** + * nes_addr_resolve_neigh + */ +static int nes_addr_resolve_neigh(struct nes_vnic *nesvnic, u32 dst_ip, int arpindex) +{ + struct rtable *rt; + struct neighbour *neigh; + int rc = arpindex; + struct net_device *netdev; + struct nes_adapter *nesadapter = nesvnic->nesdev->nesadapter; + + rt = ip_route_output(&init_net, htonl(dst_ip), 0, 0, 0); + if (IS_ERR(rt)) { + printk(KERN_ERR "%s: ip_route_output_key failed for 0x%08X\n", + __func__, dst_ip); + return rc; + } + + if (netif_is_bond_slave(nesvnic->netdev)) + netdev = netdev_master_upper_dev_get(nesvnic->netdev); + else + netdev = nesvnic->netdev; + + neigh = neigh_lookup(&arp_tbl, &rt->rt_gateway, netdev); + + rcu_read_lock(); + if (neigh) { + if (neigh->nud_state & NUD_VALID) { + nes_debug(NES_DBG_CM, "Neighbor MAC address for 0x%08X" + " is %pM, Gateway is 0x%08X \n", dst_ip, + neigh->ha, ntohl(rt->rt_gateway)); + + if (arpindex >= 0) { + if (ether_addr_equal(nesadapter->arp_table[arpindex].mac_addr, neigh->ha)) { + /* Mac address same as in nes_arp_table */ + goto out; + } + + nes_manage_arp_cache(nesvnic->netdev, + nesadapter->arp_table[arpindex].mac_addr, + dst_ip, NES_ARP_DELETE); + } + + nes_manage_arp_cache(nesvnic->netdev, neigh->ha, + dst_ip, NES_ARP_ADD); + rc = nes_arp_table(nesvnic->nesdev, dst_ip, NULL, + NES_ARP_RESOLVE); + } else { + neigh_event_send(neigh, NULL); + } + } +out: + rcu_read_unlock(); + + if (neigh) + neigh_release(neigh); + + ip_rt_put(rt); + return rc; +} + +/** + * make_cm_node - create a new instance of a cm node + */ +static struct nes_cm_node *make_cm_node(struct nes_cm_core *cm_core, + struct nes_vnic *nesvnic, struct nes_cm_info *cm_info, + struct nes_cm_listener *listener) +{ + struct nes_cm_node *cm_node; + struct timespec ts; + int oldarpindex = 0; + int arpindex = 0; + struct nes_device *nesdev; + struct nes_adapter *nesadapter; + + /* create an hte and cm_node for this instance */ + cm_node = kzalloc(sizeof(*cm_node), GFP_ATOMIC); + if (!cm_node) + return NULL; + + /* set our node specific transport info */ + if (listener) { + cm_node->loc_addr = listener->loc_addr; + cm_node->loc_port = listener->loc_port; + } else { + cm_node->loc_addr = cm_info->loc_addr; + cm_node->loc_port = cm_info->loc_port; + } + cm_node->rem_addr = cm_info->rem_addr; + cm_node->rem_port = cm_info->rem_port; + + cm_node->mapped_loc_addr = cm_info->mapped_loc_addr; + cm_node->mapped_rem_addr = cm_info->mapped_rem_addr; + cm_node->mapped_loc_port = cm_info->mapped_loc_port; + cm_node->mapped_rem_port = cm_info->mapped_rem_port; + + cm_node->mpa_frame_rev = mpa_version; + cm_node->send_rdma0_op = SEND_RDMA_READ_ZERO; + cm_node->mpav2_ird_ord = 0; + cm_node->ird_size = 0; + cm_node->ord_size = 0; + + nes_debug(NES_DBG_CM, "Make node addresses : loc = %pI4:%x, rem = %pI4:%x\n", + &cm_node->loc_addr, cm_node->loc_port, + &cm_node->rem_addr, cm_node->rem_port); + cm_node->listener = listener; + cm_node->netdev = nesvnic->netdev; + cm_node->cm_id = cm_info->cm_id; + memcpy(cm_node->loc_mac, nesvnic->netdev->dev_addr, ETH_ALEN); + + nes_debug(NES_DBG_CM, "listener=%p, cm_id=%p\n", cm_node->listener, + cm_node->cm_id); + + spin_lock_init(&cm_node->retrans_list_lock); + + cm_node->loopbackpartner = NULL; + atomic_set(&cm_node->ref_count, 1); + /* associate our parent CM core */ + cm_node->cm_core = cm_core; + cm_node->tcp_cntxt.loc_id = NES_CM_DEF_LOCAL_ID; + cm_node->tcp_cntxt.rcv_wscale = NES_CM_DEFAULT_RCV_WND_SCALE; + cm_node->tcp_cntxt.rcv_wnd = NES_CM_DEFAULT_RCV_WND_SCALED >> + NES_CM_DEFAULT_RCV_WND_SCALE; + ts = current_kernel_time(); + cm_node->tcp_cntxt.loc_seq_num = htonl(ts.tv_nsec); + cm_node->tcp_cntxt.mss = nesvnic->max_frame_size - sizeof(struct iphdr) - + sizeof(struct tcphdr) - ETH_HLEN - VLAN_HLEN; + cm_node->tcp_cntxt.rcv_nxt = 0; + /* get a unique session ID , add thread_id to an upcounter to handle race */ + atomic_inc(&cm_core->node_cnt); + cm_node->conn_type = cm_info->conn_type; + cm_node->apbvt_set = 0; + cm_node->accept_pend = 0; + + cm_node->nesvnic = nesvnic; + /* get some device handles, for arp lookup */ + nesdev = nesvnic->nesdev; + nesadapter = nesdev->nesadapter; + + cm_node->loopbackpartner = NULL; + + /* get the mac addr for the remote node */ + oldarpindex = nes_arp_table(nesdev, cm_node->mapped_rem_addr, + NULL, NES_ARP_RESOLVE); + arpindex = nes_addr_resolve_neigh(nesvnic, + cm_node->mapped_rem_addr, oldarpindex); + if (arpindex < 0) { + kfree(cm_node); + return NULL; + } + + /* copy the mac addr to node context */ + memcpy(cm_node->rem_mac, nesadapter->arp_table[arpindex].mac_addr, ETH_ALEN); + nes_debug(NES_DBG_CM, "Remote mac addr from arp table: %pM\n", + cm_node->rem_mac); + + add_hte_node(cm_core, cm_node); + atomic_inc(&cm_nodes_created); + + return cm_node; +} + + +/** + * add_ref_cm_node - destroy an instance of a cm node + */ +static int add_ref_cm_node(struct nes_cm_node *cm_node) +{ + atomic_inc(&cm_node->ref_count); + return 0; +} + + +/** + * rem_ref_cm_node - destroy an instance of a cm node + */ +static int rem_ref_cm_node(struct nes_cm_core *cm_core, + struct nes_cm_node *cm_node) +{ + unsigned long flags; + struct nes_qp *nesqp; + + if (!cm_node) + return -EINVAL; + + spin_lock_irqsave(&cm_node->cm_core->ht_lock, flags); + if (atomic_dec_return(&cm_node->ref_count)) { + spin_unlock_irqrestore(&cm_node->cm_core->ht_lock, flags); + return 0; + } + list_del(&cm_node->list); + atomic_dec(&cm_core->ht_node_cnt); + spin_unlock_irqrestore(&cm_node->cm_core->ht_lock, flags); + + /* if the node is destroyed before connection was accelerated */ + if (!cm_node->accelerated && cm_node->accept_pend) { + BUG_ON(!cm_node->listener); + atomic_dec(&cm_node->listener->pend_accepts_cnt); + BUG_ON(atomic_read(&cm_node->listener->pend_accepts_cnt) < 0); + } + WARN_ON(cm_node->send_entry); + if (cm_node->recv_entry) + handle_recv_entry(cm_node, 0); + if (cm_node->listener) { + mini_cm_dec_refcnt_listen(cm_core, cm_node->listener, 0); + } else { + if (cm_node->apbvt_set && cm_node->nesvnic) { + nes_manage_apbvt(cm_node->nesvnic, cm_node->mapped_loc_port, + PCI_FUNC(cm_node->nesvnic->nesdev->pcidev->devfn), + NES_MANAGE_APBVT_DEL); + } + nes_debug(NES_DBG_NLMSG, "Delete APBVT mapped_loc_port = %04X\n", + cm_node->mapped_loc_port); + nes_remove_mapinfo(cm_node->loc_addr, cm_node->loc_port, + cm_node->mapped_loc_addr, cm_node->mapped_loc_port); + } + + atomic_dec(&cm_core->node_cnt); + atomic_inc(&cm_nodes_destroyed); + nesqp = cm_node->nesqp; + if (nesqp) { + nesqp->cm_node = NULL; + nes_rem_ref(&nesqp->ibqp); + cm_node->nesqp = NULL; + } + + kfree(cm_node); + return 0; +} + +/** + * process_options + */ +static int process_options(struct nes_cm_node *cm_node, u8 *optionsloc, + u32 optionsize, u32 syn_packet) +{ + u32 tmp; + u32 offset = 0; + union all_known_options *all_options; + char got_mss_option = 0; + + while (offset < optionsize) { + all_options = (union all_known_options *)(optionsloc + offset); + switch (all_options->as_base.optionnum) { + case OPTION_NUMBER_END: + offset = optionsize; + break; + case OPTION_NUMBER_NONE: + offset += 1; + continue; + case OPTION_NUMBER_MSS: + nes_debug(NES_DBG_CM, "%s: MSS Length: %d Offset: %d " + "Size: %d\n", __func__, + all_options->as_mss.length, offset, optionsize); + got_mss_option = 1; + if (all_options->as_mss.length != 4) { + return 1; + } else { + tmp = ntohs(all_options->as_mss.mss); + if (tmp > 0 && tmp < + cm_node->tcp_cntxt.mss) + cm_node->tcp_cntxt.mss = tmp; + } + break; + case OPTION_NUMBER_WINDOW_SCALE: + cm_node->tcp_cntxt.snd_wscale = + all_options->as_windowscale.shiftcount; + break; + default: + nes_debug(NES_DBG_CM, "TCP Option not understood: %x\n", + all_options->as_base.optionnum); + break; + } + offset += all_options->as_base.length; + } + if ((!got_mss_option) && (syn_packet)) + cm_node->tcp_cntxt.mss = NES_CM_DEFAULT_MSS; + return 0; +} + +static void drop_packet(struct sk_buff *skb) +{ + atomic_inc(&cm_accel_dropped_pkts); + dev_kfree_skb_any(skb); +} + +static void handle_fin_pkt(struct nes_cm_node *cm_node) +{ + nes_debug(NES_DBG_CM, "Received FIN, cm_node = %p, state = %u. " + "refcnt=%d\n", cm_node, cm_node->state, + atomic_read(&cm_node->ref_count)); + switch (cm_node->state) { + case NES_CM_STATE_SYN_RCVD: + case NES_CM_STATE_SYN_SENT: + case NES_CM_STATE_ESTABLISHED: + case NES_CM_STATE_MPAREJ_RCVD: + cm_node->tcp_cntxt.rcv_nxt++; + cleanup_retrans_entry(cm_node); + cm_node->state = NES_CM_STATE_LAST_ACK; + send_fin(cm_node, NULL); + break; + case NES_CM_STATE_MPAREQ_SENT: + create_event(cm_node, NES_CM_EVENT_ABORTED); + cm_node->tcp_cntxt.rcv_nxt++; + cleanup_retrans_entry(cm_node); + cm_node->state = NES_CM_STATE_CLOSED; + add_ref_cm_node(cm_node); + send_reset(cm_node, NULL); + break; + case NES_CM_STATE_FIN_WAIT1: + cm_node->tcp_cntxt.rcv_nxt++; + cleanup_retrans_entry(cm_node); + cm_node->state = NES_CM_STATE_CLOSING; + send_ack(cm_node, NULL); + /* Wait for ACK as this is simultaneous close.. + * After we receive ACK, do not send anything.. + * Just rm the node.. Done.. */ + break; + case NES_CM_STATE_FIN_WAIT2: + cm_node->tcp_cntxt.rcv_nxt++; + cleanup_retrans_entry(cm_node); + cm_node->state = NES_CM_STATE_TIME_WAIT; + send_ack(cm_node, NULL); + schedule_nes_timer(cm_node, NULL, NES_TIMER_TYPE_CLOSE, 1, 0); + break; + case NES_CM_STATE_TIME_WAIT: + cm_node->tcp_cntxt.rcv_nxt++; + cleanup_retrans_entry(cm_node); + cm_node->state = NES_CM_STATE_CLOSED; + rem_ref_cm_node(cm_node->cm_core, cm_node); + break; + case NES_CM_STATE_TSA: + default: + nes_debug(NES_DBG_CM, "Error Rcvd FIN for node-%p state = %d\n", + cm_node, cm_node->state); + break; + } +} + + +static void handle_rst_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb, + struct tcphdr *tcph) +{ + + int reset = 0; /* whether to send reset in case of err.. */ + atomic_inc(&cm_resets_recvd); + nes_debug(NES_DBG_CM, "Received Reset, cm_node = %p, state = %u." + " refcnt=%d\n", cm_node, cm_node->state, + atomic_read(&cm_node->ref_count)); + cleanup_retrans_entry(cm_node); + switch (cm_node->state) { + case NES_CM_STATE_SYN_SENT: + case NES_CM_STATE_MPAREQ_SENT: + nes_debug(NES_DBG_CM, "%s[%u] create abort for cm_node=%p " + "listener=%p state=%d\n", __func__, __LINE__, cm_node, + cm_node->listener, cm_node->state); + switch (cm_node->mpa_frame_rev) { + case IETF_MPA_V2: + cm_node->mpa_frame_rev = IETF_MPA_V1; + /* send a syn and goto syn sent state */ + cm_node->state = NES_CM_STATE_SYN_SENT; + if (send_syn(cm_node, 0, NULL)) { + active_open_err(cm_node, skb, reset); + } + break; + case IETF_MPA_V1: + default: + active_open_err(cm_node, skb, reset); + break; + } + break; + case NES_CM_STATE_MPAREQ_RCVD: + atomic_inc(&cm_node->passive_state); + dev_kfree_skb_any(skb); + break; + case NES_CM_STATE_ESTABLISHED: + case NES_CM_STATE_SYN_RCVD: + case NES_CM_STATE_LISTENING: + nes_debug(NES_DBG_CM, "Bad state %s[%u]\n", __func__, __LINE__); + passive_open_err(cm_node, skb, reset); + break; + case NES_CM_STATE_TSA: + active_open_err(cm_node, skb, reset); + break; + case NES_CM_STATE_CLOSED: + drop_packet(skb); + break; + case NES_CM_STATE_FIN_WAIT2: + case NES_CM_STATE_FIN_WAIT1: + case NES_CM_STATE_LAST_ACK: + cm_node->cm_id->rem_ref(cm_node->cm_id); + case NES_CM_STATE_TIME_WAIT: + cm_node->state = NES_CM_STATE_CLOSED; + rem_ref_cm_node(cm_node->cm_core, cm_node); + drop_packet(skb); + break; + default: + drop_packet(skb); + break; + } +} + + +static void handle_rcv_mpa(struct nes_cm_node *cm_node, struct sk_buff *skb) +{ + int ret = 0; + int datasize = skb->len; + u8 *dataloc = skb->data; + + enum nes_cm_event_type type = NES_CM_EVENT_UNKNOWN; + u32 res_type; + + ret = parse_mpa(cm_node, dataloc, &res_type, datasize); + if (ret) { + nes_debug(NES_DBG_CM, "didn't like MPA Request\n"); + if (cm_node->state == NES_CM_STATE_MPAREQ_SENT) { + nes_debug(NES_DBG_CM, "%s[%u] create abort for " + "cm_node=%p listener=%p state=%d\n", __func__, + __LINE__, cm_node, cm_node->listener, + cm_node->state); + active_open_err(cm_node, skb, 1); + } else { + passive_open_err(cm_node, skb, 1); + } + return; + } + + switch (cm_node->state) { + case NES_CM_STATE_ESTABLISHED: + if (res_type == NES_MPA_REQUEST_REJECT) + /*BIG problem as we are receiving the MPA.. So should + * not be REJECT.. This is Passive Open.. We can + * only receive it Reject for Active Open...*/ + WARN_ON(1); + cm_node->state = NES_CM_STATE_MPAREQ_RCVD; + type = NES_CM_EVENT_MPA_REQ; + atomic_set(&cm_node->passive_state, + NES_PASSIVE_STATE_INDICATED); + break; + case NES_CM_STATE_MPAREQ_SENT: + cleanup_retrans_entry(cm_node); + if (res_type == NES_MPA_REQUEST_REJECT) { + type = NES_CM_EVENT_MPA_REJECT; + cm_node->state = NES_CM_STATE_MPAREJ_RCVD; + } else { + type = NES_CM_EVENT_CONNECTED; + cm_node->state = NES_CM_STATE_TSA; + } + + break; + default: + WARN_ON(1); + break; + } + dev_kfree_skb_any(skb); + create_event(cm_node, type); +} + +static void indicate_pkt_err(struct nes_cm_node *cm_node, struct sk_buff *skb) +{ + switch (cm_node->state) { + case NES_CM_STATE_SYN_SENT: + case NES_CM_STATE_MPAREQ_SENT: + nes_debug(NES_DBG_CM, "%s[%u] create abort for cm_node=%p " + "listener=%p state=%d\n", __func__, __LINE__, cm_node, + cm_node->listener, cm_node->state); + active_open_err(cm_node, skb, 1); + break; + case NES_CM_STATE_ESTABLISHED: + case NES_CM_STATE_SYN_RCVD: + passive_open_err(cm_node, skb, 1); + break; + case NES_CM_STATE_TSA: + default: + drop_packet(skb); + } +} + +static int check_syn(struct nes_cm_node *cm_node, struct tcphdr *tcph, + struct sk_buff *skb) +{ + int err; + + err = ((ntohl(tcph->ack_seq) == cm_node->tcp_cntxt.loc_seq_num)) ? 0 : 1; + if (err) + active_open_err(cm_node, skb, 1); + + return err; +} + +static int check_seq(struct nes_cm_node *cm_node, struct tcphdr *tcph, + struct sk_buff *skb) +{ + int err = 0; + u32 seq; + u32 ack_seq; + u32 loc_seq_num = cm_node->tcp_cntxt.loc_seq_num; + u32 rcv_nxt = cm_node->tcp_cntxt.rcv_nxt; + u32 rcv_wnd; + + seq = ntohl(tcph->seq); + ack_seq = ntohl(tcph->ack_seq); + rcv_wnd = cm_node->tcp_cntxt.rcv_wnd; + if (ack_seq != loc_seq_num) + err = 1; + else if (!between(seq, rcv_nxt, (rcv_nxt + rcv_wnd))) + err = 1; + if (err) { + nes_debug(NES_DBG_CM, "%s[%u] create abort for cm_node=%p " + "listener=%p state=%d\n", __func__, __LINE__, cm_node, + cm_node->listener, cm_node->state); + indicate_pkt_err(cm_node, skb); + nes_debug(NES_DBG_CM, "seq ERROR cm_node =%p seq=0x%08X " + "rcv_nxt=0x%08X rcv_wnd=0x%x\n", cm_node, seq, rcv_nxt, + rcv_wnd); + } + return err; +} + +/* + * handle_syn_pkt() is for Passive node. The syn packet is received when a node + * is created with a listener or it may comein as rexmitted packet which in + * that case will be just dropped. + */ +static void handle_syn_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb, + struct tcphdr *tcph) +{ + int ret; + u32 inc_sequence; + int optionsize; + + optionsize = (tcph->doff << 2) - sizeof(struct tcphdr); + skb_trim(skb, 0); + inc_sequence = ntohl(tcph->seq); + + switch (cm_node->state) { + case NES_CM_STATE_SYN_SENT: + case NES_CM_STATE_MPAREQ_SENT: + /* Rcvd syn on active open connection*/ + active_open_err(cm_node, skb, 1); + break; + case NES_CM_STATE_LISTENING: + /* Passive OPEN */ + if (atomic_read(&cm_node->listener->pend_accepts_cnt) > + cm_node->listener->backlog) { + nes_debug(NES_DBG_CM, "drop syn due to backlog " + "pressure \n"); + cm_backlog_drops++; + passive_open_err(cm_node, skb, 0); + break; + } + ret = handle_tcp_options(cm_node, tcph, skb, optionsize, + 1); + if (ret) { + passive_open_err(cm_node, skb, 0); + /* drop pkt */ + break; + } + cm_node->tcp_cntxt.rcv_nxt = inc_sequence + 1; + BUG_ON(cm_node->send_entry); + cm_node->accept_pend = 1; + atomic_inc(&cm_node->listener->pend_accepts_cnt); + + cm_node->state = NES_CM_STATE_SYN_RCVD; + send_syn(cm_node, 1, skb); + break; + case NES_CM_STATE_CLOSED: + cleanup_retrans_entry(cm_node); + add_ref_cm_node(cm_node); + send_reset(cm_node, skb); + break; + case NES_CM_STATE_TSA: + case NES_CM_STATE_ESTABLISHED: + case NES_CM_STATE_FIN_WAIT1: + case NES_CM_STATE_FIN_WAIT2: + case NES_CM_STATE_MPAREQ_RCVD: + case NES_CM_STATE_LAST_ACK: + case NES_CM_STATE_CLOSING: + case NES_CM_STATE_UNKNOWN: + default: + drop_packet(skb); + break; + } +} + +static void handle_synack_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb, + struct tcphdr *tcph) +{ + int ret; + u32 inc_sequence; + int optionsize; + + optionsize = (tcph->doff << 2) - sizeof(struct tcphdr); + skb_trim(skb, 0); + inc_sequence = ntohl(tcph->seq); + switch (cm_node->state) { + case NES_CM_STATE_SYN_SENT: + cleanup_retrans_entry(cm_node); + /* active open */ + if (check_syn(cm_node, tcph, skb)) + return; + cm_node->tcp_cntxt.rem_ack_num = ntohl(tcph->ack_seq); + /* setup options */ + ret = handle_tcp_options(cm_node, tcph, skb, optionsize, 0); + if (ret) { + nes_debug(NES_DBG_CM, "cm_node=%p tcp_options failed\n", + cm_node); + break; + } + cleanup_retrans_entry(cm_node); + cm_node->tcp_cntxt.rcv_nxt = inc_sequence + 1; + send_mpa_request(cm_node, skb); + cm_node->state = NES_CM_STATE_MPAREQ_SENT; + break; + case NES_CM_STATE_MPAREQ_RCVD: + /* passive open, so should not be here */ + passive_open_err(cm_node, skb, 1); + break; + case NES_CM_STATE_LISTENING: + cm_node->tcp_cntxt.loc_seq_num = ntohl(tcph->ack_seq); + cleanup_retrans_entry(cm_node); + cm_node->state = NES_CM_STATE_CLOSED; + send_reset(cm_node, skb); + break; + case NES_CM_STATE_CLOSED: + cm_node->tcp_cntxt.loc_seq_num = ntohl(tcph->ack_seq); + cleanup_retrans_entry(cm_node); + add_ref_cm_node(cm_node); + send_reset(cm_node, skb); + break; + case NES_CM_STATE_ESTABLISHED: + case NES_CM_STATE_FIN_WAIT1: + case NES_CM_STATE_FIN_WAIT2: + case NES_CM_STATE_LAST_ACK: + case NES_CM_STATE_TSA: + case NES_CM_STATE_CLOSING: + case NES_CM_STATE_UNKNOWN: + case NES_CM_STATE_MPAREQ_SENT: + default: + drop_packet(skb); + break; + } +} + +static int handle_ack_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb, + struct tcphdr *tcph) +{ + int datasize = 0; + u32 inc_sequence; + int ret = 0; + int optionsize; + + optionsize = (tcph->doff << 2) - sizeof(struct tcphdr); + + if (check_seq(cm_node, tcph, skb)) + return -EINVAL; + + skb_pull(skb, tcph->doff << 2); + inc_sequence = ntohl(tcph->seq); + datasize = skb->len; + switch (cm_node->state) { + case NES_CM_STATE_SYN_RCVD: + /* Passive OPEN */ + cleanup_retrans_entry(cm_node); + ret = handle_tcp_options(cm_node, tcph, skb, optionsize, 1); + if (ret) + break; + cm_node->tcp_cntxt.rem_ack_num = ntohl(tcph->ack_seq); + cm_node->state = NES_CM_STATE_ESTABLISHED; + if (datasize) { + cm_node->tcp_cntxt.rcv_nxt = inc_sequence + datasize; + nes_get_remote_addr(cm_node); + handle_rcv_mpa(cm_node, skb); + } else { /* rcvd ACK only */ + dev_kfree_skb_any(skb); + } + break; + case NES_CM_STATE_ESTABLISHED: + /* Passive OPEN */ + cleanup_retrans_entry(cm_node); + if (datasize) { + cm_node->tcp_cntxt.rcv_nxt = inc_sequence + datasize; + handle_rcv_mpa(cm_node, skb); + } else { + drop_packet(skb); + } + break; + case NES_CM_STATE_MPAREQ_SENT: + cm_node->tcp_cntxt.rem_ack_num = ntohl(tcph->ack_seq); + if (datasize) { + cm_node->tcp_cntxt.rcv_nxt = inc_sequence + datasize; + handle_rcv_mpa(cm_node, skb); + } else { /* Could be just an ack pkt.. */ + dev_kfree_skb_any(skb); + } + break; + case NES_CM_STATE_LISTENING: + cleanup_retrans_entry(cm_node); + cm_node->state = NES_CM_STATE_CLOSED; + send_reset(cm_node, skb); + break; + case NES_CM_STATE_CLOSED: + cleanup_retrans_entry(cm_node); + add_ref_cm_node(cm_node); + send_reset(cm_node, skb); + break; + case NES_CM_STATE_LAST_ACK: + case NES_CM_STATE_CLOSING: + cleanup_retrans_entry(cm_node); + cm_node->state = NES_CM_STATE_CLOSED; + cm_node->cm_id->rem_ref(cm_node->cm_id); + rem_ref_cm_node(cm_node->cm_core, cm_node); + drop_packet(skb); + break; + case NES_CM_STATE_FIN_WAIT1: + cleanup_retrans_entry(cm_node); + drop_packet(skb); + cm_node->state = NES_CM_STATE_FIN_WAIT2; + break; + case NES_CM_STATE_SYN_SENT: + case NES_CM_STATE_FIN_WAIT2: + case NES_CM_STATE_TSA: + case NES_CM_STATE_MPAREQ_RCVD: + case NES_CM_STATE_UNKNOWN: + default: + cleanup_retrans_entry(cm_node); + drop_packet(skb); + break; + } + return ret; +} + + + +static int handle_tcp_options(struct nes_cm_node *cm_node, struct tcphdr *tcph, + struct sk_buff *skb, int optionsize, int passive) +{ + u8 *optionsloc = (u8 *)&tcph[1]; + + if (optionsize) { + if (process_options(cm_node, optionsloc, optionsize, + (u32)tcph->syn)) { + nes_debug(NES_DBG_CM, "%s: Node %p, Sending RESET\n", + __func__, cm_node); + if (passive) + passive_open_err(cm_node, skb, 1); + else + active_open_err(cm_node, skb, 1); + return 1; + } + } + + cm_node->tcp_cntxt.snd_wnd = ntohs(tcph->window) << + cm_node->tcp_cntxt.snd_wscale; + + if (cm_node->tcp_cntxt.snd_wnd > cm_node->tcp_cntxt.max_snd_wnd) + cm_node->tcp_cntxt.max_snd_wnd = cm_node->tcp_cntxt.snd_wnd; + return 0; +} + +/* + * active_open_err() will send reset() if flag set.. + * It will also send ABORT event. + */ +static void active_open_err(struct nes_cm_node *cm_node, struct sk_buff *skb, + int reset) +{ + cleanup_retrans_entry(cm_node); + if (reset) { + nes_debug(NES_DBG_CM, "ERROR active err called for cm_node=%p, " + "state=%d\n", cm_node, cm_node->state); + add_ref_cm_node(cm_node); + send_reset(cm_node, skb); + } else { + dev_kfree_skb_any(skb); + } + + cm_node->state = NES_CM_STATE_CLOSED; + create_event(cm_node, NES_CM_EVENT_ABORTED); +} + +/* + * passive_open_err() will either do a reset() or will free up the skb and + * remove the cm_node. + */ +static void passive_open_err(struct nes_cm_node *cm_node, struct sk_buff *skb, + int reset) +{ + cleanup_retrans_entry(cm_node); + cm_node->state = NES_CM_STATE_CLOSED; + if (reset) { + nes_debug(NES_DBG_CM, "passive_open_err sending RST for " + "cm_node=%p state =%d\n", cm_node, cm_node->state); + send_reset(cm_node, skb); + } else { + dev_kfree_skb_any(skb); + rem_ref_cm_node(cm_node->cm_core, cm_node); + } +} + +/* + * free_retrans_entry() routines assumes that the retrans_list_lock has + * been acquired before calling. + */ +static void free_retrans_entry(struct nes_cm_node *cm_node) +{ + struct nes_timer_entry *send_entry; + + send_entry = cm_node->send_entry; + if (send_entry) { + cm_node->send_entry = NULL; + dev_kfree_skb_any(send_entry->skb); + kfree(send_entry); + rem_ref_cm_node(cm_node->cm_core, cm_node); + } +} + +static void cleanup_retrans_entry(struct nes_cm_node *cm_node) +{ + unsigned long flags; + + spin_lock_irqsave(&cm_node->retrans_list_lock, flags); + free_retrans_entry(cm_node); + spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags); +} + +/** + * process_packet + * Returns skb if to be freed, else it will return NULL if already used.. + */ +static void process_packet(struct nes_cm_node *cm_node, struct sk_buff *skb, + struct nes_cm_core *cm_core) +{ + enum nes_tcpip_pkt_type pkt_type = NES_PKT_TYPE_UNKNOWN; + struct tcphdr *tcph = tcp_hdr(skb); + u32 fin_set = 0; + int ret = 0; + + skb_pull(skb, ip_hdr(skb)->ihl << 2); + + nes_debug(NES_DBG_CM, "process_packet: cm_node=%p state =%d syn=%d " + "ack=%d rst=%d fin=%d\n", cm_node, cm_node->state, tcph->syn, + tcph->ack, tcph->rst, tcph->fin); + + if (tcph->rst) { + pkt_type = NES_PKT_TYPE_RST; + } else if (tcph->syn) { + pkt_type = NES_PKT_TYPE_SYN; + if (tcph->ack) + pkt_type = NES_PKT_TYPE_SYNACK; + } else if (tcph->ack) { + pkt_type = NES_PKT_TYPE_ACK; + } + if (tcph->fin) + fin_set = 1; + + switch (pkt_type) { + case NES_PKT_TYPE_SYN: + handle_syn_pkt(cm_node, skb, tcph); + break; + case NES_PKT_TYPE_SYNACK: + handle_synack_pkt(cm_node, skb, tcph); + break; + case NES_PKT_TYPE_ACK: + ret = handle_ack_pkt(cm_node, skb, tcph); + if (fin_set && !ret) + handle_fin_pkt(cm_node); + break; + case NES_PKT_TYPE_RST: + handle_rst_pkt(cm_node, skb, tcph); + break; + default: + if ((fin_set) && (!check_seq(cm_node, tcph, skb))) + handle_fin_pkt(cm_node); + drop_packet(skb); + break; + } +} + +/** + * mini_cm_listen - create a listen node with params + */ +static struct nes_cm_listener *mini_cm_listen(struct nes_cm_core *cm_core, + struct nes_vnic *nesvnic, struct nes_cm_info *cm_info) +{ + struct nes_cm_listener *listener; + struct iwpm_dev_data pm_reg_msg; + struct iwpm_sa_data pm_msg; + unsigned long flags; + int iwpm_err = 0; + + nes_debug(NES_DBG_CM, "Search for 0x%08x : 0x%04x\n", + cm_info->loc_addr, cm_info->loc_port); + + /* cannot have multiple matching listeners */ + listener = find_listener(cm_core, cm_info->loc_addr, cm_info->loc_port, + NES_CM_LISTENER_EITHER_STATE, 1); + + if (listener && listener->listener_state == NES_CM_LISTENER_ACTIVE_STATE) { + /* find automatically incs ref count ??? */ + atomic_dec(&listener->ref_count); + nes_debug(NES_DBG_CM, "Not creating listener since it already exists\n"); + return NULL; + } + + if (!listener) { + nes_form_reg_msg(nesvnic, &pm_reg_msg); + iwpm_err = iwpm_register_pid(&pm_reg_msg, RDMA_NL_NES); + if (iwpm_err) { + nes_debug(NES_DBG_NLMSG, + "Port Mapper reg pid fail (err = %d).\n", iwpm_err); + } + if (iwpm_valid_pid() && !iwpm_err) { + nes_form_pm_msg(cm_info, &pm_msg); + iwpm_err = iwpm_add_mapping(&pm_msg, RDMA_NL_NES); + if (iwpm_err) + nes_debug(NES_DBG_NLMSG, + "Port Mapper query fail (err = %d).\n", iwpm_err); + else + nes_record_pm_msg(cm_info, &pm_msg); + } + + /* create a CM listen node (1/2 node to compare incoming traffic to) */ + listener = kzalloc(sizeof(*listener), GFP_ATOMIC); + if (!listener) { + nes_debug(NES_DBG_CM, "Not creating listener memory allocation failed\n"); + return NULL; + } + + listener->loc_addr = cm_info->loc_addr; + listener->loc_port = cm_info->loc_port; + listener->mapped_loc_addr = cm_info->mapped_loc_addr; + listener->mapped_loc_port = cm_info->mapped_loc_port; + listener->reused_node = 0; + + atomic_set(&listener->ref_count, 1); + } + /* pasive case */ + /* find already inc'ed the ref count */ + else { + listener->reused_node = 1; + } + + listener->cm_id = cm_info->cm_id; + atomic_set(&listener->pend_accepts_cnt, 0); + listener->cm_core = cm_core; + listener->nesvnic = nesvnic; + atomic_inc(&cm_core->node_cnt); + + listener->conn_type = cm_info->conn_type; + listener->backlog = cm_info->backlog; + listener->listener_state = NES_CM_LISTENER_ACTIVE_STATE; + + if (!listener->reused_node) { + spin_lock_irqsave(&cm_core->listen_list_lock, flags); + list_add(&listener->list, &cm_core->listen_list.list); + spin_unlock_irqrestore(&cm_core->listen_list_lock, flags); + atomic_inc(&cm_core->listen_node_cnt); + } + + nes_debug(NES_DBG_CM, "Api - listen(): addr=0x%08X, port=0x%04x," + " listener = %p, backlog = %d, cm_id = %p.\n", + cm_info->loc_addr, cm_info->loc_port, + listener, listener->backlog, listener->cm_id); + + return listener; +} + + +/** + * mini_cm_connect - make a connection node with params + */ +static struct nes_cm_node *mini_cm_connect(struct nes_cm_core *cm_core, + struct nes_vnic *nesvnic, u16 private_data_len, + void *private_data, struct nes_cm_info *cm_info) +{ + int ret = 0; + struct nes_cm_node *cm_node; + struct nes_cm_listener *loopbackremotelistener; + struct nes_cm_node *loopbackremotenode; + struct nes_cm_info loopback_cm_info; + u8 *start_buff; + + /* create a CM connection node */ + cm_node = make_cm_node(cm_core, nesvnic, cm_info, NULL); + if (!cm_node) + return NULL; + + /* set our node side to client (active) side */ + cm_node->tcp_cntxt.client = 1; + cm_node->tcp_cntxt.rcv_wscale = NES_CM_DEFAULT_RCV_WND_SCALE; + + if (cm_info->loc_addr == cm_info->rem_addr) { + loopbackremotelistener = find_listener(cm_core, + cm_node->mapped_loc_addr, cm_node->mapped_rem_port, + NES_CM_LISTENER_ACTIVE_STATE, 0); + if (loopbackremotelistener == NULL) { + create_event(cm_node, NES_CM_EVENT_ABORTED); + } else { + loopback_cm_info = *cm_info; + loopback_cm_info.loc_port = cm_info->rem_port; + loopback_cm_info.rem_port = cm_info->loc_port; + loopback_cm_info.mapped_loc_port = + cm_info->mapped_rem_port; + loopback_cm_info.mapped_rem_port = + cm_info->mapped_loc_port; + loopback_cm_info.cm_id = loopbackremotelistener->cm_id; + loopbackremotenode = make_cm_node(cm_core, nesvnic, + &loopback_cm_info, loopbackremotelistener); + if (!loopbackremotenode) { + rem_ref_cm_node(cm_node->cm_core, cm_node); + return NULL; + } + atomic_inc(&cm_loopbacks); + loopbackremotenode->loopbackpartner = cm_node; + loopbackremotenode->tcp_cntxt.rcv_wscale = + NES_CM_DEFAULT_RCV_WND_SCALE; + cm_node->loopbackpartner = loopbackremotenode; + memcpy(loopbackremotenode->mpa_frame_buf, private_data, + private_data_len); + loopbackremotenode->mpa_frame_size = private_data_len; + + /* we are done handling this state. */ + /* set node to a TSA state */ + cm_node->state = NES_CM_STATE_TSA; + cm_node->tcp_cntxt.rcv_nxt = + loopbackremotenode->tcp_cntxt.loc_seq_num; + loopbackremotenode->tcp_cntxt.rcv_nxt = + cm_node->tcp_cntxt.loc_seq_num; + cm_node->tcp_cntxt.max_snd_wnd = + loopbackremotenode->tcp_cntxt.rcv_wnd; + loopbackremotenode->tcp_cntxt.max_snd_wnd = + cm_node->tcp_cntxt.rcv_wnd; + cm_node->tcp_cntxt.snd_wnd = + loopbackremotenode->tcp_cntxt.rcv_wnd; + loopbackremotenode->tcp_cntxt.snd_wnd = + cm_node->tcp_cntxt.rcv_wnd; + cm_node->tcp_cntxt.snd_wscale = + loopbackremotenode->tcp_cntxt.rcv_wscale; + loopbackremotenode->tcp_cntxt.snd_wscale = + cm_node->tcp_cntxt.rcv_wscale; + loopbackremotenode->state = NES_CM_STATE_MPAREQ_RCVD; + create_event(loopbackremotenode, NES_CM_EVENT_MPA_REQ); + } + return cm_node; + } + + start_buff = &cm_node->mpa_frame_buf[0] + sizeof(struct ietf_mpa_v2); + cm_node->mpa_frame_size = private_data_len; + + memcpy(start_buff, private_data, private_data_len); + + /* send a syn and goto syn sent state */ + cm_node->state = NES_CM_STATE_SYN_SENT; + ret = send_syn(cm_node, 0, NULL); + + if (ret) { + /* error in sending the syn free up the cm_node struct */ + nes_debug(NES_DBG_CM, "Api - connect() FAILED: dest " + "addr=0x%08X, port=0x%04x, cm_node=%p, cm_id = %p.\n", + cm_node->rem_addr, cm_node->rem_port, cm_node, + cm_node->cm_id); + rem_ref_cm_node(cm_node->cm_core, cm_node); + cm_node = NULL; + } + + if (cm_node) { + nes_debug(NES_DBG_CM, "Api - connect(): dest addr=0x%08X," + "port=0x%04x, cm_node=%p, cm_id = %p.\n", + cm_node->rem_addr, cm_node->rem_port, cm_node, + cm_node->cm_id); + } + + return cm_node; +} + + +/** + * mini_cm_accept - accept a connection + * This function is never called + */ +static int mini_cm_accept(struct nes_cm_core *cm_core, struct nes_cm_node *cm_node) +{ + return 0; +} + + +/** + * mini_cm_reject - reject and teardown a connection + */ +static int mini_cm_reject(struct nes_cm_core *cm_core, struct nes_cm_node *cm_node) +{ + int ret = 0; + int err = 0; + int passive_state; + struct nes_cm_event event; + struct iw_cm_id *cm_id = cm_node->cm_id; + struct nes_cm_node *loopback = cm_node->loopbackpartner; + + nes_debug(NES_DBG_CM, "%s cm_node=%p type=%d state=%d\n", + __func__, cm_node, cm_node->tcp_cntxt.client, cm_node->state); + + if (cm_node->tcp_cntxt.client) + return ret; + cleanup_retrans_entry(cm_node); + + if (!loopback) { + passive_state = atomic_add_return(1, &cm_node->passive_state); + if (passive_state == NES_SEND_RESET_EVENT) { + cm_node->state = NES_CM_STATE_CLOSED; + rem_ref_cm_node(cm_core, cm_node); + } else { + if (cm_node->state == NES_CM_STATE_LISTENER_DESTROYED) { + rem_ref_cm_node(cm_core, cm_node); + } else { + ret = send_mpa_reject(cm_node); + if (ret) { + cm_node->state = NES_CM_STATE_CLOSED; + err = send_reset(cm_node, NULL); + if (err) + WARN_ON(1); + } else { + cm_id->add_ref(cm_id); + } + } + } + } else { + cm_node->cm_id = NULL; + if (cm_node->state == NES_CM_STATE_LISTENER_DESTROYED) { + rem_ref_cm_node(cm_core, cm_node); + rem_ref_cm_node(cm_core, loopback); + } else { + event.cm_node = loopback; + event.cm_info.rem_addr = loopback->rem_addr; + event.cm_info.loc_addr = loopback->loc_addr; + event.cm_info.rem_port = loopback->rem_port; + event.cm_info.loc_port = loopback->loc_port; + event.cm_info.cm_id = loopback->cm_id; + cm_event_mpa_reject(&event); + rem_ref_cm_node(cm_core, cm_node); + loopback->state = NES_CM_STATE_CLOSING; + + cm_id = loopback->cm_id; + rem_ref_cm_node(cm_core, loopback); + cm_id->rem_ref(cm_id); + } + } + + return ret; +} + + +/** + * mini_cm_close + */ +static int mini_cm_close(struct nes_cm_core *cm_core, struct nes_cm_node *cm_node) +{ + int ret = 0; + + if (!cm_core || !cm_node) + return -EINVAL; + + switch (cm_node->state) { + case NES_CM_STATE_SYN_RCVD: + case NES_CM_STATE_SYN_SENT: + case NES_CM_STATE_ONE_SIDE_ESTABLISHED: + case NES_CM_STATE_ESTABLISHED: + case NES_CM_STATE_ACCEPTING: + case NES_CM_STATE_MPAREQ_SENT: + case NES_CM_STATE_MPAREQ_RCVD: + cleanup_retrans_entry(cm_node); + send_reset(cm_node, NULL); + break; + case NES_CM_STATE_CLOSE_WAIT: + cm_node->state = NES_CM_STATE_LAST_ACK; + send_fin(cm_node, NULL); + break; + case NES_CM_STATE_FIN_WAIT1: + case NES_CM_STATE_FIN_WAIT2: + case NES_CM_STATE_LAST_ACK: + case NES_CM_STATE_TIME_WAIT: + case NES_CM_STATE_CLOSING: + ret = -1; + break; + case NES_CM_STATE_LISTENING: + cleanup_retrans_entry(cm_node); + send_reset(cm_node, NULL); + break; + case NES_CM_STATE_MPAREJ_RCVD: + case NES_CM_STATE_UNKNOWN: + case NES_CM_STATE_INITED: + case NES_CM_STATE_CLOSED: + case NES_CM_STATE_LISTENER_DESTROYED: + ret = rem_ref_cm_node(cm_core, cm_node); + break; + case NES_CM_STATE_TSA: + if (cm_node->send_entry) + printk(KERN_ERR "ERROR Close got called from STATE_TSA " + "send_entry=%p\n", cm_node->send_entry); + ret = rem_ref_cm_node(cm_core, cm_node); + break; + } + return ret; +} + + +/** + * recv_pkt - recv an ETHERNET packet, and process it through CM + * node state machine + */ +static int mini_cm_recv_pkt(struct nes_cm_core *cm_core, + struct nes_vnic *nesvnic, struct sk_buff *skb) +{ + struct nes_cm_node *cm_node = NULL; + struct nes_cm_listener *listener = NULL; + struct iphdr *iph; + struct tcphdr *tcph; + struct nes_cm_info nfo; + int skb_handled = 1; + __be32 tmp_daddr, tmp_saddr; + + if (!skb) + return 0; + if (skb->len < sizeof(struct iphdr) + sizeof(struct tcphdr)) + return 0; + + iph = (struct iphdr *)skb->data; + tcph = (struct tcphdr *)(skb->data + sizeof(struct iphdr)); + + nfo.loc_addr = ntohl(iph->daddr); + nfo.loc_port = ntohs(tcph->dest); + nfo.rem_addr = ntohl(iph->saddr); + nfo.rem_port = ntohs(tcph->source); + + /* If port mapper is available these should be mapped address info */ + nfo.mapped_loc_addr = ntohl(iph->daddr); + nfo.mapped_loc_port = ntohs(tcph->dest); + nfo.mapped_rem_addr = ntohl(iph->saddr); + nfo.mapped_rem_port = ntohs(tcph->source); + + tmp_daddr = cpu_to_be32(iph->daddr); + tmp_saddr = cpu_to_be32(iph->saddr); + + nes_debug(NES_DBG_CM, "Received packet: dest=%pI4:0x%04X src=%pI4:0x%04X\n", + &tmp_daddr, tcph->dest, &tmp_saddr, tcph->source); + + do { + cm_node = find_node(cm_core, + nfo.mapped_rem_port, nfo.mapped_rem_addr, + nfo.mapped_loc_port, nfo.mapped_loc_addr); + + if (!cm_node) { + /* Only type of packet accepted are for */ + /* the PASSIVE open (syn only) */ + if ((!tcph->syn) || (tcph->ack)) { + skb_handled = 0; + break; + } + listener = find_listener(cm_core, nfo.mapped_loc_addr, + nfo.mapped_loc_port, + NES_CM_LISTENER_ACTIVE_STATE, 0); + if (!listener) { + nfo.cm_id = NULL; + nfo.conn_type = 0; + nes_debug(NES_DBG_CM, "Unable to find listener for the pkt\n"); + skb_handled = 0; + break; + } + nfo.cm_id = listener->cm_id; + nfo.conn_type = listener->conn_type; + cm_node = make_cm_node(cm_core, nesvnic, &nfo, + listener); + if (!cm_node) { + nes_debug(NES_DBG_CM, "Unable to allocate " + "node\n"); + cm_packets_dropped++; + atomic_dec(&listener->ref_count); + dev_kfree_skb_any(skb); + break; + } + if (!tcph->rst && !tcph->fin) { + cm_node->state = NES_CM_STATE_LISTENING; + } else { + cm_packets_dropped++; + rem_ref_cm_node(cm_core, cm_node); + dev_kfree_skb_any(skb); + break; + } + add_ref_cm_node(cm_node); + } else if (cm_node->state == NES_CM_STATE_TSA) { + if (cm_node->nesqp->pau_mode) + nes_queue_mgt_skbs(skb, nesvnic, cm_node->nesqp); + else { + rem_ref_cm_node(cm_core, cm_node); + atomic_inc(&cm_accel_dropped_pkts); + dev_kfree_skb_any(skb); + } + break; + } + skb_reset_network_header(skb); + skb_set_transport_header(skb, sizeof(*tcph)); + skb->len = ntohs(iph->tot_len); + process_packet(cm_node, skb, cm_core); + rem_ref_cm_node(cm_core, cm_node); + } while (0); + return skb_handled; +} + + +/** + * nes_cm_alloc_core - allocate a top level instance of a cm core + */ +static struct nes_cm_core *nes_cm_alloc_core(void) +{ + struct nes_cm_core *cm_core; + + /* setup the CM core */ + /* alloc top level core control structure */ + cm_core = kzalloc(sizeof(*cm_core), GFP_KERNEL); + if (!cm_core) + return NULL; + + INIT_LIST_HEAD(&cm_core->connected_nodes); + init_timer(&cm_core->tcp_timer); + cm_core->tcp_timer.function = nes_cm_timer_tick; + + cm_core->mtu = NES_CM_DEFAULT_MTU; + cm_core->state = NES_CM_STATE_INITED; + cm_core->free_tx_pkt_max = NES_CM_DEFAULT_FREE_PKTS; + + atomic_set(&cm_core->events_posted, 0); + + cm_core->api = &nes_cm_api; + + spin_lock_init(&cm_core->ht_lock); + spin_lock_init(&cm_core->listen_list_lock); + + INIT_LIST_HEAD(&cm_core->listen_list.list); + + nes_debug(NES_DBG_CM, "Init CM Core completed -- cm_core=%p\n", cm_core); + + nes_debug(NES_DBG_CM, "Enable QUEUE EVENTS\n"); + cm_core->event_wq = create_singlethread_workqueue("nesewq"); + cm_core->post_event = nes_cm_post_event; + nes_debug(NES_DBG_CM, "Enable QUEUE DISCONNECTS\n"); + cm_core->disconn_wq = create_singlethread_workqueue("nesdwq"); + + print_core(cm_core); + return cm_core; +} + + +/** + * mini_cm_dealloc_core - deallocate a top level instance of a cm core + */ +static int mini_cm_dealloc_core(struct nes_cm_core *cm_core) +{ + nes_debug(NES_DBG_CM, "De-Alloc CM Core (%p)\n", cm_core); + + if (!cm_core) + return -EINVAL; + + barrier(); + + if (timer_pending(&cm_core->tcp_timer)) + del_timer(&cm_core->tcp_timer); + + destroy_workqueue(cm_core->event_wq); + destroy_workqueue(cm_core->disconn_wq); + nes_debug(NES_DBG_CM, "\n"); + kfree(cm_core); + + return 0; +} + + +/** + * mini_cm_get + */ +static int mini_cm_get(struct nes_cm_core *cm_core) +{ + return cm_core->state; +} + + +/** + * mini_cm_set + */ +static int mini_cm_set(struct nes_cm_core *cm_core, u32 type, u32 value) +{ + int ret = 0; + + switch (type) { + case NES_CM_SET_PKT_SIZE: + cm_core->mtu = value; + break; + case NES_CM_SET_FREE_PKT_Q_SIZE: + cm_core->free_tx_pkt_max = value; + break; + default: + /* unknown set option */ + ret = -EINVAL; + } + + return ret; +} + + +/** + * nes_cm_init_tsa_conn setup HW; MPA frames must be + * successfully exchanged when this is called + */ +static int nes_cm_init_tsa_conn(struct nes_qp *nesqp, struct nes_cm_node *cm_node) +{ + int ret = 0; + + if (!nesqp) + return -EINVAL; + + nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_IPV4 | + NES_QPCONTEXT_MISC_NO_NAGLE | NES_QPCONTEXT_MISC_DO_NOT_FRAG | + NES_QPCONTEXT_MISC_DROS); + + if (cm_node->tcp_cntxt.snd_wscale || cm_node->tcp_cntxt.rcv_wscale) + nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_WSCALE); + + nesqp->nesqp_context->misc2 |= cpu_to_le32(64 << NES_QPCONTEXT_MISC2_TTL_SHIFT); + + nesqp->nesqp_context->mss |= cpu_to_le32(((u32)cm_node->tcp_cntxt.mss) << 16); + + nesqp->nesqp_context->tcp_state_flow_label |= cpu_to_le32( + (u32)NES_QPCONTEXT_TCPSTATE_EST << NES_QPCONTEXT_TCPFLOW_TCP_STATE_SHIFT); + + nesqp->nesqp_context->pd_index_wscale |= cpu_to_le32( + (cm_node->tcp_cntxt.snd_wscale << NES_QPCONTEXT_PDWSCALE_SND_WSCALE_SHIFT) & + NES_QPCONTEXT_PDWSCALE_SND_WSCALE_MASK); + + nesqp->nesqp_context->pd_index_wscale |= cpu_to_le32( + (cm_node->tcp_cntxt.rcv_wscale << NES_QPCONTEXT_PDWSCALE_RCV_WSCALE_SHIFT) & + NES_QPCONTEXT_PDWSCALE_RCV_WSCALE_MASK); + + nesqp->nesqp_context->keepalive = cpu_to_le32(0x80); + nesqp->nesqp_context->ts_recent = 0; + nesqp->nesqp_context->ts_age = 0; + nesqp->nesqp_context->snd_nxt = cpu_to_le32(cm_node->tcp_cntxt.loc_seq_num); + nesqp->nesqp_context->snd_wnd = cpu_to_le32(cm_node->tcp_cntxt.snd_wnd); + nesqp->nesqp_context->rcv_nxt = cpu_to_le32(cm_node->tcp_cntxt.rcv_nxt); + nesqp->nesqp_context->rcv_wnd = cpu_to_le32(cm_node->tcp_cntxt.rcv_wnd << + cm_node->tcp_cntxt.rcv_wscale); + nesqp->nesqp_context->snd_max = cpu_to_le32(cm_node->tcp_cntxt.loc_seq_num); + nesqp->nesqp_context->snd_una = cpu_to_le32(cm_node->tcp_cntxt.loc_seq_num); + nesqp->nesqp_context->srtt = 0; + nesqp->nesqp_context->rttvar = cpu_to_le32(0x6); + nesqp->nesqp_context->ssthresh = cpu_to_le32(0x3FFFC000); + nesqp->nesqp_context->cwnd = cpu_to_le32(2 * cm_node->tcp_cntxt.mss); + nesqp->nesqp_context->snd_wl1 = cpu_to_le32(cm_node->tcp_cntxt.rcv_nxt); + nesqp->nesqp_context->snd_wl2 = cpu_to_le32(cm_node->tcp_cntxt.loc_seq_num); + nesqp->nesqp_context->max_snd_wnd = cpu_to_le32(cm_node->tcp_cntxt.max_snd_wnd); + + nes_debug(NES_DBG_CM, "QP%u: rcv_nxt = 0x%08X, snd_nxt = 0x%08X," + " Setting MSS to %u, PDWscale = 0x%08X, rcv_wnd = %u, context misc = 0x%08X.\n", + nesqp->hwqp.qp_id, le32_to_cpu(nesqp->nesqp_context->rcv_nxt), + le32_to_cpu(nesqp->nesqp_context->snd_nxt), + cm_node->tcp_cntxt.mss, le32_to_cpu(nesqp->nesqp_context->pd_index_wscale), + le32_to_cpu(nesqp->nesqp_context->rcv_wnd), + le32_to_cpu(nesqp->nesqp_context->misc)); + nes_debug(NES_DBG_CM, " snd_wnd = 0x%08X.\n", le32_to_cpu(nesqp->nesqp_context->snd_wnd)); + nes_debug(NES_DBG_CM, " snd_cwnd = 0x%08X.\n", le32_to_cpu(nesqp->nesqp_context->cwnd)); + nes_debug(NES_DBG_CM, " max_swnd = 0x%08X.\n", le32_to_cpu(nesqp->nesqp_context->max_snd_wnd)); + + nes_debug(NES_DBG_CM, "Change cm_node state to TSA\n"); + cm_node->state = NES_CM_STATE_TSA; + + return ret; +} + + +/** + * nes_cm_disconn + */ +int nes_cm_disconn(struct nes_qp *nesqp) +{ + struct disconn_work *work; + + work = kzalloc(sizeof *work, GFP_ATOMIC); + if (!work) + return -ENOMEM; /* Timer will clean up */ + + nes_add_ref(&nesqp->ibqp); + work->nesqp = nesqp; + INIT_WORK(&work->work, nes_disconnect_worker); + queue_work(g_cm_core->disconn_wq, &work->work); + return 0; +} + + +/** + * nes_disconnect_worker + */ +static void nes_disconnect_worker(struct work_struct *work) +{ + struct disconn_work *dwork = container_of(work, struct disconn_work, work); + struct nes_qp *nesqp = dwork->nesqp; + + kfree(dwork); + nes_debug(NES_DBG_CM, "processing AEQE id 0x%04X for QP%u.\n", + nesqp->last_aeq, nesqp->hwqp.qp_id); + nes_cm_disconn_true(nesqp); + nes_rem_ref(&nesqp->ibqp); +} + + +/** + * nes_cm_disconn_true + */ +static int nes_cm_disconn_true(struct nes_qp *nesqp) +{ + unsigned long flags; + int ret = 0; + struct iw_cm_id *cm_id; + struct iw_cm_event cm_event; + struct nes_vnic *nesvnic; + u16 last_ae; + u8 original_hw_tcp_state; + u8 original_ibqp_state; + int disconn_status = 0; + int issue_disconn = 0; + int issue_close = 0; + int issue_flush = 0; + u32 flush_q = NES_CQP_FLUSH_RQ; + struct ib_event ibevent; + + if (!nesqp) { + nes_debug(NES_DBG_CM, "disconnect_worker nesqp is NULL\n"); + return -1; + } + + spin_lock_irqsave(&nesqp->lock, flags); + cm_id = nesqp->cm_id; + /* make sure we havent already closed this connection */ + if (!cm_id) { + nes_debug(NES_DBG_CM, "QP%u disconnect_worker cmid is NULL\n", + nesqp->hwqp.qp_id); + spin_unlock_irqrestore(&nesqp->lock, flags); + return -1; + } + + nesvnic = to_nesvnic(nesqp->ibqp.device); + nes_debug(NES_DBG_CM, "Disconnecting QP%u\n", nesqp->hwqp.qp_id); + + original_hw_tcp_state = nesqp->hw_tcp_state; + original_ibqp_state = nesqp->ibqp_state; + last_ae = nesqp->last_aeq; + + if (nesqp->term_flags) { + issue_disconn = 1; + issue_close = 1; + nesqp->cm_id = NULL; + del_timer(&nesqp->terminate_timer); + if (nesqp->flush_issued == 0) { + nesqp->flush_issued = 1; + issue_flush = 1; + } + } else if ((original_hw_tcp_state == NES_AEQE_TCP_STATE_CLOSE_WAIT) || + ((original_ibqp_state == IB_QPS_RTS) && + (last_ae == NES_AEQE_AEID_LLP_CONNECTION_RESET))) { + issue_disconn = 1; + if (last_ae == NES_AEQE_AEID_LLP_CONNECTION_RESET) + disconn_status = -ECONNRESET; + } + + if (((original_hw_tcp_state == NES_AEQE_TCP_STATE_CLOSED) || + (original_hw_tcp_state == NES_AEQE_TCP_STATE_TIME_WAIT) || + (last_ae == NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE) || + (last_ae == NES_AEQE_AEID_LLP_CONNECTION_RESET))) { + issue_close = 1; + nesqp->cm_id = NULL; + if (nesqp->flush_issued == 0) { + nesqp->flush_issued = 1; + issue_flush = 1; + } + } + + spin_unlock_irqrestore(&nesqp->lock, flags); + + if ((issue_flush) && (nesqp->destroyed == 0)) { + /* Flush the queue(s) */ + if (nesqp->hw_iwarp_state >= NES_AEQE_IWARP_STATE_TERMINATE) + flush_q |= NES_CQP_FLUSH_SQ; + flush_wqes(nesvnic->nesdev, nesqp, flush_q, 1); + + if (nesqp->term_flags) { + ibevent.device = nesqp->ibqp.device; + ibevent.event = nesqp->terminate_eventtype; + ibevent.element.qp = &nesqp->ibqp; + if (nesqp->ibqp.event_handler) + nesqp->ibqp.event_handler(&ibevent, nesqp->ibqp.qp_context); + } + } + + if ((cm_id) && (cm_id->event_handler)) { + if (issue_disconn) { + atomic_inc(&cm_disconnects); + cm_event.event = IW_CM_EVENT_DISCONNECT; + cm_event.status = disconn_status; + cm_event.local_addr = cm_id->local_addr; + cm_event.remote_addr = cm_id->remote_addr; + cm_event.private_data = NULL; + cm_event.private_data_len = 0; + + nes_debug(NES_DBG_CM, "Generating a CM Disconnect Event" + " for QP%u, SQ Head = %u, SQ Tail = %u. " + "cm_id = %p, refcount = %u.\n", + nesqp->hwqp.qp_id, nesqp->hwqp.sq_head, + nesqp->hwqp.sq_tail, cm_id, + atomic_read(&nesqp->refcount)); + + ret = cm_id->event_handler(cm_id, &cm_event); + if (ret) + nes_debug(NES_DBG_CM, "OFA CM event_handler " + "returned, ret=%d\n", ret); + } + + if (issue_close) { + atomic_inc(&cm_closes); + nes_disconnect(nesqp, 1); + + cm_id->provider_data = nesqp; + /* Send up the close complete event */ + cm_event.event = IW_CM_EVENT_CLOSE; + cm_event.status = 0; + cm_event.provider_data = cm_id->provider_data; + cm_event.local_addr = cm_id->local_addr; + cm_event.remote_addr = cm_id->remote_addr; + cm_event.private_data = NULL; + cm_event.private_data_len = 0; + + ret = cm_id->event_handler(cm_id, &cm_event); + if (ret) + nes_debug(NES_DBG_CM, "OFA CM event_handler returned, ret=%d\n", ret); + + cm_id->rem_ref(cm_id); + } + } + + return 0; +} + + +/** + * nes_disconnect + */ +static int nes_disconnect(struct nes_qp *nesqp, int abrupt) +{ + int ret = 0; + struct nes_vnic *nesvnic; + struct nes_device *nesdev; + struct nes_ib_device *nesibdev; + + nesvnic = to_nesvnic(nesqp->ibqp.device); + if (!nesvnic) + return -EINVAL; + + nesdev = nesvnic->nesdev; + nesibdev = nesvnic->nesibdev; + + nes_debug(NES_DBG_CM, "netdev refcnt = %u.\n", + netdev_refcnt_read(nesvnic->netdev)); + + if (nesqp->active_conn) { + + /* indicate this connection is NOT active */ + nesqp->active_conn = 0; + } else { + /* Need to free the Last Streaming Mode Message */ + if (nesqp->ietf_frame) { + if (nesqp->lsmm_mr) + nesibdev->ibdev.dereg_mr(nesqp->lsmm_mr); + pci_free_consistent(nesdev->pcidev, + nesqp->private_data_len + nesqp->ietf_frame_size, + nesqp->ietf_frame, nesqp->ietf_frame_pbase); + } + } + + /* close the CM node down if it is still active */ + if (nesqp->cm_node) { + nes_debug(NES_DBG_CM, "Call close API\n"); + + g_cm_core->api->close(g_cm_core, nesqp->cm_node); + } + + return ret; +} + + +/** + * nes_accept + */ +int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) +{ + u64 u64temp; + struct ib_qp *ibqp; + struct nes_qp *nesqp; + struct nes_vnic *nesvnic; + struct nes_device *nesdev; + struct nes_cm_node *cm_node; + struct nes_adapter *adapter; + struct ib_qp_attr attr; + struct iw_cm_event cm_event; + struct nes_hw_qp_wqe *wqe; + struct nes_v4_quad nes_quad; + u32 crc_value; + int ret; + int passive_state; + struct nes_ib_device *nesibdev; + struct ib_mr *ibmr = NULL; + struct ib_phys_buf ibphysbuf; + struct nes_pd *nespd; + u64 tagged_offset; + u8 mpa_frame_offset = 0; + struct ietf_mpa_v2 *mpa_v2_frame; + u8 start_addr = 0; + u8 *start_ptr = &start_addr; + u8 **start_buff = &start_ptr; + u16 buff_len = 0; + struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->local_addr; + struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->remote_addr; + + ibqp = nes_get_qp(cm_id->device, conn_param->qpn); + if (!ibqp) + return -EINVAL; + + /* get all our handles */ + nesqp = to_nesqp(ibqp); + nesvnic = to_nesvnic(nesqp->ibqp.device); + nesdev = nesvnic->nesdev; + adapter = nesdev->nesadapter; + + cm_node = (struct nes_cm_node *)cm_id->provider_data; + nes_debug(NES_DBG_CM, "nes_accept: cm_node= %p nesvnic=%p, netdev=%p," + "%s\n", cm_node, nesvnic, nesvnic->netdev, + nesvnic->netdev->name); + + if (NES_CM_STATE_LISTENER_DESTROYED == cm_node->state) { + if (cm_node->loopbackpartner) + rem_ref_cm_node(cm_node->cm_core, cm_node->loopbackpartner); + rem_ref_cm_node(cm_node->cm_core, cm_node); + return -EINVAL; + } + + passive_state = atomic_add_return(1, &cm_node->passive_state); + if (passive_state == NES_SEND_RESET_EVENT) { + rem_ref_cm_node(cm_node->cm_core, cm_node); + return -ECONNRESET; + } + /* associate the node with the QP */ + nesqp->cm_node = (void *)cm_node; + cm_node->nesqp = nesqp; + + + nes_debug(NES_DBG_CM, "QP%u, cm_node=%p, jiffies = %lu listener = %p\n", + nesqp->hwqp.qp_id, cm_node, jiffies, cm_node->listener); + atomic_inc(&cm_accepts); + + nes_debug(NES_DBG_CM, "netdev refcnt = %u.\n", + netdev_refcnt_read(nesvnic->netdev)); + + nesqp->ietf_frame_size = sizeof(struct ietf_mpa_v2); + /* allocate the ietf frame and space for private data */ + nesqp->ietf_frame = pci_alloc_consistent(nesdev->pcidev, + nesqp->ietf_frame_size + conn_param->private_data_len, + &nesqp->ietf_frame_pbase); + + if (!nesqp->ietf_frame) { + nes_debug(NES_DBG_CM, "Unable to allocate memory for private data\n"); + return -ENOMEM; + } + mpa_v2_frame = (struct ietf_mpa_v2 *)nesqp->ietf_frame; + + if (cm_node->mpa_frame_rev == IETF_MPA_V1) + mpa_frame_offset = 4; + + if (cm_node->mpa_frame_rev == IETF_MPA_V1 || + cm_node->mpav2_ird_ord == IETF_NO_IRD_ORD) { + record_ird_ord(cm_node, (u16)conn_param->ird, (u16)conn_param->ord); + } + + memcpy(mpa_v2_frame->priv_data, conn_param->private_data, + conn_param->private_data_len); + + cm_build_mpa_frame(cm_node, start_buff, &buff_len, nesqp->ietf_frame, MPA_KEY_REPLY); + nesqp->private_data_len = conn_param->private_data_len; + + /* setup our first outgoing iWarp send WQE (the IETF frame response) */ + wqe = &nesqp->hwqp.sq_vbase[0]; + + if (raddr->sin_addr.s_addr != laddr->sin_addr.s_addr) { + u64temp = (unsigned long)nesqp; + nesibdev = nesvnic->nesibdev; + nespd = nesqp->nespd; + ibphysbuf.addr = nesqp->ietf_frame_pbase + mpa_frame_offset; + ibphysbuf.size = buff_len; + tagged_offset = (u64)(unsigned long)*start_buff; + ibmr = nesibdev->ibdev.reg_phys_mr((struct ib_pd *)nespd, + &ibphysbuf, 1, + IB_ACCESS_LOCAL_WRITE, + &tagged_offset); + if (!ibmr) { + nes_debug(NES_DBG_CM, "Unable to register memory region" + "for lSMM for cm_node = %p \n", + cm_node); + pci_free_consistent(nesdev->pcidev, + nesqp->private_data_len + nesqp->ietf_frame_size, + nesqp->ietf_frame, nesqp->ietf_frame_pbase); + return -ENOMEM; + } + + ibmr->pd = &nespd->ibpd; + ibmr->device = nespd->ibpd.device; + nesqp->lsmm_mr = ibmr; + + u64temp |= NES_SW_CONTEXT_ALIGN >> 1; + set_wqe_64bit_value(wqe->wqe_words, + NES_IWARP_SQ_WQE_COMP_CTX_LOW_IDX, + u64temp); + wqe->wqe_words[NES_IWARP_SQ_WQE_MISC_IDX] = + cpu_to_le32(NES_IWARP_SQ_WQE_STREAMING | + NES_IWARP_SQ_WQE_WRPDU); + wqe->wqe_words[NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX] = + cpu_to_le32(buff_len); + set_wqe_64bit_value(wqe->wqe_words, + NES_IWARP_SQ_WQE_FRAG0_LOW_IDX, + (u64)(unsigned long)(*start_buff)); + wqe->wqe_words[NES_IWARP_SQ_WQE_LENGTH0_IDX] = + cpu_to_le32(buff_len); + wqe->wqe_words[NES_IWARP_SQ_WQE_STAG0_IDX] = ibmr->lkey; + if (nesqp->sq_kmapped) { + nesqp->sq_kmapped = 0; + kunmap(nesqp->page); + } + + nesqp->nesqp_context->ird_ord_sizes |= + cpu_to_le32(NES_QPCONTEXT_ORDIRD_LSMM_PRESENT | + NES_QPCONTEXT_ORDIRD_WRPDU); + } else { + nesqp->nesqp_context->ird_ord_sizes |= + cpu_to_le32(NES_QPCONTEXT_ORDIRD_WRPDU); + } + nesqp->skip_lsmm = 1; + + /* Cache the cm_id in the qp */ + nesqp->cm_id = cm_id; + cm_node->cm_id = cm_id; + + /* nesqp->cm_node = (void *)cm_id->provider_data; */ + cm_id->provider_data = nesqp; + nesqp->active_conn = 0; + + if (cm_node->state == NES_CM_STATE_TSA) + nes_debug(NES_DBG_CM, "Already state = TSA for cm_node=%p\n", + cm_node); + + nes_cm_init_tsa_conn(nesqp, cm_node); + + nesqp->nesqp_context->tcpPorts[0] = + cpu_to_le16(cm_node->mapped_loc_port); + nesqp->nesqp_context->tcpPorts[1] = + cpu_to_le16(cm_node->mapped_rem_port); + + nesqp->nesqp_context->ip0 = cpu_to_le32(cm_node->mapped_rem_addr); + + nesqp->nesqp_context->misc2 |= cpu_to_le32( + (u32)PCI_FUNC(nesdev->pcidev->devfn) << + NES_QPCONTEXT_MISC2_SRC_IP_SHIFT); + + nesqp->nesqp_context->arp_index_vlan |= + cpu_to_le32(nes_arp_table(nesdev, + le32_to_cpu(nesqp->nesqp_context->ip0), NULL, + NES_ARP_RESOLVE) << 16); + + nesqp->nesqp_context->ts_val_delta = cpu_to_le32( + jiffies - nes_read_indexed(nesdev, NES_IDX_TCP_NOW)); + + nesqp->nesqp_context->ird_index = cpu_to_le32(nesqp->hwqp.qp_id); + + nesqp->nesqp_context->ird_ord_sizes |= cpu_to_le32( + ((u32)1 << NES_QPCONTEXT_ORDIRD_IWARP_MODE_SHIFT)); + nesqp->nesqp_context->ird_ord_sizes |= + cpu_to_le32((u32)cm_node->ord_size); + + memset(&nes_quad, 0, sizeof(nes_quad)); + nes_quad.DstIpAdrIndex = + cpu_to_le32((u32)PCI_FUNC(nesdev->pcidev->devfn) << 24); + nes_quad.SrcIpadr = htonl(cm_node->mapped_rem_addr); + nes_quad.TcpPorts[0] = htons(cm_node->mapped_rem_port); + nes_quad.TcpPorts[1] = htons(cm_node->mapped_loc_port); + + /* Produce hash key */ + crc_value = get_crc_value(&nes_quad); + nesqp->hte_index = cpu_to_be32(crc_value ^ 0xffffffff); + nes_debug(NES_DBG_CM, "HTE Index = 0x%08X, CRC = 0x%08X\n", + nesqp->hte_index, nesqp->hte_index & adapter->hte_index_mask); + + nesqp->hte_index &= adapter->hte_index_mask; + nesqp->nesqp_context->hte_index = cpu_to_le32(nesqp->hte_index); + + cm_node->cm_core->api->accelerated(cm_node->cm_core, cm_node); + + nes_debug(NES_DBG_CM, "QP%u, Destination IP = 0x%08X:0x%04X, local = " + "0x%08X:0x%04X, rcv_nxt=0x%08X, snd_nxt=0x%08X, mpa + " + "private data length=%u.\n", nesqp->hwqp.qp_id, + ntohl(raddr->sin_addr.s_addr), ntohs(raddr->sin_port), + ntohl(laddr->sin_addr.s_addr), ntohs(laddr->sin_port), + le32_to_cpu(nesqp->nesqp_context->rcv_nxt), + le32_to_cpu(nesqp->nesqp_context->snd_nxt), + buff_len); + + /* notify OF layer that accept event was successful */ + cm_id->add_ref(cm_id); + nes_add_ref(&nesqp->ibqp); + + cm_event.event = IW_CM_EVENT_ESTABLISHED; + cm_event.status = 0; + cm_event.provider_data = (void *)nesqp; + cm_event.local_addr = cm_id->local_addr; + cm_event.remote_addr = cm_id->remote_addr; + cm_event.private_data = NULL; + cm_event.private_data_len = 0; + cm_event.ird = cm_node->ird_size; + cm_event.ord = cm_node->ord_size; + + ret = cm_id->event_handler(cm_id, &cm_event); + attr.qp_state = IB_QPS_RTS; + nes_modify_qp(&nesqp->ibqp, &attr, IB_QP_STATE, NULL); + if (cm_node->loopbackpartner) { + cm_node->loopbackpartner->mpa_frame_size = + nesqp->private_data_len; + /* copy entire MPA frame to our cm_node's frame */ + memcpy(cm_node->loopbackpartner->mpa_frame_buf, + conn_param->private_data, conn_param->private_data_len); + create_event(cm_node->loopbackpartner, NES_CM_EVENT_CONNECTED); + } + if (ret) + printk(KERN_ERR "%s[%u] OFA CM event_handler returned, " + "ret=%d\n", __func__, __LINE__, ret); + + return 0; +} + + +/** + * nes_reject + */ +int nes_reject(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len) +{ + struct nes_cm_node *cm_node; + struct nes_cm_node *loopback; + struct nes_cm_core *cm_core; + u8 *start_buff; + + atomic_inc(&cm_rejects); + cm_node = (struct nes_cm_node *)cm_id->provider_data; + loopback = cm_node->loopbackpartner; + cm_core = cm_node->cm_core; + cm_node->cm_id = cm_id; + + if (pdata_len + sizeof(struct ietf_mpa_v2) > MAX_CM_BUFFER) + return -EINVAL; + + if (loopback) { + memcpy(&loopback->mpa_frame.priv_data, pdata, pdata_len); + loopback->mpa_frame.priv_data_len = pdata_len; + loopback->mpa_frame_size = pdata_len; + } else { + start_buff = &cm_node->mpa_frame_buf[0] + sizeof(struct ietf_mpa_v2); + cm_node->mpa_frame_size = pdata_len; + memcpy(start_buff, pdata, pdata_len); + } + return cm_core->api->reject(cm_core, cm_node); +} + + +/** + * nes_connect + * setup and launch cm connect node + */ +int nes_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) +{ + struct ib_qp *ibqp; + struct nes_qp *nesqp; + struct nes_vnic *nesvnic; + struct nes_device *nesdev; + struct nes_cm_node *cm_node; + struct nes_cm_info cm_info; + int apbvt_set = 0; + struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->local_addr; + struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->remote_addr; + struct iwpm_dev_data pm_reg_msg; + struct iwpm_sa_data pm_msg; + int iwpm_err = 0; + + if (cm_id->remote_addr.ss_family != AF_INET) + return -ENOSYS; + ibqp = nes_get_qp(cm_id->device, conn_param->qpn); + if (!ibqp) + return -EINVAL; + nesqp = to_nesqp(ibqp); + if (!nesqp) + return -EINVAL; + nesvnic = to_nesvnic(nesqp->ibqp.device); + if (!nesvnic) + return -EINVAL; + nesdev = nesvnic->nesdev; + if (!nesdev) + return -EINVAL; + + if (!laddr->sin_port || !raddr->sin_port) + return -EINVAL; + + nes_debug(NES_DBG_CM, "QP%u, current IP = 0x%08X, Destination IP = " + "0x%08X:0x%04X, local = 0x%08X:0x%04X.\n", nesqp->hwqp.qp_id, + ntohl(nesvnic->local_ipaddr), ntohl(raddr->sin_addr.s_addr), + ntohs(raddr->sin_port), ntohl(laddr->sin_addr.s_addr), + ntohs(laddr->sin_port)); + + atomic_inc(&cm_connects); + nesqp->active_conn = 1; + + /* cache the cm_id in the qp */ + nesqp->cm_id = cm_id; + cm_id->provider_data = nesqp; + nesqp->private_data_len = conn_param->private_data_len; + + nes_debug(NES_DBG_CM, "requested ord = 0x%08X.\n", (u32)conn_param->ord); + nes_debug(NES_DBG_CM, "mpa private data len =%u\n", + conn_param->private_data_len); + + /* set up the connection params for the node */ + cm_info.loc_addr = ntohl(laddr->sin_addr.s_addr); + cm_info.loc_port = ntohs(laddr->sin_port); + cm_info.rem_addr = ntohl(raddr->sin_addr.s_addr); + cm_info.rem_port = ntohs(raddr->sin_port); + cm_info.cm_id = cm_id; + cm_info.conn_type = NES_CM_IWARP_CONN_TYPE; + + /* No port mapper available, go with the specified peer information */ + cm_info.mapped_loc_addr = cm_info.loc_addr; + cm_info.mapped_loc_port = cm_info.loc_port; + cm_info.mapped_rem_addr = cm_info.rem_addr; + cm_info.mapped_rem_port = cm_info.rem_port; + + nes_form_reg_msg(nesvnic, &pm_reg_msg); + iwpm_err = iwpm_register_pid(&pm_reg_msg, RDMA_NL_NES); + if (iwpm_err) { + nes_debug(NES_DBG_NLMSG, + "Port Mapper reg pid fail (err = %d).\n", iwpm_err); + } + if (iwpm_valid_pid() && !iwpm_err) { + nes_form_pm_msg(&cm_info, &pm_msg); + iwpm_err = iwpm_add_and_query_mapping(&pm_msg, RDMA_NL_NES); + if (iwpm_err) + nes_debug(NES_DBG_NLMSG, + "Port Mapper query fail (err = %d).\n", iwpm_err); + else + nes_record_pm_msg(&cm_info, &pm_msg); + } + + if (laddr->sin_addr.s_addr != raddr->sin_addr.s_addr) { + nes_manage_apbvt(nesvnic, cm_info.mapped_loc_port, + PCI_FUNC(nesdev->pcidev->devfn), NES_MANAGE_APBVT_ADD); + apbvt_set = 1; + } + + if (nes_create_mapinfo(&cm_info)) + return -ENOMEM; + + cm_id->add_ref(cm_id); + + /* create a connect CM node connection */ + cm_node = g_cm_core->api->connect(g_cm_core, nesvnic, + conn_param->private_data_len, (void *)conn_param->private_data, + &cm_info); + if (!cm_node) { + if (apbvt_set) + nes_manage_apbvt(nesvnic, cm_info.mapped_loc_port, + PCI_FUNC(nesdev->pcidev->devfn), + NES_MANAGE_APBVT_DEL); + + nes_debug(NES_DBG_NLMSG, "Delete mapped_loc_port = %04X\n", + cm_info.mapped_loc_port); + nes_remove_mapinfo(cm_info.loc_addr, cm_info.loc_port, + cm_info.mapped_loc_addr, cm_info.mapped_loc_port); + cm_id->rem_ref(cm_id); + return -ENOMEM; + } + + record_ird_ord(cm_node, (u16)conn_param->ird, (u16)conn_param->ord); + if (cm_node->send_rdma0_op == SEND_RDMA_READ_ZERO && + cm_node->ord_size == 0) + cm_node->ord_size = 1; + + cm_node->apbvt_set = apbvt_set; + nesqp->cm_node = cm_node; + cm_node->nesqp = nesqp; + nes_add_ref(&nesqp->ibqp); + + return 0; +} + + +/** + * nes_create_listen + */ +int nes_create_listen(struct iw_cm_id *cm_id, int backlog) +{ + struct nes_vnic *nesvnic; + struct nes_cm_listener *cm_node; + struct nes_cm_info cm_info; + int err; + struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->local_addr; + + nes_debug(NES_DBG_CM, "cm_id = %p, local port = 0x%04X.\n", + cm_id, ntohs(laddr->sin_port)); + + if (cm_id->local_addr.ss_family != AF_INET) + return -ENOSYS; + nesvnic = to_nesvnic(cm_id->device); + if (!nesvnic) + return -EINVAL; + + nes_debug(NES_DBG_CM, "nesvnic=%p, netdev=%p, %s\n", + nesvnic, nesvnic->netdev, nesvnic->netdev->name); + + nes_debug(NES_DBG_CM, "nesvnic->local_ipaddr=0x%08x, sin_addr.s_addr=0x%08x\n", + nesvnic->local_ipaddr, laddr->sin_addr.s_addr); + + /* setup listen params in our api call struct */ + cm_info.loc_addr = ntohl(nesvnic->local_ipaddr); + cm_info.loc_port = ntohs(laddr->sin_port); + cm_info.backlog = backlog; + cm_info.cm_id = cm_id; + + cm_info.conn_type = NES_CM_IWARP_CONN_TYPE; + + /* No port mapper available, go with the specified info */ + cm_info.mapped_loc_addr = cm_info.loc_addr; + cm_info.mapped_loc_port = cm_info.loc_port; + + cm_node = g_cm_core->api->listen(g_cm_core, nesvnic, &cm_info); + if (!cm_node) { + printk(KERN_ERR "%s[%u] Error returned from listen API call\n", + __func__, __LINE__); + return -ENOMEM; + } + + cm_id->provider_data = cm_node; + + if (!cm_node->reused_node) { + if (nes_create_mapinfo(&cm_info)) + return -ENOMEM; + + err = nes_manage_apbvt(nesvnic, cm_node->mapped_loc_port, + PCI_FUNC(nesvnic->nesdev->pcidev->devfn), + NES_MANAGE_APBVT_ADD); + if (err) { + printk(KERN_ERR "nes_manage_apbvt call returned %d.\n", + err); + g_cm_core->api->stop_listener(g_cm_core, (void *)cm_node); + return err; + } + atomic_inc(&cm_listens_created); + } + + cm_id->add_ref(cm_id); + cm_id->provider_data = (void *)cm_node; + + + return 0; +} + + +/** + * nes_destroy_listen + */ +int nes_destroy_listen(struct iw_cm_id *cm_id) +{ + if (cm_id->provider_data) + g_cm_core->api->stop_listener(g_cm_core, cm_id->provider_data); + else + nes_debug(NES_DBG_CM, "cm_id->provider_data was NULL\n"); + + cm_id->rem_ref(cm_id); + + return 0; +} + + +/** + * nes_cm_recv + */ +int nes_cm_recv(struct sk_buff *skb, struct net_device *netdevice) +{ + int rc = 0; + + cm_packets_received++; + if ((g_cm_core) && (g_cm_core->api)) + rc = g_cm_core->api->recv_pkt(g_cm_core, netdev_priv(netdevice), skb); + else + nes_debug(NES_DBG_CM, "Unable to process packet for CM," + " cm is not setup properly.\n"); + + return rc; +} + + +/** + * nes_cm_start + * Start and init a cm core module + */ +int nes_cm_start(void) +{ + nes_debug(NES_DBG_CM, "\n"); + /* create the primary CM core, pass this handle to subsequent core inits */ + g_cm_core = nes_cm_alloc_core(); + if (g_cm_core) + return 0; + else + return -ENOMEM; +} + + +/** + * nes_cm_stop + * stop and dealloc all cm core instances + */ +int nes_cm_stop(void) +{ + g_cm_core->api->destroy_cm_core(g_cm_core); + return 0; +} + + +/** + * cm_event_connected + * handle a connected event, setup QPs and HW + */ +static void cm_event_connected(struct nes_cm_event *event) +{ + struct nes_qp *nesqp; + struct nes_vnic *nesvnic; + struct nes_device *nesdev; + struct nes_cm_node *cm_node; + struct nes_adapter *nesadapter; + struct ib_qp_attr attr; + struct iw_cm_id *cm_id; + struct iw_cm_event cm_event; + struct nes_v4_quad nes_quad; + u32 crc_value; + int ret; + struct sockaddr_in *laddr; + struct sockaddr_in *raddr; + struct sockaddr_in *cm_event_laddr; + + /* get all our handles */ + cm_node = event->cm_node; + cm_id = cm_node->cm_id; + nes_debug(NES_DBG_CM, "cm_event_connected - %p - cm_id = %p\n", cm_node, cm_id); + nesqp = (struct nes_qp *)cm_id->provider_data; + nesvnic = to_nesvnic(nesqp->ibqp.device); + nesdev = nesvnic->nesdev; + nesadapter = nesdev->nesadapter; + laddr = (struct sockaddr_in *)&cm_id->local_addr; + raddr = (struct sockaddr_in *)&cm_id->remote_addr; + cm_event_laddr = (struct sockaddr_in *)&cm_event.local_addr; + + if (nesqp->destroyed) + return; + atomic_inc(&cm_connecteds); + nes_debug(NES_DBG_CM, "QP%u attempting to connect to 0x%08X:0x%04X on" + " local port 0x%04X. jiffies = %lu.\n", + nesqp->hwqp.qp_id, ntohl(raddr->sin_addr.s_addr), + ntohs(raddr->sin_port), ntohs(laddr->sin_port), jiffies); + + nes_cm_init_tsa_conn(nesqp, cm_node); + + /* set the QP tsa context */ + nesqp->nesqp_context->tcpPorts[0] = + cpu_to_le16(cm_node->mapped_loc_port); + nesqp->nesqp_context->tcpPorts[1] = + cpu_to_le16(cm_node->mapped_rem_port); + nesqp->nesqp_context->ip0 = cpu_to_le32(cm_node->mapped_rem_addr); + + nesqp->nesqp_context->misc2 |= cpu_to_le32( + (u32)PCI_FUNC(nesdev->pcidev->devfn) << + NES_QPCONTEXT_MISC2_SRC_IP_SHIFT); + nesqp->nesqp_context->arp_index_vlan |= cpu_to_le32( + nes_arp_table(nesdev, + le32_to_cpu(nesqp->nesqp_context->ip0), + NULL, NES_ARP_RESOLVE) << 16); + nesqp->nesqp_context->ts_val_delta = cpu_to_le32( + jiffies - nes_read_indexed(nesdev, NES_IDX_TCP_NOW)); + nesqp->nesqp_context->ird_index = cpu_to_le32(nesqp->hwqp.qp_id); + nesqp->nesqp_context->ird_ord_sizes |= + cpu_to_le32((u32)1 << + NES_QPCONTEXT_ORDIRD_IWARP_MODE_SHIFT); + nesqp->nesqp_context->ird_ord_sizes |= + cpu_to_le32((u32)cm_node->ord_size); + + /* Adjust tail for not having a LSMM */ + /*nesqp->hwqp.sq_tail = 1;*/ + + build_rdma0_msg(cm_node, &nesqp); + + nes_write32(nesdev->regs + NES_WQE_ALLOC, + (1 << 24) | 0x00800000 | nesqp->hwqp.qp_id); + + memset(&nes_quad, 0, sizeof(nes_quad)); + + nes_quad.DstIpAdrIndex = + cpu_to_le32((u32)PCI_FUNC(nesdev->pcidev->devfn) << 24); + nes_quad.SrcIpadr = htonl(cm_node->mapped_rem_addr); + nes_quad.TcpPorts[0] = htons(cm_node->mapped_rem_port); + nes_quad.TcpPorts[1] = htons(cm_node->mapped_loc_port); + + /* Produce hash key */ + crc_value = get_crc_value(&nes_quad); + nesqp->hte_index = cpu_to_be32(crc_value ^ 0xffffffff); + nes_debug(NES_DBG_CM, "HTE Index = 0x%08X, After CRC = 0x%08X\n", + nesqp->hte_index, nesqp->hte_index & nesadapter->hte_index_mask); + + nesqp->hte_index &= nesadapter->hte_index_mask; + nesqp->nesqp_context->hte_index = cpu_to_le32(nesqp->hte_index); + + nesqp->ietf_frame = &cm_node->mpa_frame; + nesqp->private_data_len = (u8)cm_node->mpa_frame_size; + cm_node->cm_core->api->accelerated(cm_node->cm_core, cm_node); + + /* notify OF layer we successfully created the requested connection */ + cm_event.event = IW_CM_EVENT_CONNECT_REPLY; + cm_event.status = 0; + cm_event.provider_data = cm_id->provider_data; + cm_event_laddr->sin_family = AF_INET; + cm_event_laddr->sin_port = laddr->sin_port; + cm_event.remote_addr = cm_id->remote_addr; + + cm_event.private_data = (void *)event->cm_node->mpa_frame_buf; + cm_event.private_data_len = (u8)event->cm_node->mpa_frame_size; + cm_event.ird = cm_node->ird_size; + cm_event.ord = cm_node->ord_size; + + cm_event_laddr->sin_addr.s_addr = htonl(event->cm_info.rem_addr); + ret = cm_id->event_handler(cm_id, &cm_event); + nes_debug(NES_DBG_CM, "OFA CM event_handler returned, ret=%d\n", ret); + + if (ret) + printk(KERN_ERR "%s[%u] OFA CM event_handler returned, " + "ret=%d\n", __func__, __LINE__, ret); + attr.qp_state = IB_QPS_RTS; + nes_modify_qp(&nesqp->ibqp, &attr, IB_QP_STATE, NULL); + + nes_debug(NES_DBG_CM, "Exiting connect thread for QP%u. jiffies = " + "%lu\n", nesqp->hwqp.qp_id, jiffies); + + return; +} + + +/** + * cm_event_connect_error + */ +static void cm_event_connect_error(struct nes_cm_event *event) +{ + struct nes_qp *nesqp; + struct iw_cm_id *cm_id; + struct iw_cm_event cm_event; + /* struct nes_cm_info cm_info; */ + int ret; + + if (!event->cm_node) + return; + + cm_id = event->cm_node->cm_id; + if (!cm_id) + return; + + nes_debug(NES_DBG_CM, "cm_node=%p, cm_id=%p\n", event->cm_node, cm_id); + nesqp = cm_id->provider_data; + + if (!nesqp) + return; + + /* notify OF layer about this connection error event */ + /* cm_id->rem_ref(cm_id); */ + nesqp->cm_id = NULL; + cm_id->provider_data = NULL; + cm_event.event = IW_CM_EVENT_CONNECT_REPLY; + cm_event.status = -ECONNRESET; + cm_event.provider_data = cm_id->provider_data; + cm_event.local_addr = cm_id->local_addr; + cm_event.remote_addr = cm_id->remote_addr; + cm_event.private_data = NULL; + cm_event.private_data_len = 0; + +#ifdef CONFIG_INFINIBAND_NES_DEBUG + { + struct sockaddr_in *cm_event_laddr = (struct sockaddr_in *) + &cm_event.local_addr; + struct sockaddr_in *cm_event_raddr = (struct sockaddr_in *) + &cm_event.remote_addr; + nes_debug(NES_DBG_CM, "call CM_EVENT REJECTED, local_addr=%08x, remote_addr=%08x\n", + cm_event_laddr->sin_addr.s_addr, cm_event_raddr->sin_addr.s_addr); + } +#endif + + ret = cm_id->event_handler(cm_id, &cm_event); + nes_debug(NES_DBG_CM, "OFA CM event_handler returned, ret=%d\n", ret); + if (ret) + printk(KERN_ERR "%s[%u] OFA CM event_handler returned, " + "ret=%d\n", __func__, __LINE__, ret); + cm_id->rem_ref(cm_id); + + rem_ref_cm_node(event->cm_node->cm_core, event->cm_node); + return; +} + + +/** + * cm_event_reset + */ +static void cm_event_reset(struct nes_cm_event *event) +{ + struct nes_qp *nesqp; + struct iw_cm_id *cm_id; + struct iw_cm_event cm_event; + /* struct nes_cm_info cm_info; */ + int ret; + + if (!event->cm_node) + return; + + if (!event->cm_node->cm_id) + return; + + cm_id = event->cm_node->cm_id; + + nes_debug(NES_DBG_CM, "%p - cm_id = %p\n", event->cm_node, cm_id); + nesqp = cm_id->provider_data; + if (!nesqp) + return; + + nesqp->cm_id = NULL; + /* cm_id->provider_data = NULL; */ + cm_event.event = IW_CM_EVENT_DISCONNECT; + cm_event.status = -ECONNRESET; + cm_event.provider_data = cm_id->provider_data; + cm_event.local_addr = cm_id->local_addr; + cm_event.remote_addr = cm_id->remote_addr; + cm_event.private_data = NULL; + cm_event.private_data_len = 0; + + cm_id->add_ref(cm_id); + ret = cm_id->event_handler(cm_id, &cm_event); + atomic_inc(&cm_closes); + cm_event.event = IW_CM_EVENT_CLOSE; + cm_event.status = 0; + cm_event.provider_data = cm_id->provider_data; + cm_event.local_addr = cm_id->local_addr; + cm_event.remote_addr = cm_id->remote_addr; + cm_event.private_data = NULL; + cm_event.private_data_len = 0; + nes_debug(NES_DBG_CM, "NODE %p Generating CLOSE\n", event->cm_node); + ret = cm_id->event_handler(cm_id, &cm_event); + + nes_debug(NES_DBG_CM, "OFA CM event_handler returned, ret=%d\n", ret); + + + /* notify OF layer about this connection error event */ + cm_id->rem_ref(cm_id); + + return; +} + + +/** + * cm_event_mpa_req + */ +static void cm_event_mpa_req(struct nes_cm_event *event) +{ + struct iw_cm_id *cm_id; + struct iw_cm_event cm_event; + int ret; + struct nes_cm_node *cm_node; + struct sockaddr_in *cm_event_laddr = (struct sockaddr_in *) + &cm_event.local_addr; + struct sockaddr_in *cm_event_raddr = (struct sockaddr_in *) + &cm_event.remote_addr; + + cm_node = event->cm_node; + if (!cm_node) + return; + cm_id = cm_node->cm_id; + + atomic_inc(&cm_connect_reqs); + nes_debug(NES_DBG_CM, "cm_node = %p - cm_id = %p, jiffies = %lu\n", + cm_node, cm_id, jiffies); + + cm_event.event = IW_CM_EVENT_CONNECT_REQUEST; + cm_event.status = 0; + cm_event.provider_data = (void *)cm_node; + + cm_event_laddr->sin_family = AF_INET; + cm_event_laddr->sin_port = htons(event->cm_info.loc_port); + cm_event_laddr->sin_addr.s_addr = htonl(event->cm_info.loc_addr); + + cm_event_raddr->sin_family = AF_INET; + cm_event_raddr->sin_port = htons(event->cm_info.rem_port); + cm_event_raddr->sin_addr.s_addr = htonl(event->cm_info.rem_addr); + cm_event.private_data = cm_node->mpa_frame_buf; + cm_event.private_data_len = (u8)cm_node->mpa_frame_size; + if (cm_node->mpa_frame_rev == IETF_MPA_V1) { + cm_event.ird = NES_MAX_IRD; + cm_event.ord = NES_MAX_ORD; + } else { + cm_event.ird = cm_node->ird_size; + cm_event.ord = cm_node->ord_size; + } + + ret = cm_id->event_handler(cm_id, &cm_event); + if (ret) + printk(KERN_ERR "%s[%u] OFA CM event_handler returned, ret=%d\n", + __func__, __LINE__, ret); + return; +} + + +static void cm_event_mpa_reject(struct nes_cm_event *event) +{ + struct iw_cm_id *cm_id; + struct iw_cm_event cm_event; + struct nes_cm_node *cm_node; + int ret; + struct sockaddr_in *cm_event_laddr = (struct sockaddr_in *) + &cm_event.local_addr; + struct sockaddr_in *cm_event_raddr = (struct sockaddr_in *) + &cm_event.remote_addr; + + cm_node = event->cm_node; + if (!cm_node) + return; + cm_id = cm_node->cm_id; + + atomic_inc(&cm_connect_reqs); + nes_debug(NES_DBG_CM, "cm_node = %p - cm_id = %p, jiffies = %lu\n", + cm_node, cm_id, jiffies); + + cm_event.event = IW_CM_EVENT_CONNECT_REPLY; + cm_event.status = -ECONNREFUSED; + cm_event.provider_data = cm_id->provider_data; + + cm_event_laddr->sin_family = AF_INET; + cm_event_laddr->sin_port = htons(event->cm_info.loc_port); + cm_event_laddr->sin_addr.s_addr = htonl(event->cm_info.loc_addr); + + cm_event_raddr->sin_family = AF_INET; + cm_event_raddr->sin_port = htons(event->cm_info.rem_port); + cm_event_raddr->sin_addr.s_addr = htonl(event->cm_info.rem_addr); + + cm_event.private_data = cm_node->mpa_frame_buf; + cm_event.private_data_len = (u8)cm_node->mpa_frame_size; + + nes_debug(NES_DBG_CM, "call CM_EVENT_MPA_REJECTED, local_addr=%08x, " + "remove_addr=%08x\n", + cm_event_laddr->sin_addr.s_addr, + cm_event_raddr->sin_addr.s_addr); + + ret = cm_id->event_handler(cm_id, &cm_event); + if (ret) + printk(KERN_ERR "%s[%u] OFA CM event_handler returned, ret=%d\n", + __func__, __LINE__, ret); + + return; +} + + +static void nes_cm_event_handler(struct work_struct *); + +/** + * nes_cm_post_event + * post an event to the cm event handler + */ +static int nes_cm_post_event(struct nes_cm_event *event) +{ + atomic_inc(&event->cm_node->cm_core->events_posted); + add_ref_cm_node(event->cm_node); + event->cm_info.cm_id->add_ref(event->cm_info.cm_id); + INIT_WORK(&event->event_work, nes_cm_event_handler); + nes_debug(NES_DBG_CM, "cm_node=%p queue_work, event=%p\n", + event->cm_node, event); + + queue_work(event->cm_node->cm_core->event_wq, &event->event_work); + + nes_debug(NES_DBG_CM, "Exit\n"); + return 0; +} + + +/** + * nes_cm_event_handler + * worker function to handle cm events + * will free instance of nes_cm_event + */ +static void nes_cm_event_handler(struct work_struct *work) +{ + struct nes_cm_event *event = container_of(work, struct nes_cm_event, + event_work); + struct nes_cm_core *cm_core; + + if ((!event) || (!event->cm_node) || (!event->cm_node->cm_core)) + return; + + cm_core = event->cm_node->cm_core; + nes_debug(NES_DBG_CM, "event=%p, event->type=%u, events posted=%u\n", + event, event->type, atomic_read(&cm_core->events_posted)); + + switch (event->type) { + case NES_CM_EVENT_MPA_REQ: + cm_event_mpa_req(event); + nes_debug(NES_DBG_CM, "cm_node=%p CM Event: MPA REQUEST\n", + event->cm_node); + break; + case NES_CM_EVENT_RESET: + nes_debug(NES_DBG_CM, "cm_node = %p CM Event: RESET\n", + event->cm_node); + cm_event_reset(event); + break; + case NES_CM_EVENT_CONNECTED: + if ((!event->cm_node->cm_id) || + (event->cm_node->state != NES_CM_STATE_TSA)) + break; + cm_event_connected(event); + nes_debug(NES_DBG_CM, "CM Event: CONNECTED\n"); + break; + case NES_CM_EVENT_MPA_REJECT: + if ((!event->cm_node->cm_id) || + (event->cm_node->state == NES_CM_STATE_TSA)) + break; + cm_event_mpa_reject(event); + nes_debug(NES_DBG_CM, "CM Event: REJECT\n"); + break; + + case NES_CM_EVENT_ABORTED: + if ((!event->cm_node->cm_id) || + (event->cm_node->state == NES_CM_STATE_TSA)) + break; + cm_event_connect_error(event); + nes_debug(NES_DBG_CM, "CM Event: ABORTED\n"); + break; + case NES_CM_EVENT_DROPPED_PKT: + nes_debug(NES_DBG_CM, "CM Event: DROPPED PKT\n"); + break; + default: + nes_debug(NES_DBG_CM, "CM Event: UNKNOWN EVENT TYPE\n"); + break; + } + + atomic_dec(&cm_core->events_posted); + event->cm_info.cm_id->rem_ref(event->cm_info.cm_id); + rem_ref_cm_node(cm_core, event->cm_node); + kfree(event); + + return; +} diff --git a/kernel/drivers/infiniband/hw/nes/nes_cm.h b/kernel/drivers/infiniband/hw/nes/nes_cm.h new file mode 100644 index 000000000..f522cf639 --- /dev/null +++ b/kernel/drivers/infiniband/hw/nes/nes_cm.h @@ -0,0 +1,476 @@ +/* + * Copyright (c) 2006 - 2014 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef NES_CM_H +#define NES_CM_H + +#define QUEUE_EVENTS + +#define NES_MANAGE_APBVT_DEL 0 +#define NES_MANAGE_APBVT_ADD 1 + +#define NES_MPA_REQUEST_ACCEPT 1 +#define NES_MPA_REQUEST_REJECT 2 + +/* IETF MPA -- defines, enums, structs */ +#define IEFT_MPA_KEY_REQ "MPA ID Req Frame" +#define IEFT_MPA_KEY_REP "MPA ID Rep Frame" +#define IETF_MPA_KEY_SIZE 16 +#define IETF_MPA_VERSION 1 +#define IETF_MAX_PRIV_DATA_LEN 512 +#define IETF_MPA_FRAME_SIZE 20 +#define IETF_RTR_MSG_SIZE 4 +#define IETF_MPA_V2_FLAG 0x10 + +/* IETF RTR MSG Fields */ +#define IETF_PEER_TO_PEER 0x8000 +#define IETF_FLPDU_ZERO_LEN 0x4000 +#define IETF_RDMA0_WRITE 0x8000 +#define IETF_RDMA0_READ 0x4000 +#define IETF_NO_IRD_ORD 0x3FFF +#define NES_MAX_IRD 0x40 +#define NES_MAX_ORD 0x7F + +enum ietf_mpa_flags { + IETF_MPA_FLAGS_MARKERS = 0x80, /* receive Markers */ + IETF_MPA_FLAGS_CRC = 0x40, /* receive Markers */ + IETF_MPA_FLAGS_REJECT = 0x20, /* Reject */ +}; + +struct ietf_mpa_v1 { + u8 key[IETF_MPA_KEY_SIZE]; + u8 flags; + u8 rev; + __be16 priv_data_len; + u8 priv_data[0]; +}; + +#define ietf_mpa_req_resp_frame ietf_mpa_frame + +struct ietf_rtr_msg { + __be16 ctrl_ird; + __be16 ctrl_ord; +}; + +struct ietf_mpa_v2 { + u8 key[IETF_MPA_KEY_SIZE]; + u8 flags; + u8 rev; + __be16 priv_data_len; + struct ietf_rtr_msg rtr_msg; + u8 priv_data[0]; +}; + +struct nes_v4_quad { + u32 rsvd0; + __le32 DstIpAdrIndex; /* Only most significant 5 bits are valid */ + __be32 SrcIpadr; + __be16 TcpPorts[2]; /* src is low, dest is high */ +}; + +struct nes_cm_node; +enum nes_timer_type { + NES_TIMER_TYPE_SEND, + NES_TIMER_TYPE_RECV, + NES_TIMER_NODE_CLEANUP, + NES_TIMER_TYPE_CLOSE, +}; + +#define NES_PASSIVE_STATE_INDICATED 0 +#define NES_DO_NOT_SEND_RESET_EVENT 1 +#define NES_SEND_RESET_EVENT 2 + +#define MAX_NES_IFS 4 + +#define SET_ACK 1 +#define SET_SYN 2 +#define SET_FIN 4 +#define SET_RST 8 + +#define TCP_OPTIONS_PADDING 3 + +struct option_base { + u8 optionnum; + u8 length; +}; + +enum option_numbers { + OPTION_NUMBER_END, + OPTION_NUMBER_NONE, + OPTION_NUMBER_MSS, + OPTION_NUMBER_WINDOW_SCALE, + OPTION_NUMBER_SACK_PERM, + OPTION_NUMBER_SACK, + OPTION_NUMBER_WRITE0 = 0xbc +}; + +struct option_mss { + u8 optionnum; + u8 length; + __be16 mss; +}; + +struct option_windowscale { + u8 optionnum; + u8 length; + u8 shiftcount; +}; + +union all_known_options { + char as_end; + struct option_base as_base; + struct option_mss as_mss; + struct option_windowscale as_windowscale; +}; + +struct nes_timer_entry { + struct list_head list; + unsigned long timetosend; /* jiffies */ + struct sk_buff *skb; + u32 type; + u32 retrycount; + u32 retranscount; + u32 context; + u32 seq_num; + u32 send_retrans; + int close_when_complete; + struct net_device *netdev; +}; + +#define NES_DEFAULT_RETRYS 64 +#define NES_DEFAULT_RETRANS 8 +#ifdef CONFIG_INFINIBAND_NES_DEBUG +#define NES_RETRY_TIMEOUT (1000*HZ/1000) +#else +#define NES_RETRY_TIMEOUT (3000*HZ/1000) +#endif +#define NES_SHORT_TIME (10) +#define NES_LONG_TIME (2000*HZ/1000) +#define NES_MAX_TIMEOUT ((unsigned long) (12*HZ)) + +#define NES_CM_HASHTABLE_SIZE 1024 +#define NES_CM_TCP_TIMER_INTERVAL 3000 +#define NES_CM_DEFAULT_MTU 1540 +#define NES_CM_DEFAULT_FRAME_CNT 10 +#define NES_CM_THREAD_STACK_SIZE 256 +#define NES_CM_DEFAULT_RCV_WND 64240 // before we know that window scaling is allowed +#define NES_CM_DEFAULT_RCV_WND_SCALED 256960 // after we know that window scaling is allowed +#define NES_CM_DEFAULT_RCV_WND_SCALE 2 +#define NES_CM_DEFAULT_FREE_PKTS 0x000A +#define NES_CM_FREE_PKT_LO_WATERMARK 2 + +#define NES_CM_DEFAULT_MSS 536 + +#define NES_CM_DEF_SEQ 0x159bf75f +#define NES_CM_DEF_LOCAL_ID 0x3b47 + +#define NES_CM_DEF_SEQ2 0x18ed5740 +#define NES_CM_DEF_LOCAL_ID2 0xb807 +#define MAX_CM_BUFFER (IETF_MPA_FRAME_SIZE + IETF_RTR_MSG_SIZE + IETF_MAX_PRIV_DATA_LEN) + +typedef u32 nes_addr_t; + +#define nes_cm_tsa_context nes_qp_context + +struct nes_qp; + +/* cm node transition states */ +enum nes_cm_node_state { + NES_CM_STATE_UNKNOWN, + NES_CM_STATE_INITED, + NES_CM_STATE_LISTENING, + NES_CM_STATE_SYN_RCVD, + NES_CM_STATE_SYN_SENT, + NES_CM_STATE_ONE_SIDE_ESTABLISHED, + NES_CM_STATE_ESTABLISHED, + NES_CM_STATE_ACCEPTING, + NES_CM_STATE_MPAREQ_SENT, + NES_CM_STATE_MPAREQ_RCVD, + NES_CM_STATE_MPAREJ_RCVD, + NES_CM_STATE_TSA, + NES_CM_STATE_FIN_WAIT1, + NES_CM_STATE_FIN_WAIT2, + NES_CM_STATE_CLOSE_WAIT, + NES_CM_STATE_TIME_WAIT, + NES_CM_STATE_LAST_ACK, + NES_CM_STATE_CLOSING, + NES_CM_STATE_LISTENER_DESTROYED, + NES_CM_STATE_CLOSED +}; + +enum mpa_frame_version { + IETF_MPA_V1 = 1, + IETF_MPA_V2 = 2 +}; + +enum mpa_frame_key { + MPA_KEY_REQUEST, + MPA_KEY_REPLY +}; + +enum send_rdma0 { + SEND_RDMA_READ_ZERO = 1, + SEND_RDMA_WRITE_ZERO = 2 +}; + +enum nes_tcpip_pkt_type { + NES_PKT_TYPE_UNKNOWN, + NES_PKT_TYPE_SYN, + NES_PKT_TYPE_SYNACK, + NES_PKT_TYPE_ACK, + NES_PKT_TYPE_FIN, + NES_PKT_TYPE_RST +}; + + +/* type of nes connection */ +enum nes_cm_conn_type { + NES_CM_IWARP_CONN_TYPE, +}; + +/* CM context params */ +struct nes_cm_tcp_context { + u8 client; + + u32 loc_seq_num; + u32 loc_ack_num; + u32 rem_ack_num; + u32 rcv_nxt; + + u32 loc_id; + u32 rem_id; + + u32 snd_wnd; + u32 max_snd_wnd; + + u32 rcv_wnd; + u32 mss; + u8 snd_wscale; + u8 rcv_wscale; + + struct nes_cm_tsa_context tsa_cntxt; + struct timeval sent_ts; +}; + + +enum nes_cm_listener_state { + NES_CM_LISTENER_PASSIVE_STATE = 1, + NES_CM_LISTENER_ACTIVE_STATE = 2, + NES_CM_LISTENER_EITHER_STATE = 3 +}; + +struct nes_cm_listener { + struct list_head list; + struct nes_cm_core *cm_core; + u8 loc_mac[ETH_ALEN]; + nes_addr_t loc_addr, mapped_loc_addr; + u16 loc_port, mapped_loc_port; + struct iw_cm_id *cm_id; + enum nes_cm_conn_type conn_type; + atomic_t ref_count; + struct nes_vnic *nesvnic; + atomic_t pend_accepts_cnt; + int backlog; + enum nes_cm_listener_state listener_state; + u32 reused_node; +}; + +/* per connection node and node state information */ +struct nes_cm_node { + nes_addr_t loc_addr, rem_addr; + nes_addr_t mapped_loc_addr, mapped_rem_addr; + u16 loc_port, rem_port; + u16 mapped_loc_port, mapped_rem_port; + + u8 loc_mac[ETH_ALEN]; + u8 rem_mac[ETH_ALEN]; + + enum nes_cm_node_state state; + struct nes_cm_tcp_context tcp_cntxt; + struct nes_cm_core *cm_core; + struct sk_buff_head resend_list; + atomic_t ref_count; + struct net_device *netdev; + + struct nes_cm_node *loopbackpartner; + + struct nes_timer_entry *send_entry; + struct nes_timer_entry *recv_entry; + spinlock_t retrans_list_lock; + enum send_rdma0 send_rdma0_op; + + union { + struct ietf_mpa_v1 mpa_frame; + struct ietf_mpa_v2 mpa_v2_frame; + u8 mpa_frame_buf[MAX_CM_BUFFER]; + }; + enum mpa_frame_version mpa_frame_rev; + u16 ird_size; + u16 ord_size; + u16 mpav2_ird_ord; + + u16 mpa_frame_size; + struct iw_cm_id *cm_id; + struct list_head list; + int accelerated; + struct nes_cm_listener *listener; + enum nes_cm_conn_type conn_type; + struct nes_vnic *nesvnic; + int apbvt_set; + int accept_pend; + struct list_head timer_entry; + struct list_head reset_entry; + struct nes_qp *nesqp; + atomic_t passive_state; +}; + +/* structure for client or CM to fill when making CM api calls. */ +/* - only need to set relevant data, based on op. */ +struct nes_cm_info { + union { + struct iw_cm_id *cm_id; + struct net_device *netdev; + }; + + u16 loc_port; + u16 rem_port; + nes_addr_t loc_addr; + nes_addr_t rem_addr; + u16 mapped_loc_port; + u16 mapped_rem_port; + nes_addr_t mapped_loc_addr; + nes_addr_t mapped_rem_addr; + + enum nes_cm_conn_type conn_type; + int backlog; +}; + +/* CM event codes */ +enum nes_cm_event_type { + NES_CM_EVENT_UNKNOWN, + NES_CM_EVENT_ESTABLISHED, + NES_CM_EVENT_MPA_REQ, + NES_CM_EVENT_MPA_CONNECT, + NES_CM_EVENT_MPA_ACCEPT, + NES_CM_EVENT_MPA_REJECT, + NES_CM_EVENT_MPA_ESTABLISHED, + NES_CM_EVENT_CONNECTED, + NES_CM_EVENT_CLOSED, + NES_CM_EVENT_RESET, + NES_CM_EVENT_DROPPED_PKT, + NES_CM_EVENT_CLOSE_IMMED, + NES_CM_EVENT_CLOSE_HARD, + NES_CM_EVENT_CLOSE_CLEAN, + NES_CM_EVENT_ABORTED, + NES_CM_EVENT_SEND_FIRST +}; + +/* event to post to CM event handler */ +struct nes_cm_event { + enum nes_cm_event_type type; + + struct nes_cm_info cm_info; + struct work_struct event_work; + struct nes_cm_node *cm_node; +}; + +struct nes_cm_core { + enum nes_cm_node_state state; + + atomic_t listen_node_cnt; + struct nes_cm_node listen_list; + spinlock_t listen_list_lock; + + u32 mtu; + u32 free_tx_pkt_max; + u32 rx_pkt_posted; + atomic_t ht_node_cnt; + struct list_head connected_nodes; + /* struct list_head hashtable[NES_CM_HASHTABLE_SIZE]; */ + spinlock_t ht_lock; + + struct timer_list tcp_timer; + + struct nes_cm_ops *api; + + int (*post_event)(struct nes_cm_event *event); + atomic_t events_posted; + struct workqueue_struct *event_wq; + struct workqueue_struct *disconn_wq; + + atomic_t node_cnt; + u64 aborted_connects; + u32 options; + + struct nes_cm_node *current_listen_node; +}; + + +#define NES_CM_SET_PKT_SIZE (1 << 1) +#define NES_CM_SET_FREE_PKT_Q_SIZE (1 << 2) + +/* CM ops/API for client interface */ +struct nes_cm_ops { + int (*accelerated)(struct nes_cm_core *, struct nes_cm_node *); + struct nes_cm_listener * (*listen)(struct nes_cm_core *, struct nes_vnic *, + struct nes_cm_info *); + int (*stop_listener)(struct nes_cm_core *, struct nes_cm_listener *); + struct nes_cm_node * (*connect)(struct nes_cm_core *, + struct nes_vnic *, u16, void *, + struct nes_cm_info *); + int (*close)(struct nes_cm_core *, struct nes_cm_node *); + int (*accept)(struct nes_cm_core *, struct nes_cm_node *); + int (*reject)(struct nes_cm_core *, struct nes_cm_node *); + int (*recv_pkt)(struct nes_cm_core *, struct nes_vnic *, + struct sk_buff *); + int (*destroy_cm_core)(struct nes_cm_core *); + int (*get)(struct nes_cm_core *); + int (*set)(struct nes_cm_core *, u32, u32); +}; + +int schedule_nes_timer(struct nes_cm_node *, struct sk_buff *, + enum nes_timer_type, int, int); + +int nes_accept(struct iw_cm_id *, struct iw_cm_conn_param *); +int nes_reject(struct iw_cm_id *, const void *, u8); +int nes_connect(struct iw_cm_id *, struct iw_cm_conn_param *); +int nes_create_listen(struct iw_cm_id *, int); +int nes_destroy_listen(struct iw_cm_id *); + +int nes_cm_recv(struct sk_buff *, struct net_device *); +int nes_cm_start(void); +int nes_cm_stop(void); +int nes_add_ref_cm_node(struct nes_cm_node *cm_node); +int nes_rem_ref_cm_node(struct nes_cm_node *cm_node); + +#endif /* NES_CM_H */ diff --git a/kernel/drivers/infiniband/hw/nes/nes_context.h b/kernel/drivers/infiniband/hw/nes/nes_context.h new file mode 100644 index 000000000..a69eef16d --- /dev/null +++ b/kernel/drivers/infiniband/hw/nes/nes_context.h @@ -0,0 +1,193 @@ +/* + * Copyright (c) 2006 - 2011 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef NES_CONTEXT_H +#define NES_CONTEXT_H + +struct nes_qp_context { + __le32 misc; + __le32 cqs; + __le32 sq_addr_low; + __le32 sq_addr_high; + __le32 rq_addr_low; + __le32 rq_addr_high; + __le32 misc2; + __le16 tcpPorts[2]; + __le32 ip0; + __le32 ip1; + __le32 ip2; + __le32 ip3; + __le32 mss; + __le32 arp_index_vlan; + __le32 tcp_state_flow_label; + __le32 pd_index_wscale; + __le32 keepalive; + u32 ts_recent; + u32 ts_age; + __le32 snd_nxt; + __le32 snd_wnd; + __le32 rcv_nxt; + __le32 rcv_wnd; + __le32 snd_max; + __le32 snd_una; + u32 srtt; + __le32 rttvar; + __le32 ssthresh; + __le32 cwnd; + __le32 snd_wl1; + __le32 snd_wl2; + __le32 max_snd_wnd; + __le32 ts_val_delta; + u32 retransmit; + u32 probe_cnt; + u32 hte_index; + __le32 q2_addr_low; + __le32 q2_addr_high; + __le32 ird_index; + u32 Rsvd3; + __le32 ird_ord_sizes; + u32 mrkr_offset; + __le32 aeq_token_low; + __le32 aeq_token_high; +}; + +/* QP Context Misc Field */ + +#define NES_QPCONTEXT_MISC_IWARP_VER_MASK 0x00000003 +#define NES_QPCONTEXT_MISC_IWARP_VER_SHIFT 0 +#define NES_QPCONTEXT_MISC_EFB_SIZE_MASK 0x000000C0 +#define NES_QPCONTEXT_MISC_EFB_SIZE_SHIFT 6 +#define NES_QPCONTEXT_MISC_RQ_SIZE_MASK 0x00000300 +#define NES_QPCONTEXT_MISC_RQ_SIZE_SHIFT 8 +#define NES_QPCONTEXT_MISC_SQ_SIZE_MASK 0x00000c00 +#define NES_QPCONTEXT_MISC_SQ_SIZE_SHIFT 10 +#define NES_QPCONTEXT_MISC_PCI_FCN_MASK 0x00007000 +#define NES_QPCONTEXT_MISC_PCI_FCN_SHIFT 12 +#define NES_QPCONTEXT_MISC_DUP_ACKS_MASK 0x00070000 +#define NES_QPCONTEXT_MISC_DUP_ACKS_SHIFT 16 + +enum nes_qp_context_misc_bits { + NES_QPCONTEXT_MISC_RX_WQE_SIZE = 0x00000004, + NES_QPCONTEXT_MISC_IPV4 = 0x00000008, + NES_QPCONTEXT_MISC_DO_NOT_FRAG = 0x00000010, + NES_QPCONTEXT_MISC_INSERT_VLAN = 0x00000020, + NES_QPCONTEXT_MISC_DROS = 0x00008000, + NES_QPCONTEXT_MISC_WSCALE = 0x00080000, + NES_QPCONTEXT_MISC_KEEPALIVE = 0x00100000, + NES_QPCONTEXT_MISC_TIMESTAMP = 0x00200000, + NES_QPCONTEXT_MISC_SACK = 0x00400000, + NES_QPCONTEXT_MISC_RDMA_WRITE_EN = 0x00800000, + NES_QPCONTEXT_MISC_RDMA_READ_EN = 0x01000000, + NES_QPCONTEXT_MISC_WBIND_EN = 0x10000000, + NES_QPCONTEXT_MISC_FAST_REGISTER_EN = 0x20000000, + NES_QPCONTEXT_MISC_PRIV_EN = 0x40000000, + NES_QPCONTEXT_MISC_NO_NAGLE = 0x80000000 +}; + +enum nes_qp_acc_wq_sizes { + HCONTEXT_TSA_WQ_SIZE_4 = 0, + HCONTEXT_TSA_WQ_SIZE_32 = 1, + HCONTEXT_TSA_WQ_SIZE_128 = 2, + HCONTEXT_TSA_WQ_SIZE_512 = 3 +}; + +/* QP Context Misc2 Fields */ +#define NES_QPCONTEXT_MISC2_TTL_MASK 0x000000ff +#define NES_QPCONTEXT_MISC2_TTL_SHIFT 0 +#define NES_QPCONTEXT_MISC2_HOP_LIMIT_MASK 0x000000ff +#define NES_QPCONTEXT_MISC2_HOP_LIMIT_SHIFT 0 +#define NES_QPCONTEXT_MISC2_LIMIT_MASK 0x00000300 +#define NES_QPCONTEXT_MISC2_LIMIT_SHIFT 8 +#define NES_QPCONTEXT_MISC2_NIC_INDEX_MASK 0x0000fc00 +#define NES_QPCONTEXT_MISC2_NIC_INDEX_SHIFT 10 +#define NES_QPCONTEXT_MISC2_SRC_IP_MASK 0x001f0000 +#define NES_QPCONTEXT_MISC2_SRC_IP_SHIFT 16 +#define NES_QPCONTEXT_MISC2_TOS_MASK 0xff000000 +#define NES_QPCONTEXT_MISC2_TOS_SHIFT 24 +#define NES_QPCONTEXT_MISC2_TRAFFIC_CLASS_MASK 0xff000000 +#define NES_QPCONTEXT_MISC2_TRAFFIC_CLASS_SHIFT 24 + +/* QP Context Tcp State/Flow Label Fields */ +#define NES_QPCONTEXT_TCPFLOW_FLOW_LABEL_MASK 0x000fffff +#define NES_QPCONTEXT_TCPFLOW_FLOW_LABEL_SHIFT 0 +#define NES_QPCONTEXT_TCPFLOW_TCP_STATE_MASK 0xf0000000 +#define NES_QPCONTEXT_TCPFLOW_TCP_STATE_SHIFT 28 + +enum nes_qp_tcp_state { + NES_QPCONTEXT_TCPSTATE_CLOSED = 1, + NES_QPCONTEXT_TCPSTATE_EST = 5, + NES_QPCONTEXT_TCPSTATE_TIME_WAIT = 11, +}; + +/* QP Context PD Index/wscale Fields */ +#define NES_QPCONTEXT_PDWSCALE_RCV_WSCALE_MASK 0x0000000f +#define NES_QPCONTEXT_PDWSCALE_RCV_WSCALE_SHIFT 0 +#define NES_QPCONTEXT_PDWSCALE_SND_WSCALE_MASK 0x00000f00 +#define NES_QPCONTEXT_PDWSCALE_SND_WSCALE_SHIFT 8 +#define NES_QPCONTEXT_PDWSCALE_PDINDEX_MASK 0xffff0000 +#define NES_QPCONTEXT_PDWSCALE_PDINDEX_SHIFT 16 + +/* QP Context Keepalive Fields */ +#define NES_QPCONTEXT_KEEPALIVE_DELTA_MASK 0x0000ffff +#define NES_QPCONTEXT_KEEPALIVE_DELTA_SHIFT 0 +#define NES_QPCONTEXT_KEEPALIVE_PROBE_CNT_MASK 0x00ff0000 +#define NES_QPCONTEXT_KEEPALIVE_PROBE_CNT_SHIFT 16 +#define NES_QPCONTEXT_KEEPALIVE_INTV_MASK 0xff000000 +#define NES_QPCONTEXT_KEEPALIVE_INTV_SHIFT 24 + +/* QP Context ORD/IRD Fields */ +#define NES_QPCONTEXT_ORDIRD_ORDSIZE_MASK 0x0000007f +#define NES_QPCONTEXT_ORDIRD_ORDSIZE_SHIFT 0 +#define NES_QPCONTEXT_ORDIRD_IRDSIZE_MASK 0x00030000 +#define NES_QPCONTEXT_ORDIRD_IRDSIZE_SHIFT 16 +#define NES_QPCONTEXT_ORDIRD_IWARP_MODE_MASK 0x30000000 +#define NES_QPCONTEXT_ORDIRD_IWARP_MODE_SHIFT 28 + +enum nes_ord_ird_bits { + NES_QPCONTEXT_ORDIRD_WRPDU = 0x02000000, + NES_QPCONTEXT_ORDIRD_LSMM_PRESENT = 0x04000000, + NES_QPCONTEXT_ORDIRD_ALSMM = 0x08000000, + NES_QPCONTEXT_ORDIRD_AAH = 0x40000000, + NES_QPCONTEXT_ORDIRD_RNMC = 0x80000000 +}; + +enum nes_iwarp_qp_state { + NES_QPCONTEXT_IWARP_STATE_NONEXIST = 0, + NES_QPCONTEXT_IWARP_STATE_IDLE = 1, + NES_QPCONTEXT_IWARP_STATE_RTS = 2, + NES_QPCONTEXT_IWARP_STATE_CLOSING = 3, + NES_QPCONTEXT_IWARP_STATE_TERMINATE = 5, + NES_QPCONTEXT_IWARP_STATE_ERROR = 6 +}; + + +#endif /* NES_CONTEXT_H */ diff --git a/kernel/drivers/infiniband/hw/nes/nes_hw.c b/kernel/drivers/infiniband/hw/nes/nes_hw.c new file mode 100644 index 000000000..02120d340 --- /dev/null +++ b/kernel/drivers/infiniband/hw/nes/nes_hw.c @@ -0,0 +1,3937 @@ +/* + * Copyright (c) 2006 - 2011 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "nes.h" + +static unsigned int nes_lro_max_aggr = NES_LRO_MAX_AGGR; +module_param(nes_lro_max_aggr, uint, 0444); +MODULE_PARM_DESC(nes_lro_max_aggr, "NIC LRO max packet aggregation"); + +static int wide_ppm_offset; +module_param(wide_ppm_offset, int, 0644); +MODULE_PARM_DESC(wide_ppm_offset, "Increase CX4 interface clock ppm offset, 0=100ppm (default), 1=300ppm"); + +static u32 crit_err_count; +u32 int_mod_timer_init; +u32 int_mod_cq_depth_256; +u32 int_mod_cq_depth_128; +u32 int_mod_cq_depth_32; +u32 int_mod_cq_depth_24; +u32 int_mod_cq_depth_16; +u32 int_mod_cq_depth_4; +u32 int_mod_cq_depth_1; +static const u8 nes_max_critical_error_count = 100; +#include "nes_cm.h" + +static void nes_cqp_ce_handler(struct nes_device *nesdev, struct nes_hw_cq *cq); +static void nes_init_csr_ne020(struct nes_device *nesdev, u8 hw_rev, u8 port_count); +static int nes_init_serdes(struct nes_device *nesdev, u8 hw_rev, u8 port_count, + struct nes_adapter *nesadapter, u8 OneG_Mode); +static void nes_nic_napi_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq); +static void nes_process_aeq(struct nes_device *nesdev, struct nes_hw_aeq *aeq); +static void nes_process_ceq(struct nes_device *nesdev, struct nes_hw_ceq *ceq); +static void nes_process_iwarp_aeqe(struct nes_device *nesdev, + struct nes_hw_aeqe *aeqe); +static void process_critical_error(struct nes_device *nesdev); +static void nes_process_mac_intr(struct nes_device *nesdev, u32 mac_number); +static unsigned int nes_reset_adapter_ne020(struct nes_device *nesdev, u8 *OneG_Mode); +static void nes_terminate_start_timer(struct nes_qp *nesqp); + +#ifdef CONFIG_INFINIBAND_NES_DEBUG +static unsigned char *nes_iwarp_state_str[] = { + "Non-Existent", + "Idle", + "RTS", + "Closing", + "RSVD1", + "Terminate", + "Error", + "RSVD2", +}; + +static unsigned char *nes_tcp_state_str[] = { + "Non-Existent", + "Closed", + "Listen", + "SYN Sent", + "SYN Rcvd", + "Established", + "Close Wait", + "FIN Wait 1", + "Closing", + "Last Ack", + "FIN Wait 2", + "Time Wait", + "RSVD1", + "RSVD2", + "RSVD3", + "RSVD4", +}; +#endif + +static inline void print_ip(struct nes_cm_node *cm_node) +{ + unsigned char *rem_addr; + if (cm_node) { + rem_addr = (unsigned char *)&cm_node->rem_addr; + printk(KERN_ERR PFX "Remote IP addr: %pI4\n", rem_addr); + } +} + +/** + * nes_nic_init_timer_defaults + */ +void nes_nic_init_timer_defaults(struct nes_device *nesdev, u8 jumbomode) +{ + unsigned long flags; + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer; + + spin_lock_irqsave(&nesadapter->periodic_timer_lock, flags); + + shared_timer->timer_in_use_min = NES_NIC_FAST_TIMER_LOW; + shared_timer->timer_in_use_max = NES_NIC_FAST_TIMER_HIGH; + if (jumbomode) { + shared_timer->threshold_low = DEFAULT_JUMBO_NES_QL_LOW; + shared_timer->threshold_target = DEFAULT_JUMBO_NES_QL_TARGET; + shared_timer->threshold_high = DEFAULT_JUMBO_NES_QL_HIGH; + } else { + shared_timer->threshold_low = DEFAULT_NES_QL_LOW; + shared_timer->threshold_target = DEFAULT_NES_QL_TARGET; + shared_timer->threshold_high = DEFAULT_NES_QL_HIGH; + } + + /* todo use netdev->mtu to set thresholds */ + spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags); +} + + +/** + * nes_nic_init_timer + */ +static void nes_nic_init_timer(struct nes_device *nesdev) +{ + unsigned long flags; + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer; + + spin_lock_irqsave(&nesadapter->periodic_timer_lock, flags); + + if (shared_timer->timer_in_use_old == 0) { + nesdev->deepcq_count = 0; + shared_timer->timer_direction_upward = 0; + shared_timer->timer_direction_downward = 0; + shared_timer->timer_in_use = NES_NIC_FAST_TIMER; + shared_timer->timer_in_use_old = 0; + + } + if (shared_timer->timer_in_use != shared_timer->timer_in_use_old) { + shared_timer->timer_in_use_old = shared_timer->timer_in_use; + nes_write32(nesdev->regs+NES_PERIODIC_CONTROL, + 0x80000000 | ((u32)(shared_timer->timer_in_use*8))); + } + /* todo use netdev->mtu to set thresholds */ + spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags); +} + + +/** + * nes_nic_tune_timer + */ +static void nes_nic_tune_timer(struct nes_device *nesdev) +{ + unsigned long flags; + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer; + u16 cq_count = nesdev->currcq_count; + + spin_lock_irqsave(&nesadapter->periodic_timer_lock, flags); + + if (shared_timer->cq_count_old <= cq_count) + shared_timer->cq_direction_downward = 0; + else + shared_timer->cq_direction_downward++; + shared_timer->cq_count_old = cq_count; + if (shared_timer->cq_direction_downward > NES_NIC_CQ_DOWNWARD_TREND) { + if (cq_count <= shared_timer->threshold_low && + shared_timer->threshold_low > 4) { + shared_timer->threshold_low = shared_timer->threshold_low/2; + shared_timer->cq_direction_downward=0; + nesdev->currcq_count = 0; + spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags); + return; + } + } + + if (cq_count > 1) { + nesdev->deepcq_count += cq_count; + if (cq_count <= shared_timer->threshold_low) { /* increase timer gently */ + shared_timer->timer_direction_upward++; + shared_timer->timer_direction_downward = 0; + } else if (cq_count <= shared_timer->threshold_target) { /* balanced */ + shared_timer->timer_direction_upward = 0; + shared_timer->timer_direction_downward = 0; + } else if (cq_count <= shared_timer->threshold_high) { /* decrease timer gently */ + shared_timer->timer_direction_downward++; + shared_timer->timer_direction_upward = 0; + } else if (cq_count <= (shared_timer->threshold_high) * 2) { + shared_timer->timer_in_use -= 2; + shared_timer->timer_direction_upward = 0; + shared_timer->timer_direction_downward++; + } else { + shared_timer->timer_in_use -= 4; + shared_timer->timer_direction_upward = 0; + shared_timer->timer_direction_downward++; + } + + if (shared_timer->timer_direction_upward > 3 ) { /* using history */ + shared_timer->timer_in_use += 3; + shared_timer->timer_direction_upward = 0; + shared_timer->timer_direction_downward = 0; + } + if (shared_timer->timer_direction_downward > 5) { /* using history */ + shared_timer->timer_in_use -= 4 ; + shared_timer->timer_direction_downward = 0; + shared_timer->timer_direction_upward = 0; + } + } + + /* boundary checking */ + if (shared_timer->timer_in_use > shared_timer->threshold_high) + shared_timer->timer_in_use = shared_timer->threshold_high; + else if (shared_timer->timer_in_use < shared_timer->threshold_low) + shared_timer->timer_in_use = shared_timer->threshold_low; + + nesdev->currcq_count = 0; + + spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags); +} + + +/** + * nes_init_adapter - initialize adapter + */ +struct nes_adapter *nes_init_adapter(struct nes_device *nesdev, u8 hw_rev) { + struct nes_adapter *nesadapter = NULL; + unsigned long num_pds; + u32 u32temp; + u32 port_count; + u16 max_rq_wrs; + u16 max_sq_wrs; + u32 max_mr; + u32 max_256pbl; + u32 max_4kpbl; + u32 max_qp; + u32 max_irrq; + u32 max_cq; + u32 hte_index_mask; + u32 adapter_size; + u32 arp_table_size; + u16 vendor_id; + u16 device_id; + u8 OneG_Mode; + u8 func_index; + + /* search the list of existing adapters */ + list_for_each_entry(nesadapter, &nes_adapter_list, list) { + nes_debug(NES_DBG_INIT, "Searching Adapter list for PCI devfn = 0x%X," + " adapter PCI slot/bus = %u/%u, pci devices PCI slot/bus = %u/%u, .\n", + nesdev->pcidev->devfn, + PCI_SLOT(nesadapter->devfn), + nesadapter->bus_number, + PCI_SLOT(nesdev->pcidev->devfn), + nesdev->pcidev->bus->number ); + if ((PCI_SLOT(nesadapter->devfn) == PCI_SLOT(nesdev->pcidev->devfn)) && + (nesadapter->bus_number == nesdev->pcidev->bus->number)) { + nesadapter->ref_count++; + return nesadapter; + } + } + + /* no adapter found */ + num_pds = pci_resource_len(nesdev->pcidev, BAR_1) >> PAGE_SHIFT; + if ((hw_rev != NE020_REV) && (hw_rev != NE020_REV1)) { + nes_debug(NES_DBG_INIT, "NE020 driver detected unknown hardware revision 0x%x\n", + hw_rev); + return NULL; + } + + nes_debug(NES_DBG_INIT, "Determine Soft Reset, QP_control=0x%x, CPU0=0x%x, CPU1=0x%x, CPU2=0x%x\n", + nes_read_indexed(nesdev, NES_IDX_QP_CONTROL + PCI_FUNC(nesdev->pcidev->devfn) * 8), + nes_read_indexed(nesdev, NES_IDX_INT_CPU_STATUS), + nes_read_indexed(nesdev, NES_IDX_INT_CPU_STATUS + 4), + nes_read_indexed(nesdev, NES_IDX_INT_CPU_STATUS + 8)); + + nes_debug(NES_DBG_INIT, "Reset and init NE020\n"); + + + if ((port_count = nes_reset_adapter_ne020(nesdev, &OneG_Mode)) == 0) + return NULL; + + max_qp = nes_read_indexed(nesdev, NES_IDX_QP_CTX_SIZE); + nes_debug(NES_DBG_INIT, "QP_CTX_SIZE=%u\n", max_qp); + + u32temp = nes_read_indexed(nesdev, NES_IDX_QUAD_HASH_TABLE_SIZE); + if (max_qp > ((u32)1 << (u32temp & 0x001f))) { + nes_debug(NES_DBG_INIT, "Reducing Max QPs to %u due to hash table size = 0x%08X\n", + max_qp, u32temp); + max_qp = (u32)1 << (u32temp & 0x001f); + } + + hte_index_mask = ((u32)1 << ((u32temp & 0x001f)+1))-1; + nes_debug(NES_DBG_INIT, "Max QP = %u, hte_index_mask = 0x%08X.\n", + max_qp, hte_index_mask); + + u32temp = nes_read_indexed(nesdev, NES_IDX_IRRQ_COUNT); + + max_irrq = 1 << (u32temp & 0x001f); + + if (max_qp > max_irrq) { + max_qp = max_irrq; + nes_debug(NES_DBG_INIT, "Reducing Max QPs to %u due to Available Q1s.\n", + max_qp); + } + + /* there should be no reason to allocate more pds than qps */ + if (num_pds > max_qp) + num_pds = max_qp; + + u32temp = nes_read_indexed(nesdev, NES_IDX_MRT_SIZE); + max_mr = (u32)8192 << (u32temp & 0x7); + + u32temp = nes_read_indexed(nesdev, NES_IDX_PBL_REGION_SIZE); + max_256pbl = (u32)1 << (u32temp & 0x0000001f); + max_4kpbl = (u32)1 << ((u32temp >> 16) & 0x0000001f); + max_cq = nes_read_indexed(nesdev, NES_IDX_CQ_CTX_SIZE); + + u32temp = nes_read_indexed(nesdev, NES_IDX_ARP_CACHE_SIZE); + arp_table_size = 1 << u32temp; + + adapter_size = (sizeof(struct nes_adapter) + + (sizeof(unsigned long)-1)) & (~(sizeof(unsigned long)-1)); + adapter_size += sizeof(unsigned long) * BITS_TO_LONGS(max_qp); + adapter_size += sizeof(unsigned long) * BITS_TO_LONGS(max_mr); + adapter_size += sizeof(unsigned long) * BITS_TO_LONGS(max_cq); + adapter_size += sizeof(unsigned long) * BITS_TO_LONGS(num_pds); + adapter_size += sizeof(unsigned long) * BITS_TO_LONGS(arp_table_size); + adapter_size += sizeof(struct nes_qp **) * max_qp; + + /* allocate a new adapter struct */ + nesadapter = kzalloc(adapter_size, GFP_KERNEL); + if (nesadapter == NULL) { + return NULL; + } + + nes_debug(NES_DBG_INIT, "Allocating new nesadapter @ %p, size = %u (actual size = %u).\n", + nesadapter, (u32)sizeof(struct nes_adapter), adapter_size); + + if (nes_read_eeprom_values(nesdev, nesadapter)) { + printk(KERN_ERR PFX "Unable to read EEPROM data.\n"); + kfree(nesadapter); + return NULL; + } + + nesadapter->vendor_id = (((u32) nesadapter->mac_addr_high) << 8) | + (nesadapter->mac_addr_low >> 24); + + pci_bus_read_config_word(nesdev->pcidev->bus, nesdev->pcidev->devfn, + PCI_DEVICE_ID, &device_id); + nesadapter->vendor_part_id = device_id; + + if (nes_init_serdes(nesdev, hw_rev, port_count, nesadapter, + OneG_Mode)) { + kfree(nesadapter); + return NULL; + } + nes_init_csr_ne020(nesdev, hw_rev, port_count); + + memset(nesadapter->pft_mcast_map, 255, + sizeof nesadapter->pft_mcast_map); + + /* populate the new nesadapter */ + nesadapter->devfn = nesdev->pcidev->devfn; + nesadapter->bus_number = nesdev->pcidev->bus->number; + nesadapter->ref_count = 1; + nesadapter->timer_int_req = 0xffff0000; + nesadapter->OneG_Mode = OneG_Mode; + nesadapter->doorbell_start = nesdev->doorbell_region; + + /* nesadapter->tick_delta = clk_divisor; */ + nesadapter->hw_rev = hw_rev; + nesadapter->port_count = port_count; + + nesadapter->max_qp = max_qp; + nesadapter->hte_index_mask = hte_index_mask; + nesadapter->max_irrq = max_irrq; + nesadapter->max_mr = max_mr; + nesadapter->max_256pbl = max_256pbl - 1; + nesadapter->max_4kpbl = max_4kpbl - 1; + nesadapter->max_cq = max_cq; + nesadapter->free_256pbl = max_256pbl - 1; + nesadapter->free_4kpbl = max_4kpbl - 1; + nesadapter->max_pd = num_pds; + nesadapter->arp_table_size = arp_table_size; + + nesadapter->et_pkt_rate_low = NES_TIMER_ENABLE_LIMIT; + if (nes_drv_opt & NES_DRV_OPT_DISABLE_INT_MOD) { + nesadapter->et_use_adaptive_rx_coalesce = 0; + nesadapter->timer_int_limit = NES_TIMER_INT_LIMIT; + nesadapter->et_rx_coalesce_usecs_irq = interrupt_mod_interval; + } else { + nesadapter->et_use_adaptive_rx_coalesce = 1; + nesadapter->timer_int_limit = NES_TIMER_INT_LIMIT_DYNAMIC; + nesadapter->et_rx_coalesce_usecs_irq = 0; + printk(PFX "%s: Using Adaptive Interrupt Moderation\n", __func__); + } + /* Setup and enable the periodic timer */ + if (nesadapter->et_rx_coalesce_usecs_irq) + nes_write32(nesdev->regs+NES_PERIODIC_CONTROL, 0x80000000 | + ((u32)(nesadapter->et_rx_coalesce_usecs_irq * 8))); + else + nes_write32(nesdev->regs+NES_PERIODIC_CONTROL, 0x00000000); + + nesadapter->base_pd = 1; + + nesadapter->device_cap_flags = IB_DEVICE_LOCAL_DMA_LKEY | + IB_DEVICE_MEM_WINDOW | + IB_DEVICE_MEM_MGT_EXTENSIONS; + + nesadapter->allocated_qps = (unsigned long *)&(((unsigned char *)nesadapter) + [(sizeof(struct nes_adapter)+(sizeof(unsigned long)-1))&(~(sizeof(unsigned long)-1))]); + nesadapter->allocated_cqs = &nesadapter->allocated_qps[BITS_TO_LONGS(max_qp)]; + nesadapter->allocated_mrs = &nesadapter->allocated_cqs[BITS_TO_LONGS(max_cq)]; + nesadapter->allocated_pds = &nesadapter->allocated_mrs[BITS_TO_LONGS(max_mr)]; + nesadapter->allocated_arps = &nesadapter->allocated_pds[BITS_TO_LONGS(num_pds)]; + nesadapter->qp_table = (struct nes_qp **)(&nesadapter->allocated_arps[BITS_TO_LONGS(arp_table_size)]); + + + /* mark the usual suspect QPs, MR and CQs as in use */ + for (u32temp = 0; u32temp < NES_FIRST_QPN; u32temp++) { + set_bit(u32temp, nesadapter->allocated_qps); + set_bit(u32temp, nesadapter->allocated_cqs); + } + set_bit(0, nesadapter->allocated_mrs); + + for (u32temp = 0; u32temp < 20; u32temp++) + set_bit(u32temp, nesadapter->allocated_pds); + u32temp = nes_read_indexed(nesdev, NES_IDX_QP_MAX_CFG_SIZES); + + max_rq_wrs = ((u32temp >> 8) & 3); + switch (max_rq_wrs) { + case 0: + max_rq_wrs = 4; + break; + case 1: + max_rq_wrs = 16; + break; + case 2: + max_rq_wrs = 32; + break; + case 3: + max_rq_wrs = 512; + break; + } + + max_sq_wrs = (u32temp & 3); + switch (max_sq_wrs) { + case 0: + max_sq_wrs = 4; + break; + case 1: + max_sq_wrs = 16; + break; + case 2: + max_sq_wrs = 32; + break; + case 3: + max_sq_wrs = 512; + break; + } + nesadapter->max_qp_wr = min(max_rq_wrs, max_sq_wrs); + nesadapter->max_irrq_wr = (u32temp >> 16) & 3; + + nesadapter->max_sge = 4; + nesadapter->max_cqe = 32766; + + if (nes_read_eeprom_values(nesdev, nesadapter)) { + printk(KERN_ERR PFX "Unable to read EEPROM data.\n"); + kfree(nesadapter); + return NULL; + } + + u32temp = nes_read_indexed(nesdev, NES_IDX_TCP_TIMER_CONFIG); + nes_write_indexed(nesdev, NES_IDX_TCP_TIMER_CONFIG, + (u32temp & 0xff000000) | (nesadapter->tcp_timer_core_clk_divisor & 0x00ffffff)); + + /* setup port configuration */ + if (nesadapter->port_count == 1) { + nesadapter->log_port = 0x00000000; + if (nes_drv_opt & NES_DRV_OPT_DUAL_LOGICAL_PORT) + nes_write_indexed(nesdev, NES_IDX_TX_POOL_SIZE, 0x00000002); + else + nes_write_indexed(nesdev, NES_IDX_TX_POOL_SIZE, 0x00000003); + } else { + if (nesadapter->phy_type[0] == NES_PHY_TYPE_PUMA_1G) { + nesadapter->log_port = 0x000000D8; + } else { + if (nesadapter->port_count == 2) + nesadapter->log_port = 0x00000044; + else + nesadapter->log_port = 0x000000e4; + } + nes_write_indexed(nesdev, NES_IDX_TX_POOL_SIZE, 0x00000003); + } + + nes_write_indexed(nesdev, NES_IDX_NIC_LOGPORT_TO_PHYPORT, + nesadapter->log_port); + nes_debug(NES_DBG_INIT, "Probe time, LOG2PHY=%u\n", + nes_read_indexed(nesdev, NES_IDX_NIC_LOGPORT_TO_PHYPORT)); + + spin_lock_init(&nesadapter->resource_lock); + spin_lock_init(&nesadapter->phy_lock); + spin_lock_init(&nesadapter->pbl_lock); + spin_lock_init(&nesadapter->periodic_timer_lock); + + INIT_LIST_HEAD(&nesadapter->nesvnic_list[0]); + INIT_LIST_HEAD(&nesadapter->nesvnic_list[1]); + INIT_LIST_HEAD(&nesadapter->nesvnic_list[2]); + INIT_LIST_HEAD(&nesadapter->nesvnic_list[3]); + + if ((!nesadapter->OneG_Mode) && (nesadapter->port_count == 2)) { + u32 pcs_control_status0, pcs_control_status1; + u32 reset_value; + u32 i = 0; + u32 int_cnt = 0; + u32 ext_cnt = 0; + unsigned long flags; + u32 j = 0; + + pcs_control_status0 = nes_read_indexed(nesdev, + NES_IDX_PHY_PCS_CONTROL_STATUS0); + pcs_control_status1 = nes_read_indexed(nesdev, + NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200); + + for (i = 0; i < NES_MAX_LINK_CHECK; i++) { + pcs_control_status0 = nes_read_indexed(nesdev, + NES_IDX_PHY_PCS_CONTROL_STATUS0); + pcs_control_status1 = nes_read_indexed(nesdev, + NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200); + if ((0x0F000100 == (pcs_control_status0 & 0x0F000100)) + || (0x0F000100 == (pcs_control_status1 & 0x0F000100))) + int_cnt++; + msleep(1); + } + if (int_cnt > 1) { + spin_lock_irqsave(&nesadapter->phy_lock, flags); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x0000F0C8); + mh_detected++; + reset_value = nes_read32(nesdev->regs+NES_SOFTWARE_RESET); + reset_value |= 0x0000003d; + nes_write32(nesdev->regs+NES_SOFTWARE_RESET, reset_value); + + while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET) + & 0x00000040) != 0x00000040) && (j++ < 5000)); + spin_unlock_irqrestore(&nesadapter->phy_lock, flags); + + pcs_control_status0 = nes_read_indexed(nesdev, + NES_IDX_PHY_PCS_CONTROL_STATUS0); + pcs_control_status1 = nes_read_indexed(nesdev, + NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200); + + for (i = 0; i < NES_MAX_LINK_CHECK; i++) { + pcs_control_status0 = nes_read_indexed(nesdev, + NES_IDX_PHY_PCS_CONTROL_STATUS0); + pcs_control_status1 = nes_read_indexed(nesdev, + NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200); + if ((0x0F000100 == (pcs_control_status0 & 0x0F000100)) + || (0x0F000100 == (pcs_control_status1 & 0x0F000100))) { + if (++ext_cnt > int_cnt) { + spin_lock_irqsave(&nesadapter->phy_lock, flags); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, + 0x0000F088); + mh_detected++; + reset_value = nes_read32(nesdev->regs+NES_SOFTWARE_RESET); + reset_value |= 0x0000003d; + nes_write32(nesdev->regs+NES_SOFTWARE_RESET, reset_value); + + while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET) + & 0x00000040) != 0x00000040) && (j++ < 5000)); + spin_unlock_irqrestore(&nesadapter->phy_lock, flags); + break; + } + } + msleep(1); + } + } + } + + if (nesadapter->hw_rev == NE020_REV) { + init_timer(&nesadapter->mh_timer); + nesadapter->mh_timer.function = nes_mh_fix; + nesadapter->mh_timer.expires = jiffies + (HZ/5); /* 1 second */ + nesadapter->mh_timer.data = (unsigned long)nesdev; + add_timer(&nesadapter->mh_timer); + } else { + nes_write32(nesdev->regs+NES_INTF_INT_STAT, 0x0f000000); + } + + init_timer(&nesadapter->lc_timer); + nesadapter->lc_timer.function = nes_clc; + nesadapter->lc_timer.expires = jiffies + 3600 * HZ; /* 1 hour */ + nesadapter->lc_timer.data = (unsigned long)nesdev; + add_timer(&nesadapter->lc_timer); + + list_add_tail(&nesadapter->list, &nes_adapter_list); + + for (func_index = 0; func_index < 8; func_index++) { + pci_bus_read_config_word(nesdev->pcidev->bus, + PCI_DEVFN(PCI_SLOT(nesdev->pcidev->devfn), + func_index), 0, &vendor_id); + if (vendor_id == 0xffff) + break; + } + nes_debug(NES_DBG_INIT, "%s %d functions found for %s.\n", __func__, + func_index, pci_name(nesdev->pcidev)); + nesadapter->adapter_fcn_count = func_index; + + return nesadapter; +} + + +/** + * nes_reset_adapter_ne020 + */ +static unsigned int nes_reset_adapter_ne020(struct nes_device *nesdev, u8 *OneG_Mode) +{ + u32 port_count; + u32 u32temp; + u32 i; + + u32temp = nes_read32(nesdev->regs+NES_SOFTWARE_RESET); + port_count = ((u32temp & 0x00000300) >> 8) + 1; + /* TODO: assuming that both SERDES are set the same for now */ + *OneG_Mode = (u32temp & 0x00003c00) ? 0 : 1; + nes_debug(NES_DBG_INIT, "Initial Software Reset = 0x%08X, port_count=%u\n", + u32temp, port_count); + if (*OneG_Mode) + nes_debug(NES_DBG_INIT, "Running in 1G mode.\n"); + u32temp &= 0xff00ffc0; + switch (port_count) { + case 1: + u32temp |= 0x00ee0000; + break; + case 2: + u32temp |= 0x00cc0000; + break; + case 4: + u32temp |= 0x00000000; + break; + default: + return 0; + break; + } + + /* check and do full reset if needed */ + if (nes_read_indexed(nesdev, NES_IDX_QP_CONTROL+(PCI_FUNC(nesdev->pcidev->devfn)*8))) { + nes_debug(NES_DBG_INIT, "Issuing Full Soft reset = 0x%08X\n", u32temp | 0xd); + nes_write32(nesdev->regs+NES_SOFTWARE_RESET, u32temp | 0xd); + + i = 0; + while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET) & 0x00000040) == 0) && i++ < 10000) + mdelay(1); + if (i > 10000) { + nes_debug(NES_DBG_INIT, "Did not see full soft reset done.\n"); + return 0; + } + + i = 0; + while ((nes_read_indexed(nesdev, NES_IDX_INT_CPU_STATUS) != 0x80) && i++ < 10000) + mdelay(1); + if (i > 10000) { + printk(KERN_ERR PFX "Internal CPU not ready, status = %02X\n", + nes_read_indexed(nesdev, NES_IDX_INT_CPU_STATUS)); + return 0; + } + } + + /* port reset */ + switch (port_count) { + case 1: + u32temp |= 0x00ee0010; + break; + case 2: + u32temp |= 0x00cc0030; + break; + case 4: + u32temp |= 0x00000030; + break; + } + + nes_debug(NES_DBG_INIT, "Issuing Port Soft reset = 0x%08X\n", u32temp | 0xd); + nes_write32(nesdev->regs+NES_SOFTWARE_RESET, u32temp | 0xd); + + i = 0; + while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET) & 0x00000040) == 0) && i++ < 10000) + mdelay(1); + if (i > 10000) { + nes_debug(NES_DBG_INIT, "Did not see port soft reset done.\n"); + return 0; + } + + /* serdes 0 */ + i = 0; + while (((u32temp = (nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0) + & 0x0000000f)) != 0x0000000f) && i++ < 5000) + mdelay(1); + if (i > 5000) { + nes_debug(NES_DBG_INIT, "Serdes 0 not ready, status=%x\n", u32temp); + return 0; + } + + /* serdes 1 */ + if (port_count > 1) { + i = 0; + while (((u32temp = (nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS1) + & 0x0000000f)) != 0x0000000f) && i++ < 5000) + mdelay(1); + if (i > 5000) { + nes_debug(NES_DBG_INIT, "Serdes 1 not ready, status=%x\n", u32temp); + return 0; + } + } + + return port_count; +} + + +/** + * nes_init_serdes + */ +static int nes_init_serdes(struct nes_device *nesdev, u8 hw_rev, u8 port_count, + struct nes_adapter *nesadapter, u8 OneG_Mode) +{ + int i; + u32 u32temp; + u32 sds; + + if (hw_rev != NE020_REV) { + /* init serdes 0 */ + switch (nesadapter->phy_type[0]) { + case NES_PHY_TYPE_CX4: + if (wide_ppm_offset) + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000FFFAA); + else + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000FF); + break; + case NES_PHY_TYPE_KR: + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000FF); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP0, 0x00000000); + break; + case NES_PHY_TYPE_PUMA_1G: + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000FF); + sds = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0); + sds |= 0x00000100; + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0, sds); + break; + default: + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000FF); + break; + } + + if (!OneG_Mode) + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_HIGHZ_LANE_MODE0, 0x11110000); + + if (port_count < 2) + return 0; + + /* init serdes 1 */ + if (!(OneG_Mode && (nesadapter->phy_type[1] != NES_PHY_TYPE_PUMA_1G))) + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL1, 0x000000FF); + + switch (nesadapter->phy_type[1]) { + case NES_PHY_TYPE_ARGUS: + case NES_PHY_TYPE_SFP_D: + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP0, 0x00000000); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP1, 0x00000000); + break; + case NES_PHY_TYPE_CX4: + if (wide_ppm_offset) + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL1, 0x000FFFAA); + break; + case NES_PHY_TYPE_KR: + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP1, 0x00000000); + break; + case NES_PHY_TYPE_PUMA_1G: + sds = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1); + sds |= 0x000000100; + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, sds); + } + if (!OneG_Mode) { + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_HIGHZ_LANE_MODE1, 0x11110000); + sds = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1); + sds &= 0xFFFFFFBF; + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, sds); + } + } else { + /* init serdes 0 */ + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0, 0x00000008); + i = 0; + while (((u32temp = (nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0) + & 0x0000000f)) != 0x0000000f) && i++ < 5000) + mdelay(1); + if (i > 5000) { + nes_debug(NES_DBG_PHY, "Init: serdes 0 not ready, status=%x\n", u32temp); + return 1; + } + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP0, 0x000bdef7); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_DRIVE0, 0x9ce73000); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_MODE0, 0x0ff00000); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_SIGDET0, 0x00000000); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_BYPASS0, 0x00000000); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_LOOPBACK_CONTROL0, 0x00000000); + if (OneG_Mode) + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_CONTROL0, 0xf0182222); + else + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_CONTROL0, 0xf0042222); + + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000ff); + if (port_count > 1) { + /* init serdes 1 */ + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x00000048); + i = 0; + while (((u32temp = (nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS1) + & 0x0000000f)) != 0x0000000f) && (i++ < 5000)) + mdelay(1); + if (i > 5000) { + printk("%s: Init: serdes 1 not ready, status=%x\n", __func__, u32temp); + /* return 1; */ + } + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP1, 0x000bdef7); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_DRIVE1, 0x9ce73000); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_MODE1, 0x0ff00000); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_SIGDET1, 0x00000000); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_BYPASS1, 0x00000000); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_LOOPBACK_CONTROL1, 0x00000000); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_CONTROL1, 0xf0002222); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL1, 0x000000ff); + } + } + return 0; +} + + +/** + * nes_init_csr_ne020 + * Initialize registers for ne020 hardware + */ +static void nes_init_csr_ne020(struct nes_device *nesdev, u8 hw_rev, u8 port_count) +{ + u32 u32temp; + + nes_debug(NES_DBG_INIT, "port_count=%d\n", port_count); + + nes_write_indexed(nesdev, 0x000001E4, 0x00000007); + /* nes_write_indexed(nesdev, 0x000001E8, 0x000208C4); */ + nes_write_indexed(nesdev, 0x000001E8, 0x00020874); + nes_write_indexed(nesdev, 0x000001D8, 0x00048002); + /* nes_write_indexed(nesdev, 0x000001D8, 0x0004B002); */ + nes_write_indexed(nesdev, 0x000001FC, 0x00050005); + nes_write_indexed(nesdev, 0x00000600, 0x55555555); + nes_write_indexed(nesdev, 0x00000604, 0x55555555); + + /* TODO: move these MAC register settings to NIC bringup */ + nes_write_indexed(nesdev, 0x00002000, 0x00000001); + nes_write_indexed(nesdev, 0x00002004, 0x00000001); + nes_write_indexed(nesdev, 0x00002008, 0x0000FFFF); + nes_write_indexed(nesdev, 0x0000200C, 0x00000001); + nes_write_indexed(nesdev, 0x00002010, 0x000003c1); + nes_write_indexed(nesdev, 0x0000201C, 0x75345678); + if (port_count > 1) { + nes_write_indexed(nesdev, 0x00002200, 0x00000001); + nes_write_indexed(nesdev, 0x00002204, 0x00000001); + nes_write_indexed(nesdev, 0x00002208, 0x0000FFFF); + nes_write_indexed(nesdev, 0x0000220C, 0x00000001); + nes_write_indexed(nesdev, 0x00002210, 0x000003c1); + nes_write_indexed(nesdev, 0x0000221C, 0x75345678); + nes_write_indexed(nesdev, 0x00000908, 0x20000001); + } + if (port_count > 2) { + nes_write_indexed(nesdev, 0x00002400, 0x00000001); + nes_write_indexed(nesdev, 0x00002404, 0x00000001); + nes_write_indexed(nesdev, 0x00002408, 0x0000FFFF); + nes_write_indexed(nesdev, 0x0000240C, 0x00000001); + nes_write_indexed(nesdev, 0x00002410, 0x000003c1); + nes_write_indexed(nesdev, 0x0000241C, 0x75345678); + nes_write_indexed(nesdev, 0x00000910, 0x20000001); + + nes_write_indexed(nesdev, 0x00002600, 0x00000001); + nes_write_indexed(nesdev, 0x00002604, 0x00000001); + nes_write_indexed(nesdev, 0x00002608, 0x0000FFFF); + nes_write_indexed(nesdev, 0x0000260C, 0x00000001); + nes_write_indexed(nesdev, 0x00002610, 0x000003c1); + nes_write_indexed(nesdev, 0x0000261C, 0x75345678); + nes_write_indexed(nesdev, 0x00000918, 0x20000001); + } + + nes_write_indexed(nesdev, 0x00005000, 0x00018000); + /* nes_write_indexed(nesdev, 0x00005000, 0x00010000); */ + nes_write_indexed(nesdev, NES_IDX_WQM_CONFIG1, (wqm_quanta << 1) | + 0x00000001); + nes_write_indexed(nesdev, 0x00005008, 0x1F1F1F1F); + nes_write_indexed(nesdev, 0x00005010, 0x1F1F1F1F); + nes_write_indexed(nesdev, 0x00005018, 0x1F1F1F1F); + nes_write_indexed(nesdev, 0x00005020, 0x1F1F1F1F); + nes_write_indexed(nesdev, 0x00006090, 0xFFFFFFFF); + + /* TODO: move this to code, get from EEPROM */ + nes_write_indexed(nesdev, 0x00000900, 0x20000001); + nes_write_indexed(nesdev, 0x000060C0, 0x0000028e); + nes_write_indexed(nesdev, 0x000060C8, 0x00000020); + + nes_write_indexed(nesdev, 0x000001EC, 0x7b2625a0); + /* nes_write_indexed(nesdev, 0x000001EC, 0x5f2625a0); */ + + if (hw_rev != NE020_REV) { + u32temp = nes_read_indexed(nesdev, 0x000008e8); + u32temp |= 0x80000000; + nes_write_indexed(nesdev, 0x000008e8, u32temp); + u32temp = nes_read_indexed(nesdev, 0x000021f8); + u32temp &= 0x7fffffff; + u32temp |= 0x7fff0010; + nes_write_indexed(nesdev, 0x000021f8, u32temp); + if (port_count > 1) { + u32temp = nes_read_indexed(nesdev, 0x000023f8); + u32temp &= 0x7fffffff; + u32temp |= 0x7fff0010; + nes_write_indexed(nesdev, 0x000023f8, u32temp); + } + } +} + + +/** + * nes_destroy_adapter - destroy the adapter structure + */ +void nes_destroy_adapter(struct nes_adapter *nesadapter) +{ + struct nes_adapter *tmp_adapter; + + list_for_each_entry(tmp_adapter, &nes_adapter_list, list) { + nes_debug(NES_DBG_SHUTDOWN, "Nes Adapter list entry = 0x%p.\n", + tmp_adapter); + } + + nesadapter->ref_count--; + if (!nesadapter->ref_count) { + if (nesadapter->hw_rev == NE020_REV) { + del_timer(&nesadapter->mh_timer); + } + del_timer(&nesadapter->lc_timer); + + list_del(&nesadapter->list); + kfree(nesadapter); + } +} + + +/** + * nes_init_cqp + */ +int nes_init_cqp(struct nes_device *nesdev) +{ + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_hw_cqp_qp_context *cqp_qp_context; + struct nes_hw_cqp_wqe *cqp_wqe; + struct nes_hw_ceq *ceq; + struct nes_hw_ceq *nic_ceq; + struct nes_hw_aeq *aeq; + void *vmem; + dma_addr_t pmem; + u32 count=0; + u32 cqp_head; + u64 u64temp; + u32 u32temp; + + /* allocate CQP memory */ + /* Need to add max_cq to the aeq size once cq overflow checking is added back */ + /* SQ is 512 byte aligned, others are 256 byte aligned */ + nesdev->cqp_mem_size = 512 + + (sizeof(struct nes_hw_cqp_wqe) * NES_CQP_SQ_SIZE) + + (sizeof(struct nes_hw_cqe) * NES_CCQ_SIZE) + + max(((u32)sizeof(struct nes_hw_ceqe) * NES_CCEQ_SIZE), (u32)256) + + max(((u32)sizeof(struct nes_hw_ceqe) * NES_NIC_CEQ_SIZE), (u32)256) + + (sizeof(struct nes_hw_aeqe) * nesadapter->max_qp) + + sizeof(struct nes_hw_cqp_qp_context); + + nesdev->cqp_vbase = pci_zalloc_consistent(nesdev->pcidev, + nesdev->cqp_mem_size, + &nesdev->cqp_pbase); + if (!nesdev->cqp_vbase) { + nes_debug(NES_DBG_INIT, "Unable to allocate memory for host descriptor rings\n"); + return -ENOMEM; + } + + /* Allocate a twice the number of CQP requests as the SQ size */ + nesdev->nes_cqp_requests = kzalloc(sizeof(struct nes_cqp_request) * + 2 * NES_CQP_SQ_SIZE, GFP_KERNEL); + if (nesdev->nes_cqp_requests == NULL) { + nes_debug(NES_DBG_INIT, "Unable to allocate memory CQP request entries.\n"); + pci_free_consistent(nesdev->pcidev, nesdev->cqp_mem_size, nesdev->cqp.sq_vbase, + nesdev->cqp.sq_pbase); + return -ENOMEM; + } + + nes_debug(NES_DBG_INIT, "Allocated CQP structures at %p (phys = %016lX), size = %u.\n", + nesdev->cqp_vbase, (unsigned long)nesdev->cqp_pbase, nesdev->cqp_mem_size); + + spin_lock_init(&nesdev->cqp.lock); + init_waitqueue_head(&nesdev->cqp.waitq); + + /* Setup Various Structures */ + vmem = (void *)(((unsigned long)nesdev->cqp_vbase + (512 - 1)) & + ~(unsigned long)(512 - 1)); + pmem = (dma_addr_t)(((unsigned long long)nesdev->cqp_pbase + (512 - 1)) & + ~(unsigned long long)(512 - 1)); + + nesdev->cqp.sq_vbase = vmem; + nesdev->cqp.sq_pbase = pmem; + nesdev->cqp.sq_size = NES_CQP_SQ_SIZE; + nesdev->cqp.sq_head = 0; + nesdev->cqp.sq_tail = 0; + nesdev->cqp.qp_id = PCI_FUNC(nesdev->pcidev->devfn); + + vmem += (sizeof(struct nes_hw_cqp_wqe) * nesdev->cqp.sq_size); + pmem += (sizeof(struct nes_hw_cqp_wqe) * nesdev->cqp.sq_size); + + nesdev->ccq.cq_vbase = vmem; + nesdev->ccq.cq_pbase = pmem; + nesdev->ccq.cq_size = NES_CCQ_SIZE; + nesdev->ccq.cq_head = 0; + nesdev->ccq.ce_handler = nes_cqp_ce_handler; + nesdev->ccq.cq_number = PCI_FUNC(nesdev->pcidev->devfn); + + vmem += (sizeof(struct nes_hw_cqe) * nesdev->ccq.cq_size); + pmem += (sizeof(struct nes_hw_cqe) * nesdev->ccq.cq_size); + + nesdev->ceq_index = PCI_FUNC(nesdev->pcidev->devfn); + ceq = &nesadapter->ceq[nesdev->ceq_index]; + ceq->ceq_vbase = vmem; + ceq->ceq_pbase = pmem; + ceq->ceq_size = NES_CCEQ_SIZE; + ceq->ceq_head = 0; + + vmem += max(((u32)sizeof(struct nes_hw_ceqe) * ceq->ceq_size), (u32)256); + pmem += max(((u32)sizeof(struct nes_hw_ceqe) * ceq->ceq_size), (u32)256); + + nesdev->nic_ceq_index = PCI_FUNC(nesdev->pcidev->devfn) + 8; + nic_ceq = &nesadapter->ceq[nesdev->nic_ceq_index]; + nic_ceq->ceq_vbase = vmem; + nic_ceq->ceq_pbase = pmem; + nic_ceq->ceq_size = NES_NIC_CEQ_SIZE; + nic_ceq->ceq_head = 0; + + vmem += max(((u32)sizeof(struct nes_hw_ceqe) * nic_ceq->ceq_size), (u32)256); + pmem += max(((u32)sizeof(struct nes_hw_ceqe) * nic_ceq->ceq_size), (u32)256); + + aeq = &nesadapter->aeq[PCI_FUNC(nesdev->pcidev->devfn)]; + aeq->aeq_vbase = vmem; + aeq->aeq_pbase = pmem; + aeq->aeq_size = nesadapter->max_qp; + aeq->aeq_head = 0; + + /* Setup QP Context */ + vmem += (sizeof(struct nes_hw_aeqe) * aeq->aeq_size); + pmem += (sizeof(struct nes_hw_aeqe) * aeq->aeq_size); + + cqp_qp_context = vmem; + cqp_qp_context->context_words[0] = + cpu_to_le32((PCI_FUNC(nesdev->pcidev->devfn) << 12) + (2 << 10)); + cqp_qp_context->context_words[1] = 0; + cqp_qp_context->context_words[2] = cpu_to_le32((u32)nesdev->cqp.sq_pbase); + cqp_qp_context->context_words[3] = cpu_to_le32(((u64)nesdev->cqp.sq_pbase) >> 32); + + + /* Write the address to Create CQP */ + if ((sizeof(dma_addr_t) > 4)) { + nes_write_indexed(nesdev, + NES_IDX_CREATE_CQP_HIGH + (PCI_FUNC(nesdev->pcidev->devfn) * 8), + ((u64)pmem) >> 32); + } else { + nes_write_indexed(nesdev, + NES_IDX_CREATE_CQP_HIGH + (PCI_FUNC(nesdev->pcidev->devfn) * 8), 0); + } + nes_write_indexed(nesdev, + NES_IDX_CREATE_CQP_LOW + (PCI_FUNC(nesdev->pcidev->devfn) * 8), + (u32)pmem); + + INIT_LIST_HEAD(&nesdev->cqp_avail_reqs); + INIT_LIST_HEAD(&nesdev->cqp_pending_reqs); + + for (count = 0; count < 2*NES_CQP_SQ_SIZE; count++) { + init_waitqueue_head(&nesdev->nes_cqp_requests[count].waitq); + list_add_tail(&nesdev->nes_cqp_requests[count].list, &nesdev->cqp_avail_reqs); + } + + /* Write Create CCQ WQE */ + cqp_head = nesdev->cqp.sq_head++; + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, + (NES_CQP_CREATE_CQ | NES_CQP_CQ_CEQ_VALID | + NES_CQP_CQ_CHK_OVERFLOW | ((u32)nesdev->ccq.cq_size << 16))); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, + (nesdev->ccq.cq_number | + ((u32)nesdev->ceq_index << 16))); + u64temp = (u64)nesdev->ccq.cq_pbase; + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp); + cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] = 0; + u64temp = (unsigned long)&nesdev->ccq; + cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_LOW_IDX] = + cpu_to_le32((u32)(u64temp >> 1)); + cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] = + cpu_to_le32(((u32)((u64temp) >> 33)) & 0x7FFFFFFF); + cqp_wqe->wqe_words[NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX] = 0; + + /* Write Create CEQ WQE */ + cqp_head = nesdev->cqp.sq_head++; + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, + (NES_CQP_CREATE_CEQ + ((u32)nesdev->ceq_index << 8))); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_CEQ_WQE_ELEMENT_COUNT_IDX, ceq->ceq_size); + u64temp = (u64)ceq->ceq_pbase; + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp); + + /* Write Create AEQ WQE */ + cqp_head = nesdev->cqp.sq_head++; + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, + (NES_CQP_CREATE_AEQ + ((u32)PCI_FUNC(nesdev->pcidev->devfn) << 8))); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_AEQ_WQE_ELEMENT_COUNT_IDX, aeq->aeq_size); + u64temp = (u64)aeq->aeq_pbase; + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp); + + /* Write Create NIC CEQ WQE */ + cqp_head = nesdev->cqp.sq_head++; + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, + (NES_CQP_CREATE_CEQ + ((u32)nesdev->nic_ceq_index << 8))); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_CEQ_WQE_ELEMENT_COUNT_IDX, nic_ceq->ceq_size); + u64temp = (u64)nic_ceq->ceq_pbase; + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp); + + /* Poll until CCQP done */ + count = 0; + do { + if (count++ > 1000) { + printk(KERN_ERR PFX "Error creating CQP\n"); + pci_free_consistent(nesdev->pcidev, nesdev->cqp_mem_size, + nesdev->cqp_vbase, nesdev->cqp_pbase); + return -1; + } + udelay(10); + } while (!(nes_read_indexed(nesdev, + NES_IDX_QP_CONTROL + (PCI_FUNC(nesdev->pcidev->devfn) * 8)) & (1 << 8))); + + nes_debug(NES_DBG_INIT, "CQP Status = 0x%08X\n", nes_read_indexed(nesdev, + NES_IDX_QP_CONTROL+(PCI_FUNC(nesdev->pcidev->devfn)*8))); + + u32temp = 0x04800000; + nes_write32(nesdev->regs+NES_WQE_ALLOC, u32temp | nesdev->cqp.qp_id); + + /* wait for the CCQ, CEQ, and AEQ to get created */ + count = 0; + do { + if (count++ > 1000) { + printk(KERN_ERR PFX "Error creating CCQ, CEQ, and AEQ\n"); + pci_free_consistent(nesdev->pcidev, nesdev->cqp_mem_size, + nesdev->cqp_vbase, nesdev->cqp_pbase); + return -1; + } + udelay(10); + } while (((nes_read_indexed(nesdev, + NES_IDX_QP_CONTROL+(PCI_FUNC(nesdev->pcidev->devfn)*8)) & (15<<8)) != (15<<8))); + + /* dump the QP status value */ + nes_debug(NES_DBG_INIT, "QP Status = 0x%08X\n", nes_read_indexed(nesdev, + NES_IDX_QP_CONTROL+(PCI_FUNC(nesdev->pcidev->devfn)*8))); + + nesdev->cqp.sq_tail++; + + return 0; +} + + +/** + * nes_destroy_cqp + */ +int nes_destroy_cqp(struct nes_device *nesdev) +{ + struct nes_hw_cqp_wqe *cqp_wqe; + u32 count = 0; + u32 cqp_head; + unsigned long flags; + + do { + if (count++ > 1000) + break; + udelay(10); + } while (!(nesdev->cqp.sq_head == nesdev->cqp.sq_tail)); + + /* Reset CCQ */ + nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_RESET | + nesdev->ccq.cq_number); + + /* Disable device interrupts */ + nes_write32(nesdev->regs+NES_INT_MASK, 0x7fffffff); + + spin_lock_irqsave(&nesdev->cqp.lock, flags); + + /* Destroy the AEQ */ + cqp_head = nesdev->cqp.sq_head++; + nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1; + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_DESTROY_AEQ | + ((u32)PCI_FUNC(nesdev->pcidev->devfn) << 8)); + cqp_wqe->wqe_words[NES_CQP_WQE_COMP_CTX_HIGH_IDX] = 0; + + /* Destroy the NIC CEQ */ + cqp_head = nesdev->cqp.sq_head++; + nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1; + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_DESTROY_CEQ | + ((u32)nesdev->nic_ceq_index << 8)); + + /* Destroy the CEQ */ + cqp_head = nesdev->cqp.sq_head++; + nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1; + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_DESTROY_CEQ | + (nesdev->ceq_index << 8)); + + /* Destroy the CCQ */ + cqp_head = nesdev->cqp.sq_head++; + nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1; + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_DESTROY_CQ); + cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(nesdev->ccq.cq_number | + ((u32)nesdev->ceq_index << 16)); + + /* Destroy CQP */ + cqp_head = nesdev->cqp.sq_head++; + nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1; + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_DESTROY_QP | + NES_CQP_QP_TYPE_CQP); + cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(nesdev->cqp.qp_id); + + barrier(); + /* Ring doorbell (5 WQEs) */ + nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x05800000 | nesdev->cqp.qp_id); + + spin_unlock_irqrestore(&nesdev->cqp.lock, flags); + + /* wait for the CCQ, CEQ, and AEQ to get destroyed */ + count = 0; + do { + if (count++ > 1000) { + printk(KERN_ERR PFX "Function%d: Error destroying CCQ, CEQ, and AEQ\n", + PCI_FUNC(nesdev->pcidev->devfn)); + break; + } + udelay(10); + } while (((nes_read_indexed(nesdev, + NES_IDX_QP_CONTROL + (PCI_FUNC(nesdev->pcidev->devfn)*8)) & (15 << 8)) != 0)); + + /* dump the QP status value */ + nes_debug(NES_DBG_SHUTDOWN, "Function%d: QP Status = 0x%08X\n", + PCI_FUNC(nesdev->pcidev->devfn), + nes_read_indexed(nesdev, + NES_IDX_QP_CONTROL+(PCI_FUNC(nesdev->pcidev->devfn)*8))); + + kfree(nesdev->nes_cqp_requests); + + /* Free the control structures */ + pci_free_consistent(nesdev->pcidev, nesdev->cqp_mem_size, nesdev->cqp.sq_vbase, + nesdev->cqp.sq_pbase); + + return 0; +} + + +/** + * nes_init_1g_phy + */ +static int nes_init_1g_phy(struct nes_device *nesdev, u8 phy_type, u8 phy_index) +{ + u32 counter = 0; + u16 phy_data; + int ret = 0; + + nes_read_1G_phy_reg(nesdev, 1, phy_index, &phy_data); + nes_write_1G_phy_reg(nesdev, 23, phy_index, 0xb000); + + /* Reset the PHY */ + nes_write_1G_phy_reg(nesdev, 0, phy_index, 0x8000); + udelay(100); + counter = 0; + do { + nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data); + if (counter++ > 100) { + ret = -1; + break; + } + } while (phy_data & 0x8000); + + /* Setting no phy loopback */ + phy_data &= 0xbfff; + phy_data |= 0x1140; + nes_write_1G_phy_reg(nesdev, 0, phy_index, phy_data); + nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data); + nes_read_1G_phy_reg(nesdev, 0x17, phy_index, &phy_data); + nes_read_1G_phy_reg(nesdev, 0x1e, phy_index, &phy_data); + + /* Setting the interrupt mask */ + nes_read_1G_phy_reg(nesdev, 0x19, phy_index, &phy_data); + nes_write_1G_phy_reg(nesdev, 0x19, phy_index, 0xffee); + nes_read_1G_phy_reg(nesdev, 0x19, phy_index, &phy_data); + + /* turning on flow control */ + nes_read_1G_phy_reg(nesdev, 4, phy_index, &phy_data); + nes_write_1G_phy_reg(nesdev, 4, phy_index, (phy_data & ~(0x03E0)) | 0xc00); + nes_read_1G_phy_reg(nesdev, 4, phy_index, &phy_data); + + /* Clear Half duplex */ + nes_read_1G_phy_reg(nesdev, 9, phy_index, &phy_data); + nes_write_1G_phy_reg(nesdev, 9, phy_index, phy_data & ~(0x0100)); + nes_read_1G_phy_reg(nesdev, 9, phy_index, &phy_data); + + nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data); + nes_write_1G_phy_reg(nesdev, 0, phy_index, phy_data | 0x0300); + + return ret; +} + + +/** + * nes_init_2025_phy + */ +static int nes_init_2025_phy(struct nes_device *nesdev, u8 phy_type, u8 phy_index) +{ + u32 temp_phy_data = 0; + u32 temp_phy_data2 = 0; + u32 counter = 0; + u32 sds; + u32 mac_index = nesdev->mac_index; + int ret = 0; + unsigned int first_attempt = 1; + + /* Check firmware heartbeat */ + nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee); + temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); + udelay(1500); + nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee); + temp_phy_data2 = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); + + if (temp_phy_data != temp_phy_data2) { + nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7fd); + temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); + if ((temp_phy_data & 0xff) > 0x20) + return 0; + printk(PFX "Reinitialize external PHY\n"); + } + + /* no heartbeat, configure the PHY */ + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0x0000, 0x8000); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc300, 0x0000); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc316, 0x000A); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc318, 0x0052); + + switch (phy_type) { + case NES_PHY_TYPE_ARGUS: + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc316, 0x000A); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc318, 0x0052); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc302, 0x000C); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc319, 0x0008); + nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0027, 0x0001); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc31a, 0x0098); + nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0026, 0x0E00); + + /* setup LEDs */ + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd006, 0x0007); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd007, 0x000A); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd008, 0x0009); + break; + + case NES_PHY_TYPE_SFP_D: + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc316, 0x000A); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc318, 0x0052); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc302, 0x0004); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc319, 0x0038); + nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0027, 0x0013); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc31a, 0x0098); + nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0026, 0x0E00); + + /* setup LEDs */ + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd006, 0x0007); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd007, 0x000A); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd008, 0x0009); + break; + + case NES_PHY_TYPE_KR: + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc316, 0x000A); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc318, 0x0052); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc302, 0x000C); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc319, 0x0010); + nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0027, 0x0013); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc31a, 0x0080); + nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0026, 0x0E00); + + /* setup LEDs */ + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd006, 0x000B); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd007, 0x0003); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd008, 0x0004); + + nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0022, 0x406D); + nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0023, 0x0020); + break; + } + + nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0028, 0xA528); + + /* Bring PHY out of reset */ + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc300, 0x0002); + + /* Check for heartbeat */ + counter = 0; + mdelay(690); + nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee); + temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); + do { + if (counter++ > 150) { + printk(PFX "No PHY heartbeat\n"); + break; + } + mdelay(1); + nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee); + temp_phy_data2 = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); + } while ((temp_phy_data2 == temp_phy_data)); + + /* wait for tracking */ + counter = 0; + do { + nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7fd); + temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); + if (counter++ > 300) { + if (((temp_phy_data & 0xff) == 0x0) && first_attempt) { + first_attempt = 0; + counter = 0; + /* reset AMCC PHY and try again */ + nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0xe854, 0x00c0); + nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0xe854, 0x0040); + continue; + } else { + ret = 1; + break; + } + } + mdelay(10); + } while ((temp_phy_data & 0xff) < 0x30); + + /* setup signal integrity */ + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd003, 0x0000); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xF00D, 0x00FE); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xF00E, 0x0032); + if (phy_type == NES_PHY_TYPE_KR) { + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xF00F, 0x000C); + } else { + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xF00F, 0x0002); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc314, 0x0063); + } + + /* reset serdes */ + sds = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0 + mac_index * 0x200); + sds |= 0x1; + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0 + mac_index * 0x200, sds); + sds &= 0xfffffffe; + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0 + mac_index * 0x200, sds); + + counter = 0; + while (((nes_read32(nesdev->regs + NES_SOFTWARE_RESET) & 0x00000040) != 0x00000040) + && (counter++ < 5000)) + ; + + return ret; +} + + +/** + * nes_init_phy + */ +int nes_init_phy(struct nes_device *nesdev) +{ + struct nes_adapter *nesadapter = nesdev->nesadapter; + u32 mac_index = nesdev->mac_index; + u32 tx_config = 0; + unsigned long flags; + u8 phy_type = nesadapter->phy_type[mac_index]; + u8 phy_index = nesadapter->phy_index[mac_index]; + int ret = 0; + + tx_config = nes_read_indexed(nesdev, NES_IDX_MAC_TX_CONFIG); + if (phy_type == NES_PHY_TYPE_1G) { + /* setup 1G MDIO operation */ + tx_config &= 0xFFFFFFE3; + tx_config |= 0x04; + } else { + /* setup 10G MDIO operation */ + tx_config &= 0xFFFFFFE3; + tx_config |= 0x1D; + } + nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONFIG, tx_config); + + spin_lock_irqsave(&nesdev->nesadapter->phy_lock, flags); + + switch (phy_type) { + case NES_PHY_TYPE_1G: + ret = nes_init_1g_phy(nesdev, phy_type, phy_index); + break; + case NES_PHY_TYPE_ARGUS: + case NES_PHY_TYPE_SFP_D: + case NES_PHY_TYPE_KR: + ret = nes_init_2025_phy(nesdev, phy_type, phy_index); + break; + } + + spin_unlock_irqrestore(&nesdev->nesadapter->phy_lock, flags); + + return ret; +} + + +/** + * nes_replenish_nic_rq + */ +static void nes_replenish_nic_rq(struct nes_vnic *nesvnic) +{ + unsigned long flags; + dma_addr_t bus_address; + struct sk_buff *skb; + struct nes_hw_nic_rq_wqe *nic_rqe; + struct nes_hw_nic *nesnic; + struct nes_device *nesdev; + struct nes_rskb_cb *cb; + u32 rx_wqes_posted = 0; + + nesnic = &nesvnic->nic; + nesdev = nesvnic->nesdev; + spin_lock_irqsave(&nesnic->rq_lock, flags); + if (nesnic->replenishing_rq !=0) { + if (((nesnic->rq_size-1) == atomic_read(&nesvnic->rx_skbs_needed)) && + (atomic_read(&nesvnic->rx_skb_timer_running) == 0)) { + atomic_set(&nesvnic->rx_skb_timer_running, 1); + spin_unlock_irqrestore(&nesnic->rq_lock, flags); + nesvnic->rq_wqes_timer.expires = jiffies + (HZ/2); /* 1/2 second */ + add_timer(&nesvnic->rq_wqes_timer); + } else + spin_unlock_irqrestore(&nesnic->rq_lock, flags); + return; + } + nesnic->replenishing_rq = 1; + spin_unlock_irqrestore(&nesnic->rq_lock, flags); + do { + skb = dev_alloc_skb(nesvnic->max_frame_size); + if (skb) { + skb->dev = nesvnic->netdev; + + bus_address = pci_map_single(nesdev->pcidev, + skb->data, nesvnic->max_frame_size, PCI_DMA_FROMDEVICE); + cb = (struct nes_rskb_cb *)&skb->cb[0]; + cb->busaddr = bus_address; + cb->maplen = nesvnic->max_frame_size; + + nic_rqe = &nesnic->rq_vbase[nesvnic->nic.rq_head]; + nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_1_0_IDX] = + cpu_to_le32(nesvnic->max_frame_size); + nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_3_2_IDX] = 0; + nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX] = + cpu_to_le32((u32)bus_address); + nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX] = + cpu_to_le32((u32)((u64)bus_address >> 32)); + nesnic->rx_skb[nesnic->rq_head] = skb; + nesnic->rq_head++; + nesnic->rq_head &= nesnic->rq_size - 1; + atomic_dec(&nesvnic->rx_skbs_needed); + barrier(); + if (++rx_wqes_posted == 255) { + nes_write32(nesdev->regs+NES_WQE_ALLOC, (rx_wqes_posted << 24) | nesnic->qp_id); + rx_wqes_posted = 0; + } + } else { + spin_lock_irqsave(&nesnic->rq_lock, flags); + if (((nesnic->rq_size-1) == atomic_read(&nesvnic->rx_skbs_needed)) && + (atomic_read(&nesvnic->rx_skb_timer_running) == 0)) { + atomic_set(&nesvnic->rx_skb_timer_running, 1); + spin_unlock_irqrestore(&nesnic->rq_lock, flags); + nesvnic->rq_wqes_timer.expires = jiffies + (HZ/2); /* 1/2 second */ + add_timer(&nesvnic->rq_wqes_timer); + } else + spin_unlock_irqrestore(&nesnic->rq_lock, flags); + break; + } + } while (atomic_read(&nesvnic->rx_skbs_needed)); + barrier(); + if (rx_wqes_posted) + nes_write32(nesdev->regs+NES_WQE_ALLOC, (rx_wqes_posted << 24) | nesnic->qp_id); + nesnic->replenishing_rq = 0; +} + + +/** + * nes_rq_wqes_timeout + */ +static void nes_rq_wqes_timeout(unsigned long parm) +{ + struct nes_vnic *nesvnic = (struct nes_vnic *)parm; + printk("%s: Timer fired.\n", __func__); + atomic_set(&nesvnic->rx_skb_timer_running, 0); + if (atomic_read(&nesvnic->rx_skbs_needed)) + nes_replenish_nic_rq(nesvnic); +} + + +static int nes_lro_get_skb_hdr(struct sk_buff *skb, void **iphdr, + void **tcph, u64 *hdr_flags, void *priv) +{ + unsigned int ip_len; + struct iphdr *iph; + skb_reset_network_header(skb); + iph = ip_hdr(skb); + if (iph->protocol != IPPROTO_TCP) + return -1; + ip_len = ip_hdrlen(skb); + skb_set_transport_header(skb, ip_len); + *tcph = tcp_hdr(skb); + + *hdr_flags = LRO_IPV4 | LRO_TCP; + *iphdr = iph; + return 0; +} + + +/** + * nes_init_nic_qp + */ +int nes_init_nic_qp(struct nes_device *nesdev, struct net_device *netdev) +{ + struct nes_hw_cqp_wqe *cqp_wqe; + struct nes_hw_nic_sq_wqe *nic_sqe; + struct nes_hw_nic_qp_context *nic_context; + struct sk_buff *skb; + struct nes_hw_nic_rq_wqe *nic_rqe; + struct nes_vnic *nesvnic = netdev_priv(netdev); + unsigned long flags; + void *vmem; + dma_addr_t pmem; + u64 u64temp; + int ret; + u32 cqp_head; + u32 counter; + u32 wqe_count; + struct nes_rskb_cb *cb; + u8 jumbomode=0; + + /* Allocate fragment, SQ, RQ, and CQ; Reuse CEQ based on the PCI function */ + nesvnic->nic_mem_size = 256 + + (NES_NIC_WQ_SIZE * sizeof(struct nes_first_frag)) + + (NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_sq_wqe)) + + (NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_rq_wqe)) + + (NES_NIC_WQ_SIZE * 2 * sizeof(struct nes_hw_nic_cqe)) + + sizeof(struct nes_hw_nic_qp_context); + + nesvnic->nic_vbase = pci_zalloc_consistent(nesdev->pcidev, + nesvnic->nic_mem_size, + &nesvnic->nic_pbase); + if (!nesvnic->nic_vbase) { + nes_debug(NES_DBG_INIT, "Unable to allocate memory for NIC host descriptor rings\n"); + return -ENOMEM; + } + nes_debug(NES_DBG_INIT, "Allocated NIC QP structures at %p (phys = %016lX), size = %u.\n", + nesvnic->nic_vbase, (unsigned long)nesvnic->nic_pbase, nesvnic->nic_mem_size); + + vmem = (void *)(((unsigned long)nesvnic->nic_vbase + (256 - 1)) & + ~(unsigned long)(256 - 1)); + pmem = (dma_addr_t)(((unsigned long long)nesvnic->nic_pbase + (256 - 1)) & + ~(unsigned long long)(256 - 1)); + + /* Setup the first Fragment buffers */ + nesvnic->nic.first_frag_vbase = vmem; + + for (counter = 0; counter < NES_NIC_WQ_SIZE; counter++) { + nesvnic->nic.frag_paddr[counter] = pmem; + pmem += sizeof(struct nes_first_frag); + } + + /* setup the SQ */ + vmem += (NES_NIC_WQ_SIZE * sizeof(struct nes_first_frag)); + + nesvnic->nic.sq_vbase = (void *)vmem; + nesvnic->nic.sq_pbase = pmem; + nesvnic->nic.sq_head = 0; + nesvnic->nic.sq_tail = 0; + nesvnic->nic.sq_size = NES_NIC_WQ_SIZE; + for (counter = 0; counter < NES_NIC_WQ_SIZE; counter++) { + nic_sqe = &nesvnic->nic.sq_vbase[counter]; + nic_sqe->wqe_words[NES_NIC_SQ_WQE_MISC_IDX] = + cpu_to_le32(NES_NIC_SQ_WQE_DISABLE_CHKSUM | + NES_NIC_SQ_WQE_COMPLETION); + nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX] = + cpu_to_le32((u32)NES_FIRST_FRAG_SIZE << 16); + nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX] = + cpu_to_le32((u32)nesvnic->nic.frag_paddr[counter]); + nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_HIGH_IDX] = + cpu_to_le32((u32)((u64)nesvnic->nic.frag_paddr[counter] >> 32)); + } + + nesvnic->get_cqp_request = nes_get_cqp_request; + nesvnic->post_cqp_request = nes_post_cqp_request; + nesvnic->mcrq_mcast_filter = NULL; + + spin_lock_init(&nesvnic->nic.rq_lock); + + /* setup the RQ */ + vmem += (NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_sq_wqe)); + pmem += (NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_sq_wqe)); + + + nesvnic->nic.rq_vbase = vmem; + nesvnic->nic.rq_pbase = pmem; + nesvnic->nic.rq_head = 0; + nesvnic->nic.rq_tail = 0; + nesvnic->nic.rq_size = NES_NIC_WQ_SIZE; + + /* setup the CQ */ + vmem += (NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_rq_wqe)); + pmem += (NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_rq_wqe)); + + if (nesdev->nesadapter->netdev_count > 2) + nesvnic->mcrq_qp_id = nesvnic->nic_index + 32; + else + nesvnic->mcrq_qp_id = nesvnic->nic.qp_id + 4; + + nesvnic->nic_cq.cq_vbase = vmem; + nesvnic->nic_cq.cq_pbase = pmem; + nesvnic->nic_cq.cq_head = 0; + nesvnic->nic_cq.cq_size = NES_NIC_WQ_SIZE * 2; + + nesvnic->nic_cq.ce_handler = nes_nic_napi_ce_handler; + + /* Send CreateCQ request to CQP */ + spin_lock_irqsave(&nesdev->cqp.lock, flags); + cqp_head = nesdev->cqp.sq_head; + + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + + cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32( + NES_CQP_CREATE_CQ | NES_CQP_CQ_CEQ_VALID | + ((u32)nesvnic->nic_cq.cq_size << 16)); + cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32( + nesvnic->nic_cq.cq_number | ((u32)nesdev->nic_ceq_index << 16)); + u64temp = (u64)nesvnic->nic_cq.cq_pbase; + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp); + cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] = 0; + u64temp = (unsigned long)&nesvnic->nic_cq; + cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_LOW_IDX] = cpu_to_le32((u32)(u64temp >> 1)); + cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] = + cpu_to_le32(((u32)((u64temp) >> 33)) & 0x7FFFFFFF); + cqp_wqe->wqe_words[NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX] = 0; + if (++cqp_head >= nesdev->cqp.sq_size) + cqp_head = 0; + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + + /* Send CreateQP request to CQP */ + nic_context = (void *)(&nesvnic->nic_cq.cq_vbase[nesvnic->nic_cq.cq_size]); + nic_context->context_words[NES_NIC_CTX_MISC_IDX] = + cpu_to_le32((u32)NES_NIC_CTX_SIZE | + ((u32)PCI_FUNC(nesdev->pcidev->devfn) << 12)); + nes_debug(NES_DBG_INIT, "RX_WINDOW_BUFFER_PAGE_TABLE_SIZE = 0x%08X, RX_WINDOW_BUFFER_SIZE = 0x%08X\n", + nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_PAGE_TABLE_SIZE), + nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_SIZE)); + if (nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_SIZE) != 0) { + nic_context->context_words[NES_NIC_CTX_MISC_IDX] |= cpu_to_le32(NES_NIC_BACK_STORE); + } + + u64temp = (u64)nesvnic->nic.sq_pbase; + nic_context->context_words[NES_NIC_CTX_SQ_LOW_IDX] = cpu_to_le32((u32)u64temp); + nic_context->context_words[NES_NIC_CTX_SQ_HIGH_IDX] = cpu_to_le32((u32)(u64temp >> 32)); + u64temp = (u64)nesvnic->nic.rq_pbase; + nic_context->context_words[NES_NIC_CTX_RQ_LOW_IDX] = cpu_to_le32((u32)u64temp); + nic_context->context_words[NES_NIC_CTX_RQ_HIGH_IDX] = cpu_to_le32((u32)(u64temp >> 32)); + + cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_CREATE_QP | + NES_CQP_QP_TYPE_NIC); + cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(nesvnic->nic.qp_id); + u64temp = (u64)nesvnic->nic_cq.cq_pbase + + (nesvnic->nic_cq.cq_size * sizeof(struct nes_hw_nic_cqe)); + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp); + + if (++cqp_head >= nesdev->cqp.sq_size) + cqp_head = 0; + nesdev->cqp.sq_head = cqp_head; + + barrier(); + + /* Ring doorbell (2 WQEs) */ + nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x02800000 | nesdev->cqp.qp_id); + + spin_unlock_irqrestore(&nesdev->cqp.lock, flags); + nes_debug(NES_DBG_INIT, "Waiting for create NIC QP%u to complete.\n", + nesvnic->nic.qp_id); + + ret = wait_event_timeout(nesdev->cqp.waitq, (nesdev->cqp.sq_tail == cqp_head), + NES_EVENT_TIMEOUT); + nes_debug(NES_DBG_INIT, "Create NIC QP%u completed, wait_event_timeout ret = %u.\n", + nesvnic->nic.qp_id, ret); + if (!ret) { + nes_debug(NES_DBG_INIT, "NIC QP%u create timeout expired\n", nesvnic->nic.qp_id); + pci_free_consistent(nesdev->pcidev, nesvnic->nic_mem_size, nesvnic->nic_vbase, + nesvnic->nic_pbase); + return -EIO; + } + + /* Populate the RQ */ + for (counter = 0; counter < (NES_NIC_WQ_SIZE - 1); counter++) { + skb = dev_alloc_skb(nesvnic->max_frame_size); + if (!skb) { + nes_debug(NES_DBG_INIT, "%s: out of memory for receive skb\n", netdev->name); + + nes_destroy_nic_qp(nesvnic); + return -ENOMEM; + } + + skb->dev = netdev; + + pmem = pci_map_single(nesdev->pcidev, skb->data, + nesvnic->max_frame_size, PCI_DMA_FROMDEVICE); + cb = (struct nes_rskb_cb *)&skb->cb[0]; + cb->busaddr = pmem; + cb->maplen = nesvnic->max_frame_size; + + nic_rqe = &nesvnic->nic.rq_vbase[counter]; + nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_1_0_IDX] = cpu_to_le32(nesvnic->max_frame_size); + nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_3_2_IDX] = 0; + nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX] = cpu_to_le32((u32)pmem); + nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX] = cpu_to_le32((u32)((u64)pmem >> 32)); + nesvnic->nic.rx_skb[counter] = skb; + } + + wqe_count = NES_NIC_WQ_SIZE - 1; + nesvnic->nic.rq_head = wqe_count; + barrier(); + do { + counter = min(wqe_count, ((u32)255)); + wqe_count -= counter; + nes_write32(nesdev->regs+NES_WQE_ALLOC, (counter << 24) | nesvnic->nic.qp_id); + } while (wqe_count); + init_timer(&nesvnic->rq_wqes_timer); + nesvnic->rq_wqes_timer.function = nes_rq_wqes_timeout; + nesvnic->rq_wqes_timer.data = (unsigned long)nesvnic; + nes_debug(NES_DBG_INIT, "NAPI support Enabled\n"); + if (nesdev->nesadapter->et_use_adaptive_rx_coalesce) + { + nes_nic_init_timer(nesdev); + if (netdev->mtu > 1500) + jumbomode = 1; + nes_nic_init_timer_defaults(nesdev, jumbomode); + } + if ((nesdev->nesadapter->allow_unaligned_fpdus) && + (nes_init_mgt_qp(nesdev, netdev, nesvnic))) { + nes_debug(NES_DBG_INIT, "%s: Out of memory for pau nic\n", netdev->name); + nes_destroy_nic_qp(nesvnic); + return -ENOMEM; + } + + nesvnic->lro_mgr.max_aggr = nes_lro_max_aggr; + nesvnic->lro_mgr.max_desc = NES_MAX_LRO_DESCRIPTORS; + nesvnic->lro_mgr.lro_arr = nesvnic->lro_desc; + nesvnic->lro_mgr.get_skb_header = nes_lro_get_skb_hdr; + nesvnic->lro_mgr.features = LRO_F_NAPI | LRO_F_EXTRACT_VLAN_ID; + nesvnic->lro_mgr.dev = netdev; + nesvnic->lro_mgr.ip_summed = CHECKSUM_UNNECESSARY; + nesvnic->lro_mgr.ip_summed_aggr = CHECKSUM_UNNECESSARY; + return 0; +} + + +/** + * nes_destroy_nic_qp + */ +void nes_destroy_nic_qp(struct nes_vnic *nesvnic) +{ + u64 u64temp; + dma_addr_t bus_address; + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_hw_cqp_wqe *cqp_wqe; + struct nes_hw_nic_sq_wqe *nic_sqe; + __le16 *wqe_fragment_length; + u16 wqe_fragment_index; + u32 cqp_head; + u32 wqm_cfg0; + unsigned long flags; + struct sk_buff *rx_skb; + struct nes_rskb_cb *cb; + int ret; + + if (nesdev->nesadapter->allow_unaligned_fpdus) + nes_destroy_mgt(nesvnic); + + /* clear wqe stall before destroying NIC QP */ + wqm_cfg0 = nes_read_indexed(nesdev, NES_IDX_WQM_CONFIG0); + nes_write_indexed(nesdev, NES_IDX_WQM_CONFIG0, wqm_cfg0 & 0xFFFF7FFF); + + /* Free remaining NIC receive buffers */ + while (nesvnic->nic.rq_head != nesvnic->nic.rq_tail) { + rx_skb = nesvnic->nic.rx_skb[nesvnic->nic.rq_tail]; + cb = (struct nes_rskb_cb *)&rx_skb->cb[0]; + pci_unmap_single(nesdev->pcidev, cb->busaddr, cb->maplen, + PCI_DMA_FROMDEVICE); + + dev_kfree_skb(nesvnic->nic.rx_skb[nesvnic->nic.rq_tail++]); + nesvnic->nic.rq_tail &= (nesvnic->nic.rq_size - 1); + } + + /* Free remaining NIC transmit buffers */ + while (nesvnic->nic.sq_head != nesvnic->nic.sq_tail) { + nic_sqe = &nesvnic->nic.sq_vbase[nesvnic->nic.sq_tail]; + wqe_fragment_index = 1; + wqe_fragment_length = (__le16 *) + &nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX]; + /* bump past the vlan tag */ + wqe_fragment_length++; + if (le16_to_cpu(wqe_fragment_length[wqe_fragment_index]) != 0) { + u64temp = (u64)le32_to_cpu( + nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX+ + wqe_fragment_index*2]); + u64temp += ((u64)le32_to_cpu( + nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_HIGH_IDX + + wqe_fragment_index*2]))<<32; + bus_address = (dma_addr_t)u64temp; + if (test_and_clear_bit(nesvnic->nic.sq_tail, + nesvnic->nic.first_frag_overflow)) { + pci_unmap_single(nesdev->pcidev, + bus_address, + le16_to_cpu(wqe_fragment_length[ + wqe_fragment_index++]), + PCI_DMA_TODEVICE); + } + for (; wqe_fragment_index < 5; wqe_fragment_index++) { + if (wqe_fragment_length[wqe_fragment_index]) { + u64temp = le32_to_cpu( + nic_sqe->wqe_words[ + NES_NIC_SQ_WQE_FRAG0_LOW_IDX+ + wqe_fragment_index*2]); + u64temp += ((u64)le32_to_cpu( + nic_sqe->wqe_words[ + NES_NIC_SQ_WQE_FRAG0_HIGH_IDX+ + wqe_fragment_index*2]))<<32; + bus_address = (dma_addr_t)u64temp; + pci_unmap_page(nesdev->pcidev, + bus_address, + le16_to_cpu( + wqe_fragment_length[ + wqe_fragment_index]), + PCI_DMA_TODEVICE); + } else + break; + } + } + if (nesvnic->nic.tx_skb[nesvnic->nic.sq_tail]) + dev_kfree_skb( + nesvnic->nic.tx_skb[nesvnic->nic.sq_tail]); + + nesvnic->nic.sq_tail = (nesvnic->nic.sq_tail + 1) + & (nesvnic->nic.sq_size - 1); + } + + spin_lock_irqsave(&nesdev->cqp.lock, flags); + + /* Destroy NIC QP */ + cqp_head = nesdev->cqp.sq_head; + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, + (NES_CQP_DESTROY_QP | NES_CQP_QP_TYPE_NIC)); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, + nesvnic->nic.qp_id); + + if (++cqp_head >= nesdev->cqp.sq_size) + cqp_head = 0; + + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + + /* Destroy NIC CQ */ + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, + (NES_CQP_DESTROY_CQ | ((u32)nesvnic->nic_cq.cq_size << 16))); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, + (nesvnic->nic_cq.cq_number | ((u32)nesdev->nic_ceq_index << 16))); + + if (++cqp_head >= nesdev->cqp.sq_size) + cqp_head = 0; + + nesdev->cqp.sq_head = cqp_head; + barrier(); + + /* Ring doorbell (2 WQEs) */ + nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x02800000 | nesdev->cqp.qp_id); + + spin_unlock_irqrestore(&nesdev->cqp.lock, flags); + nes_debug(NES_DBG_SHUTDOWN, "Waiting for CQP, cqp_head=%u, cqp.sq_head=%u," + " cqp.sq_tail=%u, cqp.sq_size=%u\n", + cqp_head, nesdev->cqp.sq_head, + nesdev->cqp.sq_tail, nesdev->cqp.sq_size); + + ret = wait_event_timeout(nesdev->cqp.waitq, (nesdev->cqp.sq_tail == cqp_head), + NES_EVENT_TIMEOUT); + + nes_debug(NES_DBG_SHUTDOWN, "Destroy NIC QP returned, wait_event_timeout ret = %u, cqp_head=%u," + " cqp.sq_head=%u, cqp.sq_tail=%u\n", + ret, cqp_head, nesdev->cqp.sq_head, nesdev->cqp.sq_tail); + if (!ret) { + nes_debug(NES_DBG_SHUTDOWN, "NIC QP%u destroy timeout expired\n", + nesvnic->nic.qp_id); + } + + pci_free_consistent(nesdev->pcidev, nesvnic->nic_mem_size, nesvnic->nic_vbase, + nesvnic->nic_pbase); + + /* restore old wqm_cfg0 value */ + nes_write_indexed(nesdev, NES_IDX_WQM_CONFIG0, wqm_cfg0); +} + +/** + * nes_napi_isr + */ +int nes_napi_isr(struct nes_device *nesdev) +{ + struct nes_adapter *nesadapter = nesdev->nesadapter; + u32 int_stat; + + if (nesdev->napi_isr_ran) { + /* interrupt status has already been read in ISR */ + int_stat = nesdev->int_stat; + } else { + int_stat = nes_read32(nesdev->regs + NES_INT_STAT); + nesdev->int_stat = int_stat; + nesdev->napi_isr_ran = 1; + } + + int_stat &= nesdev->int_req; + /* iff NIC, process here, else wait for DPC */ + if ((int_stat) && ((int_stat & 0x0000ff00) == int_stat)) { + nesdev->napi_isr_ran = 0; + nes_write32(nesdev->regs + NES_INT_STAT, + (int_stat & + ~(NES_INT_INTF | NES_INT_TIMER | NES_INT_MAC0 | NES_INT_MAC1 | NES_INT_MAC2 | NES_INT_MAC3))); + + /* Process the CEQs */ + nes_process_ceq(nesdev, &nesdev->nesadapter->ceq[nesdev->nic_ceq_index]); + + if (unlikely((((nesadapter->et_rx_coalesce_usecs_irq) && + (!nesadapter->et_use_adaptive_rx_coalesce)) || + ((nesadapter->et_use_adaptive_rx_coalesce) && + (nesdev->deepcq_count > nesadapter->et_pkt_rate_low))))) { + if ((nesdev->int_req & NES_INT_TIMER) == 0) { + /* Enable Periodic timer interrupts */ + nesdev->int_req |= NES_INT_TIMER; + /* ack any pending periodic timer interrupts so we don't get an immediate interrupt */ + /* TODO: need to also ack other unused periodic timer values, get from nesadapter */ + nes_write32(nesdev->regs+NES_TIMER_STAT, + nesdev->timer_int_req | ~(nesdev->nesadapter->timer_int_req)); + nes_write32(nesdev->regs+NES_INTF_INT_MASK, + ~(nesdev->intf_int_req | NES_INTF_PERIODIC_TIMER)); + } + + if (unlikely(nesadapter->et_use_adaptive_rx_coalesce)) + { + nes_nic_init_timer(nesdev); + } + /* Enable interrupts, except CEQs */ + nes_write32(nesdev->regs+NES_INT_MASK, 0x0000ffff | (~nesdev->int_req)); + } else { + /* Enable interrupts, make sure timer is off */ + nesdev->int_req &= ~NES_INT_TIMER; + nes_write32(nesdev->regs+NES_INTF_INT_MASK, ~(nesdev->intf_int_req)); + nes_write32(nesdev->regs+NES_INT_MASK, ~nesdev->int_req); + } + nesdev->deepcq_count = 0; + return 1; + } else { + return 0; + } +} + +static void process_critical_error(struct nes_device *nesdev) +{ + u32 debug_error; + u32 nes_idx_debug_error_masks0 = 0; + u16 error_module = 0; + + debug_error = nes_read_indexed(nesdev, NES_IDX_DEBUG_ERROR_CONTROL_STATUS); + printk(KERN_ERR PFX "Critical Error reported by device!!! 0x%02X\n", + (u16)debug_error); + nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_CONTROL_STATUS, + 0x01010000 | (debug_error & 0x0000ffff)); + if (crit_err_count++ > 10) + nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS1, 1 << 0x17); + error_module = (u16) (debug_error & 0x1F00) >> 8; + if (++nesdev->nesadapter->crit_error_count[error_module-1] >= + nes_max_critical_error_count) { + printk(KERN_ERR PFX "Masking off critical error for module " + "0x%02X\n", (u16)error_module); + nes_idx_debug_error_masks0 = nes_read_indexed(nesdev, + NES_IDX_DEBUG_ERROR_MASKS0); + nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS0, + nes_idx_debug_error_masks0 | (1 << error_module)); + } +} +/** + * nes_dpc + */ +void nes_dpc(unsigned long param) +{ + struct nes_device *nesdev = (struct nes_device *)param; + struct nes_adapter *nesadapter = nesdev->nesadapter; + u32 counter; + u32 loop_counter = 0; + u32 int_status_bit; + u32 int_stat; + u32 timer_stat; + u32 temp_int_stat; + u32 intf_int_stat; + u32 processed_intf_int = 0; + u16 processed_timer_int = 0; + u16 completion_ints = 0; + u16 timer_ints = 0; + + /* nes_debug(NES_DBG_ISR, "\n"); */ + + do { + timer_stat = 0; + if (nesdev->napi_isr_ran) { + nesdev->napi_isr_ran = 0; + int_stat = nesdev->int_stat; + } else + int_stat = nes_read32(nesdev->regs+NES_INT_STAT); + if (processed_intf_int != 0) + int_stat &= nesdev->int_req & ~NES_INT_INTF; + else + int_stat &= nesdev->int_req; + if (processed_timer_int == 0) { + processed_timer_int = 1; + if (int_stat & NES_INT_TIMER) { + timer_stat = nes_read32(nesdev->regs + NES_TIMER_STAT); + if ((timer_stat & nesdev->timer_int_req) == 0) { + int_stat &= ~NES_INT_TIMER; + } + } + } else { + int_stat &= ~NES_INT_TIMER; + } + + if (int_stat) { + if (int_stat & ~(NES_INT_INTF | NES_INT_TIMER | NES_INT_MAC0| + NES_INT_MAC1|NES_INT_MAC2 | NES_INT_MAC3)) { + /* Ack the interrupts */ + nes_write32(nesdev->regs+NES_INT_STAT, + (int_stat & ~(NES_INT_INTF | NES_INT_TIMER | NES_INT_MAC0| + NES_INT_MAC1 | NES_INT_MAC2 | NES_INT_MAC3))); + } + + temp_int_stat = int_stat; + for (counter = 0, int_status_bit = 1; counter < 16; counter++) { + if (int_stat & int_status_bit) { + nes_process_ceq(nesdev, &nesadapter->ceq[counter]); + temp_int_stat &= ~int_status_bit; + completion_ints = 1; + } + if (!(temp_int_stat & 0x0000ffff)) + break; + int_status_bit <<= 1; + } + + /* Process the AEQ for this pci function */ + int_status_bit = 1 << (16 + PCI_FUNC(nesdev->pcidev->devfn)); + if (int_stat & int_status_bit) { + nes_process_aeq(nesdev, &nesadapter->aeq[PCI_FUNC(nesdev->pcidev->devfn)]); + } + + /* Process the MAC interrupt for this pci function */ + int_status_bit = 1 << (24 + nesdev->mac_index); + if (int_stat & int_status_bit) { + nes_process_mac_intr(nesdev, nesdev->mac_index); + } + + if (int_stat & NES_INT_TIMER) { + if (timer_stat & nesdev->timer_int_req) { + nes_write32(nesdev->regs + NES_TIMER_STAT, + (timer_stat & nesdev->timer_int_req) | + ~(nesdev->nesadapter->timer_int_req)); + timer_ints = 1; + } + } + + if (int_stat & NES_INT_INTF) { + processed_intf_int = 1; + intf_int_stat = nes_read32(nesdev->regs+NES_INTF_INT_STAT); + intf_int_stat &= nesdev->intf_int_req; + if (NES_INTF_INT_CRITERR & intf_int_stat) { + process_critical_error(nesdev); + } + if (NES_INTF_INT_PCIERR & intf_int_stat) { + printk(KERN_ERR PFX "PCI Error reported by device!!!\n"); + BUG(); + } + if (NES_INTF_INT_AEQ_OFLOW & intf_int_stat) { + printk(KERN_ERR PFX "AEQ Overflow reported by device!!!\n"); + BUG(); + } + nes_write32(nesdev->regs+NES_INTF_INT_STAT, intf_int_stat); + } + + if (int_stat & NES_INT_TSW) { + } + } + /* Don't use the interface interrupt bit stay in loop */ + int_stat &= ~NES_INT_INTF | NES_INT_TIMER | NES_INT_MAC0 | + NES_INT_MAC1 | NES_INT_MAC2 | NES_INT_MAC3; + } while ((int_stat != 0) && (loop_counter++ < MAX_DPC_ITERATIONS)); + + if (timer_ints == 1) { + if ((nesadapter->et_rx_coalesce_usecs_irq) || (nesadapter->et_use_adaptive_rx_coalesce)) { + if (completion_ints == 0) { + nesdev->timer_only_int_count++; + if (nesdev->timer_only_int_count>=nesadapter->timer_int_limit) { + nesdev->timer_only_int_count = 0; + nesdev->int_req &= ~NES_INT_TIMER; + nes_write32(nesdev->regs + NES_INTF_INT_MASK, ~(nesdev->intf_int_req)); + nes_write32(nesdev->regs + NES_INT_MASK, ~nesdev->int_req); + } else { + nes_write32(nesdev->regs+NES_INT_MASK, 0x0000ffff | (~nesdev->int_req)); + } + } else { + if (unlikely(nesadapter->et_use_adaptive_rx_coalesce)) + { + nes_nic_init_timer(nesdev); + } + nesdev->timer_only_int_count = 0; + nes_write32(nesdev->regs+NES_INT_MASK, 0x0000ffff | (~nesdev->int_req)); + } + } else { + nesdev->timer_only_int_count = 0; + nesdev->int_req &= ~NES_INT_TIMER; + nes_write32(nesdev->regs+NES_INTF_INT_MASK, ~(nesdev->intf_int_req)); + nes_write32(nesdev->regs+NES_TIMER_STAT, + nesdev->timer_int_req | ~(nesdev->nesadapter->timer_int_req)); + nes_write32(nesdev->regs+NES_INT_MASK, ~nesdev->int_req); + } + } else { + if ( (completion_ints == 1) && + (((nesadapter->et_rx_coalesce_usecs_irq) && + (!nesadapter->et_use_adaptive_rx_coalesce)) || + ((nesdev->deepcq_count > nesadapter->et_pkt_rate_low) && + (nesadapter->et_use_adaptive_rx_coalesce) )) ) { + /* nes_debug(NES_DBG_ISR, "Enabling periodic timer interrupt.\n" ); */ + nesdev->timer_only_int_count = 0; + nesdev->int_req |= NES_INT_TIMER; + nes_write32(nesdev->regs+NES_TIMER_STAT, + nesdev->timer_int_req | ~(nesdev->nesadapter->timer_int_req)); + nes_write32(nesdev->regs+NES_INTF_INT_MASK, + ~(nesdev->intf_int_req | NES_INTF_PERIODIC_TIMER)); + nes_write32(nesdev->regs+NES_INT_MASK, 0x0000ffff | (~nesdev->int_req)); + } else { + nes_write32(nesdev->regs+NES_INT_MASK, ~nesdev->int_req); + } + } + nesdev->deepcq_count = 0; +} + + +/** + * nes_process_ceq + */ +static void nes_process_ceq(struct nes_device *nesdev, struct nes_hw_ceq *ceq) +{ + u64 u64temp; + struct nes_hw_cq *cq; + u32 head; + u32 ceq_size; + + /* nes_debug(NES_DBG_CQ, "\n"); */ + head = ceq->ceq_head; + ceq_size = ceq->ceq_size; + + do { + if (le32_to_cpu(ceq->ceq_vbase[head].ceqe_words[NES_CEQE_CQ_CTX_HIGH_IDX]) & + NES_CEQE_VALID) { + u64temp = (((u64)(le32_to_cpu(ceq->ceq_vbase[head].ceqe_words[NES_CEQE_CQ_CTX_HIGH_IDX]))) << 32) | + ((u64)(le32_to_cpu(ceq->ceq_vbase[head].ceqe_words[NES_CEQE_CQ_CTX_LOW_IDX]))); + u64temp <<= 1; + cq = *((struct nes_hw_cq **)&u64temp); + /* nes_debug(NES_DBG_CQ, "pCQ = %p\n", cq); */ + barrier(); + ceq->ceq_vbase[head].ceqe_words[NES_CEQE_CQ_CTX_HIGH_IDX] = 0; + + /* call the event handler */ + cq->ce_handler(nesdev, cq); + + if (++head >= ceq_size) + head = 0; + } else { + break; + } + + } while (1); + + ceq->ceq_head = head; +} + + +/** + * nes_process_aeq + */ +static void nes_process_aeq(struct nes_device *nesdev, struct nes_hw_aeq *aeq) +{ + /* u64 u64temp; */ + u32 head; + u32 aeq_size; + u32 aeqe_misc; + u32 aeqe_cq_id; + struct nes_hw_aeqe volatile *aeqe; + + head = aeq->aeq_head; + aeq_size = aeq->aeq_size; + + do { + aeqe = &aeq->aeq_vbase[head]; + if ((le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]) & NES_AEQE_VALID) == 0) + break; + aeqe_misc = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]); + aeqe_cq_id = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]); + if (aeqe_misc & (NES_AEQE_QP|NES_AEQE_CQ)) { + if (aeqe_cq_id >= NES_FIRST_QPN) { + /* dealing with an accelerated QP related AE */ + /* + * u64temp = (((u64)(le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_HIGH_IDX]))) << 32) | + * ((u64)(le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_LOW_IDX]))); + */ + nes_process_iwarp_aeqe(nesdev, (struct nes_hw_aeqe *)aeqe); + } else { + /* TODO: dealing with a CQP related AE */ + nes_debug(NES_DBG_AEQ, "Processing CQP related AE, misc = 0x%04X\n", + (u16)(aeqe_misc >> 16)); + } + } + + aeqe->aeqe_words[NES_AEQE_MISC_IDX] = 0; + + if (++head >= aeq_size) + head = 0; + + nes_write32(nesdev->regs + NES_AEQ_ALLOC, 1 << 16); + } + while (1); + aeq->aeq_head = head; +} + +static void nes_reset_link(struct nes_device *nesdev, u32 mac_index) +{ + struct nes_adapter *nesadapter = nesdev->nesadapter; + u32 reset_value; + u32 i=0; + u32 u32temp; + + if (nesadapter->hw_rev == NE020_REV) { + return; + } + mh_detected++; + + reset_value = nes_read32(nesdev->regs+NES_SOFTWARE_RESET); + + if ((mac_index == 0) || ((mac_index == 1) && (nesadapter->OneG_Mode))) + reset_value |= 0x0000001d; + else + reset_value |= 0x0000002d; + + if (4 <= (nesadapter->link_interrupt_count[mac_index] / ((u16)NES_MAX_LINK_INTERRUPTS))) { + if ((!nesadapter->OneG_Mode) && (nesadapter->port_count == 2)) { + nesadapter->link_interrupt_count[0] = 0; + nesadapter->link_interrupt_count[1] = 0; + u32temp = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1); + if (0x00000040 & u32temp) + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x0000F088); + else + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x0000F0C8); + + reset_value |= 0x0000003d; + } + nesadapter->link_interrupt_count[mac_index] = 0; + } + + nes_write32(nesdev->regs+NES_SOFTWARE_RESET, reset_value); + + while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET) + & 0x00000040) != 0x00000040) && (i++ < 5000)); + + if (0x0000003d == (reset_value & 0x0000003d)) { + u32 pcs_control_status0, pcs_control_status1; + + for (i = 0; i < 10; i++) { + pcs_control_status0 = nes_read_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0); + pcs_control_status1 = nes_read_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200); + if (((0x0F000000 == (pcs_control_status0 & 0x0F000000)) + && (pcs_control_status0 & 0x00100000)) + || ((0x0F000000 == (pcs_control_status1 & 0x0F000000)) + && (pcs_control_status1 & 0x00100000))) + continue; + else + break; + } + if (10 == i) { + u32temp = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1); + if (0x00000040 & u32temp) + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x0000F088); + else + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x0000F0C8); + + nes_write32(nesdev->regs+NES_SOFTWARE_RESET, reset_value); + + while (((nes_read32(nesdev->regs + NES_SOFTWARE_RESET) + & 0x00000040) != 0x00000040) && (i++ < 5000)); + } + } +} + +/** + * nes_process_mac_intr + */ +static void nes_process_mac_intr(struct nes_device *nesdev, u32 mac_number) +{ + unsigned long flags; + u32 pcs_control_status; + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_vnic *nesvnic; + u32 mac_status; + u32 mac_index = nesdev->mac_index; + u32 u32temp; + u16 phy_data; + u16 temp_phy_data; + u32 pcs_val = 0x0f0f0000; + u32 pcs_mask = 0x0f1f0000; + u32 cdr_ctrl; + + spin_lock_irqsave(&nesadapter->phy_lock, flags); + if (nesadapter->mac_sw_state[mac_number] != NES_MAC_SW_IDLE) { + spin_unlock_irqrestore(&nesadapter->phy_lock, flags); + return; + } + nesadapter->mac_sw_state[mac_number] = NES_MAC_SW_INTERRUPT; + + /* ack the MAC interrupt */ + mac_status = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS + (mac_index * 0x200)); + /* Clear the interrupt */ + nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS + (mac_index * 0x200), mac_status); + + nes_debug(NES_DBG_PHY, "MAC%u interrupt status = 0x%X.\n", mac_number, mac_status); + + if (mac_status & (NES_MAC_INT_LINK_STAT_CHG | NES_MAC_INT_XGMII_EXT)) { + nesdev->link_status_interrupts++; + if (0 == (++nesadapter->link_interrupt_count[mac_index] % ((u16)NES_MAX_LINK_INTERRUPTS))) + nes_reset_link(nesdev, mac_index); + + /* read the PHY interrupt status register */ + if ((nesadapter->OneG_Mode) && + (nesadapter->phy_type[mac_index] != NES_PHY_TYPE_PUMA_1G)) { + do { + nes_read_1G_phy_reg(nesdev, 0x1a, + nesadapter->phy_index[mac_index], &phy_data); + nes_debug(NES_DBG_PHY, "Phy%d data from register 0x1a = 0x%X.\n", + nesadapter->phy_index[mac_index], phy_data); + } while (phy_data&0x8000); + + temp_phy_data = 0; + do { + nes_read_1G_phy_reg(nesdev, 0x11, + nesadapter->phy_index[mac_index], &phy_data); + nes_debug(NES_DBG_PHY, "Phy%d data from register 0x11 = 0x%X.\n", + nesadapter->phy_index[mac_index], phy_data); + if (temp_phy_data == phy_data) + break; + temp_phy_data = phy_data; + } while (1); + + nes_read_1G_phy_reg(nesdev, 0x1e, + nesadapter->phy_index[mac_index], &phy_data); + nes_debug(NES_DBG_PHY, "Phy%d data from register 0x1e = 0x%X.\n", + nesadapter->phy_index[mac_index], phy_data); + + nes_read_1G_phy_reg(nesdev, 1, + nesadapter->phy_index[mac_index], &phy_data); + nes_debug(NES_DBG_PHY, "1G phy%u data from register 1 = 0x%X\n", + nesadapter->phy_index[mac_index], phy_data); + + if (temp_phy_data & 0x1000) { + nes_debug(NES_DBG_PHY, "The Link is up according to the PHY\n"); + phy_data = 4; + } else { + nes_debug(NES_DBG_PHY, "The Link is down according to the PHY\n"); + } + } + nes_debug(NES_DBG_PHY, "Eth SERDES Common Status: 0=0x%08X, 1=0x%08X\n", + nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0), + nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0+0x200)); + + if (nesadapter->phy_type[mac_index] == NES_PHY_TYPE_PUMA_1G) { + switch (mac_index) { + case 1: + case 3: + pcs_control_status = nes_read_indexed(nesdev, + NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200); + break; + default: + pcs_control_status = nes_read_indexed(nesdev, + NES_IDX_PHY_PCS_CONTROL_STATUS0); + break; + } + } else { + pcs_control_status = nes_read_indexed(nesdev, + NES_IDX_PHY_PCS_CONTROL_STATUS0 + ((mac_index & 1) * 0x200)); + pcs_control_status = nes_read_indexed(nesdev, + NES_IDX_PHY_PCS_CONTROL_STATUS0 + ((mac_index & 1) * 0x200)); + } + + nes_debug(NES_DBG_PHY, "PCS PHY Control/Status%u: 0x%08X\n", + mac_index, pcs_control_status); + if ((nesadapter->OneG_Mode) && + (nesadapter->phy_type[mac_index] != NES_PHY_TYPE_PUMA_1G)) { + u32temp = 0x01010000; + if (nesadapter->port_count > 2) { + u32temp |= 0x02020000; + } + if ((pcs_control_status & u32temp)!= u32temp) { + phy_data = 0; + nes_debug(NES_DBG_PHY, "PCS says the link is down\n"); + } + } else { + switch (nesadapter->phy_type[mac_index]) { + case NES_PHY_TYPE_ARGUS: + case NES_PHY_TYPE_SFP_D: + case NES_PHY_TYPE_KR: + /* clear the alarms */ + nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 4, 0x0008); + nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 4, 0xc001); + nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 4, 0xc002); + nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 4, 0xc005); + nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 4, 0xc006); + nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 0x9003); + nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 0x9004); + nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 0x9005); + /* check link status */ + nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 0x9003); + temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); + + nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 3, 0x0021); + nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); + nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 3, 0x0021); + phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); + + phy_data = (!temp_phy_data && (phy_data == 0x8000)) ? 0x4 : 0x0; + + nes_debug(NES_DBG_PHY, "%s: Phy data = 0x%04X, link was %s.\n", + __func__, phy_data, nesadapter->mac_link_down[mac_index] ? "DOWN" : "UP"); + break; + + case NES_PHY_TYPE_PUMA_1G: + if (mac_index < 2) + pcs_val = pcs_mask = 0x01010000; + else + pcs_val = pcs_mask = 0x02020000; + /* fall through */ + default: + phy_data = (pcs_val == (pcs_control_status & pcs_mask)) ? 0x4 : 0x0; + break; + } + } + + if (phy_data & 0x0004) { + if (wide_ppm_offset && + (nesadapter->phy_type[mac_index] == NES_PHY_TYPE_CX4) && + (nesadapter->hw_rev != NE020_REV)) { + cdr_ctrl = nes_read_indexed(nesdev, + NES_IDX_ETH_SERDES_CDR_CONTROL0 + + mac_index * 0x200); + nes_write_indexed(nesdev, + NES_IDX_ETH_SERDES_CDR_CONTROL0 + + mac_index * 0x200, + cdr_ctrl | 0x000F0000); + } + nesadapter->mac_link_down[mac_index] = 0; + list_for_each_entry(nesvnic, &nesadapter->nesvnic_list[mac_index], list) { + nes_debug(NES_DBG_PHY, "The Link is UP!!. linkup was %d\n", + nesvnic->linkup); + if (nesvnic->linkup == 0) { + printk(PFX "The Link is now up for port %s, netdev %p.\n", + nesvnic->netdev->name, nesvnic->netdev); + if (netif_queue_stopped(nesvnic->netdev)) + netif_start_queue(nesvnic->netdev); + nesvnic->linkup = 1; + netif_carrier_on(nesvnic->netdev); + + spin_lock(&nesvnic->port_ibevent_lock); + if (nesvnic->of_device_registered) { + if (nesdev->iw_status == 0) { + nesdev->iw_status = 1; + nes_port_ibevent(nesvnic); + } + } + spin_unlock(&nesvnic->port_ibevent_lock); + } + } + } else { + if (wide_ppm_offset && + (nesadapter->phy_type[mac_index] == NES_PHY_TYPE_CX4) && + (nesadapter->hw_rev != NE020_REV)) { + cdr_ctrl = nes_read_indexed(nesdev, + NES_IDX_ETH_SERDES_CDR_CONTROL0 + + mac_index * 0x200); + nes_write_indexed(nesdev, + NES_IDX_ETH_SERDES_CDR_CONTROL0 + + mac_index * 0x200, + cdr_ctrl & 0xFFF0FFFF); + } + nesadapter->mac_link_down[mac_index] = 1; + list_for_each_entry(nesvnic, &nesadapter->nesvnic_list[mac_index], list) { + nes_debug(NES_DBG_PHY, "The Link is Down!!. linkup was %d\n", + nesvnic->linkup); + if (nesvnic->linkup == 1) { + printk(PFX "The Link is now down for port %s, netdev %p.\n", + nesvnic->netdev->name, nesvnic->netdev); + if (!(netif_queue_stopped(nesvnic->netdev))) + netif_stop_queue(nesvnic->netdev); + nesvnic->linkup = 0; + netif_carrier_off(nesvnic->netdev); + + spin_lock(&nesvnic->port_ibevent_lock); + if (nesvnic->of_device_registered) { + if (nesdev->iw_status == 1) { + nesdev->iw_status = 0; + nes_port_ibevent(nesvnic); + } + } + spin_unlock(&nesvnic->port_ibevent_lock); + } + } + } + if (nesadapter->phy_type[mac_index] == NES_PHY_TYPE_SFP_D) { + nesdev->link_recheck = 1; + mod_delayed_work(system_wq, &nesdev->work, + NES_LINK_RECHECK_DELAY); + } + } + + spin_unlock_irqrestore(&nesadapter->phy_lock, flags); + + nesadapter->mac_sw_state[mac_number] = NES_MAC_SW_IDLE; +} + +void nes_recheck_link_status(struct work_struct *work) +{ + unsigned long flags; + struct nes_device *nesdev = container_of(work, struct nes_device, work.work); + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_vnic *nesvnic; + u32 mac_index = nesdev->mac_index; + u16 phy_data; + u16 temp_phy_data; + + spin_lock_irqsave(&nesadapter->phy_lock, flags); + + /* check link status */ + nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 0x9003); + temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); + + nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 3, 0x0021); + nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); + nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 3, 0x0021); + phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); + + phy_data = (!temp_phy_data && (phy_data == 0x8000)) ? 0x4 : 0x0; + + nes_debug(NES_DBG_PHY, "%s: Phy data = 0x%04X, link was %s.\n", + __func__, phy_data, + nesadapter->mac_link_down[mac_index] ? "DOWN" : "UP"); + + if (phy_data & 0x0004) { + nesadapter->mac_link_down[mac_index] = 0; + list_for_each_entry(nesvnic, &nesadapter->nesvnic_list[mac_index], list) { + if (nesvnic->linkup == 0) { + printk(PFX "The Link is now up for port %s, netdev %p.\n", + nesvnic->netdev->name, nesvnic->netdev); + if (netif_queue_stopped(nesvnic->netdev)) + netif_start_queue(nesvnic->netdev); + nesvnic->linkup = 1; + netif_carrier_on(nesvnic->netdev); + + spin_lock(&nesvnic->port_ibevent_lock); + if (nesvnic->of_device_registered) { + if (nesdev->iw_status == 0) { + nesdev->iw_status = 1; + nes_port_ibevent(nesvnic); + } + } + spin_unlock(&nesvnic->port_ibevent_lock); + } + } + + } else { + nesadapter->mac_link_down[mac_index] = 1; + list_for_each_entry(nesvnic, &nesadapter->nesvnic_list[mac_index], list) { + if (nesvnic->linkup == 1) { + printk(PFX "The Link is now down for port %s, netdev %p.\n", + nesvnic->netdev->name, nesvnic->netdev); + if (!(netif_queue_stopped(nesvnic->netdev))) + netif_stop_queue(nesvnic->netdev); + nesvnic->linkup = 0; + netif_carrier_off(nesvnic->netdev); + + spin_lock(&nesvnic->port_ibevent_lock); + if (nesvnic->of_device_registered) { + if (nesdev->iw_status == 1) { + nesdev->iw_status = 0; + nes_port_ibevent(nesvnic); + } + } + spin_unlock(&nesvnic->port_ibevent_lock); + } + } + } + if (nesdev->link_recheck++ < NES_LINK_RECHECK_MAX) + schedule_delayed_work(&nesdev->work, NES_LINK_RECHECK_DELAY); + else + nesdev->link_recheck = 0; + + spin_unlock_irqrestore(&nesadapter->phy_lock, flags); +} + + +static void nes_nic_napi_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq) +{ + struct nes_vnic *nesvnic = container_of(cq, struct nes_vnic, nic_cq); + + napi_schedule(&nesvnic->napi); +} + + +/* The MAX_RQES_TO_PROCESS defines how many max read requests to complete before +* getting out of nic_ce_handler +*/ +#define MAX_RQES_TO_PROCESS 384 + +/** + * nes_nic_ce_handler + */ +void nes_nic_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq) +{ + u64 u64temp; + dma_addr_t bus_address; + struct nes_hw_nic *nesnic; + struct nes_vnic *nesvnic = container_of(cq, struct nes_vnic, nic_cq); + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_hw_nic_rq_wqe *nic_rqe; + struct nes_hw_nic_sq_wqe *nic_sqe; + struct sk_buff *skb; + struct sk_buff *rx_skb; + struct nes_rskb_cb *cb; + __le16 *wqe_fragment_length; + u32 head; + u32 cq_size; + u32 rx_pkt_size; + u32 cqe_count=0; + u32 cqe_errv; + u32 cqe_misc; + u16 wqe_fragment_index = 1; /* first fragment (0) is used by copy buffer */ + u16 vlan_tag; + u16 pkt_type; + u16 rqes_processed = 0; + u8 sq_cqes = 0; + u8 nes_use_lro = 0; + + head = cq->cq_head; + cq_size = cq->cq_size; + cq->cqes_pending = 1; + if (nesvnic->netdev->features & NETIF_F_LRO) + nes_use_lro = 1; + do { + if (le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_NIC_CQE_MISC_IDX]) & + NES_NIC_CQE_VALID) { + nesnic = &nesvnic->nic; + cqe_misc = le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_NIC_CQE_MISC_IDX]); + if (cqe_misc & NES_NIC_CQE_SQ) { + sq_cqes++; + wqe_fragment_index = 1; + nic_sqe = &nesnic->sq_vbase[nesnic->sq_tail]; + skb = nesnic->tx_skb[nesnic->sq_tail]; + wqe_fragment_length = (__le16 *)&nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX]; + /* bump past the vlan tag */ + wqe_fragment_length++; + if (le16_to_cpu(wqe_fragment_length[wqe_fragment_index]) != 0) { + u64temp = (u64) le32_to_cpu(nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX + + wqe_fragment_index * 2]); + u64temp += ((u64)le32_to_cpu(nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_HIGH_IDX + + wqe_fragment_index * 2])) << 32; + bus_address = (dma_addr_t)u64temp; + if (test_and_clear_bit(nesnic->sq_tail, nesnic->first_frag_overflow)) { + pci_unmap_single(nesdev->pcidev, + bus_address, + le16_to_cpu(wqe_fragment_length[wqe_fragment_index++]), + PCI_DMA_TODEVICE); + } + for (; wqe_fragment_index < 5; wqe_fragment_index++) { + if (wqe_fragment_length[wqe_fragment_index]) { + u64temp = le32_to_cpu(nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX + + wqe_fragment_index * 2]); + u64temp += ((u64)le32_to_cpu(nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_HIGH_IDX + + wqe_fragment_index * 2])) <<32; + bus_address = (dma_addr_t)u64temp; + pci_unmap_page(nesdev->pcidev, + bus_address, + le16_to_cpu(wqe_fragment_length[wqe_fragment_index]), + PCI_DMA_TODEVICE); + } else + break; + } + } + if (skb) + dev_kfree_skb_any(skb); + nesnic->sq_tail++; + nesnic->sq_tail &= nesnic->sq_size-1; + if (sq_cqes > 128) { + barrier(); + /* restart the queue if it had been stopped */ + if (netif_queue_stopped(nesvnic->netdev)) + netif_wake_queue(nesvnic->netdev); + sq_cqes = 0; + } + } else { + rqes_processed ++; + + cq->rx_cqes_completed++; + cq->rx_pkts_indicated++; + rx_pkt_size = cqe_misc & 0x0000ffff; + nic_rqe = &nesnic->rq_vbase[nesnic->rq_tail]; + /* Get the skb */ + rx_skb = nesnic->rx_skb[nesnic->rq_tail]; + nic_rqe = &nesnic->rq_vbase[nesvnic->nic.rq_tail]; + bus_address = (dma_addr_t)le32_to_cpu(nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX]); + bus_address += ((u64)le32_to_cpu(nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX])) << 32; + pci_unmap_single(nesdev->pcidev, bus_address, + nesvnic->max_frame_size, PCI_DMA_FROMDEVICE); + cb = (struct nes_rskb_cb *)&rx_skb->cb[0]; + cb->busaddr = 0; + /* rx_skb->tail = rx_skb->data + rx_pkt_size; */ + /* rx_skb->len = rx_pkt_size; */ + rx_skb->len = 0; /* TODO: see if this is necessary */ + skb_put(rx_skb, rx_pkt_size); + rx_skb->protocol = eth_type_trans(rx_skb, nesvnic->netdev); + nesnic->rq_tail++; + nesnic->rq_tail &= nesnic->rq_size - 1; + + atomic_inc(&nesvnic->rx_skbs_needed); + if (atomic_read(&nesvnic->rx_skbs_needed) > (nesvnic->nic.rq_size>>1)) { + nes_write32(nesdev->regs+NES_CQE_ALLOC, + cq->cq_number | (cqe_count << 16)); + /* nesadapter->tune_timer.cq_count += cqe_count; */ + nesdev->currcq_count += cqe_count; + cqe_count = 0; + nes_replenish_nic_rq(nesvnic); + } + pkt_type = (u16)(le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_NIC_CQE_TAG_PKT_TYPE_IDX])); + cqe_errv = (cqe_misc & NES_NIC_CQE_ERRV_MASK) >> NES_NIC_CQE_ERRV_SHIFT; + rx_skb->ip_summed = CHECKSUM_NONE; + + if ((NES_PKT_TYPE_TCPV4_BITS == (pkt_type & NES_PKT_TYPE_TCPV4_MASK)) || + (NES_PKT_TYPE_UDPV4_BITS == (pkt_type & NES_PKT_TYPE_UDPV4_MASK))) { + if ((cqe_errv & + (NES_NIC_ERRV_BITS_IPV4_CSUM_ERR | NES_NIC_ERRV_BITS_TCPUDP_CSUM_ERR | + NES_NIC_ERRV_BITS_IPH_ERR | NES_NIC_ERRV_BITS_WQE_OVERRUN)) == 0) { + if (nesvnic->netdev->features & NETIF_F_RXCSUM) + rx_skb->ip_summed = CHECKSUM_UNNECESSARY; + } else + nes_debug(NES_DBG_CQ, "%s: unsuccessfully checksummed TCP or UDP packet." + " errv = 0x%X, pkt_type = 0x%X.\n", + nesvnic->netdev->name, cqe_errv, pkt_type); + + } else if ((pkt_type & NES_PKT_TYPE_IPV4_MASK) == NES_PKT_TYPE_IPV4_BITS) { + if ((cqe_errv & + (NES_NIC_ERRV_BITS_IPV4_CSUM_ERR | NES_NIC_ERRV_BITS_IPH_ERR | + NES_NIC_ERRV_BITS_WQE_OVERRUN)) == 0) { + if (nesvnic->netdev->features & NETIF_F_RXCSUM) { + rx_skb->ip_summed = CHECKSUM_UNNECESSARY; + /* nes_debug(NES_DBG_CQ, "%s: Reporting successfully checksummed IPv4 packet.\n", + nesvnic->netdev->name); */ + } + } else + nes_debug(NES_DBG_CQ, "%s: unsuccessfully checksummed TCP or UDP packet." + " errv = 0x%X, pkt_type = 0x%X.\n", + nesvnic->netdev->name, cqe_errv, pkt_type); + } + /* nes_debug(NES_DBG_CQ, "pkt_type=%x, APBVT_MASK=%x\n", + pkt_type, (pkt_type & NES_PKT_TYPE_APBVT_MASK)); */ + + if ((pkt_type & NES_PKT_TYPE_APBVT_MASK) == NES_PKT_TYPE_APBVT_BITS) { + if (nes_cm_recv(rx_skb, nesvnic->netdev)) + rx_skb = NULL; + } + if (rx_skb == NULL) + goto skip_rx_indicate0; + + + if (cqe_misc & NES_NIC_CQE_TAG_VALID) { + vlan_tag = (u16)(le32_to_cpu( + cq->cq_vbase[head].cqe_words[NES_NIC_CQE_TAG_PKT_TYPE_IDX]) + >> 16); + nes_debug(NES_DBG_CQ, "%s: Reporting stripped VLAN packet. Tag = 0x%04X\n", + nesvnic->netdev->name, vlan_tag); + + __vlan_hwaccel_put_tag(rx_skb, htons(ETH_P_8021Q), vlan_tag); + } + if (nes_use_lro) + lro_receive_skb(&nesvnic->lro_mgr, rx_skb, NULL); + else + netif_receive_skb(rx_skb); + +skip_rx_indicate0: + ; + /* nesvnic->netstats.rx_packets++; */ + /* nesvnic->netstats.rx_bytes += rx_pkt_size; */ + } + + cq->cq_vbase[head].cqe_words[NES_NIC_CQE_MISC_IDX] = 0; + /* Accounting... */ + cqe_count++; + if (++head >= cq_size) + head = 0; + if (cqe_count == 255) { + /* Replenish Nic CQ */ + nes_write32(nesdev->regs+NES_CQE_ALLOC, + cq->cq_number | (cqe_count << 16)); + /* nesdev->nesadapter->tune_timer.cq_count += cqe_count; */ + nesdev->currcq_count += cqe_count; + cqe_count = 0; + } + + if (cq->rx_cqes_completed >= nesvnic->budget) + break; + } else { + cq->cqes_pending = 0; + break; + } + + } while (1); + + if (nes_use_lro) + lro_flush_all(&nesvnic->lro_mgr); + if (sq_cqes) { + barrier(); + /* restart the queue if it had been stopped */ + if (netif_queue_stopped(nesvnic->netdev)) + netif_wake_queue(nesvnic->netdev); + } + cq->cq_head = head; + /* nes_debug(NES_DBG_CQ, "CQ%u Processed = %u cqes, new head = %u.\n", + cq->cq_number, cqe_count, cq->cq_head); */ + cq->cqe_allocs_pending = cqe_count; + if (unlikely(nesadapter->et_use_adaptive_rx_coalesce)) + { + /* nesdev->nesadapter->tune_timer.cq_count += cqe_count; */ + nesdev->currcq_count += cqe_count; + nes_nic_tune_timer(nesdev); + } + if (atomic_read(&nesvnic->rx_skbs_needed)) + nes_replenish_nic_rq(nesvnic); +} + + + +/** + * nes_cqp_ce_handler + */ +static void nes_cqp_ce_handler(struct nes_device *nesdev, struct nes_hw_cq *cq) +{ + u64 u64temp; + unsigned long flags; + struct nes_hw_cqp *cqp = NULL; + struct nes_cqp_request *cqp_request; + struct nes_hw_cqp_wqe *cqp_wqe; + u32 head; + u32 cq_size; + u32 cqe_count=0; + u32 error_code; + u32 opcode; + u32 ctx_index; + /* u32 counter; */ + + head = cq->cq_head; + cq_size = cq->cq_size; + + do { + /* process the CQE */ + /* nes_debug(NES_DBG_CQP, "head=%u cqe_words=%08X\n", head, + le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX])); */ + + opcode = le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX]); + if (opcode & NES_CQE_VALID) { + cqp = &nesdev->cqp; + + error_code = le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_CQE_ERROR_CODE_IDX]); + if (error_code) { + nes_debug(NES_DBG_CQP, "Bad Completion code for opcode 0x%02X from CQP," + " Major/Minor codes = 0x%04X:%04X.\n", + le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX])&0x3f, + (u16)(error_code >> 16), + (u16)error_code); + } + + u64temp = (((u64)(le32_to_cpu(cq->cq_vbase[head]. + cqe_words[NES_CQE_COMP_COMP_CTX_HIGH_IDX]))) << 32) | + ((u64)(le32_to_cpu(cq->cq_vbase[head]. + cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX]))); + + cqp_request = (struct nes_cqp_request *)(unsigned long)u64temp; + if (cqp_request) { + if (cqp_request->waiting) { + /* nes_debug(NES_DBG_CQP, "%s: Waking up requestor\n"); */ + cqp_request->major_code = (u16)(error_code >> 16); + cqp_request->minor_code = (u16)error_code; + barrier(); + cqp_request->request_done = 1; + wake_up(&cqp_request->waitq); + nes_put_cqp_request(nesdev, cqp_request); + } else { + if (cqp_request->callback) + cqp_request->cqp_callback(nesdev, cqp_request); + nes_free_cqp_request(nesdev, cqp_request); + } + } else { + wake_up(&nesdev->cqp.waitq); + } + + cq->cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX] = 0; + nes_write32(nesdev->regs + NES_CQE_ALLOC, cq->cq_number | (1 << 16)); + if (++cqp->sq_tail >= cqp->sq_size) + cqp->sq_tail = 0; + + /* Accounting... */ + cqe_count++; + if (++head >= cq_size) + head = 0; + } else { + break; + } + } while (1); + cq->cq_head = head; + + spin_lock_irqsave(&nesdev->cqp.lock, flags); + while ((!list_empty(&nesdev->cqp_pending_reqs)) && + ((((nesdev->cqp.sq_tail+nesdev->cqp.sq_size)-nesdev->cqp.sq_head) & + (nesdev->cqp.sq_size - 1)) != 1)) { + cqp_request = list_entry(nesdev->cqp_pending_reqs.next, + struct nes_cqp_request, list); + list_del_init(&cqp_request->list); + head = nesdev->cqp.sq_head++; + nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1; + cqp_wqe = &nesdev->cqp.sq_vbase[head]; + memcpy(cqp_wqe, &cqp_request->cqp_wqe, sizeof(*cqp_wqe)); + barrier(); + + opcode = cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX]; + if ((opcode & NES_CQP_OPCODE_MASK) == NES_CQP_DOWNLOAD_SEGMENT) + ctx_index = NES_CQP_WQE_DL_COMP_CTX_LOW_IDX; + else + ctx_index = NES_CQP_WQE_COMP_CTX_LOW_IDX; + cqp_wqe->wqe_words[ctx_index] = + cpu_to_le32((u32)((unsigned long)cqp_request)); + cqp_wqe->wqe_words[ctx_index + 1] = + cpu_to_le32((u32)(upper_32_bits((unsigned long)cqp_request))); + nes_debug(NES_DBG_CQP, "CQP request %p (opcode 0x%02X) put on CQPs SQ wqe%u.\n", + cqp_request, le32_to_cpu(cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX])&0x3f, head); + /* Ring doorbell (1 WQEs) */ + barrier(); + nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x01800000 | nesdev->cqp.qp_id); + } + spin_unlock_irqrestore(&nesdev->cqp.lock, flags); + + /* Arm the CCQ */ + nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT | + cq->cq_number); + nes_read32(nesdev->regs+NES_CQE_ALLOC); +} + +static u8 *locate_mpa(u8 *pkt, u32 aeq_info) +{ + if (aeq_info & NES_AEQE_Q2_DATA_ETHERNET) { + /* skip over ethernet header */ + pkt += ETH_HLEN; + + /* Skip over IP and TCP headers */ + pkt += 4 * (pkt[0] & 0x0f); + pkt += 4 * ((pkt[12] >> 4) & 0x0f); + } + return pkt; +} + +/* Determine if incoming error pkt is rdma layer */ +static u32 iwarp_opcode(struct nes_qp *nesqp, u32 aeq_info) +{ + u8 *pkt; + u16 *mpa; + u32 opcode = 0xffffffff; + + if (aeq_info & NES_AEQE_Q2_DATA_WRITTEN) { + pkt = nesqp->hwqp.q2_vbase + BAD_FRAME_OFFSET; + mpa = (u16 *)locate_mpa(pkt, aeq_info); + opcode = be16_to_cpu(mpa[1]) & 0xf; + } + + return opcode; +} + +/* Build iWARP terminate header */ +static int nes_bld_terminate_hdr(struct nes_qp *nesqp, u16 async_event_id, u32 aeq_info) +{ + u8 *pkt = nesqp->hwqp.q2_vbase + BAD_FRAME_OFFSET; + u16 ddp_seg_len; + int copy_len = 0; + u8 is_tagged = 0; + u8 flush_code = 0; + struct nes_terminate_hdr *termhdr; + + termhdr = (struct nes_terminate_hdr *)nesqp->hwqp.q2_vbase; + memset(termhdr, 0, 64); + + if (aeq_info & NES_AEQE_Q2_DATA_WRITTEN) { + + /* Use data from offending packet to fill in ddp & rdma hdrs */ + pkt = locate_mpa(pkt, aeq_info); + ddp_seg_len = be16_to_cpu(*(u16 *)pkt); + if (ddp_seg_len) { + copy_len = 2; + termhdr->hdrct = DDP_LEN_FLAG; + if (pkt[2] & 0x80) { + is_tagged = 1; + if (ddp_seg_len >= TERM_DDP_LEN_TAGGED) { + copy_len += TERM_DDP_LEN_TAGGED; + termhdr->hdrct |= DDP_HDR_FLAG; + } + } else { + if (ddp_seg_len >= TERM_DDP_LEN_UNTAGGED) { + copy_len += TERM_DDP_LEN_UNTAGGED; + termhdr->hdrct |= DDP_HDR_FLAG; + } + + if (ddp_seg_len >= (TERM_DDP_LEN_UNTAGGED + TERM_RDMA_LEN)) { + if ((pkt[3] & RDMA_OPCODE_MASK) == RDMA_READ_REQ_OPCODE) { + copy_len += TERM_RDMA_LEN; + termhdr->hdrct |= RDMA_HDR_FLAG; + } + } + } + } + } + + switch (async_event_id) { + case NES_AEQE_AEID_AMP_UNALLOCATED_STAG: + switch (iwarp_opcode(nesqp, aeq_info)) { + case IWARP_OPCODE_WRITE: + flush_code = IB_WC_LOC_PROT_ERR; + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_TAGGED_BUFFER; + termhdr->error_code = DDP_TAGGED_INV_STAG; + break; + default: + flush_code = IB_WC_REM_ACCESS_ERR; + termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT; + termhdr->error_code = RDMAP_INV_STAG; + } + break; + case NES_AEQE_AEID_AMP_INVALID_STAG: + flush_code = IB_WC_REM_ACCESS_ERR; + termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT; + termhdr->error_code = RDMAP_INV_STAG; + break; + case NES_AEQE_AEID_AMP_BAD_QP: + flush_code = IB_WC_LOC_QP_OP_ERR; + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER; + termhdr->error_code = DDP_UNTAGGED_INV_QN; + break; + case NES_AEQE_AEID_AMP_BAD_STAG_KEY: + case NES_AEQE_AEID_AMP_BAD_STAG_INDEX: + switch (iwarp_opcode(nesqp, aeq_info)) { + case IWARP_OPCODE_SEND_INV: + case IWARP_OPCODE_SEND_SE_INV: + flush_code = IB_WC_REM_OP_ERR; + termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_OP; + termhdr->error_code = RDMAP_CANT_INV_STAG; + break; + default: + flush_code = IB_WC_REM_ACCESS_ERR; + termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT; + termhdr->error_code = RDMAP_INV_STAG; + } + break; + case NES_AEQE_AEID_AMP_BOUNDS_VIOLATION: + if (aeq_info & (NES_AEQE_Q2_DATA_ETHERNET | NES_AEQE_Q2_DATA_MPA)) { + flush_code = IB_WC_LOC_PROT_ERR; + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_TAGGED_BUFFER; + termhdr->error_code = DDP_TAGGED_BOUNDS; + } else { + flush_code = IB_WC_REM_ACCESS_ERR; + termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT; + termhdr->error_code = RDMAP_INV_BOUNDS; + } + break; + case NES_AEQE_AEID_AMP_RIGHTS_VIOLATION: + case NES_AEQE_AEID_AMP_INVALIDATE_NO_REMOTE_ACCESS_RIGHTS: + case NES_AEQE_AEID_PRIV_OPERATION_DENIED: + flush_code = IB_WC_REM_ACCESS_ERR; + termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT; + termhdr->error_code = RDMAP_ACCESS; + break; + case NES_AEQE_AEID_AMP_TO_WRAP: + flush_code = IB_WC_REM_ACCESS_ERR; + termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT; + termhdr->error_code = RDMAP_TO_WRAP; + break; + case NES_AEQE_AEID_AMP_BAD_PD: + switch (iwarp_opcode(nesqp, aeq_info)) { + case IWARP_OPCODE_WRITE: + flush_code = IB_WC_LOC_PROT_ERR; + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_TAGGED_BUFFER; + termhdr->error_code = DDP_TAGGED_UNASSOC_STAG; + break; + case IWARP_OPCODE_SEND_INV: + case IWARP_OPCODE_SEND_SE_INV: + flush_code = IB_WC_REM_ACCESS_ERR; + termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT; + termhdr->error_code = RDMAP_CANT_INV_STAG; + break; + default: + flush_code = IB_WC_REM_ACCESS_ERR; + termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT; + termhdr->error_code = RDMAP_UNASSOC_STAG; + } + break; + case NES_AEQE_AEID_LLP_RECEIVED_MARKER_AND_LENGTH_FIELDS_DONT_MATCH: + flush_code = IB_WC_LOC_LEN_ERR; + termhdr->layer_etype = (LAYER_MPA << 4) | DDP_LLP; + termhdr->error_code = MPA_MARKER; + break; + case NES_AEQE_AEID_LLP_RECEIVED_MPA_CRC_ERROR: + flush_code = IB_WC_GENERAL_ERR; + termhdr->layer_etype = (LAYER_MPA << 4) | DDP_LLP; + termhdr->error_code = MPA_CRC; + break; + case NES_AEQE_AEID_LLP_SEGMENT_TOO_LARGE: + case NES_AEQE_AEID_LLP_SEGMENT_TOO_SMALL: + flush_code = IB_WC_LOC_LEN_ERR; + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_CATASTROPHIC; + termhdr->error_code = DDP_CATASTROPHIC_LOCAL; + break; + case NES_AEQE_AEID_DDP_LCE_LOCAL_CATASTROPHIC: + case NES_AEQE_AEID_DDP_NO_L_BIT: + flush_code = IB_WC_FATAL_ERR; + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_CATASTROPHIC; + termhdr->error_code = DDP_CATASTROPHIC_LOCAL; + break; + case NES_AEQE_AEID_DDP_INVALID_MSN_GAP_IN_MSN: + case NES_AEQE_AEID_DDP_INVALID_MSN_RANGE_IS_NOT_VALID: + flush_code = IB_WC_GENERAL_ERR; + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER; + termhdr->error_code = DDP_UNTAGGED_INV_MSN_RANGE; + break; + case NES_AEQE_AEID_DDP_UBE_DDP_MESSAGE_TOO_LONG_FOR_AVAILABLE_BUFFER: + flush_code = IB_WC_LOC_LEN_ERR; + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER; + termhdr->error_code = DDP_UNTAGGED_INV_TOO_LONG; + break; + case NES_AEQE_AEID_DDP_UBE_INVALID_DDP_VERSION: + flush_code = IB_WC_GENERAL_ERR; + if (is_tagged) { + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_TAGGED_BUFFER; + termhdr->error_code = DDP_TAGGED_INV_DDP_VER; + } else { + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER; + termhdr->error_code = DDP_UNTAGGED_INV_DDP_VER; + } + break; + case NES_AEQE_AEID_DDP_UBE_INVALID_MO: + flush_code = IB_WC_GENERAL_ERR; + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER; + termhdr->error_code = DDP_UNTAGGED_INV_MO; + break; + case NES_AEQE_AEID_DDP_UBE_INVALID_MSN_NO_BUFFER_AVAILABLE: + flush_code = IB_WC_REM_OP_ERR; + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER; + termhdr->error_code = DDP_UNTAGGED_INV_MSN_NO_BUF; + break; + case NES_AEQE_AEID_DDP_UBE_INVALID_QN: + flush_code = IB_WC_GENERAL_ERR; + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER; + termhdr->error_code = DDP_UNTAGGED_INV_QN; + break; + case NES_AEQE_AEID_RDMAP_ROE_INVALID_RDMAP_VERSION: + flush_code = IB_WC_GENERAL_ERR; + termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_OP; + termhdr->error_code = RDMAP_INV_RDMAP_VER; + break; + case NES_AEQE_AEID_RDMAP_ROE_UNEXPECTED_OPCODE: + flush_code = IB_WC_LOC_QP_OP_ERR; + termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_OP; + termhdr->error_code = RDMAP_UNEXPECTED_OP; + break; + default: + flush_code = IB_WC_FATAL_ERR; + termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_OP; + termhdr->error_code = RDMAP_UNSPECIFIED; + break; + } + + if (copy_len) + memcpy(termhdr + 1, pkt, copy_len); + + if ((flush_code) && ((NES_AEQE_INBOUND_RDMA & aeq_info) == 0)) { + if (aeq_info & NES_AEQE_SQ) + nesqp->term_sq_flush_code = flush_code; + else + nesqp->term_rq_flush_code = flush_code; + } + + return sizeof(struct nes_terminate_hdr) + copy_len; +} + +static void nes_terminate_connection(struct nes_device *nesdev, struct nes_qp *nesqp, + struct nes_hw_aeqe *aeqe, enum ib_event_type eventtype) +{ + u64 context; + unsigned long flags; + u32 aeq_info; + u16 async_event_id; + u8 tcp_state; + u8 iwarp_state; + u32 termlen = 0; + u32 mod_qp_flags = NES_CQP_QP_IWARP_STATE_TERMINATE | + NES_CQP_QP_TERM_DONT_SEND_FIN; + struct nes_adapter *nesadapter = nesdev->nesadapter; + + if (nesqp->term_flags & NES_TERM_SENT) + return; /* Sanity check */ + + aeq_info = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]); + tcp_state = (aeq_info & NES_AEQE_TCP_STATE_MASK) >> NES_AEQE_TCP_STATE_SHIFT; + iwarp_state = (aeq_info & NES_AEQE_IWARP_STATE_MASK) >> NES_AEQE_IWARP_STATE_SHIFT; + async_event_id = (u16)aeq_info; + + context = (unsigned long)nesadapter->qp_table[le32_to_cpu( + aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]) - NES_FIRST_QPN]; + if (!context) { + WARN_ON(!context); + return; + } + + nesqp = (struct nes_qp *)(unsigned long)context; + spin_lock_irqsave(&nesqp->lock, flags); + nesqp->hw_iwarp_state = iwarp_state; + nesqp->hw_tcp_state = tcp_state; + nesqp->last_aeq = async_event_id; + nesqp->terminate_eventtype = eventtype; + spin_unlock_irqrestore(&nesqp->lock, flags); + + if (nesadapter->send_term_ok) + termlen = nes_bld_terminate_hdr(nesqp, async_event_id, aeq_info); + else + mod_qp_flags |= NES_CQP_QP_TERM_DONT_SEND_TERM_MSG; + + if (!nesdev->iw_status) { + nesqp->term_flags = NES_TERM_DONE; + nes_hw_modify_qp(nesdev, nesqp, NES_CQP_QP_IWARP_STATE_ERROR, 0, 0); + nes_cm_disconn(nesqp); + } else { + nes_terminate_start_timer(nesqp); + nesqp->term_flags |= NES_TERM_SENT; + nes_hw_modify_qp(nesdev, nesqp, mod_qp_flags, termlen, 0); + } +} + +static void nes_terminate_send_fin(struct nes_device *nesdev, + struct nes_qp *nesqp, struct nes_hw_aeqe *aeqe) +{ + u32 aeq_info; + u16 async_event_id; + u8 tcp_state; + u8 iwarp_state; + unsigned long flags; + + aeq_info = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]); + tcp_state = (aeq_info & NES_AEQE_TCP_STATE_MASK) >> NES_AEQE_TCP_STATE_SHIFT; + iwarp_state = (aeq_info & NES_AEQE_IWARP_STATE_MASK) >> NES_AEQE_IWARP_STATE_SHIFT; + async_event_id = (u16)aeq_info; + + spin_lock_irqsave(&nesqp->lock, flags); + nesqp->hw_iwarp_state = iwarp_state; + nesqp->hw_tcp_state = tcp_state; + nesqp->last_aeq = async_event_id; + spin_unlock_irqrestore(&nesqp->lock, flags); + + /* Send the fin only */ + nes_hw_modify_qp(nesdev, nesqp, NES_CQP_QP_IWARP_STATE_TERMINATE | + NES_CQP_QP_TERM_DONT_SEND_TERM_MSG, 0, 0); +} + +/* Cleanup after a terminate sent or received */ +static void nes_terminate_done(struct nes_qp *nesqp, int timeout_occurred) +{ + u32 next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR; + unsigned long flags; + struct nes_vnic *nesvnic = to_nesvnic(nesqp->ibqp.device); + struct nes_device *nesdev = nesvnic->nesdev; + u8 first_time = 0; + + spin_lock_irqsave(&nesqp->lock, flags); + if (nesqp->hte_added) { + nesqp->hte_added = 0; + next_iwarp_state |= NES_CQP_QP_DEL_HTE; + } + + first_time = (nesqp->term_flags & NES_TERM_DONE) == 0; + nesqp->term_flags |= NES_TERM_DONE; + spin_unlock_irqrestore(&nesqp->lock, flags); + + /* Make sure we go through this only once */ + if (first_time) { + if (timeout_occurred == 0) + del_timer(&nesqp->terminate_timer); + else + next_iwarp_state |= NES_CQP_QP_RESET; + + nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0, 0); + nes_cm_disconn(nesqp); + } +} + +static void nes_terminate_received(struct nes_device *nesdev, + struct nes_qp *nesqp, struct nes_hw_aeqe *aeqe) +{ + u32 aeq_info; + u8 *pkt; + u32 *mpa; + u8 ddp_ctl; + u8 rdma_ctl; + u16 aeq_id = 0; + + aeq_info = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]); + if (aeq_info & NES_AEQE_Q2_DATA_WRITTEN) { + /* Terminate is not a performance path so the silicon */ + /* did not validate the frame - do it now */ + pkt = nesqp->hwqp.q2_vbase + BAD_FRAME_OFFSET; + mpa = (u32 *)locate_mpa(pkt, aeq_info); + ddp_ctl = (be32_to_cpu(mpa[0]) >> 8) & 0xff; + rdma_ctl = be32_to_cpu(mpa[0]) & 0xff; + if ((ddp_ctl & 0xc0) != 0x40) + aeq_id = NES_AEQE_AEID_DDP_LCE_LOCAL_CATASTROPHIC; + else if ((ddp_ctl & 0x03) != 1) + aeq_id = NES_AEQE_AEID_DDP_UBE_INVALID_DDP_VERSION; + else if (be32_to_cpu(mpa[2]) != 2) + aeq_id = NES_AEQE_AEID_DDP_UBE_INVALID_QN; + else if (be32_to_cpu(mpa[3]) != 1) + aeq_id = NES_AEQE_AEID_DDP_INVALID_MSN_GAP_IN_MSN; + else if (be32_to_cpu(mpa[4]) != 0) + aeq_id = NES_AEQE_AEID_DDP_UBE_INVALID_MO; + else if ((rdma_ctl & 0xc0) != 0x40) + aeq_id = NES_AEQE_AEID_RDMAP_ROE_INVALID_RDMAP_VERSION; + + if (aeq_id) { + /* Bad terminate recvd - send back a terminate */ + aeq_info = (aeq_info & 0xffff0000) | aeq_id; + aeqe->aeqe_words[NES_AEQE_MISC_IDX] = cpu_to_le32(aeq_info); + nes_terminate_connection(nesdev, nesqp, aeqe, IB_EVENT_QP_FATAL); + return; + } + } + + nesqp->term_flags |= NES_TERM_RCVD; + nesqp->terminate_eventtype = IB_EVENT_QP_FATAL; + nes_terminate_start_timer(nesqp); + nes_terminate_send_fin(nesdev, nesqp, aeqe); +} + +/* Timeout routine in case terminate fails to complete */ +void nes_terminate_timeout(unsigned long context) +{ + struct nes_qp *nesqp = (struct nes_qp *)(unsigned long)context; + + nes_terminate_done(nesqp, 1); +} + +/* Set a timer in case hw cannot complete the terminate sequence */ +static void nes_terminate_start_timer(struct nes_qp *nesqp) +{ + mod_timer(&nesqp->terminate_timer, (jiffies + HZ)); +} + +/** + * nes_process_iwarp_aeqe + */ +static void nes_process_iwarp_aeqe(struct nes_device *nesdev, + struct nes_hw_aeqe *aeqe) +{ + u64 context; + unsigned long flags; + struct nes_qp *nesqp; + struct nes_hw_cq *hw_cq; + struct nes_cq *nescq; + int resource_allocated; + struct nes_adapter *nesadapter = nesdev->nesadapter; + u32 aeq_info; + u32 next_iwarp_state = 0; + u32 aeqe_cq_id; + u16 async_event_id; + u8 tcp_state; + u8 iwarp_state; + struct ib_event ibevent; + + nes_debug(NES_DBG_AEQ, "\n"); + aeq_info = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]); + if ((NES_AEQE_INBOUND_RDMA & aeq_info) || (!(NES_AEQE_QP & aeq_info))) { + context = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_LOW_IDX]); + context += ((u64)le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_HIGH_IDX])) << 32; + } else { + context = (unsigned long)nesadapter->qp_table[le32_to_cpu( + aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]) - NES_FIRST_QPN]; + BUG_ON(!context); + } + + /* context is nesqp unless async_event_id == CQ ERROR */ + nesqp = (struct nes_qp *)(unsigned long)context; + async_event_id = (u16)aeq_info; + tcp_state = (aeq_info & NES_AEQE_TCP_STATE_MASK) >> NES_AEQE_TCP_STATE_SHIFT; + iwarp_state = (aeq_info & NES_AEQE_IWARP_STATE_MASK) >> NES_AEQE_IWARP_STATE_SHIFT; + nes_debug(NES_DBG_AEQ, "aeid = 0x%04X, qp-cq id = %d, aeqe = %p," + " Tcp state = %s, iWARP state = %s\n", + async_event_id, + le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]), aeqe, + nes_tcp_state_str[tcp_state], nes_iwarp_state_str[iwarp_state]); + + aeqe_cq_id = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]); + if (aeq_info & NES_AEQE_QP) { + if (!nes_is_resource_allocated(nesadapter, + nesadapter->allocated_qps, + aeqe_cq_id)) + return; + } + + switch (async_event_id) { + case NES_AEQE_AEID_LLP_FIN_RECEIVED: + if (nesqp->term_flags) + return; /* Ignore it, wait for close complete */ + + if (atomic_inc_return(&nesqp->close_timer_started) == 1) { + if ((tcp_state == NES_AEQE_TCP_STATE_CLOSE_WAIT) && + (nesqp->ibqp_state == IB_QPS_RTS)) { + spin_lock_irqsave(&nesqp->lock, flags); + nesqp->hw_iwarp_state = iwarp_state; + nesqp->hw_tcp_state = tcp_state; + nesqp->last_aeq = async_event_id; + next_iwarp_state = NES_CQP_QP_IWARP_STATE_CLOSING; + nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_CLOSING; + spin_unlock_irqrestore(&nesqp->lock, flags); + nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0, 0); + nes_cm_disconn(nesqp); + } + nesqp->cm_id->add_ref(nesqp->cm_id); + schedule_nes_timer(nesqp->cm_node, (struct sk_buff *)nesqp, + NES_TIMER_TYPE_CLOSE, 1, 0); + nes_debug(NES_DBG_AEQ, "QP%u Not decrementing QP refcount (%d)," + " need ae to finish up, original_last_aeq = 0x%04X." + " last_aeq = 0x%04X, scheduling timer. TCP state = %d\n", + nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount), + async_event_id, nesqp->last_aeq, tcp_state); + } + break; + case NES_AEQE_AEID_LLP_CLOSE_COMPLETE: + spin_lock_irqsave(&nesqp->lock, flags); + nesqp->hw_iwarp_state = iwarp_state; + nesqp->hw_tcp_state = tcp_state; + nesqp->last_aeq = async_event_id; + spin_unlock_irqrestore(&nesqp->lock, flags); + nes_cm_disconn(nesqp); + break; + + case NES_AEQE_AEID_RESET_SENT: + tcp_state = NES_AEQE_TCP_STATE_CLOSED; + spin_lock_irqsave(&nesqp->lock, flags); + nesqp->hw_iwarp_state = iwarp_state; + nesqp->hw_tcp_state = tcp_state; + nesqp->last_aeq = async_event_id; + nesqp->hte_added = 0; + spin_unlock_irqrestore(&nesqp->lock, flags); + next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR | NES_CQP_QP_DEL_HTE; + nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0, 0); + nes_cm_disconn(nesqp); + break; + + case NES_AEQE_AEID_LLP_CONNECTION_RESET: + if (atomic_read(&nesqp->close_timer_started)) + return; + spin_lock_irqsave(&nesqp->lock, flags); + nesqp->hw_iwarp_state = iwarp_state; + nesqp->hw_tcp_state = tcp_state; + nesqp->last_aeq = async_event_id; + spin_unlock_irqrestore(&nesqp->lock, flags); + nes_cm_disconn(nesqp); + break; + + case NES_AEQE_AEID_TERMINATE_SENT: + nes_terminate_send_fin(nesdev, nesqp, aeqe); + break; + + case NES_AEQE_AEID_LLP_TERMINATE_RECEIVED: + nes_terminate_received(nesdev, nesqp, aeqe); + break; + + case NES_AEQE_AEID_AMP_BAD_STAG_KEY: + case NES_AEQE_AEID_AMP_BAD_STAG_INDEX: + case NES_AEQE_AEID_AMP_UNALLOCATED_STAG: + case NES_AEQE_AEID_AMP_INVALID_STAG: + case NES_AEQE_AEID_AMP_RIGHTS_VIOLATION: + case NES_AEQE_AEID_AMP_INVALIDATE_NO_REMOTE_ACCESS_RIGHTS: + case NES_AEQE_AEID_PRIV_OPERATION_DENIED: + case NES_AEQE_AEID_DDP_UBE_DDP_MESSAGE_TOO_LONG_FOR_AVAILABLE_BUFFER: + case NES_AEQE_AEID_AMP_BOUNDS_VIOLATION: + case NES_AEQE_AEID_AMP_TO_WRAP: + printk(KERN_ERR PFX "QP[%u] async_event_id=0x%04X IB_EVENT_QP_ACCESS_ERR\n", + nesqp->hwqp.qp_id, async_event_id); + nes_terminate_connection(nesdev, nesqp, aeqe, IB_EVENT_QP_ACCESS_ERR); + break; + + case NES_AEQE_AEID_LLP_SEGMENT_TOO_LARGE: + case NES_AEQE_AEID_LLP_SEGMENT_TOO_SMALL: + case NES_AEQE_AEID_DDP_UBE_INVALID_MO: + case NES_AEQE_AEID_DDP_UBE_INVALID_QN: + if (iwarp_opcode(nesqp, aeq_info) > IWARP_OPCODE_TERM) { + aeq_info &= 0xffff0000; + aeq_info |= NES_AEQE_AEID_RDMAP_ROE_UNEXPECTED_OPCODE; + aeqe->aeqe_words[NES_AEQE_MISC_IDX] = cpu_to_le32(aeq_info); + } + + case NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE: + case NES_AEQE_AEID_LLP_TOO_MANY_RETRIES: + case NES_AEQE_AEID_DDP_UBE_INVALID_MSN_NO_BUFFER_AVAILABLE: + case NES_AEQE_AEID_LLP_RECEIVED_MPA_CRC_ERROR: + case NES_AEQE_AEID_AMP_BAD_QP: + case NES_AEQE_AEID_LLP_RECEIVED_MARKER_AND_LENGTH_FIELDS_DONT_MATCH: + case NES_AEQE_AEID_DDP_LCE_LOCAL_CATASTROPHIC: + case NES_AEQE_AEID_DDP_NO_L_BIT: + case NES_AEQE_AEID_DDP_INVALID_MSN_GAP_IN_MSN: + case NES_AEQE_AEID_DDP_INVALID_MSN_RANGE_IS_NOT_VALID: + case NES_AEQE_AEID_DDP_UBE_INVALID_DDP_VERSION: + case NES_AEQE_AEID_RDMAP_ROE_INVALID_RDMAP_VERSION: + case NES_AEQE_AEID_RDMAP_ROE_UNEXPECTED_OPCODE: + case NES_AEQE_AEID_AMP_BAD_PD: + case NES_AEQE_AEID_AMP_FASTREG_SHARED: + case NES_AEQE_AEID_AMP_FASTREG_VALID_STAG: + case NES_AEQE_AEID_AMP_FASTREG_MW_STAG: + case NES_AEQE_AEID_AMP_FASTREG_INVALID_RIGHTS: + case NES_AEQE_AEID_AMP_FASTREG_PBL_TABLE_OVERFLOW: + case NES_AEQE_AEID_AMP_FASTREG_INVALID_LENGTH: + case NES_AEQE_AEID_AMP_INVALIDATE_SHARED: + case NES_AEQE_AEID_AMP_INVALIDATE_MR_WITH_BOUND_WINDOWS: + case NES_AEQE_AEID_AMP_MWBIND_VALID_STAG: + case NES_AEQE_AEID_AMP_MWBIND_OF_MR_STAG: + case NES_AEQE_AEID_AMP_MWBIND_TO_ZERO_BASED_STAG: + case NES_AEQE_AEID_AMP_MWBIND_TO_MW_STAG: + case NES_AEQE_AEID_AMP_MWBIND_INVALID_RIGHTS: + case NES_AEQE_AEID_AMP_MWBIND_INVALID_BOUNDS: + case NES_AEQE_AEID_AMP_MWBIND_TO_INVALID_PARENT: + case NES_AEQE_AEID_AMP_MWBIND_BIND_DISABLED: + case NES_AEQE_AEID_BAD_CLOSE: + case NES_AEQE_AEID_RDMA_READ_WHILE_ORD_ZERO: + case NES_AEQE_AEID_STAG_ZERO_INVALID: + case NES_AEQE_AEID_ROE_INVALID_RDMA_READ_REQUEST: + case NES_AEQE_AEID_ROE_INVALID_RDMA_WRITE_OR_READ_RESP: + printk(KERN_ERR PFX "QP[%u] async_event_id=0x%04X IB_EVENT_QP_FATAL\n", + nesqp->hwqp.qp_id, async_event_id); + print_ip(nesqp->cm_node); + if (!atomic_read(&nesqp->close_timer_started)) + nes_terminate_connection(nesdev, nesqp, aeqe, IB_EVENT_QP_FATAL); + break; + + case NES_AEQE_AEID_CQ_OPERATION_ERROR: + context <<= 1; + nes_debug(NES_DBG_AEQ, "Processing an NES_AEQE_AEID_CQ_OPERATION_ERROR event on CQ%u, %p\n", + le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]), (void *)(unsigned long)context); + resource_allocated = nes_is_resource_allocated(nesadapter, nesadapter->allocated_cqs, + le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX])); + if (resource_allocated) { + printk(KERN_ERR PFX "%s: Processing an NES_AEQE_AEID_CQ_OPERATION_ERROR event on CQ%u\n", + __func__, le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX])); + hw_cq = (struct nes_hw_cq *)(unsigned long)context; + if (hw_cq) { + nescq = container_of(hw_cq, struct nes_cq, hw_cq); + if (nescq->ibcq.event_handler) { + ibevent.device = nescq->ibcq.device; + ibevent.event = IB_EVENT_CQ_ERR; + ibevent.element.cq = &nescq->ibcq; + nescq->ibcq.event_handler(&ibevent, nescq->ibcq.cq_context); + } + } + } + break; + + default: + nes_debug(NES_DBG_AEQ, "Processing an iWARP related AE for QP, misc = 0x%04X\n", + async_event_id); + break; + } + +} + +/** + * nes_iwarp_ce_handler + */ +void nes_iwarp_ce_handler(struct nes_device *nesdev, struct nes_hw_cq *hw_cq) +{ + struct nes_cq *nescq = container_of(hw_cq, struct nes_cq, hw_cq); + + /* nes_debug(NES_DBG_CQ, "Processing completion event for iWARP CQ%u.\n", + nescq->hw_cq.cq_number); */ + nes_write32(nesdev->regs+NES_CQ_ACK, nescq->hw_cq.cq_number); + + if (nescq->ibcq.comp_handler) + nescq->ibcq.comp_handler(&nescq->ibcq, nescq->ibcq.cq_context); + + return; +} + + +/** + * nes_manage_apbvt() + */ +int nes_manage_apbvt(struct nes_vnic *nesvnic, u32 accel_local_port, + u32 nic_index, u32 add_port) +{ + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_hw_cqp_wqe *cqp_wqe; + struct nes_cqp_request *cqp_request; + int ret = 0; + u16 major_code; + + /* Send manage APBVT request to CQP */ + cqp_request = nes_get_cqp_request(nesdev); + if (cqp_request == NULL) { + nes_debug(NES_DBG_QP, "Failed to get a cqp_request.\n"); + return -ENOMEM; + } + cqp_request->waiting = 1; + cqp_wqe = &cqp_request->cqp_wqe; + + nes_debug(NES_DBG_QP, "%s APBV for local port=%u(0x%04x), nic_index=%u\n", + (add_port == NES_MANAGE_APBVT_ADD) ? "ADD" : "DEL", + accel_local_port, accel_local_port, nic_index); + + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, (NES_CQP_MANAGE_APBVT | + ((add_port == NES_MANAGE_APBVT_ADD) ? NES_CQP_APBVT_ADD : 0))); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, + ((nic_index << NES_CQP_APBVT_NIC_SHIFT) | accel_local_port)); + + nes_debug(NES_DBG_QP, "Waiting for CQP completion for APBVT.\n"); + + atomic_set(&cqp_request->refcount, 2); + nes_post_cqp_request(nesdev, cqp_request); + + if (add_port == NES_MANAGE_APBVT_ADD) + ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0), + NES_EVENT_TIMEOUT); + nes_debug(NES_DBG_QP, "Completed, ret=%u, CQP Major:Minor codes = 0x%04X:0x%04X\n", + ret, cqp_request->major_code, cqp_request->minor_code); + major_code = cqp_request->major_code; + + nes_put_cqp_request(nesdev, cqp_request); + + if (!ret) + return -ETIME; + else if (major_code) + return -EIO; + else + return 0; +} + + +/** + * nes_manage_arp_cache + */ +void nes_manage_arp_cache(struct net_device *netdev, unsigned char *mac_addr, + u32 ip_addr, u32 action) +{ + struct nes_hw_cqp_wqe *cqp_wqe; + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_device *nesdev; + struct nes_cqp_request *cqp_request; + int arp_index; + + nesdev = nesvnic->nesdev; + arp_index = nes_arp_table(nesdev, ip_addr, mac_addr, action); + if (arp_index == -1) { + return; + } + + /* update the ARP entry */ + cqp_request = nes_get_cqp_request(nesdev); + if (cqp_request == NULL) { + nes_debug(NES_DBG_NETDEV, "Failed to get a cqp_request.\n"); + return; + } + cqp_request->waiting = 0; + cqp_wqe = &cqp_request->cqp_wqe; + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + + cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32( + NES_CQP_MANAGE_ARP_CACHE | NES_CQP_ARP_PERM); + cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] |= cpu_to_le32( + (u32)PCI_FUNC(nesdev->pcidev->devfn) << NES_CQP_ARP_AEQ_INDEX_SHIFT); + cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(arp_index); + + if (action == NES_ARP_ADD) { + cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] |= cpu_to_le32(NES_CQP_ARP_VALID); + cqp_wqe->wqe_words[NES_CQP_ARP_WQE_MAC_ADDR_LOW_IDX] = cpu_to_le32( + (((u32)mac_addr[2]) << 24) | (((u32)mac_addr[3]) << 16) | + (((u32)mac_addr[4]) << 8) | (u32)mac_addr[5]); + cqp_wqe->wqe_words[NES_CQP_ARP_WQE_MAC_HIGH_IDX] = cpu_to_le32( + (((u32)mac_addr[0]) << 16) | (u32)mac_addr[1]); + } else { + cqp_wqe->wqe_words[NES_CQP_ARP_WQE_MAC_ADDR_LOW_IDX] = 0; + cqp_wqe->wqe_words[NES_CQP_ARP_WQE_MAC_HIGH_IDX] = 0; + } + + nes_debug(NES_DBG_NETDEV, "Not waiting for CQP, cqp.sq_head=%u, cqp.sq_tail=%u\n", + nesdev->cqp.sq_head, nesdev->cqp.sq_tail); + + atomic_set(&cqp_request->refcount, 1); + nes_post_cqp_request(nesdev, cqp_request); +} + + +/** + * flush_wqes + */ +void flush_wqes(struct nes_device *nesdev, struct nes_qp *nesqp, + u32 which_wq, u32 wait_completion) +{ + struct nes_cqp_request *cqp_request; + struct nes_hw_cqp_wqe *cqp_wqe; + u32 sq_code = (NES_IWARP_CQE_MAJOR_FLUSH << 16) | NES_IWARP_CQE_MINOR_FLUSH; + u32 rq_code = (NES_IWARP_CQE_MAJOR_FLUSH << 16) | NES_IWARP_CQE_MINOR_FLUSH; + int ret; + + cqp_request = nes_get_cqp_request(nesdev); + if (cqp_request == NULL) { + nes_debug(NES_DBG_QP, "Failed to get a cqp_request.\n"); + return; + } + if (wait_completion) { + cqp_request->waiting = 1; + atomic_set(&cqp_request->refcount, 2); + } else { + cqp_request->waiting = 0; + } + cqp_wqe = &cqp_request->cqp_wqe; + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + + /* If wqe in error was identified, set code to be put into cqe */ + if ((nesqp->term_sq_flush_code) && (which_wq & NES_CQP_FLUSH_SQ)) { + which_wq |= NES_CQP_FLUSH_MAJ_MIN; + sq_code = (CQE_MAJOR_DRV << 16) | nesqp->term_sq_flush_code; + nesqp->term_sq_flush_code = 0; + } + + if ((nesqp->term_rq_flush_code) && (which_wq & NES_CQP_FLUSH_RQ)) { + which_wq |= NES_CQP_FLUSH_MAJ_MIN; + rq_code = (CQE_MAJOR_DRV << 16) | nesqp->term_rq_flush_code; + nesqp->term_rq_flush_code = 0; + } + + if (which_wq & NES_CQP_FLUSH_MAJ_MIN) { + cqp_wqe->wqe_words[NES_CQP_QP_WQE_FLUSH_SQ_CODE] = cpu_to_le32(sq_code); + cqp_wqe->wqe_words[NES_CQP_QP_WQE_FLUSH_RQ_CODE] = cpu_to_le32(rq_code); + } + + cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = + cpu_to_le32(NES_CQP_FLUSH_WQES | which_wq); + cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(nesqp->hwqp.qp_id); + + nes_post_cqp_request(nesdev, cqp_request); + + if (wait_completion) { + /* Wait for CQP */ + ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0), + NES_EVENT_TIMEOUT); + nes_debug(NES_DBG_QP, "Flush SQ QP WQEs completed, ret=%u," + " CQP Major:Minor codes = 0x%04X:0x%04X\n", + ret, cqp_request->major_code, cqp_request->minor_code); + nes_put_cqp_request(nesdev, cqp_request); + } +} diff --git a/kernel/drivers/infiniband/hw/nes/nes_hw.h b/kernel/drivers/infiniband/hw/nes/nes_hw.h new file mode 100644 index 000000000..d748e4b31 --- /dev/null +++ b/kernel/drivers/infiniband/hw/nes/nes_hw.h @@ -0,0 +1,1392 @@ +/* +* Copyright (c) 2006 - 2011 Intel Corporation. All rights reserved. +* +* This software is available to you under a choice of one of two +* licenses. You may choose to be licensed under the terms of the GNU +* General Public License (GPL) Version 2, available from the file +* COPYING in the main directory of this source tree, or the +* OpenIB.org BSD license below: +* +* Redistribution and use in source and binary forms, with or +* without modification, are permitted provided that the following +* conditions are met: +* +* - Redistributions of source code must retain the above +* copyright notice, this list of conditions and the following +* disclaimer. +* +* - Redistributions in binary form must reproduce the above +* copyright notice, this list of conditions and the following +* disclaimer in the documentation and/or other materials +* provided with the distribution. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ + +#ifndef __NES_HW_H +#define __NES_HW_H + +#include + +#define NES_PHY_TYPE_CX4 1 +#define NES_PHY_TYPE_1G 2 +#define NES_PHY_TYPE_ARGUS 4 +#define NES_PHY_TYPE_PUMA_1G 5 +#define NES_PHY_TYPE_PUMA_10G 6 +#define NES_PHY_TYPE_GLADIUS 7 +#define NES_PHY_TYPE_SFP_D 8 +#define NES_PHY_TYPE_KR 9 + +#define NES_MULTICAST_PF_MAX 8 +#define NES_A0 3 + +#define NES_ENABLE_PAU 0x07000001 +#define NES_DISABLE_PAU 0x07000000 +#define NES_PAU_COUNTER 10 +#define NES_CQP_OPCODE_MASK 0x3f + +enum pci_regs { + NES_INT_STAT = 0x0000, + NES_INT_MASK = 0x0004, + NES_INT_PENDING = 0x0008, + NES_INTF_INT_STAT = 0x000C, + NES_INTF_INT_MASK = 0x0010, + NES_TIMER_STAT = 0x0014, + NES_PERIODIC_CONTROL = 0x0018, + NES_ONE_SHOT_CONTROL = 0x001C, + NES_EEPROM_COMMAND = 0x0020, + NES_EEPROM_DATA = 0x0024, + NES_FLASH_COMMAND = 0x0028, + NES_FLASH_DATA = 0x002C, + NES_SOFTWARE_RESET = 0x0030, + NES_CQ_ACK = 0x0034, + NES_WQE_ALLOC = 0x0040, + NES_CQE_ALLOC = 0x0044, + NES_AEQ_ALLOC = 0x0048 +}; + +enum indexed_regs { + NES_IDX_CREATE_CQP_LOW = 0x0000, + NES_IDX_CREATE_CQP_HIGH = 0x0004, + NES_IDX_QP_CONTROL = 0x0040, + NES_IDX_FLM_CONTROL = 0x0080, + NES_IDX_INT_CPU_STATUS = 0x00a0, + NES_IDX_GPR_TRIGGER = 0x00bc, + NES_IDX_GPIO_CONTROL = 0x00f0, + NES_IDX_GPIO_DATA = 0x00f4, + NES_IDX_GPR2 = 0x010c, + NES_IDX_TCP_CONFIG0 = 0x01e4, + NES_IDX_TCP_TIMER_CONFIG = 0x01ec, + NES_IDX_TCP_NOW = 0x01f0, + NES_IDX_QP_MAX_CFG_SIZES = 0x0200, + NES_IDX_QP_CTX_SIZE = 0x0218, + NES_IDX_TCP_TIMER_SIZE0 = 0x0238, + NES_IDX_TCP_TIMER_SIZE1 = 0x0240, + NES_IDX_ARP_CACHE_SIZE = 0x0258, + NES_IDX_CQ_CTX_SIZE = 0x0260, + NES_IDX_MRT_SIZE = 0x0278, + NES_IDX_PBL_REGION_SIZE = 0x0280, + NES_IDX_IRRQ_COUNT = 0x02b0, + NES_IDX_RX_WINDOW_BUFFER_PAGE_TABLE_SIZE = 0x02f0, + NES_IDX_RX_WINDOW_BUFFER_SIZE = 0x0300, + NES_IDX_DST_IP_ADDR = 0x0400, + NES_IDX_PCIX_DIAG = 0x08e8, + NES_IDX_MPP_DEBUG = 0x0a00, + NES_IDX_PORT_RX_DISCARDS = 0x0a30, + NES_IDX_PORT_TX_DISCARDS = 0x0a34, + NES_IDX_MPP_LB_DEBUG = 0x0b00, + NES_IDX_DENALI_CTL_22 = 0x1058, + NES_IDX_MAC_TX_CONTROL = 0x2000, + NES_IDX_MAC_TX_CONFIG = 0x2004, + NES_IDX_MAC_TX_PAUSE_QUANTA = 0x2008, + NES_IDX_MAC_RX_CONTROL = 0x200c, + NES_IDX_MAC_RX_CONFIG = 0x2010, + NES_IDX_MAC_EXACT_MATCH_BOTTOM = 0x201c, + NES_IDX_MAC_MDIO_CONTROL = 0x2084, + NES_IDX_MAC_TX_OCTETS_LOW = 0x2100, + NES_IDX_MAC_TX_OCTETS_HIGH = 0x2104, + NES_IDX_MAC_TX_FRAMES_LOW = 0x2108, + NES_IDX_MAC_TX_FRAMES_HIGH = 0x210c, + NES_IDX_MAC_TX_PAUSE_FRAMES = 0x2118, + NES_IDX_MAC_TX_ERRORS = 0x2138, + NES_IDX_MAC_RX_OCTETS_LOW = 0x213c, + NES_IDX_MAC_RX_OCTETS_HIGH = 0x2140, + NES_IDX_MAC_RX_FRAMES_LOW = 0x2144, + NES_IDX_MAC_RX_FRAMES_HIGH = 0x2148, + NES_IDX_MAC_RX_BC_FRAMES_LOW = 0x214c, + NES_IDX_MAC_RX_MC_FRAMES_HIGH = 0x2150, + NES_IDX_MAC_RX_PAUSE_FRAMES = 0x2154, + NES_IDX_MAC_RX_SHORT_FRAMES = 0x2174, + NES_IDX_MAC_RX_OVERSIZED_FRAMES = 0x2178, + NES_IDX_MAC_RX_JABBER_FRAMES = 0x217c, + NES_IDX_MAC_RX_CRC_ERR_FRAMES = 0x2180, + NES_IDX_MAC_RX_LENGTH_ERR_FRAMES = 0x2184, + NES_IDX_MAC_RX_SYMBOL_ERR_FRAMES = 0x2188, + NES_IDX_MAC_INT_STATUS = 0x21f0, + NES_IDX_MAC_INT_MASK = 0x21f4, + NES_IDX_PHY_PCS_CONTROL_STATUS0 = 0x2800, + NES_IDX_PHY_PCS_CONTROL_STATUS1 = 0x2a00, + NES_IDX_ETH_SERDES_COMMON_CONTROL0 = 0x2808, + NES_IDX_ETH_SERDES_COMMON_CONTROL1 = 0x2a08, + NES_IDX_ETH_SERDES_COMMON_STATUS0 = 0x280c, + NES_IDX_ETH_SERDES_COMMON_STATUS1 = 0x2a0c, + NES_IDX_ETH_SERDES_TX_EMP0 = 0x2810, + NES_IDX_ETH_SERDES_TX_EMP1 = 0x2a10, + NES_IDX_ETH_SERDES_TX_DRIVE0 = 0x2814, + NES_IDX_ETH_SERDES_TX_DRIVE1 = 0x2a14, + NES_IDX_ETH_SERDES_RX_MODE0 = 0x2818, + NES_IDX_ETH_SERDES_RX_MODE1 = 0x2a18, + NES_IDX_ETH_SERDES_RX_SIGDET0 = 0x281c, + NES_IDX_ETH_SERDES_RX_SIGDET1 = 0x2a1c, + NES_IDX_ETH_SERDES_BYPASS0 = 0x2820, + NES_IDX_ETH_SERDES_BYPASS1 = 0x2a20, + NES_IDX_ETH_SERDES_LOOPBACK_CONTROL0 = 0x2824, + NES_IDX_ETH_SERDES_LOOPBACK_CONTROL1 = 0x2a24, + NES_IDX_ETH_SERDES_RX_EQ_CONTROL0 = 0x2828, + NES_IDX_ETH_SERDES_RX_EQ_CONTROL1 = 0x2a28, + NES_IDX_ETH_SERDES_RX_EQ_STATUS0 = 0x282c, + NES_IDX_ETH_SERDES_RX_EQ_STATUS1 = 0x2a2c, + NES_IDX_ETH_SERDES_CDR_RESET0 = 0x2830, + NES_IDX_ETH_SERDES_CDR_RESET1 = 0x2a30, + NES_IDX_ETH_SERDES_CDR_CONTROL0 = 0x2834, + NES_IDX_ETH_SERDES_CDR_CONTROL1 = 0x2a34, + NES_IDX_ETH_SERDES_TX_HIGHZ_LANE_MODE0 = 0x2838, + NES_IDX_ETH_SERDES_TX_HIGHZ_LANE_MODE1 = 0x2a38, + NES_IDX_ENDNODE0_NSTAT_RX_DISCARD = 0x3080, + NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_LO = 0x3000, + NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_HI = 0x3004, + NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_LO = 0x3008, + NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_HI = 0x300c, + NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_LO = 0x7000, + NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_HI = 0x7004, + NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_LO = 0x7008, + NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_HI = 0x700c, + NES_IDX_WQM_CONFIG0 = 0x5000, + NES_IDX_WQM_CONFIG1 = 0x5004, + NES_IDX_CM_CONFIG = 0x5100, + NES_IDX_NIC_LOGPORT_TO_PHYPORT = 0x6000, + NES_IDX_NIC_PHYPORT_TO_USW = 0x6008, + NES_IDX_NIC_ACTIVE = 0x6010, + NES_IDX_NIC_UNICAST_ALL = 0x6018, + NES_IDX_NIC_MULTICAST_ALL = 0x6020, + NES_IDX_NIC_MULTICAST_ENABLE = 0x6028, + NES_IDX_NIC_BROADCAST_ON = 0x6030, + NES_IDX_USED_CHUNKS_TX = 0x60b0, + NES_IDX_TX_POOL_SIZE = 0x60b8, + NES_IDX_QUAD_HASH_TABLE_SIZE = 0x6148, + NES_IDX_PERFECT_FILTER_LOW = 0x6200, + NES_IDX_PERFECT_FILTER_HIGH = 0x6204, + NES_IDX_IPV4_TCP_REXMITS = 0x7080, + NES_IDX_DEBUG_ERROR_CONTROL_STATUS = 0x913c, + NES_IDX_DEBUG_ERROR_MASKS0 = 0x9140, + NES_IDX_DEBUG_ERROR_MASKS1 = 0x9144, + NES_IDX_DEBUG_ERROR_MASKS2 = 0x9148, + NES_IDX_DEBUG_ERROR_MASKS3 = 0x914c, + NES_IDX_DEBUG_ERROR_MASKS4 = 0x9150, + NES_IDX_DEBUG_ERROR_MASKS5 = 0x9154, +}; + +#define NES_IDX_MAC_TX_CONFIG_ENABLE_PAUSE 1 +#define NES_IDX_MPP_DEBUG_PORT_DISABLE_PAUSE (1 << 17) + +enum nes_cqp_opcodes { + NES_CQP_CREATE_QP = 0x00, + NES_CQP_MODIFY_QP = 0x01, + NES_CQP_DESTROY_QP = 0x02, + NES_CQP_CREATE_CQ = 0x03, + NES_CQP_MODIFY_CQ = 0x04, + NES_CQP_DESTROY_CQ = 0x05, + NES_CQP_ALLOCATE_STAG = 0x09, + NES_CQP_REGISTER_STAG = 0x0a, + NES_CQP_QUERY_STAG = 0x0b, + NES_CQP_REGISTER_SHARED_STAG = 0x0c, + NES_CQP_DEALLOCATE_STAG = 0x0d, + NES_CQP_MANAGE_ARP_CACHE = 0x0f, + NES_CQP_DOWNLOAD_SEGMENT = 0x10, + NES_CQP_SUSPEND_QPS = 0x11, + NES_CQP_UPLOAD_CONTEXT = 0x13, + NES_CQP_CREATE_CEQ = 0x16, + NES_CQP_DESTROY_CEQ = 0x18, + NES_CQP_CREATE_AEQ = 0x19, + NES_CQP_DESTROY_AEQ = 0x1b, + NES_CQP_LMI_ACCESS = 0x20, + NES_CQP_FLUSH_WQES = 0x22, + NES_CQP_MANAGE_APBVT = 0x23, + NES_CQP_MANAGE_QUAD_HASH = 0x25 +}; + +enum nes_cqp_wqe_word_idx { + NES_CQP_WQE_OPCODE_IDX = 0, + NES_CQP_WQE_ID_IDX = 1, + NES_CQP_WQE_COMP_CTX_LOW_IDX = 2, + NES_CQP_WQE_COMP_CTX_HIGH_IDX = 3, + NES_CQP_WQE_COMP_SCRATCH_LOW_IDX = 4, + NES_CQP_WQE_COMP_SCRATCH_HIGH_IDX = 5, +}; + +enum nes_cqp_wqe_word_download_idx { /* format differs from other cqp ops */ + NES_CQP_WQE_DL_OPCODE_IDX = 0, + NES_CQP_WQE_DL_COMP_CTX_LOW_IDX = 1, + NES_CQP_WQE_DL_COMP_CTX_HIGH_IDX = 2, + NES_CQP_WQE_DL_LENGTH_0_TOTAL_IDX = 3 + /* For index values 4-15 use NES_NIC_SQ_WQE_ values */ +}; + +enum nes_cqp_cq_wqeword_idx { + NES_CQP_CQ_WQE_PBL_LOW_IDX = 6, + NES_CQP_CQ_WQE_PBL_HIGH_IDX = 7, + NES_CQP_CQ_WQE_CQ_CONTEXT_LOW_IDX = 8, + NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX = 9, + NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX = 10, +}; + +enum nes_cqp_stag_wqeword_idx { + NES_CQP_STAG_WQE_PBL_BLK_COUNT_IDX = 1, + NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX = 6, + NES_CQP_STAG_WQE_LEN_LOW_IDX = 7, + NES_CQP_STAG_WQE_STAG_IDX = 8, + NES_CQP_STAG_WQE_VA_LOW_IDX = 10, + NES_CQP_STAG_WQE_VA_HIGH_IDX = 11, + NES_CQP_STAG_WQE_PA_LOW_IDX = 12, + NES_CQP_STAG_WQE_PA_HIGH_IDX = 13, + NES_CQP_STAG_WQE_PBL_LEN_IDX = 14 +}; + +#define NES_CQP_OP_LOGICAL_PORT_SHIFT 26 +#define NES_CQP_OP_IWARP_STATE_SHIFT 28 +#define NES_CQP_OP_TERMLEN_SHIFT 28 + +enum nes_cqp_qp_bits { + NES_CQP_QP_ARP_VALID = (1<<8), + NES_CQP_QP_WINBUF_VALID = (1<<9), + NES_CQP_QP_CONTEXT_VALID = (1<<10), + NES_CQP_QP_ORD_VALID = (1<<11), + NES_CQP_QP_WINBUF_DATAIND_EN = (1<<12), + NES_CQP_QP_VIRT_WQS = (1<<13), + NES_CQP_QP_DEL_HTE = (1<<14), + NES_CQP_QP_CQS_VALID = (1<<15), + NES_CQP_QP_TYPE_TSA = 0, + NES_CQP_QP_TYPE_IWARP = (1<<16), + NES_CQP_QP_TYPE_CQP = (4<<16), + NES_CQP_QP_TYPE_NIC = (5<<16), + NES_CQP_QP_MSS_CHG = (1<<20), + NES_CQP_QP_STATIC_RESOURCES = (1<<21), + NES_CQP_QP_IGNORE_MW_BOUND = (1<<22), + NES_CQP_QP_VWQ_USE_LMI = (1<<23), + NES_CQP_QP_IWARP_STATE_IDLE = (1<netdev */ + u8 perfect_filter_index; + u8 nic_index; + u8 qp_nic_index[4]; + u8 next_qp_nic_index; + u8 of_device_registered; + u8 rdma_enabled; + u32 lro_max_aggr; + struct net_lro_mgr lro_mgr; + struct net_lro_desc lro_desc[NES_MAX_LRO_DESCRIPTORS]; + struct timer_list event_timer; + enum ib_event_type delayed_event; + enum ib_event_type last_dispatched_event; + spinlock_t port_ibevent_lock; + u32 mgt_mem_size; + void *mgt_vbase; + dma_addr_t mgt_pbase; + struct nes_vnic_mgt *mgtvnic[NES_MGT_QP_COUNT]; + struct task_struct *mgt_thread; + wait_queue_head_t mgt_wait_queue; + struct sk_buff_head mgt_skb_list; + +}; + +struct nes_ib_device { + struct ib_device ibdev; + struct nes_vnic *nesvnic; + + /* Virtual RNIC Limits */ + u32 max_mr; + u32 max_qp; + u32 max_cq; + u32 max_pd; + u32 num_mr; + u32 num_qp; + u32 num_cq; + u32 num_pd; +}; + +enum nes_hdrct_flags { + DDP_LEN_FLAG = 0x80, + DDP_HDR_FLAG = 0x40, + RDMA_HDR_FLAG = 0x20 +}; + +enum nes_term_layers { + LAYER_RDMA = 0, + LAYER_DDP = 1, + LAYER_MPA = 2 +}; + +enum nes_term_error_types { + RDMAP_CATASTROPHIC = 0, + RDMAP_REMOTE_PROT = 1, + RDMAP_REMOTE_OP = 2, + DDP_CATASTROPHIC = 0, + DDP_TAGGED_BUFFER = 1, + DDP_UNTAGGED_BUFFER = 2, + DDP_LLP = 3 +}; + +enum nes_term_rdma_errors { + RDMAP_INV_STAG = 0x00, + RDMAP_INV_BOUNDS = 0x01, + RDMAP_ACCESS = 0x02, + RDMAP_UNASSOC_STAG = 0x03, + RDMAP_TO_WRAP = 0x04, + RDMAP_INV_RDMAP_VER = 0x05, + RDMAP_UNEXPECTED_OP = 0x06, + RDMAP_CATASTROPHIC_LOCAL = 0x07, + RDMAP_CATASTROPHIC_GLOBAL = 0x08, + RDMAP_CANT_INV_STAG = 0x09, + RDMAP_UNSPECIFIED = 0xff +}; + +enum nes_term_ddp_errors { + DDP_CATASTROPHIC_LOCAL = 0x00, + DDP_TAGGED_INV_STAG = 0x00, + DDP_TAGGED_BOUNDS = 0x01, + DDP_TAGGED_UNASSOC_STAG = 0x02, + DDP_TAGGED_TO_WRAP = 0x03, + DDP_TAGGED_INV_DDP_VER = 0x04, + DDP_UNTAGGED_INV_QN = 0x01, + DDP_UNTAGGED_INV_MSN_NO_BUF = 0x02, + DDP_UNTAGGED_INV_MSN_RANGE = 0x03, + DDP_UNTAGGED_INV_MO = 0x04, + DDP_UNTAGGED_INV_TOO_LONG = 0x05, + DDP_UNTAGGED_INV_DDP_VER = 0x06 +}; + +enum nes_term_mpa_errors { + MPA_CLOSED = 0x01, + MPA_CRC = 0x02, + MPA_MARKER = 0x03, + MPA_REQ_RSP = 0x04, +}; + +struct nes_terminate_hdr { + u8 layer_etype; + u8 error_code; + u8 hdrct; + u8 rsvd; +}; + +/* Used to determine how to fill in terminate error codes */ +#define IWARP_OPCODE_WRITE 0 +#define IWARP_OPCODE_READREQ 1 +#define IWARP_OPCODE_READRSP 2 +#define IWARP_OPCODE_SEND 3 +#define IWARP_OPCODE_SEND_INV 4 +#define IWARP_OPCODE_SEND_SE 5 +#define IWARP_OPCODE_SEND_SE_INV 6 +#define IWARP_OPCODE_TERM 7 + +/* These values are used only during terminate processing */ +#define TERM_DDP_LEN_TAGGED 14 +#define TERM_DDP_LEN_UNTAGGED 18 +#define TERM_RDMA_LEN 28 +#define RDMA_OPCODE_MASK 0x0f +#define RDMA_READ_REQ_OPCODE 1 +#define BAD_FRAME_OFFSET 64 +#define CQE_MAJOR_DRV 0x8000 + +/* Used for link status recheck after interrupt processing */ +#define NES_LINK_RECHECK_DELAY msecs_to_jiffies(50) +#define NES_LINK_RECHECK_MAX 60 + +#endif /* __NES_HW_H */ diff --git a/kernel/drivers/infiniband/hw/nes/nes_mgt.c b/kernel/drivers/infiniband/hw/nes/nes_mgt.c new file mode 100644 index 000000000..416645259 --- /dev/null +++ b/kernel/drivers/infiniband/hw/nes/nes_mgt.c @@ -0,0 +1,1160 @@ +/* + * Copyright (c) 2006 - 2011 Intel-NE, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include +#include +#include +#include +#include +#include "nes.h" +#include "nes_mgt.h" + +atomic_t pau_qps_created; +atomic_t pau_qps_destroyed; + +static void nes_replenish_mgt_rq(struct nes_vnic_mgt *mgtvnic) +{ + unsigned long flags; + dma_addr_t bus_address; + struct sk_buff *skb; + struct nes_hw_nic_rq_wqe *nic_rqe; + struct nes_hw_mgt *nesmgt; + struct nes_device *nesdev; + struct nes_rskb_cb *cb; + u32 rx_wqes_posted = 0; + + nesmgt = &mgtvnic->mgt; + nesdev = mgtvnic->nesvnic->nesdev; + spin_lock_irqsave(&nesmgt->rq_lock, flags); + if (nesmgt->replenishing_rq != 0) { + if (((nesmgt->rq_size - 1) == atomic_read(&mgtvnic->rx_skbs_needed)) && + (atomic_read(&mgtvnic->rx_skb_timer_running) == 0)) { + atomic_set(&mgtvnic->rx_skb_timer_running, 1); + spin_unlock_irqrestore(&nesmgt->rq_lock, flags); + mgtvnic->rq_wqes_timer.expires = jiffies + (HZ / 2); /* 1/2 second */ + add_timer(&mgtvnic->rq_wqes_timer); + } else { + spin_unlock_irqrestore(&nesmgt->rq_lock, flags); + } + return; + } + nesmgt->replenishing_rq = 1; + spin_unlock_irqrestore(&nesmgt->rq_lock, flags); + do { + skb = dev_alloc_skb(mgtvnic->nesvnic->max_frame_size); + if (skb) { + skb->dev = mgtvnic->nesvnic->netdev; + + bus_address = pci_map_single(nesdev->pcidev, + skb->data, mgtvnic->nesvnic->max_frame_size, PCI_DMA_FROMDEVICE); + cb = (struct nes_rskb_cb *)&skb->cb[0]; + cb->busaddr = bus_address; + cb->maplen = mgtvnic->nesvnic->max_frame_size; + + nic_rqe = &nesmgt->rq_vbase[mgtvnic->mgt.rq_head]; + nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_1_0_IDX] = + cpu_to_le32(mgtvnic->nesvnic->max_frame_size); + nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_3_2_IDX] = 0; + nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX] = + cpu_to_le32((u32)bus_address); + nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX] = + cpu_to_le32((u32)((u64)bus_address >> 32)); + nesmgt->rx_skb[nesmgt->rq_head] = skb; + nesmgt->rq_head++; + nesmgt->rq_head &= nesmgt->rq_size - 1; + atomic_dec(&mgtvnic->rx_skbs_needed); + barrier(); + if (++rx_wqes_posted == 255) { + nes_write32(nesdev->regs + NES_WQE_ALLOC, (rx_wqes_posted << 24) | nesmgt->qp_id); + rx_wqes_posted = 0; + } + } else { + spin_lock_irqsave(&nesmgt->rq_lock, flags); + if (((nesmgt->rq_size - 1) == atomic_read(&mgtvnic->rx_skbs_needed)) && + (atomic_read(&mgtvnic->rx_skb_timer_running) == 0)) { + atomic_set(&mgtvnic->rx_skb_timer_running, 1); + spin_unlock_irqrestore(&nesmgt->rq_lock, flags); + mgtvnic->rq_wqes_timer.expires = jiffies + (HZ / 2); /* 1/2 second */ + add_timer(&mgtvnic->rq_wqes_timer); + } else { + spin_unlock_irqrestore(&nesmgt->rq_lock, flags); + } + break; + } + } while (atomic_read(&mgtvnic->rx_skbs_needed)); + barrier(); + if (rx_wqes_posted) + nes_write32(nesdev->regs + NES_WQE_ALLOC, (rx_wqes_posted << 24) | nesmgt->qp_id); + nesmgt->replenishing_rq = 0; +} + +/** + * nes_mgt_rq_wqes_timeout + */ +static void nes_mgt_rq_wqes_timeout(unsigned long parm) +{ + struct nes_vnic_mgt *mgtvnic = (struct nes_vnic_mgt *)parm; + + atomic_set(&mgtvnic->rx_skb_timer_running, 0); + if (atomic_read(&mgtvnic->rx_skbs_needed)) + nes_replenish_mgt_rq(mgtvnic); +} + +/** + * nes_mgt_free_skb - unmap and free skb + */ +static void nes_mgt_free_skb(struct nes_device *nesdev, struct sk_buff *skb, u32 dir) +{ + struct nes_rskb_cb *cb; + + cb = (struct nes_rskb_cb *)&skb->cb[0]; + pci_unmap_single(nesdev->pcidev, cb->busaddr, cb->maplen, dir); + cb->busaddr = 0; + dev_kfree_skb_any(skb); +} + +/** + * nes_download_callback - handle download completions + */ +static void nes_download_callback(struct nes_device *nesdev, struct nes_cqp_request *cqp_request) +{ + struct pau_fpdu_info *fpdu_info = cqp_request->cqp_callback_pointer; + struct nes_qp *nesqp = fpdu_info->nesqp; + struct sk_buff *skb; + int i; + + for (i = 0; i < fpdu_info->frag_cnt; i++) { + skb = fpdu_info->frags[i].skb; + if (fpdu_info->frags[i].cmplt) { + nes_mgt_free_skb(nesdev, skb, PCI_DMA_TODEVICE); + nes_rem_ref_cm_node(nesqp->cm_node); + } + } + + if (fpdu_info->hdr_vbase) + pci_free_consistent(nesdev->pcidev, fpdu_info->hdr_len, + fpdu_info->hdr_vbase, fpdu_info->hdr_pbase); + kfree(fpdu_info); +} + +/** + * nes_get_seq - Get the seq, ack_seq and window from the packet + */ +static u32 nes_get_seq(struct sk_buff *skb, u32 *ack, u16 *wnd, u32 *fin_rcvd, u32 *rst_rcvd) +{ + struct nes_rskb_cb *cb = (struct nes_rskb_cb *)&skb->cb[0]; + struct iphdr *iph = (struct iphdr *)(cb->data_start + ETH_HLEN); + struct tcphdr *tcph = (struct tcphdr *)(((char *)iph) + (4 * iph->ihl)); + + *ack = be32_to_cpu(tcph->ack_seq); + *wnd = be16_to_cpu(tcph->window); + *fin_rcvd = tcph->fin; + *rst_rcvd = tcph->rst; + return be32_to_cpu(tcph->seq); +} + +/** + * nes_get_next_skb - Get the next skb based on where current skb is in the queue + */ +static struct sk_buff *nes_get_next_skb(struct nes_device *nesdev, struct nes_qp *nesqp, + struct sk_buff *skb, u32 nextseq, u32 *ack, + u16 *wnd, u32 *fin_rcvd, u32 *rst_rcvd) +{ + u32 seq; + bool processacks; + struct sk_buff *old_skb; + + if (skb) { + /* Continue processing fpdu */ + if (skb->next == (struct sk_buff *)&nesqp->pau_list) + goto out; + skb = skb->next; + processacks = false; + } else { + /* Starting a new one */ + if (skb_queue_empty(&nesqp->pau_list)) + goto out; + skb = skb_peek(&nesqp->pau_list); + processacks = true; + } + + while (1) { + if (skb_queue_empty(&nesqp->pau_list)) + goto out; + + seq = nes_get_seq(skb, ack, wnd, fin_rcvd, rst_rcvd); + if (seq == nextseq) { + if (skb->len || processacks) + break; + } else if (after(seq, nextseq)) { + goto out; + } + + old_skb = skb; + skb = skb->next; + skb_unlink(old_skb, &nesqp->pau_list); + nes_mgt_free_skb(nesdev, old_skb, PCI_DMA_TODEVICE); + nes_rem_ref_cm_node(nesqp->cm_node); + if (skb == (struct sk_buff *)&nesqp->pau_list) + goto out; + } + return skb; + +out: + return NULL; +} + +/** + * get_fpdu_info - Find the next complete fpdu and return its fragments. + */ +static int get_fpdu_info(struct nes_device *nesdev, struct nes_qp *nesqp, + struct pau_fpdu_info **pau_fpdu_info) +{ + struct sk_buff *skb; + struct iphdr *iph; + struct tcphdr *tcph; + struct nes_rskb_cb *cb; + struct pau_fpdu_info *fpdu_info = NULL; + struct pau_fpdu_frag frags[MAX_FPDU_FRAGS]; + u32 fpdu_len = 0; + u32 tmp_len; + int frag_cnt = 0; + u32 tot_len; + u32 frag_tot; + u32 ack; + u32 fin_rcvd; + u32 rst_rcvd; + u16 wnd; + int i; + int rc = 0; + + *pau_fpdu_info = NULL; + + skb = nes_get_next_skb(nesdev, nesqp, NULL, nesqp->pau_rcv_nxt, &ack, &wnd, &fin_rcvd, &rst_rcvd); + if (!skb) + goto out; + + cb = (struct nes_rskb_cb *)&skb->cb[0]; + if (skb->len) { + fpdu_len = be16_to_cpu(*(__be16 *) skb->data) + MPA_FRAMING; + fpdu_len = (fpdu_len + 3) & 0xfffffffc; + tmp_len = fpdu_len; + + /* See if we have all of the fpdu */ + frag_tot = 0; + memset(&frags, 0, sizeof frags); + for (i = 0; i < MAX_FPDU_FRAGS; i++) { + frags[i].physaddr = cb->busaddr; + frags[i].physaddr += skb->data - cb->data_start; + frags[i].frag_len = min(tmp_len, skb->len); + frags[i].skb = skb; + frags[i].cmplt = (skb->len == frags[i].frag_len); + frag_tot += frags[i].frag_len; + frag_cnt++; + + tmp_len -= frags[i].frag_len; + if (tmp_len == 0) + break; + + skb = nes_get_next_skb(nesdev, nesqp, skb, + nesqp->pau_rcv_nxt + frag_tot, &ack, &wnd, &fin_rcvd, &rst_rcvd); + if (!skb) + goto out; + if (rst_rcvd) { + /* rst received in the middle of fpdu */ + for (; i >= 0; i--) { + skb_unlink(frags[i].skb, &nesqp->pau_list); + nes_mgt_free_skb(nesdev, frags[i].skb, PCI_DMA_TODEVICE); + } + cb = (struct nes_rskb_cb *)&skb->cb[0]; + frags[0].physaddr = cb->busaddr; + frags[0].physaddr += skb->data - cb->data_start; + frags[0].frag_len = skb->len; + frags[0].skb = skb; + frags[0].cmplt = true; + frag_cnt = 1; + break; + } + + cb = (struct nes_rskb_cb *)&skb->cb[0]; + } + } else { + /* no data */ + frags[0].physaddr = cb->busaddr; + frags[0].frag_len = 0; + frags[0].skb = skb; + frags[0].cmplt = true; + frag_cnt = 1; + } + + /* Found one */ + fpdu_info = kzalloc(sizeof(*fpdu_info), GFP_ATOMIC); + if (fpdu_info == NULL) { + nes_debug(NES_DBG_PAU, "Failed to alloc a fpdu_info.\n"); + rc = -ENOMEM; + goto out; + } + + fpdu_info->cqp_request = nes_get_cqp_request(nesdev); + if (fpdu_info->cqp_request == NULL) { + nes_debug(NES_DBG_PAU, "Failed to get a cqp_request.\n"); + rc = -ENOMEM; + goto out; + } + + cb = (struct nes_rskb_cb *)&frags[0].skb->cb[0]; + iph = (struct iphdr *)(cb->data_start + ETH_HLEN); + tcph = (struct tcphdr *)(((char *)iph) + (4 * iph->ihl)); + fpdu_info->hdr_len = (((unsigned char *)tcph) + 4 * (tcph->doff)) - cb->data_start; + fpdu_info->data_len = fpdu_len; + tot_len = fpdu_info->hdr_len + fpdu_len - ETH_HLEN; + + if (frags[0].cmplt) { + fpdu_info->hdr_pbase = cb->busaddr; + fpdu_info->hdr_vbase = NULL; + } else { + fpdu_info->hdr_vbase = pci_alloc_consistent(nesdev->pcidev, + fpdu_info->hdr_len, &fpdu_info->hdr_pbase); + if (!fpdu_info->hdr_vbase) { + nes_debug(NES_DBG_PAU, "Unable to allocate memory for pau first frag\n"); + rc = -ENOMEM; + goto out; + } + + /* Copy hdrs, adjusting len and seqnum */ + memcpy(fpdu_info->hdr_vbase, cb->data_start, fpdu_info->hdr_len); + iph = (struct iphdr *)(fpdu_info->hdr_vbase + ETH_HLEN); + tcph = (struct tcphdr *)(((char *)iph) + (4 * iph->ihl)); + } + + iph->tot_len = cpu_to_be16(tot_len); + iph->saddr = cpu_to_be32(0x7f000001); + + tcph->seq = cpu_to_be32(nesqp->pau_rcv_nxt); + tcph->ack_seq = cpu_to_be32(ack); + tcph->window = cpu_to_be16(wnd); + + nesqp->pau_rcv_nxt += fpdu_len + fin_rcvd; + + memcpy(fpdu_info->frags, frags, sizeof(fpdu_info->frags)); + fpdu_info->frag_cnt = frag_cnt; + fpdu_info->nesqp = nesqp; + *pau_fpdu_info = fpdu_info; + + /* Update skb's for next pass */ + for (i = 0; i < frag_cnt; i++) { + cb = (struct nes_rskb_cb *)&frags[i].skb->cb[0]; + skb_pull(frags[i].skb, frags[i].frag_len); + + if (frags[i].skb->len == 0) { + /* Pull skb off the list - it will be freed in the callback */ + if (!skb_queue_empty(&nesqp->pau_list)) + skb_unlink(frags[i].skb, &nesqp->pau_list); + } else { + /* Last skb still has data so update the seq */ + iph = (struct iphdr *)(cb->data_start + ETH_HLEN); + tcph = (struct tcphdr *)(((char *)iph) + (4 * iph->ihl)); + tcph->seq = cpu_to_be32(nesqp->pau_rcv_nxt); + } + } + +out: + if (rc) { + if (fpdu_info) { + if (fpdu_info->cqp_request) + nes_put_cqp_request(nesdev, fpdu_info->cqp_request); + kfree(fpdu_info); + } + } + return rc; +} + +/** + * forward_fpdu - send complete fpdus, one at a time + */ +static int forward_fpdus(struct nes_vnic *nesvnic, struct nes_qp *nesqp) +{ + struct nes_device *nesdev = nesvnic->nesdev; + struct pau_fpdu_info *fpdu_info; + struct nes_hw_cqp_wqe *cqp_wqe; + struct nes_cqp_request *cqp_request; + unsigned long flags; + u64 u64tmp; + u32 u32tmp; + int rc; + + while (1) { + spin_lock_irqsave(&nesqp->pau_lock, flags); + rc = get_fpdu_info(nesdev, nesqp, &fpdu_info); + if (rc || (fpdu_info == NULL)) { + spin_unlock_irqrestore(&nesqp->pau_lock, flags); + return rc; + } + + cqp_request = fpdu_info->cqp_request; + cqp_wqe = &cqp_request->cqp_wqe; + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_DL_OPCODE_IDX, + NES_CQP_DOWNLOAD_SEGMENT | + (((u32)nesvnic->logical_port) << NES_CQP_OP_LOGICAL_PORT_SHIFT)); + + u32tmp = fpdu_info->hdr_len << 16; + u32tmp |= fpdu_info->hdr_len + (u32)fpdu_info->data_len; + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_DL_LENGTH_0_TOTAL_IDX, + u32tmp); + + u32tmp = (fpdu_info->frags[1].frag_len << 16) | fpdu_info->frags[0].frag_len; + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_LENGTH_2_1_IDX, + u32tmp); + + u32tmp = (fpdu_info->frags[3].frag_len << 16) | fpdu_info->frags[2].frag_len; + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_LENGTH_4_3_IDX, + u32tmp); + + u64tmp = (u64)fpdu_info->hdr_pbase; + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG0_LOW_IDX, + lower_32_bits(u64tmp)); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG0_HIGH_IDX, + upper_32_bits(u64tmp)); + + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG1_LOW_IDX, + lower_32_bits(fpdu_info->frags[0].physaddr)); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG1_HIGH_IDX, + upper_32_bits(fpdu_info->frags[0].physaddr)); + + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG2_LOW_IDX, + lower_32_bits(fpdu_info->frags[1].physaddr)); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG2_HIGH_IDX, + upper_32_bits(fpdu_info->frags[1].physaddr)); + + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG3_LOW_IDX, + lower_32_bits(fpdu_info->frags[2].physaddr)); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG3_HIGH_IDX, + upper_32_bits(fpdu_info->frags[2].physaddr)); + + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG4_LOW_IDX, + lower_32_bits(fpdu_info->frags[3].physaddr)); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG4_HIGH_IDX, + upper_32_bits(fpdu_info->frags[3].physaddr)); + + cqp_request->cqp_callback_pointer = fpdu_info; + cqp_request->callback = 1; + cqp_request->cqp_callback = nes_download_callback; + + atomic_set(&cqp_request->refcount, 1); + nes_post_cqp_request(nesdev, cqp_request); + spin_unlock_irqrestore(&nesqp->pau_lock, flags); + } + + return 0; +} + +static void process_fpdus(struct nes_vnic *nesvnic, struct nes_qp *nesqp) +{ + int again = 1; + unsigned long flags; + + do { + /* Ignore rc - if it failed, tcp retries will cause it to try again */ + forward_fpdus(nesvnic, nesqp); + + spin_lock_irqsave(&nesqp->pau_lock, flags); + if (nesqp->pau_pending) { + nesqp->pau_pending = 0; + } else { + nesqp->pau_busy = 0; + again = 0; + } + + spin_unlock_irqrestore(&nesqp->pau_lock, flags); + } while (again); +} + +/** + * queue_fpdus - Handle fpdu's that hw passed up to sw + */ +static void queue_fpdus(struct sk_buff *skb, struct nes_vnic *nesvnic, struct nes_qp *nesqp) +{ + struct sk_buff *tmpskb; + struct nes_rskb_cb *cb; + struct iphdr *iph; + struct tcphdr *tcph; + unsigned char *tcph_end; + u32 rcv_nxt; + u32 rcv_wnd; + u32 seqnum; + u32 len; + bool process_it = false; + unsigned long flags; + + /* Move data ptr to after tcp header */ + iph = (struct iphdr *)skb->data; + tcph = (struct tcphdr *)(((char *)iph) + (4 * iph->ihl)); + seqnum = be32_to_cpu(tcph->seq); + tcph_end = (((char *)tcph) + (4 * tcph->doff)); + + len = be16_to_cpu(iph->tot_len); + if (skb->len > len) + skb_trim(skb, len); + skb_pull(skb, tcph_end - skb->data); + + /* Initialize tracking values */ + cb = (struct nes_rskb_cb *)&skb->cb[0]; + cb->seqnum = seqnum; + + /* Make sure data is in the receive window */ + rcv_nxt = nesqp->pau_rcv_nxt; + rcv_wnd = le32_to_cpu(nesqp->nesqp_context->rcv_wnd); + if (!between(seqnum, rcv_nxt, (rcv_nxt + rcv_wnd))) { + nes_mgt_free_skb(nesvnic->nesdev, skb, PCI_DMA_TODEVICE); + nes_rem_ref_cm_node(nesqp->cm_node); + return; + } + + spin_lock_irqsave(&nesqp->pau_lock, flags); + + if (nesqp->pau_busy) + nesqp->pau_pending = 1; + else + nesqp->pau_busy = 1; + + /* Queue skb by sequence number */ + if (skb_queue_len(&nesqp->pau_list) == 0) { + skb_queue_head(&nesqp->pau_list, skb); + } else { + tmpskb = nesqp->pau_list.next; + while (tmpskb != (struct sk_buff *)&nesqp->pau_list) { + cb = (struct nes_rskb_cb *)&tmpskb->cb[0]; + if (before(seqnum, cb->seqnum)) + break; + tmpskb = tmpskb->next; + } + skb_insert(tmpskb, skb, &nesqp->pau_list); + } + if (nesqp->pau_state == PAU_READY) + process_it = true; + spin_unlock_irqrestore(&nesqp->pau_lock, flags); + + if (process_it) + process_fpdus(nesvnic, nesqp); + + return; +} + +/** + * mgt_thread - Handle mgt skbs in a safe context + */ +static int mgt_thread(void *context) +{ + struct nes_vnic *nesvnic = context; + struct sk_buff *skb; + struct nes_rskb_cb *cb; + + while (!kthread_should_stop()) { + wait_event_interruptible(nesvnic->mgt_wait_queue, + skb_queue_len(&nesvnic->mgt_skb_list) || kthread_should_stop()); + while ((skb_queue_len(&nesvnic->mgt_skb_list)) && !kthread_should_stop()) { + skb = skb_dequeue(&nesvnic->mgt_skb_list); + cb = (struct nes_rskb_cb *)&skb->cb[0]; + cb->data_start = skb->data - ETH_HLEN; + cb->busaddr = pci_map_single(nesvnic->nesdev->pcidev, cb->data_start, + nesvnic->max_frame_size, PCI_DMA_TODEVICE); + queue_fpdus(skb, nesvnic, cb->nesqp); + } + } + + /* Closing down so delete any entries on the queue */ + while (skb_queue_len(&nesvnic->mgt_skb_list)) { + skb = skb_dequeue(&nesvnic->mgt_skb_list); + cb = (struct nes_rskb_cb *)&skb->cb[0]; + nes_rem_ref_cm_node(cb->nesqp->cm_node); + dev_kfree_skb_any(skb); + } + return 0; +} + +/** + * nes_queue_skbs - Queue skb so it can be handled in a thread context + */ +void nes_queue_mgt_skbs(struct sk_buff *skb, struct nes_vnic *nesvnic, struct nes_qp *nesqp) +{ + struct nes_rskb_cb *cb; + + cb = (struct nes_rskb_cb *)&skb->cb[0]; + cb->nesqp = nesqp; + skb_queue_tail(&nesvnic->mgt_skb_list, skb); + wake_up_interruptible(&nesvnic->mgt_wait_queue); +} + +void nes_destroy_pau_qp(struct nes_device *nesdev, struct nes_qp *nesqp) +{ + struct sk_buff *skb; + unsigned long flags; + atomic_inc(&pau_qps_destroyed); + + /* Free packets that have not yet been forwarded */ + /* Lock is acquired by skb_dequeue when removing the skb */ + spin_lock_irqsave(&nesqp->pau_lock, flags); + while (skb_queue_len(&nesqp->pau_list)) { + skb = skb_dequeue(&nesqp->pau_list); + nes_mgt_free_skb(nesdev, skb, PCI_DMA_TODEVICE); + nes_rem_ref_cm_node(nesqp->cm_node); + } + spin_unlock_irqrestore(&nesqp->pau_lock, flags); +} + +static void nes_chg_qh_handler(struct nes_device *nesdev, struct nes_cqp_request *cqp_request) +{ + struct pau_qh_chg *qh_chg = cqp_request->cqp_callback_pointer; + struct nes_cqp_request *new_request; + struct nes_hw_cqp_wqe *cqp_wqe; + struct nes_adapter *nesadapter; + struct nes_qp *nesqp; + struct nes_v4_quad nes_quad; + u32 crc_value; + u64 u64temp; + + nesadapter = nesdev->nesadapter; + nesqp = qh_chg->nesqp; + + /* Should we handle the bad completion */ + if (cqp_request->major_code) + WARN(1, PFX "Invalid cqp_request major_code=0x%x\n", + cqp_request->major_code); + + switch (nesqp->pau_state) { + case PAU_DEL_QH: + /* Old hash code deleted, now set the new one */ + nesqp->pau_state = PAU_ADD_LB_QH; + new_request = nes_get_cqp_request(nesdev); + if (new_request == NULL) { + nes_debug(NES_DBG_PAU, "Failed to get a new_request.\n"); + WARN_ON(1); + return; + } + + memset(&nes_quad, 0, sizeof(nes_quad)); + nes_quad.DstIpAdrIndex = + cpu_to_le32((u32)PCI_FUNC(nesdev->pcidev->devfn) << 24); + nes_quad.SrcIpadr = cpu_to_be32(0x7f000001); + nes_quad.TcpPorts[0] = swab16(nesqp->nesqp_context->tcpPorts[1]); + nes_quad.TcpPorts[1] = swab16(nesqp->nesqp_context->tcpPorts[0]); + + /* Produce hash key */ + crc_value = get_crc_value(&nes_quad); + nesqp->hte_index = cpu_to_be32(crc_value ^ 0xffffffff); + nes_debug(NES_DBG_PAU, "new HTE Index = 0x%08X, CRC = 0x%08X\n", + nesqp->hte_index, nesqp->hte_index & nesadapter->hte_index_mask); + + nesqp->hte_index &= nesadapter->hte_index_mask; + nesqp->nesqp_context->hte_index = cpu_to_le32(nesqp->hte_index); + nesqp->nesqp_context->ip0 = cpu_to_le32(0x7f000001); + nesqp->nesqp_context->rcv_nxt = cpu_to_le32(nesqp->pau_rcv_nxt); + + cqp_wqe = &new_request->cqp_wqe; + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, + NES_CQP_WQE_OPCODE_IDX, NES_CQP_MANAGE_QUAD_HASH | + NES_CQP_QP_TYPE_IWARP | NES_CQP_QP_CONTEXT_VALID | NES_CQP_QP_IWARP_STATE_RTS); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id); + u64temp = (u64)nesqp->nesqp_context_pbase; + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp); + + nes_debug(NES_DBG_PAU, "Waiting for CQP completion for adding the quad hash.\n"); + + new_request->cqp_callback_pointer = qh_chg; + new_request->callback = 1; + new_request->cqp_callback = nes_chg_qh_handler; + atomic_set(&new_request->refcount, 1); + nes_post_cqp_request(nesdev, new_request); + break; + + case PAU_ADD_LB_QH: + /* Start processing the queued fpdu's */ + nesqp->pau_state = PAU_READY; + process_fpdus(qh_chg->nesvnic, qh_chg->nesqp); + kfree(qh_chg); + break; + } +} + +/** + * nes_change_quad_hash + */ +static int nes_change_quad_hash(struct nes_device *nesdev, + struct nes_vnic *nesvnic, struct nes_qp *nesqp) +{ + struct nes_cqp_request *cqp_request = NULL; + struct pau_qh_chg *qh_chg = NULL; + u64 u64temp; + struct nes_hw_cqp_wqe *cqp_wqe; + int ret = 0; + + cqp_request = nes_get_cqp_request(nesdev); + if (cqp_request == NULL) { + nes_debug(NES_DBG_PAU, "Failed to get a cqp_request.\n"); + ret = -ENOMEM; + goto chg_qh_err; + } + + qh_chg = kmalloc(sizeof *qh_chg, GFP_ATOMIC); + if (qh_chg == NULL) { + nes_debug(NES_DBG_PAU, "Failed to get a cqp_request.\n"); + ret = -ENOMEM; + goto chg_qh_err; + } + qh_chg->nesdev = nesdev; + qh_chg->nesvnic = nesvnic; + qh_chg->nesqp = nesqp; + nesqp->pau_state = PAU_DEL_QH; + + cqp_wqe = &cqp_request->cqp_wqe; + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, + NES_CQP_WQE_OPCODE_IDX, NES_CQP_MANAGE_QUAD_HASH | NES_CQP_QP_DEL_HTE | + NES_CQP_QP_TYPE_IWARP | NES_CQP_QP_CONTEXT_VALID | NES_CQP_QP_IWARP_STATE_RTS); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id); + u64temp = (u64)nesqp->nesqp_context_pbase; + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp); + + nes_debug(NES_DBG_PAU, "Waiting for CQP completion for deleting the quad hash.\n"); + + cqp_request->cqp_callback_pointer = qh_chg; + cqp_request->callback = 1; + cqp_request->cqp_callback = nes_chg_qh_handler; + atomic_set(&cqp_request->refcount, 1); + nes_post_cqp_request(nesdev, cqp_request); + + return ret; + +chg_qh_err: + kfree(qh_chg); + if (cqp_request) + nes_put_cqp_request(nesdev, cqp_request); + return ret; +} + +/** + * nes_mgt_ce_handler + * This management code deals with any packed and unaligned (pau) fpdu's + * that the hardware cannot handle. + */ +static void nes_mgt_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq) +{ + struct nes_vnic_mgt *mgtvnic = container_of(cq, struct nes_vnic_mgt, mgt_cq); + struct nes_adapter *nesadapter = nesdev->nesadapter; + u32 head; + u32 cq_size; + u32 cqe_count = 0; + u32 cqe_misc; + u32 qp_id = 0; + u32 skbs_needed; + unsigned long context; + struct nes_qp *nesqp; + struct sk_buff *rx_skb; + struct nes_rskb_cb *cb; + + head = cq->cq_head; + cq_size = cq->cq_size; + + while (1) { + cqe_misc = le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_NIC_CQE_MISC_IDX]); + if (!(cqe_misc & NES_NIC_CQE_VALID)) + break; + + nesqp = NULL; + if (cqe_misc & NES_NIC_CQE_ACCQP_VALID) { + qp_id = le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_NIC_CQE_ACCQP_ID_IDX]); + qp_id &= 0x001fffff; + if (qp_id < nesadapter->max_qp) { + context = (unsigned long)nesadapter->qp_table[qp_id - NES_FIRST_QPN]; + nesqp = (struct nes_qp *)context; + } + } + + if (nesqp) { + if (nesqp->pau_mode == false) { + nesqp->pau_mode = true; /* First time for this qp */ + nesqp->pau_rcv_nxt = le32_to_cpu( + cq->cq_vbase[head].cqe_words[NES_NIC_CQE_HASH_RCVNXT]); + skb_queue_head_init(&nesqp->pau_list); + spin_lock_init(&nesqp->pau_lock); + atomic_inc(&pau_qps_created); + nes_change_quad_hash(nesdev, mgtvnic->nesvnic, nesqp); + } + + rx_skb = mgtvnic->mgt.rx_skb[mgtvnic->mgt.rq_tail]; + rx_skb->len = 0; + skb_put(rx_skb, cqe_misc & 0x0000ffff); + rx_skb->protocol = eth_type_trans(rx_skb, mgtvnic->nesvnic->netdev); + cb = (struct nes_rskb_cb *)&rx_skb->cb[0]; + pci_unmap_single(nesdev->pcidev, cb->busaddr, cb->maplen, PCI_DMA_FROMDEVICE); + cb->busaddr = 0; + mgtvnic->mgt.rq_tail++; + mgtvnic->mgt.rq_tail &= mgtvnic->mgt.rq_size - 1; + + nes_add_ref_cm_node(nesqp->cm_node); + nes_queue_mgt_skbs(rx_skb, mgtvnic->nesvnic, nesqp); + } else { + printk(KERN_ERR PFX "Invalid QP %d for packed/unaligned handling\n", qp_id); + } + + cq->cq_vbase[head].cqe_words[NES_NIC_CQE_MISC_IDX] = 0; + cqe_count++; + if (++head >= cq_size) + head = 0; + + if (cqe_count == 255) { + /* Replenish mgt CQ */ + nes_write32(nesdev->regs + NES_CQE_ALLOC, cq->cq_number | (cqe_count << 16)); + nesdev->currcq_count += cqe_count; + cqe_count = 0; + } + + skbs_needed = atomic_inc_return(&mgtvnic->rx_skbs_needed); + if (skbs_needed > (mgtvnic->mgt.rq_size >> 1)) + nes_replenish_mgt_rq(mgtvnic); + } + + cq->cq_head = head; + nes_write32(nesdev->regs + NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT | + cq->cq_number | (cqe_count << 16)); + nes_read32(nesdev->regs + NES_CQE_ALLOC); + nesdev->currcq_count += cqe_count; +} + +/** + * nes_init_mgt_qp + */ +int nes_init_mgt_qp(struct nes_device *nesdev, struct net_device *netdev, struct nes_vnic *nesvnic) +{ + struct nes_vnic_mgt *mgtvnic; + u32 counter; + void *vmem; + dma_addr_t pmem; + struct nes_hw_cqp_wqe *cqp_wqe; + u32 cqp_head; + unsigned long flags; + struct nes_hw_nic_qp_context *mgt_context; + u64 u64temp; + struct nes_hw_nic_rq_wqe *mgt_rqe; + struct sk_buff *skb; + u32 wqe_count; + struct nes_rskb_cb *cb; + u32 mgt_mem_size; + void *mgt_vbase; + dma_addr_t mgt_pbase; + int i; + int ret; + + /* Allocate space the all mgt QPs once */ + mgtvnic = kzalloc(NES_MGT_QP_COUNT * sizeof(struct nes_vnic_mgt), GFP_KERNEL); + if (mgtvnic == NULL) { + nes_debug(NES_DBG_INIT, "Unable to allocate memory for mgt structure\n"); + return -ENOMEM; + } + + /* Allocate fragment, RQ, and CQ; Reuse CEQ based on the PCI function */ + /* We are not sending from this NIC so sq is not allocated */ + mgt_mem_size = 256 + + (NES_MGT_WQ_COUNT * sizeof(struct nes_hw_nic_rq_wqe)) + + (NES_MGT_WQ_COUNT * sizeof(struct nes_hw_nic_cqe)) + + sizeof(struct nes_hw_nic_qp_context); + mgt_mem_size = (mgt_mem_size + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1); + mgt_vbase = pci_alloc_consistent(nesdev->pcidev, NES_MGT_QP_COUNT * mgt_mem_size, &mgt_pbase); + if (!mgt_vbase) { + kfree(mgtvnic); + nes_debug(NES_DBG_INIT, "Unable to allocate memory for mgt host descriptor rings\n"); + return -ENOMEM; + } + + nesvnic->mgt_mem_size = NES_MGT_QP_COUNT * mgt_mem_size; + nesvnic->mgt_vbase = mgt_vbase; + nesvnic->mgt_pbase = mgt_pbase; + + skb_queue_head_init(&nesvnic->mgt_skb_list); + init_waitqueue_head(&nesvnic->mgt_wait_queue); + nesvnic->mgt_thread = kthread_run(mgt_thread, nesvnic, "nes_mgt_thread"); + + for (i = 0; i < NES_MGT_QP_COUNT; i++) { + mgtvnic->nesvnic = nesvnic; + mgtvnic->mgt.qp_id = nesdev->mac_index + NES_MGT_QP_OFFSET + i; + memset(mgt_vbase, 0, mgt_mem_size); + nes_debug(NES_DBG_INIT, "Allocated mgt QP structures at %p (phys = %016lX), size = %u.\n", + mgt_vbase, (unsigned long)mgt_pbase, mgt_mem_size); + + vmem = (void *)(((unsigned long)mgt_vbase + (256 - 1)) & + ~(unsigned long)(256 - 1)); + pmem = (dma_addr_t)(((unsigned long long)mgt_pbase + (256 - 1)) & + ~(unsigned long long)(256 - 1)); + + spin_lock_init(&mgtvnic->mgt.rq_lock); + + /* setup the RQ */ + mgtvnic->mgt.rq_vbase = vmem; + mgtvnic->mgt.rq_pbase = pmem; + mgtvnic->mgt.rq_head = 0; + mgtvnic->mgt.rq_tail = 0; + mgtvnic->mgt.rq_size = NES_MGT_WQ_COUNT; + + /* setup the CQ */ + vmem += (NES_MGT_WQ_COUNT * sizeof(struct nes_hw_nic_rq_wqe)); + pmem += (NES_MGT_WQ_COUNT * sizeof(struct nes_hw_nic_rq_wqe)); + + mgtvnic->mgt_cq.cq_number = mgtvnic->mgt.qp_id; + mgtvnic->mgt_cq.cq_vbase = vmem; + mgtvnic->mgt_cq.cq_pbase = pmem; + mgtvnic->mgt_cq.cq_head = 0; + mgtvnic->mgt_cq.cq_size = NES_MGT_WQ_COUNT; + + mgtvnic->mgt_cq.ce_handler = nes_mgt_ce_handler; + + /* Send CreateCQ request to CQP */ + spin_lock_irqsave(&nesdev->cqp.lock, flags); + cqp_head = nesdev->cqp.sq_head; + + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + + cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32( + NES_CQP_CREATE_CQ | NES_CQP_CQ_CEQ_VALID | + ((u32)mgtvnic->mgt_cq.cq_size << 16)); + cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32( + mgtvnic->mgt_cq.cq_number | ((u32)nesdev->ceq_index << 16)); + u64temp = (u64)mgtvnic->mgt_cq.cq_pbase; + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp); + cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] = 0; + u64temp = (unsigned long)&mgtvnic->mgt_cq; + cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_LOW_IDX] = cpu_to_le32((u32)(u64temp >> 1)); + cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] = + cpu_to_le32(((u32)((u64temp) >> 33)) & 0x7FFFFFFF); + cqp_wqe->wqe_words[NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX] = 0; + + if (++cqp_head >= nesdev->cqp.sq_size) + cqp_head = 0; + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + + /* Send CreateQP request to CQP */ + mgt_context = (void *)(&mgtvnic->mgt_cq.cq_vbase[mgtvnic->mgt_cq.cq_size]); + mgt_context->context_words[NES_NIC_CTX_MISC_IDX] = + cpu_to_le32((u32)NES_MGT_CTX_SIZE | + ((u32)PCI_FUNC(nesdev->pcidev->devfn) << 12)); + nes_debug(NES_DBG_INIT, "RX_WINDOW_BUFFER_PAGE_TABLE_SIZE = 0x%08X, RX_WINDOW_BUFFER_SIZE = 0x%08X\n", + nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_PAGE_TABLE_SIZE), + nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_SIZE)); + if (nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_SIZE) != 0) + mgt_context->context_words[NES_NIC_CTX_MISC_IDX] |= cpu_to_le32(NES_NIC_BACK_STORE); + + u64temp = (u64)mgtvnic->mgt.rq_pbase; + mgt_context->context_words[NES_NIC_CTX_SQ_LOW_IDX] = cpu_to_le32((u32)u64temp); + mgt_context->context_words[NES_NIC_CTX_SQ_HIGH_IDX] = cpu_to_le32((u32)(u64temp >> 32)); + u64temp = (u64)mgtvnic->mgt.rq_pbase; + mgt_context->context_words[NES_NIC_CTX_RQ_LOW_IDX] = cpu_to_le32((u32)u64temp); + mgt_context->context_words[NES_NIC_CTX_RQ_HIGH_IDX] = cpu_to_le32((u32)(u64temp >> 32)); + + cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_CREATE_QP | + NES_CQP_QP_TYPE_NIC); + cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(mgtvnic->mgt.qp_id); + u64temp = (u64)mgtvnic->mgt_cq.cq_pbase + + (mgtvnic->mgt_cq.cq_size * sizeof(struct nes_hw_nic_cqe)); + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp); + + if (++cqp_head >= nesdev->cqp.sq_size) + cqp_head = 0; + nesdev->cqp.sq_head = cqp_head; + + barrier(); + + /* Ring doorbell (2 WQEs) */ + nes_write32(nesdev->regs + NES_WQE_ALLOC, 0x02800000 | nesdev->cqp.qp_id); + + spin_unlock_irqrestore(&nesdev->cqp.lock, flags); + nes_debug(NES_DBG_INIT, "Waiting for create MGT QP%u to complete.\n", + mgtvnic->mgt.qp_id); + + ret = wait_event_timeout(nesdev->cqp.waitq, (nesdev->cqp.sq_tail == cqp_head), + NES_EVENT_TIMEOUT); + nes_debug(NES_DBG_INIT, "Create MGT QP%u completed, wait_event_timeout ret = %u.\n", + mgtvnic->mgt.qp_id, ret); + if (!ret) { + nes_debug(NES_DBG_INIT, "MGT QP%u create timeout expired\n", mgtvnic->mgt.qp_id); + if (i == 0) { + pci_free_consistent(nesdev->pcidev, nesvnic->mgt_mem_size, nesvnic->mgt_vbase, + nesvnic->mgt_pbase); + kfree(mgtvnic); + } else { + nes_destroy_mgt(nesvnic); + } + return -EIO; + } + + /* Populate the RQ */ + for (counter = 0; counter < (NES_MGT_WQ_COUNT - 1); counter++) { + skb = dev_alloc_skb(nesvnic->max_frame_size); + if (!skb) { + nes_debug(NES_DBG_INIT, "%s: out of memory for receive skb\n", netdev->name); + return -ENOMEM; + } + + skb->dev = netdev; + + pmem = pci_map_single(nesdev->pcidev, skb->data, + nesvnic->max_frame_size, PCI_DMA_FROMDEVICE); + cb = (struct nes_rskb_cb *)&skb->cb[0]; + cb->busaddr = pmem; + cb->maplen = nesvnic->max_frame_size; + + mgt_rqe = &mgtvnic->mgt.rq_vbase[counter]; + mgt_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_1_0_IDX] = cpu_to_le32((u32)nesvnic->max_frame_size); + mgt_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_3_2_IDX] = 0; + mgt_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX] = cpu_to_le32((u32)pmem); + mgt_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX] = cpu_to_le32((u32)((u64)pmem >> 32)); + mgtvnic->mgt.rx_skb[counter] = skb; + } + + init_timer(&mgtvnic->rq_wqes_timer); + mgtvnic->rq_wqes_timer.function = nes_mgt_rq_wqes_timeout; + mgtvnic->rq_wqes_timer.data = (unsigned long)mgtvnic; + + wqe_count = NES_MGT_WQ_COUNT - 1; + mgtvnic->mgt.rq_head = wqe_count; + barrier(); + do { + counter = min(wqe_count, ((u32)255)); + wqe_count -= counter; + nes_write32(nesdev->regs + NES_WQE_ALLOC, (counter << 24) | mgtvnic->mgt.qp_id); + } while (wqe_count); + + nes_write32(nesdev->regs + NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT | + mgtvnic->mgt_cq.cq_number); + nes_read32(nesdev->regs + NES_CQE_ALLOC); + + mgt_vbase += mgt_mem_size; + mgt_pbase += mgt_mem_size; + nesvnic->mgtvnic[i] = mgtvnic++; + } + return 0; +} + + +void nes_destroy_mgt(struct nes_vnic *nesvnic) +{ + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_vnic_mgt *mgtvnic; + struct nes_vnic_mgt *first_mgtvnic; + unsigned long flags; + struct nes_hw_cqp_wqe *cqp_wqe; + u32 cqp_head; + struct sk_buff *rx_skb; + int i; + int ret; + + kthread_stop(nesvnic->mgt_thread); + + /* Free remaining NIC receive buffers */ + first_mgtvnic = nesvnic->mgtvnic[0]; + for (i = 0; i < NES_MGT_QP_COUNT; i++) { + mgtvnic = nesvnic->mgtvnic[i]; + if (mgtvnic == NULL) + continue; + + while (mgtvnic->mgt.rq_head != mgtvnic->mgt.rq_tail) { + rx_skb = mgtvnic->mgt.rx_skb[mgtvnic->mgt.rq_tail]; + nes_mgt_free_skb(nesdev, rx_skb, PCI_DMA_FROMDEVICE); + mgtvnic->mgt.rq_tail++; + mgtvnic->mgt.rq_tail &= (mgtvnic->mgt.rq_size - 1); + } + + spin_lock_irqsave(&nesdev->cqp.lock, flags); + + /* Destroy NIC QP */ + cqp_head = nesdev->cqp.sq_head; + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, + (NES_CQP_DESTROY_QP | NES_CQP_QP_TYPE_NIC)); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, + mgtvnic->mgt.qp_id); + + if (++cqp_head >= nesdev->cqp.sq_size) + cqp_head = 0; + + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + + /* Destroy NIC CQ */ + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, + (NES_CQP_DESTROY_CQ | ((u32)mgtvnic->mgt_cq.cq_size << 16))); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, + (mgtvnic->mgt_cq.cq_number | ((u32)nesdev->ceq_index << 16))); + + if (++cqp_head >= nesdev->cqp.sq_size) + cqp_head = 0; + + nesdev->cqp.sq_head = cqp_head; + barrier(); + + /* Ring doorbell (2 WQEs) */ + nes_write32(nesdev->regs + NES_WQE_ALLOC, 0x02800000 | nesdev->cqp.qp_id); + + spin_unlock_irqrestore(&nesdev->cqp.lock, flags); + nes_debug(NES_DBG_SHUTDOWN, "Waiting for CQP, cqp_head=%u, cqp.sq_head=%u," + " cqp.sq_tail=%u, cqp.sq_size=%u\n", + cqp_head, nesdev->cqp.sq_head, + nesdev->cqp.sq_tail, nesdev->cqp.sq_size); + + ret = wait_event_timeout(nesdev->cqp.waitq, (nesdev->cqp.sq_tail == cqp_head), + NES_EVENT_TIMEOUT); + + nes_debug(NES_DBG_SHUTDOWN, "Destroy MGT QP returned, wait_event_timeout ret = %u, cqp_head=%u," + " cqp.sq_head=%u, cqp.sq_tail=%u\n", + ret, cqp_head, nesdev->cqp.sq_head, nesdev->cqp.sq_tail); + if (!ret) + nes_debug(NES_DBG_SHUTDOWN, "MGT QP%u destroy timeout expired\n", + mgtvnic->mgt.qp_id); + + nesvnic->mgtvnic[i] = NULL; + } + + if (nesvnic->mgt_vbase) { + pci_free_consistent(nesdev->pcidev, nesvnic->mgt_mem_size, nesvnic->mgt_vbase, + nesvnic->mgt_pbase); + nesvnic->mgt_vbase = NULL; + nesvnic->mgt_pbase = 0; + } + + kfree(first_mgtvnic); +} diff --git a/kernel/drivers/infiniband/hw/nes/nes_mgt.h b/kernel/drivers/infiniband/hw/nes/nes_mgt.h new file mode 100644 index 000000000..4f7f701c4 --- /dev/null +++ b/kernel/drivers/infiniband/hw/nes/nes_mgt.h @@ -0,0 +1,97 @@ +/* +* Copyright (c) 2006 - 2011 Intel-NE, Inc. All rights reserved. +* +* This software is available to you under a choice of one of two +* licenses. You may choose to be licensed under the terms of the GNU +* General Public License (GPL) Version 2, available from the file +* COPYING in the main directory of this source tree, or the +* OpenIB.org BSD license below: +* +* Redistribution and use in source and binary forms, with or +* without modification, are permitted provided that the following +* conditions are met: +* +* - Redistributions of source code must retain the above +* copyright notice, this list of conditions and the following +* disclaimer. +* +* - Redistributions in binary form must reproduce the above +* copyright notice, this list of conditions and the following +* disclaimer in the documentation and/or other materials +* provided with the distribution. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ + +#ifndef __NES_MGT_H +#define __NES_MGT_H + +#define MPA_FRAMING 6 /* length is 2 bytes, crc is 4 bytes */ + +int nes_init_mgt_qp(struct nes_device *nesdev, struct net_device *netdev, struct nes_vnic *nesvnic); +void nes_queue_mgt_skbs(struct sk_buff *skb, struct nes_vnic *nesvnic, struct nes_qp *nesqp); +void nes_destroy_mgt(struct nes_vnic *nesvnic); +void nes_destroy_pau_qp(struct nes_device *nesdev, struct nes_qp *nesqp); + +struct nes_hw_mgt { + struct nes_hw_nic_rq_wqe *rq_vbase; /* virtual address of rq */ + dma_addr_t rq_pbase; /* PCI memory for host rings */ + struct sk_buff *rx_skb[NES_NIC_WQ_SIZE]; + u16 qp_id; + u16 sq_head; + u16 rq_head; + u16 rq_tail; + u16 rq_size; + u8 replenishing_rq; + u8 reserved; + spinlock_t rq_lock; +}; + +struct nes_vnic_mgt { + struct nes_vnic *nesvnic; + struct nes_hw_mgt mgt; + struct nes_hw_nic_cq mgt_cq; + atomic_t rx_skbs_needed; + struct timer_list rq_wqes_timer; + atomic_t rx_skb_timer_running; +}; + +#define MAX_FPDU_FRAGS 4 +struct pau_fpdu_frag { + struct sk_buff *skb; + u64 physaddr; + u32 frag_len; + bool cmplt; +}; + +struct pau_fpdu_info { + struct nes_qp *nesqp; + struct nes_cqp_request *cqp_request; + void *hdr_vbase; + dma_addr_t hdr_pbase; + int hdr_len; + u16 data_len; + u16 frag_cnt; + struct pau_fpdu_frag frags[MAX_FPDU_FRAGS]; +}; + +enum pau_qh_state { + PAU_DEL_QH, + PAU_ADD_LB_QH, + PAU_READY +}; + +struct pau_qh_chg { + struct nes_device *nesdev; + struct nes_vnic *nesvnic; + struct nes_qp *nesqp; +}; + +#endif /* __NES_MGT_H */ diff --git a/kernel/drivers/infiniband/hw/nes/nes_nic.c b/kernel/drivers/infiniband/hw/nes/nes_nic.c new file mode 100644 index 000000000..70acda91e --- /dev/null +++ b/kernel/drivers/infiniband/hw/nes/nes_nic.c @@ -0,0 +1,1883 @@ +/* + * Copyright (c) 2006 - 2011 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "nes.h" + +static struct nic_qp_map nic_qp_mapping_0[] = { + {16,0,0,1},{24,4,0,0},{28,8,0,0},{32,12,0,0}, + {20,2,2,1},{26,6,2,0},{30,10,2,0},{34,14,2,0}, + {18,1,1,1},{25,5,1,0},{29,9,1,0},{33,13,1,0}, + {22,3,3,1},{27,7,3,0},{31,11,3,0},{35,15,3,0} +}; + +static struct nic_qp_map nic_qp_mapping_1[] = { + {18,1,1,1},{25,5,1,0},{29,9,1,0},{33,13,1,0}, + {22,3,3,1},{27,7,3,0},{31,11,3,0},{35,15,3,0} +}; + +static struct nic_qp_map nic_qp_mapping_2[] = { + {20,2,2,1},{26,6,2,0},{30,10,2,0},{34,14,2,0} +}; + +static struct nic_qp_map nic_qp_mapping_3[] = { + {22,3,3,1},{27,7,3,0},{31,11,3,0},{35,15,3,0} +}; + +static struct nic_qp_map nic_qp_mapping_4[] = { + {28,8,0,0},{32,12,0,0} +}; + +static struct nic_qp_map nic_qp_mapping_5[] = { + {29,9,1,0},{33,13,1,0} +}; + +static struct nic_qp_map nic_qp_mapping_6[] = { + {30,10,2,0},{34,14,2,0} +}; + +static struct nic_qp_map nic_qp_mapping_7[] = { + {31,11,3,0},{35,15,3,0} +}; + +static struct nic_qp_map *nic_qp_mapping_per_function[] = { + nic_qp_mapping_0, nic_qp_mapping_1, nic_qp_mapping_2, nic_qp_mapping_3, + nic_qp_mapping_4, nic_qp_mapping_5, nic_qp_mapping_6, nic_qp_mapping_7 +}; + +static const u32 default_msg = NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_LINK + | NETIF_MSG_IFUP | NETIF_MSG_IFDOWN; +static int debug = -1; +static int nics_per_function = 1; + +/** + * nes_netdev_poll + */ +static int nes_netdev_poll(struct napi_struct *napi, int budget) +{ + struct nes_vnic *nesvnic = container_of(napi, struct nes_vnic, napi); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_hw_nic_cq *nescq = &nesvnic->nic_cq; + + nesvnic->budget = budget; + nescq->cqes_pending = 0; + nescq->rx_cqes_completed = 0; + nescq->cqe_allocs_pending = 0; + nescq->rx_pkts_indicated = 0; + + nes_nic_ce_handler(nesdev, nescq); + + if (nescq->cqes_pending == 0) { + napi_complete(napi); + /* clear out completed cqes and arm */ + nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT | + nescq->cq_number | (nescq->cqe_allocs_pending << 16)); + nes_read32(nesdev->regs+NES_CQE_ALLOC); + } else { + /* clear out completed cqes but don't arm */ + nes_write32(nesdev->regs+NES_CQE_ALLOC, + nescq->cq_number | (nescq->cqe_allocs_pending << 16)); + nes_debug(NES_DBG_NETDEV, "%s: exiting with work pending\n", + nesvnic->netdev->name); + } + return nescq->rx_pkts_indicated; +} + + +/** + * nes_netdev_open - Activate the network interface; ifconfig + * ethx up. + */ +static int nes_netdev_open(struct net_device *netdev) +{ + u32 macaddr_low; + u16 macaddr_high; + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_device *nesdev = nesvnic->nesdev; + int ret; + int i; + struct nes_vnic *first_nesvnic = NULL; + u32 nic_active_bit; + u32 nic_active; + struct list_head *list_pos, *list_temp; + unsigned long flags; + + assert(nesdev != NULL); + + if (nesvnic->netdev_open == 1) + return 0; + + if (netif_msg_ifup(nesvnic)) + printk(KERN_INFO PFX "%s: enabling interface\n", netdev->name); + + ret = nes_init_nic_qp(nesdev, netdev); + if (ret) { + return ret; + } + + netif_carrier_off(netdev); + netif_stop_queue(netdev); + + if ((!nesvnic->of_device_registered) && (nesvnic->rdma_enabled)) { + nesvnic->nesibdev = nes_init_ofa_device(netdev); + if (nesvnic->nesibdev == NULL) { + printk(KERN_ERR PFX "%s: nesvnic->nesibdev alloc failed", netdev->name); + } else { + nesvnic->nesibdev->nesvnic = nesvnic; + ret = nes_register_ofa_device(nesvnic->nesibdev); + if (ret) { + printk(KERN_ERR PFX "%s: Unable to register RDMA device, ret = %d\n", + netdev->name, ret); + } + } + } + /* Set packet filters */ + nic_active_bit = 1 << nesvnic->nic_index; + nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_ACTIVE); + nic_active |= nic_active_bit; + nes_write_indexed(nesdev, NES_IDX_NIC_ACTIVE, nic_active); + nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ENABLE); + nic_active |= nic_active_bit; + nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ENABLE, nic_active); + nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_BROADCAST_ON); + nic_active |= nic_active_bit; + nes_write_indexed(nesdev, NES_IDX_NIC_BROADCAST_ON, nic_active); + + macaddr_high = ((u16)netdev->dev_addr[0]) << 8; + macaddr_high += (u16)netdev->dev_addr[1]; + + macaddr_low = ((u32)netdev->dev_addr[2]) << 24; + macaddr_low += ((u32)netdev->dev_addr[3]) << 16; + macaddr_low += ((u32)netdev->dev_addr[4]) << 8; + macaddr_low += (u32)netdev->dev_addr[5]; + + /* Program the various MAC regs */ + for (i = 0; i < NES_MAX_PORT_COUNT; i++) { + if (nesvnic->qp_nic_index[i] == 0xf) { + break; + } + nes_debug(NES_DBG_NETDEV, "i=%d, perfect filter table index= %d, PERF FILTER LOW" + " (Addr:%08X) = %08X, HIGH = %08X.\n", + i, nesvnic->qp_nic_index[i], + NES_IDX_PERFECT_FILTER_LOW+ + (nesvnic->qp_nic_index[i] * 8), + macaddr_low, + (u32)macaddr_high | NES_MAC_ADDR_VALID | + ((((u32)nesvnic->nic_index) << 16))); + nes_write_indexed(nesdev, + NES_IDX_PERFECT_FILTER_LOW + (nesvnic->qp_nic_index[i] * 8), + macaddr_low); + nes_write_indexed(nesdev, + NES_IDX_PERFECT_FILTER_HIGH + (nesvnic->qp_nic_index[i] * 8), + (u32)macaddr_high | NES_MAC_ADDR_VALID | + ((((u32)nesvnic->nic_index) << 16))); + } + + + nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT | + nesvnic->nic_cq.cq_number); + nes_read32(nesdev->regs+NES_CQE_ALLOC); + list_for_each_safe(list_pos, list_temp, &nesdev->nesadapter->nesvnic_list[nesdev->mac_index]) { + first_nesvnic = container_of(list_pos, struct nes_vnic, list); + if (first_nesvnic->netdev_open == 1) + break; + } + if (first_nesvnic->netdev_open == 0) { + nes_debug(NES_DBG_INIT, "Setting up MAC interrupt mask.\n"); + nes_write_indexed(nesdev, NES_IDX_MAC_INT_MASK + (0x200 * nesdev->mac_index), + ~(NES_MAC_INT_LINK_STAT_CHG | NES_MAC_INT_XGMII_EXT | + NES_MAC_INT_TX_UNDERFLOW | NES_MAC_INT_TX_ERROR)); + first_nesvnic = nesvnic; + } + + if (first_nesvnic->linkup) { + /* Enable network packets */ + nesvnic->linkup = 1; + netif_start_queue(netdev); + netif_carrier_on(netdev); + } + + spin_lock_irqsave(&nesdev->nesadapter->phy_lock, flags); + if (nesdev->nesadapter->phy_type[nesdev->mac_index] == NES_PHY_TYPE_SFP_D) { + nesdev->link_recheck = 1; + mod_delayed_work(system_wq, &nesdev->work, + NES_LINK_RECHECK_DELAY); + } + spin_unlock_irqrestore(&nesdev->nesadapter->phy_lock, flags); + + spin_lock_irqsave(&nesvnic->port_ibevent_lock, flags); + if (nesvnic->of_device_registered) { + nesdev->nesadapter->send_term_ok = 1; + if (nesvnic->linkup == 1) { + if (nesdev->iw_status == 0) { + nesdev->iw_status = 1; + nes_port_ibevent(nesvnic); + } + } else { + nesdev->iw_status = 0; + } + } + spin_unlock_irqrestore(&nesvnic->port_ibevent_lock, flags); + + napi_enable(&nesvnic->napi); + nesvnic->netdev_open = 1; + + return 0; +} + + +/** + * nes_netdev_stop + */ +static int nes_netdev_stop(struct net_device *netdev) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_device *nesdev = nesvnic->nesdev; + u32 nic_active_mask; + u32 nic_active; + struct nes_vnic *first_nesvnic = NULL; + struct list_head *list_pos, *list_temp; + unsigned long flags; + + nes_debug(NES_DBG_SHUTDOWN, "nesvnic=%p, nesdev=%p, netdev=%p %s\n", + nesvnic, nesdev, netdev, netdev->name); + if (nesvnic->netdev_open == 0) + return 0; + + if (netif_msg_ifdown(nesvnic)) + printk(KERN_INFO PFX "%s: disabling interface\n", netdev->name); + netif_carrier_off(netdev); + + /* Disable network packets */ + napi_disable(&nesvnic->napi); + netif_stop_queue(netdev); + list_for_each_safe(list_pos, list_temp, &nesdev->nesadapter->nesvnic_list[nesdev->mac_index]) { + first_nesvnic = container_of(list_pos, struct nes_vnic, list); + if ((first_nesvnic->netdev_open == 1) && (first_nesvnic != nesvnic)) + break; + } + + if ((first_nesvnic->netdev_open == 1) && (first_nesvnic != nesvnic) && + (PCI_FUNC(first_nesvnic->nesdev->pcidev->devfn) != + PCI_FUNC(nesvnic->nesdev->pcidev->devfn))) { + nes_write_indexed(nesdev, NES_IDX_MAC_INT_MASK+ + (0x200*nesdev->mac_index), 0xffffffff); + nes_write_indexed(first_nesvnic->nesdev, + NES_IDX_MAC_INT_MASK+ + (0x200*first_nesvnic->nesdev->mac_index), + ~(NES_MAC_INT_LINK_STAT_CHG | NES_MAC_INT_XGMII_EXT | + NES_MAC_INT_TX_UNDERFLOW | NES_MAC_INT_TX_ERROR)); + } else { + nes_write_indexed(nesdev, NES_IDX_MAC_INT_MASK+(0x200*nesdev->mac_index), 0xffffffff); + } + + nic_active_mask = ~((u32)(1 << nesvnic->nic_index)); + nes_write_indexed(nesdev, NES_IDX_PERFECT_FILTER_HIGH+ + (nesvnic->perfect_filter_index*8), 0); + nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_ACTIVE); + nic_active &= nic_active_mask; + nes_write_indexed(nesdev, NES_IDX_NIC_ACTIVE, nic_active); + nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL); + nic_active &= nic_active_mask; + nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL, nic_active); + nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ENABLE); + nic_active &= nic_active_mask; + nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ENABLE, nic_active); + nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL); + nic_active &= nic_active_mask; + nes_write_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL, nic_active); + nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_BROADCAST_ON); + nic_active &= nic_active_mask; + nes_write_indexed(nesdev, NES_IDX_NIC_BROADCAST_ON, nic_active); + + spin_lock_irqsave(&nesvnic->port_ibevent_lock, flags); + if (nesvnic->of_device_registered) { + nesdev->nesadapter->send_term_ok = 0; + nesdev->iw_status = 0; + if (nesvnic->linkup == 1) + nes_port_ibevent(nesvnic); + } + del_timer_sync(&nesvnic->event_timer); + nesvnic->event_timer.function = NULL; + spin_unlock_irqrestore(&nesvnic->port_ibevent_lock, flags); + + nes_destroy_nic_qp(nesvnic); + + nesvnic->netdev_open = 0; + + return 0; +} + + +/** + * nes_nic_send + */ +static int nes_nic_send(struct sk_buff *skb, struct net_device *netdev) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_hw_nic *nesnic = &nesvnic->nic; + struct nes_hw_nic_sq_wqe *nic_sqe; + struct tcphdr *tcph; + __le16 *wqe_fragment_length; + u32 wqe_misc; + u16 wqe_fragment_index = 1; /* first fragment (0) is used by copy buffer */ + u16 skb_fragment_index; + dma_addr_t bus_address; + + nic_sqe = &nesnic->sq_vbase[nesnic->sq_head]; + wqe_fragment_length = (__le16 *)&nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX]; + + /* setup the VLAN tag if present */ + if (skb_vlan_tag_present(skb)) { + nes_debug(NES_DBG_NIC_TX, "%s: VLAN packet to send... VLAN = %08X\n", + netdev->name, skb_vlan_tag_get(skb)); + wqe_misc = NES_NIC_SQ_WQE_TAGVALUE_ENABLE; + wqe_fragment_length[0] = (__force __le16) skb_vlan_tag_get(skb); + } else + wqe_misc = 0; + + /* bump past the vlan tag */ + wqe_fragment_length++; + /* wqe_fragment_address = (u64 *)&nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX]; */ + wqe_misc |= NES_NIC_SQ_WQE_COMPLETION; + + if (skb->ip_summed == CHECKSUM_PARTIAL) { + if (skb_is_gso(skb)) { + tcph = tcp_hdr(skb); + /* nes_debug(NES_DBG_NIC_TX, "%s: TSO request... is_gso = %u seg size = %u\n", + netdev->name, skb_is_gso(skb), skb_shinfo(skb)->gso_size); */ + wqe_misc |= NES_NIC_SQ_WQE_LSO_ENABLE | (u16)skb_shinfo(skb)->gso_size; + set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_LSO_INFO_IDX, + ((u32)tcph->doff) | + (((u32)(((unsigned char *)tcph) - skb->data)) << 4)); + } + } else { /* CHECKSUM_HW */ + wqe_misc |= NES_NIC_SQ_WQE_DISABLE_CHKSUM; + } + + set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_TOTAL_LENGTH_IDX, + skb->len); + memcpy(&nesnic->first_frag_vbase[nesnic->sq_head].buffer, + skb->data, min(((unsigned int)NES_FIRST_FRAG_SIZE), skb_headlen(skb))); + wqe_fragment_length[0] = cpu_to_le16(min(((unsigned int)NES_FIRST_FRAG_SIZE), + skb_headlen(skb))); + wqe_fragment_length[1] = 0; + if (skb_headlen(skb) > NES_FIRST_FRAG_SIZE) { + if ((skb_shinfo(skb)->nr_frags + 1) > 4) { + nes_debug(NES_DBG_NIC_TX, "%s: Packet with %u fragments not sent, skb_headlen=%u\n", + netdev->name, skb_shinfo(skb)->nr_frags + 2, skb_headlen(skb)); + kfree_skb(skb); + nesvnic->tx_sw_dropped++; + return NETDEV_TX_LOCKED; + } + set_bit(nesnic->sq_head, nesnic->first_frag_overflow); + bus_address = pci_map_single(nesdev->pcidev, skb->data + NES_FIRST_FRAG_SIZE, + skb_headlen(skb) - NES_FIRST_FRAG_SIZE, PCI_DMA_TODEVICE); + wqe_fragment_length[wqe_fragment_index++] = + cpu_to_le16(skb_headlen(skb) - NES_FIRST_FRAG_SIZE); + wqe_fragment_length[wqe_fragment_index] = 0; + set_wqe_64bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_FRAG1_LOW_IDX, + ((u64)(bus_address))); + nesnic->tx_skb[nesnic->sq_head] = skb; + } + + if (skb_headlen(skb) == skb->len) { + if (skb_headlen(skb) <= NES_FIRST_FRAG_SIZE) { + nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_2_1_IDX] = 0; + nesnic->tx_skb[nesnic->sq_head] = skb; + } + } else { + /* Deal with Fragments */ + nesnic->tx_skb[nesnic->sq_head] = skb; + for (skb_fragment_index = 0; skb_fragment_index < skb_shinfo(skb)->nr_frags; + skb_fragment_index++) { + skb_frag_t *frag = + &skb_shinfo(skb)->frags[skb_fragment_index]; + bus_address = skb_frag_dma_map(&nesdev->pcidev->dev, + frag, 0, skb_frag_size(frag), + DMA_TO_DEVICE); + wqe_fragment_length[wqe_fragment_index] = + cpu_to_le16(skb_frag_size(&skb_shinfo(skb)->frags[skb_fragment_index])); + set_wqe_64bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_FRAG0_LOW_IDX+(2*wqe_fragment_index), + bus_address); + wqe_fragment_index++; + if (wqe_fragment_index < 5) + wqe_fragment_length[wqe_fragment_index] = 0; + } + } + + set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_MISC_IDX, wqe_misc); + nesnic->sq_head++; + nesnic->sq_head &= nesnic->sq_size - 1; + + return NETDEV_TX_OK; +} + + +/** + * nes_netdev_start_xmit + */ +static int nes_netdev_start_xmit(struct sk_buff *skb, struct net_device *netdev) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_hw_nic *nesnic = &nesvnic->nic; + struct nes_hw_nic_sq_wqe *nic_sqe; + struct tcphdr *tcph; + /* struct udphdr *udph; */ +#define NES_MAX_TSO_FRAGS MAX_SKB_FRAGS + /* 64K segment plus overflow on each side */ + dma_addr_t tso_bus_address[NES_MAX_TSO_FRAGS]; + dma_addr_t bus_address; + u32 tso_frag_index; + u32 tso_frag_count; + u32 tso_wqe_length; + u32 curr_tcp_seq; + u32 wqe_count=1; + u32 send_rc; + struct iphdr *iph; + __le16 *wqe_fragment_length; + u32 nr_frags; + u32 original_first_length; + /* u64 *wqe_fragment_address; */ + /* first fragment (0) is used by copy buffer */ + u16 wqe_fragment_index=1; + u16 hoffset; + u16 nhoffset; + u16 wqes_needed; + u16 wqes_available; + u32 wqe_misc; + + /* + * nes_debug(NES_DBG_NIC_TX, "%s Request to tx NIC packet length %u, headlen %u," + * " (%u frags), tso_size=%u\n", + * netdev->name, skb->len, skb_headlen(skb), + * skb_shinfo(skb)->nr_frags, skb_is_gso(skb)); + */ + + if (!netif_carrier_ok(netdev)) + return NETDEV_TX_OK; + + if (netif_queue_stopped(netdev)) + return NETDEV_TX_BUSY; + + /* Check if SQ is full */ + if ((((nesnic->sq_tail+(nesnic->sq_size*2))-nesnic->sq_head) & (nesnic->sq_size - 1)) == 1) { + if (!netif_queue_stopped(netdev)) { + netif_stop_queue(netdev); + barrier(); + if ((((((volatile u16)nesnic->sq_tail)+(nesnic->sq_size*2))-nesnic->sq_head) & (nesnic->sq_size - 1)) != 1) { + netif_start_queue(netdev); + goto sq_no_longer_full; + } + } + nesvnic->sq_full++; + return NETDEV_TX_BUSY; + } + +sq_no_longer_full: + nr_frags = skb_shinfo(skb)->nr_frags; + if (skb_headlen(skb) > NES_FIRST_FRAG_SIZE) { + nr_frags++; + } + /* Check if too many fragments */ + if (unlikely((nr_frags > 4))) { + if (skb_is_gso(skb)) { + nesvnic->segmented_tso_requests++; + nesvnic->tso_requests++; + /* Basically 4 fragments available per WQE with extended fragments */ + wqes_needed = nr_frags >> 2; + wqes_needed += (nr_frags&3)?1:0; + wqes_available = (((nesnic->sq_tail+nesnic->sq_size)-nesnic->sq_head) - 1) & + (nesnic->sq_size - 1); + + if (unlikely(wqes_needed > wqes_available)) { + if (!netif_queue_stopped(netdev)) { + netif_stop_queue(netdev); + barrier(); + wqes_available = (((((volatile u16)nesnic->sq_tail)+nesnic->sq_size)-nesnic->sq_head) - 1) & + (nesnic->sq_size - 1); + if (wqes_needed <= wqes_available) { + netif_start_queue(netdev); + goto tso_sq_no_longer_full; + } + } + nesvnic->sq_full++; + nes_debug(NES_DBG_NIC_TX, "%s: HNIC SQ full- TSO request has too many frags!\n", + netdev->name); + return NETDEV_TX_BUSY; + } +tso_sq_no_longer_full: + /* Map all the buffers */ + for (tso_frag_count=0; tso_frag_count < skb_shinfo(skb)->nr_frags; + tso_frag_count++) { + skb_frag_t *frag = + &skb_shinfo(skb)->frags[tso_frag_count]; + tso_bus_address[tso_frag_count] = + skb_frag_dma_map(&nesdev->pcidev->dev, + frag, 0, skb_frag_size(frag), + DMA_TO_DEVICE); + } + + tso_frag_index = 0; + curr_tcp_seq = ntohl(tcp_hdr(skb)->seq); + hoffset = skb_transport_header(skb) - skb->data; + nhoffset = skb_network_header(skb) - skb->data; + original_first_length = hoffset + ((((struct tcphdr *)skb_transport_header(skb))->doff)<<2); + + for (wqe_count=0; wqe_count<((u32)wqes_needed); wqe_count++) { + tso_wqe_length = 0; + nic_sqe = &nesnic->sq_vbase[nesnic->sq_head]; + wqe_fragment_length = + (__le16 *)&nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX]; + /* setup the VLAN tag if present */ + if (skb_vlan_tag_present(skb)) { + nes_debug(NES_DBG_NIC_TX, "%s: VLAN packet to send... VLAN = %08X\n", + netdev->name, + skb_vlan_tag_get(skb)); + wqe_misc = NES_NIC_SQ_WQE_TAGVALUE_ENABLE; + wqe_fragment_length[0] = (__force __le16) skb_vlan_tag_get(skb); + } else + wqe_misc = 0; + + /* bump past the vlan tag */ + wqe_fragment_length++; + + /* Assumes header totally fits in allocated buffer and is in first fragment */ + if (original_first_length > NES_FIRST_FRAG_SIZE) { + nes_debug(NES_DBG_NIC_TX, "ERROR: SKB header too big, headlen=%u, FIRST_FRAG_SIZE=%u\n", + original_first_length, NES_FIRST_FRAG_SIZE); + nes_debug(NES_DBG_NIC_TX, "%s Request to tx NIC packet length %u, headlen %u," + " (%u frags), is_gso = %u tso_size=%u\n", + netdev->name, + skb->len, skb_headlen(skb), + skb_shinfo(skb)->nr_frags, skb_is_gso(skb), skb_shinfo(skb)->gso_size); + } + memcpy(&nesnic->first_frag_vbase[nesnic->sq_head].buffer, + skb->data, min(((unsigned int)NES_FIRST_FRAG_SIZE), + original_first_length)); + iph = (struct iphdr *) + (&nesnic->first_frag_vbase[nesnic->sq_head].buffer[nhoffset]); + tcph = (struct tcphdr *) + (&nesnic->first_frag_vbase[nesnic->sq_head].buffer[hoffset]); + if ((wqe_count+1)!=(u32)wqes_needed) { + tcph->fin = 0; + tcph->psh = 0; + tcph->rst = 0; + tcph->urg = 0; + } + if (wqe_count) { + tcph->syn = 0; + } + tcph->seq = htonl(curr_tcp_seq); + wqe_fragment_length[0] = cpu_to_le16(min(((unsigned int)NES_FIRST_FRAG_SIZE), + original_first_length)); + + wqe_fragment_index = 1; + if ((wqe_count==0) && (skb_headlen(skb) > original_first_length)) { + set_bit(nesnic->sq_head, nesnic->first_frag_overflow); + bus_address = pci_map_single(nesdev->pcidev, skb->data + original_first_length, + skb_headlen(skb) - original_first_length, PCI_DMA_TODEVICE); + wqe_fragment_length[wqe_fragment_index++] = + cpu_to_le16(skb_headlen(skb) - original_first_length); + wqe_fragment_length[wqe_fragment_index] = 0; + set_wqe_64bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_FRAG1_LOW_IDX, + bus_address); + tso_wqe_length += skb_headlen(skb) - + original_first_length; + } + while (wqe_fragment_index < 5) { + wqe_fragment_length[wqe_fragment_index] = + cpu_to_le16(skb_frag_size(&skb_shinfo(skb)->frags[tso_frag_index])); + set_wqe_64bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_FRAG0_LOW_IDX+(2*wqe_fragment_index), + (u64)tso_bus_address[tso_frag_index]); + wqe_fragment_index++; + tso_wqe_length += skb_frag_size(&skb_shinfo(skb)->frags[tso_frag_index++]); + if (wqe_fragment_index < 5) + wqe_fragment_length[wqe_fragment_index] = 0; + if (tso_frag_index == tso_frag_count) + break; + } + if ((wqe_count+1) == (u32)wqes_needed) { + nesnic->tx_skb[nesnic->sq_head] = skb; + } else { + nesnic->tx_skb[nesnic->sq_head] = NULL; + } + wqe_misc |= NES_NIC_SQ_WQE_COMPLETION | (u16)skb_shinfo(skb)->gso_size; + if ((tso_wqe_length + original_first_length) > skb_shinfo(skb)->gso_size) { + wqe_misc |= NES_NIC_SQ_WQE_LSO_ENABLE; + } else { + iph->tot_len = htons(tso_wqe_length + original_first_length - nhoffset); + } + + set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_MISC_IDX, + wqe_misc); + set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_LSO_INFO_IDX, + ((u32)tcph->doff) | (((u32)hoffset) << 4)); + + set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_TOTAL_LENGTH_IDX, + tso_wqe_length + original_first_length); + curr_tcp_seq += tso_wqe_length; + nesnic->sq_head++; + nesnic->sq_head &= nesnic->sq_size-1; + } + } else { + nesvnic->linearized_skbs++; + hoffset = skb_transport_header(skb) - skb->data; + nhoffset = skb_network_header(skb) - skb->data; + skb_linearize(skb); + skb_set_transport_header(skb, hoffset); + skb_set_network_header(skb, nhoffset); + send_rc = nes_nic_send(skb, netdev); + if (send_rc != NETDEV_TX_OK) + return NETDEV_TX_OK; + } + } else { + send_rc = nes_nic_send(skb, netdev); + if (send_rc != NETDEV_TX_OK) + return NETDEV_TX_OK; + } + + barrier(); + + if (wqe_count) + nes_write32(nesdev->regs+NES_WQE_ALLOC, + (wqe_count << 24) | (1 << 23) | nesvnic->nic.qp_id); + + netdev->trans_start = jiffies; + + return NETDEV_TX_OK; +} + + +/** + * nes_netdev_get_stats + */ +static struct net_device_stats *nes_netdev_get_stats(struct net_device *netdev) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_device *nesdev = nesvnic->nesdev; + u64 u64temp; + u32 u32temp; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_RX_DISCARD + (nesvnic->nic_index*0x200)); + nesvnic->netstats.rx_dropped += u32temp; + nesvnic->endnode_nstat_rx_discard += u32temp; + + u64temp = (u64)nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_LO + (nesvnic->nic_index*0x200)); + u64temp += ((u64)nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_HI + (nesvnic->nic_index*0x200))) << 32; + + nesvnic->endnode_nstat_rx_octets += u64temp; + nesvnic->netstats.rx_bytes += u64temp; + + u64temp = (u64)nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_LO + (nesvnic->nic_index*0x200)); + u64temp += ((u64)nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_HI + (nesvnic->nic_index*0x200))) << 32; + + nesvnic->endnode_nstat_rx_frames += u64temp; + nesvnic->netstats.rx_packets += u64temp; + + u64temp = (u64)nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_LO + (nesvnic->nic_index*0x200)); + u64temp += ((u64)nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_HI + (nesvnic->nic_index*0x200))) << 32; + + nesvnic->endnode_nstat_tx_octets += u64temp; + nesvnic->netstats.tx_bytes += u64temp; + + u64temp = (u64)nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_LO + (nesvnic->nic_index*0x200)); + u64temp += ((u64)nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_HI + (nesvnic->nic_index*0x200))) << 32; + + nesvnic->endnode_nstat_tx_frames += u64temp; + nesvnic->netstats.tx_packets += u64temp; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_MAC_RX_SHORT_FRAMES + (nesvnic->nesdev->mac_index*0x200)); + nesvnic->netstats.rx_dropped += u32temp; + nesvnic->nesdev->mac_rx_errors += u32temp; + nesvnic->nesdev->mac_rx_short_frames += u32temp; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_MAC_RX_OVERSIZED_FRAMES + (nesvnic->nesdev->mac_index*0x200)); + nesvnic->netstats.rx_dropped += u32temp; + nesvnic->nesdev->mac_rx_errors += u32temp; + nesvnic->nesdev->mac_rx_oversized_frames += u32temp; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_MAC_RX_JABBER_FRAMES + (nesvnic->nesdev->mac_index*0x200)); + nesvnic->netstats.rx_dropped += u32temp; + nesvnic->nesdev->mac_rx_errors += u32temp; + nesvnic->nesdev->mac_rx_jabber_frames += u32temp; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_MAC_RX_SYMBOL_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200)); + nesvnic->netstats.rx_dropped += u32temp; + nesvnic->nesdev->mac_rx_errors += u32temp; + nesvnic->nesdev->mac_rx_symbol_err_frames += u32temp; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_MAC_RX_LENGTH_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200)); + nesvnic->netstats.rx_length_errors += u32temp; + nesvnic->nesdev->mac_rx_errors += u32temp; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_MAC_RX_CRC_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200)); + nesvnic->nesdev->mac_rx_errors += u32temp; + nesvnic->nesdev->mac_rx_crc_errors += u32temp; + nesvnic->netstats.rx_crc_errors += u32temp; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_MAC_TX_ERRORS + (nesvnic->nesdev->mac_index*0x200)); + nesvnic->nesdev->mac_tx_errors += u32temp; + nesvnic->netstats.tx_errors += u32temp; + + return &nesvnic->netstats; +} + + +/** + * nes_netdev_tx_timeout + */ +static void nes_netdev_tx_timeout(struct net_device *netdev) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + + if (netif_msg_timer(nesvnic)) + nes_debug(NES_DBG_NIC_TX, "%s: tx timeout\n", netdev->name); +} + + +/** + * nes_netdev_set_mac_address + */ +static int nes_netdev_set_mac_address(struct net_device *netdev, void *p) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_device *nesdev = nesvnic->nesdev; + struct sockaddr *mac_addr = p; + int i; + u32 macaddr_low; + u16 macaddr_high; + + if (!is_valid_ether_addr(mac_addr->sa_data)) + return -EADDRNOTAVAIL; + + memcpy(netdev->dev_addr, mac_addr->sa_data, netdev->addr_len); + printk(PFX "%s: Address length = %d, Address = %pM\n", + __func__, netdev->addr_len, mac_addr->sa_data); + macaddr_high = ((u16)netdev->dev_addr[0]) << 8; + macaddr_high += (u16)netdev->dev_addr[1]; + macaddr_low = ((u32)netdev->dev_addr[2]) << 24; + macaddr_low += ((u32)netdev->dev_addr[3]) << 16; + macaddr_low += ((u32)netdev->dev_addr[4]) << 8; + macaddr_low += (u32)netdev->dev_addr[5]; + + for (i = 0; i < NES_MAX_PORT_COUNT; i++) { + if (nesvnic->qp_nic_index[i] == 0xf) { + break; + } + nes_write_indexed(nesdev, + NES_IDX_PERFECT_FILTER_LOW + (nesvnic->qp_nic_index[i] * 8), + macaddr_low); + nes_write_indexed(nesdev, + NES_IDX_PERFECT_FILTER_HIGH + (nesvnic->qp_nic_index[i] * 8), + (u32)macaddr_high | NES_MAC_ADDR_VALID | + ((((u32)nesvnic->nic_index) << 16))); + } + return 0; +} + + +static void set_allmulti(struct nes_device *nesdev, u32 nic_active_bit) +{ + u32 nic_active; + + nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL); + nic_active |= nic_active_bit; + nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL, nic_active); + nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL); + nic_active &= ~nic_active_bit; + nes_write_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL, nic_active); +} + +#define get_addr(addrs, index) ((addrs) + (index) * ETH_ALEN) + +/** + * nes_netdev_set_multicast_list + */ +static void nes_netdev_set_multicast_list(struct net_device *netdev) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesvnic->nesdev->nesadapter; + u32 nic_active_bit; + u32 nic_active; + u32 perfect_filter_register_address; + u32 macaddr_low; + u16 macaddr_high; + u8 mc_all_on = 0; + u8 mc_index; + int mc_nic_index = -1; + u8 pft_entries_preallocated = max(nesadapter->adapter_fcn_count * + nics_per_function, 4); + u8 max_pft_entries_avaiable = NES_PFT_SIZE - pft_entries_preallocated; + unsigned long flags; + int mc_count = netdev_mc_count(netdev); + + spin_lock_irqsave(&nesadapter->resource_lock, flags); + nic_active_bit = 1 << nesvnic->nic_index; + + if (netdev->flags & IFF_PROMISC) { + nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL); + nic_active |= nic_active_bit; + nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL, nic_active); + nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL); + nic_active |= nic_active_bit; + nes_write_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL, nic_active); + mc_all_on = 1; + } else if ((netdev->flags & IFF_ALLMULTI) || + (nesvnic->nic_index > 3)) { + set_allmulti(nesdev, nic_active_bit); + mc_all_on = 1; + } else { + nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL); + nic_active &= ~nic_active_bit; + nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL, nic_active); + nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL); + nic_active &= ~nic_active_bit; + nes_write_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL, nic_active); + } + + nes_debug(NES_DBG_NIC_RX, "Number of MC entries = %d, Promiscuous = %d, All Multicast = %d.\n", + mc_count, !!(netdev->flags & IFF_PROMISC), + !!(netdev->flags & IFF_ALLMULTI)); + if (!mc_all_on) { + char *addrs; + int i; + struct netdev_hw_addr *ha; + + addrs = kmalloc(ETH_ALEN * mc_count, GFP_ATOMIC); + if (!addrs) { + set_allmulti(nesdev, nic_active_bit); + goto unlock; + } + i = 0; + netdev_for_each_mc_addr(ha, netdev) + memcpy(get_addr(addrs, i++), ha->addr, ETH_ALEN); + + perfect_filter_register_address = NES_IDX_PERFECT_FILTER_LOW + + pft_entries_preallocated * 0x8; + for (i = 0, mc_index = 0; mc_index < max_pft_entries_avaiable; + mc_index++) { + while (i < mc_count && nesvnic->mcrq_mcast_filter && + ((mc_nic_index = nesvnic->mcrq_mcast_filter(nesvnic, + get_addr(addrs, i++))) == 0)); + if (mc_nic_index < 0) + mc_nic_index = nesvnic->nic_index; + while (nesadapter->pft_mcast_map[mc_index] < 16 && + nesadapter->pft_mcast_map[mc_index] != + nesvnic->nic_index && + mc_index < max_pft_entries_avaiable) { + nes_debug(NES_DBG_NIC_RX, + "mc_index=%d skipping nic_index=%d, " + "used for=%d \n", mc_index, + nesvnic->nic_index, + nesadapter->pft_mcast_map[mc_index]); + mc_index++; + } + if (mc_index >= max_pft_entries_avaiable) + break; + if (i < mc_count) { + char *addr = get_addr(addrs, i++); + + nes_debug(NES_DBG_NIC_RX, "Assigning MC Address %pM to register 0x%04X nic_idx=%d\n", + addr, + perfect_filter_register_address+(mc_index * 8), + mc_nic_index); + macaddr_high = ((u8) addr[0]) << 8; + macaddr_high += (u8) addr[1]; + macaddr_low = ((u8) addr[2]) << 24; + macaddr_low += ((u8) addr[3]) << 16; + macaddr_low += ((u8) addr[4]) << 8; + macaddr_low += (u8) addr[5]; + + nes_write_indexed(nesdev, + perfect_filter_register_address+(mc_index * 8), + macaddr_low); + nes_write_indexed(nesdev, + perfect_filter_register_address+4+(mc_index * 8), + (u32)macaddr_high | NES_MAC_ADDR_VALID | + ((((u32)(1<pft_mcast_map[mc_index] = + nesvnic->nic_index; + } else { + nes_debug(NES_DBG_NIC_RX, "Clearing MC Address at register 0x%04X\n", + perfect_filter_register_address+(mc_index * 8)); + nes_write_indexed(nesdev, + perfect_filter_register_address+4+(mc_index * 8), + 0); + nesadapter->pft_mcast_map[mc_index] = 255; + } + } + kfree(addrs); + /* PFT is not large enough */ + if (i < mc_count) + set_allmulti(nesdev, nic_active_bit); + } + +unlock: + spin_unlock_irqrestore(&nesadapter->resource_lock, flags); +} + + +/** + * nes_netdev_change_mtu + */ +static int nes_netdev_change_mtu(struct net_device *netdev, int new_mtu) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_device *nesdev = nesvnic->nesdev; + int ret = 0; + u8 jumbomode = 0; + u32 nic_active; + u32 nic_active_bit; + u32 uc_all_active; + u32 mc_all_active; + + if ((new_mtu < ETH_ZLEN) || (new_mtu > max_mtu)) + return -EINVAL; + + netdev->mtu = new_mtu; + nesvnic->max_frame_size = new_mtu + VLAN_ETH_HLEN; + + if (netdev->mtu > 1500) { + jumbomode=1; + } + nes_nic_init_timer_defaults(nesdev, jumbomode); + + if (netif_running(netdev)) { + nic_active_bit = 1 << nesvnic->nic_index; + mc_all_active = nes_read_indexed(nesdev, + NES_IDX_NIC_MULTICAST_ALL) & nic_active_bit; + uc_all_active = nes_read_indexed(nesdev, + NES_IDX_NIC_UNICAST_ALL) & nic_active_bit; + + nes_netdev_stop(netdev); + nes_netdev_open(netdev); + + nic_active = nes_read_indexed(nesdev, + NES_IDX_NIC_MULTICAST_ALL); + nic_active |= mc_all_active; + nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL, + nic_active); + + nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL); + nic_active |= uc_all_active; + nes_write_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL, nic_active); + } + + return ret; +} + + +static const char nes_ethtool_stringset[][ETH_GSTRING_LEN] = { + "Link Change Interrupts", + "Linearized SKBs", + "T/GSO Requests", + "Pause Frames Sent", + "Pause Frames Received", + "Internal Routing Errors", + "SQ SW Dropped SKBs", + "SQ Full", + "Segmented TSO Requests", + "Rx Symbol Errors", + "Rx Jabber Errors", + "Rx Oversized Frames", + "Rx Short Frames", + "Rx Length Errors", + "Rx CRC Errors", + "Rx Port Discard", + "Endnode Rx Discards", + "Endnode Rx Octets", + "Endnode Rx Frames", + "Endnode Tx Octets", + "Endnode Tx Frames", + "Tx Errors", + "mh detected", + "mh pauses", + "Retransmission Count", + "CM Connects", + "CM Accepts", + "Disconnects", + "Connected Events", + "Connect Requests", + "CM Rejects", + "ModifyQP Timeouts", + "CreateQPs", + "SW DestroyQPs", + "DestroyQPs", + "CM Closes", + "CM Packets Sent", + "CM Packets Bounced", + "CM Packets Created", + "CM Packets Rcvd", + "CM Packets Dropped", + "CM Packets Retrans", + "CM Listens Created", + "CM Listens Destroyed", + "CM Backlog Drops", + "CM Loopbacks", + "CM Nodes Created", + "CM Nodes Destroyed", + "CM Accel Drops", + "CM Resets Received", + "Free 4Kpbls", + "Free 256pbls", + "Timer Inits", + "LRO aggregated", + "LRO flushed", + "LRO no_desc", + "PAU CreateQPs", + "PAU DestroyQPs", +}; +#define NES_ETHTOOL_STAT_COUNT ARRAY_SIZE(nes_ethtool_stringset) + + +/** + * nes_netdev_get_sset_count + */ +static int nes_netdev_get_sset_count(struct net_device *netdev, int stringset) +{ + if (stringset == ETH_SS_STATS) + return NES_ETHTOOL_STAT_COUNT; + else + return -EINVAL; +} + + +/** + * nes_netdev_get_strings + */ +static void nes_netdev_get_strings(struct net_device *netdev, u32 stringset, + u8 *ethtool_strings) +{ + if (stringset == ETH_SS_STATS) + memcpy(ethtool_strings, + &nes_ethtool_stringset, + sizeof(nes_ethtool_stringset)); +} + + +/** + * nes_netdev_get_ethtool_stats + */ + +static void nes_netdev_get_ethtool_stats(struct net_device *netdev, + struct ethtool_stats *target_ethtool_stats, u64 *target_stat_values) +{ + u64 u64temp; + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + u32 nic_count; + u32 u32temp; + u32 index = 0; + + target_ethtool_stats->n_stats = NES_ETHTOOL_STAT_COUNT; + target_stat_values[index] = nesvnic->nesdev->link_status_interrupts; + target_stat_values[++index] = nesvnic->linearized_skbs; + target_stat_values[++index] = nesvnic->tso_requests; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_MAC_TX_PAUSE_FRAMES + (nesvnic->nesdev->mac_index*0x200)); + nesvnic->nesdev->mac_pause_frames_sent += u32temp; + target_stat_values[++index] = nesvnic->nesdev->mac_pause_frames_sent; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_MAC_RX_PAUSE_FRAMES + (nesvnic->nesdev->mac_index*0x200)); + nesvnic->nesdev->mac_pause_frames_received += u32temp; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_PORT_RX_DISCARDS + (nesvnic->nesdev->mac_index*0x40)); + nesvnic->nesdev->port_rx_discards += u32temp; + nesvnic->netstats.rx_dropped += u32temp; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_PORT_TX_DISCARDS + (nesvnic->nesdev->mac_index*0x40)); + nesvnic->nesdev->port_tx_discards += u32temp; + nesvnic->netstats.tx_dropped += u32temp; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_MAC_RX_SHORT_FRAMES + (nesvnic->nesdev->mac_index*0x200)); + nesvnic->netstats.rx_dropped += u32temp; + nesvnic->nesdev->mac_rx_errors += u32temp; + nesvnic->nesdev->mac_rx_short_frames += u32temp; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_MAC_RX_OVERSIZED_FRAMES + (nesvnic->nesdev->mac_index*0x200)); + nesvnic->netstats.rx_dropped += u32temp; + nesvnic->nesdev->mac_rx_errors += u32temp; + nesvnic->nesdev->mac_rx_oversized_frames += u32temp; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_MAC_RX_JABBER_FRAMES + (nesvnic->nesdev->mac_index*0x200)); + nesvnic->netstats.rx_dropped += u32temp; + nesvnic->nesdev->mac_rx_errors += u32temp; + nesvnic->nesdev->mac_rx_jabber_frames += u32temp; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_MAC_RX_SYMBOL_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200)); + nesvnic->netstats.rx_dropped += u32temp; + nesvnic->nesdev->mac_rx_errors += u32temp; + nesvnic->nesdev->mac_rx_symbol_err_frames += u32temp; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_MAC_RX_LENGTH_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200)); + nesvnic->netstats.rx_length_errors += u32temp; + nesvnic->nesdev->mac_rx_errors += u32temp; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_MAC_RX_CRC_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200)); + nesvnic->nesdev->mac_rx_errors += u32temp; + nesvnic->nesdev->mac_rx_crc_errors += u32temp; + nesvnic->netstats.rx_crc_errors += u32temp; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_MAC_TX_ERRORS + (nesvnic->nesdev->mac_index*0x200)); + nesvnic->nesdev->mac_tx_errors += u32temp; + nesvnic->netstats.tx_errors += u32temp; + + for (nic_count = 0; nic_count < NES_MAX_PORT_COUNT; nic_count++) { + if (nesvnic->qp_nic_index[nic_count] == 0xf) + break; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_RX_DISCARD + + (nesvnic->qp_nic_index[nic_count]*0x200)); + nesvnic->netstats.rx_dropped += u32temp; + nesvnic->endnode_nstat_rx_discard += u32temp; + + u64temp = (u64)nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_LO + + (nesvnic->qp_nic_index[nic_count]*0x200)); + u64temp += ((u64)nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_HI + + (nesvnic->qp_nic_index[nic_count]*0x200))) << 32; + + nesvnic->endnode_nstat_rx_octets += u64temp; + nesvnic->netstats.rx_bytes += u64temp; + + u64temp = (u64)nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_LO + + (nesvnic->qp_nic_index[nic_count]*0x200)); + u64temp += ((u64)nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_HI + + (nesvnic->qp_nic_index[nic_count]*0x200))) << 32; + + nesvnic->endnode_nstat_rx_frames += u64temp; + nesvnic->netstats.rx_packets += u64temp; + + u64temp = (u64)nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_LO + + (nesvnic->qp_nic_index[nic_count]*0x200)); + u64temp += ((u64)nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_HI + + (nesvnic->qp_nic_index[nic_count]*0x200))) << 32; + + nesvnic->endnode_nstat_tx_octets += u64temp; + nesvnic->netstats.tx_bytes += u64temp; + + u64temp = (u64)nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_LO + + (nesvnic->qp_nic_index[nic_count]*0x200)); + u64temp += ((u64)nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_HI + + (nesvnic->qp_nic_index[nic_count]*0x200))) << 32; + + nesvnic->endnode_nstat_tx_frames += u64temp; + nesvnic->netstats.tx_packets += u64temp; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_IPV4_TCP_REXMITS + (nesvnic->qp_nic_index[nic_count]*0x200)); + nesvnic->endnode_ipv4_tcp_retransmits += u32temp; + } + + target_stat_values[++index] = nesvnic->nesdev->mac_pause_frames_received; + target_stat_values[++index] = nesdev->nesadapter->nic_rx_eth_route_err; + target_stat_values[++index] = nesvnic->tx_sw_dropped; + target_stat_values[++index] = nesvnic->sq_full; + target_stat_values[++index] = nesvnic->segmented_tso_requests; + target_stat_values[++index] = nesvnic->nesdev->mac_rx_symbol_err_frames; + target_stat_values[++index] = nesvnic->nesdev->mac_rx_jabber_frames; + target_stat_values[++index] = nesvnic->nesdev->mac_rx_oversized_frames; + target_stat_values[++index] = nesvnic->nesdev->mac_rx_short_frames; + target_stat_values[++index] = nesvnic->netstats.rx_length_errors; + target_stat_values[++index] = nesvnic->nesdev->mac_rx_crc_errors; + target_stat_values[++index] = nesvnic->nesdev->port_rx_discards; + target_stat_values[++index] = nesvnic->endnode_nstat_rx_discard; + target_stat_values[++index] = nesvnic->endnode_nstat_rx_octets; + target_stat_values[++index] = nesvnic->endnode_nstat_rx_frames; + target_stat_values[++index] = nesvnic->endnode_nstat_tx_octets; + target_stat_values[++index] = nesvnic->endnode_nstat_tx_frames; + target_stat_values[++index] = nesvnic->nesdev->mac_tx_errors; + target_stat_values[++index] = mh_detected; + target_stat_values[++index] = mh_pauses_sent; + target_stat_values[++index] = nesvnic->endnode_ipv4_tcp_retransmits; + target_stat_values[++index] = atomic_read(&cm_connects); + target_stat_values[++index] = atomic_read(&cm_accepts); + target_stat_values[++index] = atomic_read(&cm_disconnects); + target_stat_values[++index] = atomic_read(&cm_connecteds); + target_stat_values[++index] = atomic_read(&cm_connect_reqs); + target_stat_values[++index] = atomic_read(&cm_rejects); + target_stat_values[++index] = atomic_read(&mod_qp_timouts); + target_stat_values[++index] = atomic_read(&qps_created); + target_stat_values[++index] = atomic_read(&sw_qps_destroyed); + target_stat_values[++index] = atomic_read(&qps_destroyed); + target_stat_values[++index] = atomic_read(&cm_closes); + target_stat_values[++index] = cm_packets_sent; + target_stat_values[++index] = cm_packets_bounced; + target_stat_values[++index] = cm_packets_created; + target_stat_values[++index] = cm_packets_received; + target_stat_values[++index] = cm_packets_dropped; + target_stat_values[++index] = cm_packets_retrans; + target_stat_values[++index] = atomic_read(&cm_listens_created); + target_stat_values[++index] = atomic_read(&cm_listens_destroyed); + target_stat_values[++index] = cm_backlog_drops; + target_stat_values[++index] = atomic_read(&cm_loopbacks); + target_stat_values[++index] = atomic_read(&cm_nodes_created); + target_stat_values[++index] = atomic_read(&cm_nodes_destroyed); + target_stat_values[++index] = atomic_read(&cm_accel_dropped_pkts); + target_stat_values[++index] = atomic_read(&cm_resets_recvd); + target_stat_values[++index] = nesadapter->free_4kpbl; + target_stat_values[++index] = nesadapter->free_256pbl; + target_stat_values[++index] = int_mod_timer_init; + target_stat_values[++index] = nesvnic->lro_mgr.stats.aggregated; + target_stat_values[++index] = nesvnic->lro_mgr.stats.flushed; + target_stat_values[++index] = nesvnic->lro_mgr.stats.no_desc; + target_stat_values[++index] = atomic_read(&pau_qps_created); + target_stat_values[++index] = atomic_read(&pau_qps_destroyed); +} + +/** + * nes_netdev_get_drvinfo + */ +static void nes_netdev_get_drvinfo(struct net_device *netdev, + struct ethtool_drvinfo *drvinfo) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_adapter *nesadapter = nesvnic->nesdev->nesadapter; + + strlcpy(drvinfo->driver, DRV_NAME, sizeof(drvinfo->driver)); + strlcpy(drvinfo->bus_info, pci_name(nesvnic->nesdev->pcidev), + sizeof(drvinfo->bus_info)); + snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version), + "%u.%u", nesadapter->firmware_version >> 16, + nesadapter->firmware_version & 0x000000ff); + strlcpy(drvinfo->version, DRV_VERSION, sizeof(drvinfo->version)); + drvinfo->testinfo_len = 0; + drvinfo->eedump_len = 0; + drvinfo->regdump_len = 0; +} + + +/** + * nes_netdev_set_coalesce + */ +static int nes_netdev_set_coalesce(struct net_device *netdev, + struct ethtool_coalesce *et_coalesce) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer; + unsigned long flags; + + spin_lock_irqsave(&nesadapter->periodic_timer_lock, flags); + if (et_coalesce->rx_max_coalesced_frames_low) { + shared_timer->threshold_low = et_coalesce->rx_max_coalesced_frames_low; + } + if (et_coalesce->rx_max_coalesced_frames_irq) { + shared_timer->threshold_target = et_coalesce->rx_max_coalesced_frames_irq; + } + if (et_coalesce->rx_max_coalesced_frames_high) { + shared_timer->threshold_high = et_coalesce->rx_max_coalesced_frames_high; + } + if (et_coalesce->rx_coalesce_usecs_low) { + shared_timer->timer_in_use_min = et_coalesce->rx_coalesce_usecs_low; + } + if (et_coalesce->rx_coalesce_usecs_high) { + shared_timer->timer_in_use_max = et_coalesce->rx_coalesce_usecs_high; + } + spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags); + + /* using this to drive total interrupt moderation */ + nesadapter->et_rx_coalesce_usecs_irq = et_coalesce->rx_coalesce_usecs_irq; + if (et_coalesce->use_adaptive_rx_coalesce) { + nesadapter->et_use_adaptive_rx_coalesce = 1; + nesadapter->timer_int_limit = NES_TIMER_INT_LIMIT_DYNAMIC; + nesadapter->et_rx_coalesce_usecs_irq = 0; + if (et_coalesce->pkt_rate_low) { + nesadapter->et_pkt_rate_low = et_coalesce->pkt_rate_low; + } + } else { + nesadapter->et_use_adaptive_rx_coalesce = 0; + nesadapter->timer_int_limit = NES_TIMER_INT_LIMIT; + if (nesadapter->et_rx_coalesce_usecs_irq) { + nes_write32(nesdev->regs+NES_PERIODIC_CONTROL, + 0x80000000 | ((u32)(nesadapter->et_rx_coalesce_usecs_irq*8))); + } + } + return 0; +} + + +/** + * nes_netdev_get_coalesce + */ +static int nes_netdev_get_coalesce(struct net_device *netdev, + struct ethtool_coalesce *et_coalesce) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct ethtool_coalesce temp_et_coalesce; + struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer; + unsigned long flags; + + memset(&temp_et_coalesce, 0, sizeof(temp_et_coalesce)); + temp_et_coalesce.rx_coalesce_usecs_irq = nesadapter->et_rx_coalesce_usecs_irq; + temp_et_coalesce.use_adaptive_rx_coalesce = nesadapter->et_use_adaptive_rx_coalesce; + temp_et_coalesce.rate_sample_interval = nesadapter->et_rate_sample_interval; + temp_et_coalesce.pkt_rate_low = nesadapter->et_pkt_rate_low; + spin_lock_irqsave(&nesadapter->periodic_timer_lock, flags); + temp_et_coalesce.rx_max_coalesced_frames_low = shared_timer->threshold_low; + temp_et_coalesce.rx_max_coalesced_frames_irq = shared_timer->threshold_target; + temp_et_coalesce.rx_max_coalesced_frames_high = shared_timer->threshold_high; + temp_et_coalesce.rx_coalesce_usecs_low = shared_timer->timer_in_use_min; + temp_et_coalesce.rx_coalesce_usecs_high = shared_timer->timer_in_use_max; + if (nesadapter->et_use_adaptive_rx_coalesce) { + temp_et_coalesce.rx_coalesce_usecs_irq = shared_timer->timer_in_use; + } + spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags); + memcpy(et_coalesce, &temp_et_coalesce, sizeof(*et_coalesce)); + return 0; +} + + +/** + * nes_netdev_get_pauseparam + */ +static void nes_netdev_get_pauseparam(struct net_device *netdev, + struct ethtool_pauseparam *et_pauseparam) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + + et_pauseparam->autoneg = 0; + et_pauseparam->rx_pause = (nesvnic->nesdev->disable_rx_flow_control == 0) ? 1:0; + et_pauseparam->tx_pause = (nesvnic->nesdev->disable_tx_flow_control == 0) ? 1:0; +} + + +/** + * nes_netdev_set_pauseparam + */ +static int nes_netdev_set_pauseparam(struct net_device *netdev, + struct ethtool_pauseparam *et_pauseparam) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_device *nesdev = nesvnic->nesdev; + u32 u32temp; + + if (et_pauseparam->autoneg) { + /* TODO: should return unsupported */ + return 0; + } + if ((et_pauseparam->tx_pause == 1) && (nesdev->disable_tx_flow_control == 1)) { + u32temp = nes_read_indexed(nesdev, + NES_IDX_MAC_TX_CONFIG + (nesdev->mac_index*0x200)); + u32temp |= NES_IDX_MAC_TX_CONFIG_ENABLE_PAUSE; + nes_write_indexed(nesdev, + NES_IDX_MAC_TX_CONFIG + (nesdev->mac_index*0x200), u32temp); + nesdev->disable_tx_flow_control = 0; + } else if ((et_pauseparam->tx_pause == 0) && (nesdev->disable_tx_flow_control == 0)) { + u32temp = nes_read_indexed(nesdev, + NES_IDX_MAC_TX_CONFIG + (nesdev->mac_index*0x200)); + u32temp &= ~NES_IDX_MAC_TX_CONFIG_ENABLE_PAUSE; + nes_write_indexed(nesdev, + NES_IDX_MAC_TX_CONFIG + (nesdev->mac_index*0x200), u32temp); + nesdev->disable_tx_flow_control = 1; + } + if ((et_pauseparam->rx_pause == 1) && (nesdev->disable_rx_flow_control == 1)) { + u32temp = nes_read_indexed(nesdev, + NES_IDX_MPP_DEBUG + (nesdev->mac_index*0x40)); + u32temp &= ~NES_IDX_MPP_DEBUG_PORT_DISABLE_PAUSE; + nes_write_indexed(nesdev, + NES_IDX_MPP_DEBUG + (nesdev->mac_index*0x40), u32temp); + nesdev->disable_rx_flow_control = 0; + } else if ((et_pauseparam->rx_pause == 0) && (nesdev->disable_rx_flow_control == 0)) { + u32temp = nes_read_indexed(nesdev, + NES_IDX_MPP_DEBUG + (nesdev->mac_index*0x40)); + u32temp |= NES_IDX_MPP_DEBUG_PORT_DISABLE_PAUSE; + nes_write_indexed(nesdev, + NES_IDX_MPP_DEBUG + (nesdev->mac_index*0x40), u32temp); + nesdev->disable_rx_flow_control = 1; + } + + return 0; +} + + +/** + * nes_netdev_get_settings + */ +static int nes_netdev_get_settings(struct net_device *netdev, struct ethtool_cmd *et_cmd) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + u32 mac_index = nesdev->mac_index; + u8 phy_type = nesadapter->phy_type[mac_index]; + u8 phy_index = nesadapter->phy_index[mac_index]; + u16 phy_data; + + et_cmd->duplex = DUPLEX_FULL; + et_cmd->port = PORT_MII; + et_cmd->maxtxpkt = 511; + et_cmd->maxrxpkt = 511; + + if (nesadapter->OneG_Mode) { + ethtool_cmd_speed_set(et_cmd, SPEED_1000); + if (phy_type == NES_PHY_TYPE_PUMA_1G) { + et_cmd->supported = SUPPORTED_1000baseT_Full; + et_cmd->advertising = ADVERTISED_1000baseT_Full; + et_cmd->autoneg = AUTONEG_DISABLE; + et_cmd->transceiver = XCVR_INTERNAL; + et_cmd->phy_address = mac_index; + } else { + unsigned long flags; + et_cmd->supported = SUPPORTED_1000baseT_Full + | SUPPORTED_Autoneg; + et_cmd->advertising = ADVERTISED_1000baseT_Full + | ADVERTISED_Autoneg; + spin_lock_irqsave(&nesadapter->phy_lock, flags); + nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data); + spin_unlock_irqrestore(&nesadapter->phy_lock, flags); + if (phy_data & 0x1000) + et_cmd->autoneg = AUTONEG_ENABLE; + else + et_cmd->autoneg = AUTONEG_DISABLE; + et_cmd->transceiver = XCVR_EXTERNAL; + et_cmd->phy_address = phy_index; + } + return 0; + } + if ((phy_type == NES_PHY_TYPE_ARGUS) || + (phy_type == NES_PHY_TYPE_SFP_D) || + (phy_type == NES_PHY_TYPE_KR)) { + et_cmd->transceiver = XCVR_EXTERNAL; + et_cmd->port = PORT_FIBRE; + et_cmd->supported = SUPPORTED_FIBRE; + et_cmd->advertising = ADVERTISED_FIBRE; + et_cmd->phy_address = phy_index; + } else { + et_cmd->transceiver = XCVR_INTERNAL; + et_cmd->supported = SUPPORTED_10000baseT_Full; + et_cmd->advertising = ADVERTISED_10000baseT_Full; + et_cmd->phy_address = mac_index; + } + ethtool_cmd_speed_set(et_cmd, SPEED_10000); + et_cmd->autoneg = AUTONEG_DISABLE; + return 0; +} + + +/** + * nes_netdev_set_settings + */ +static int nes_netdev_set_settings(struct net_device *netdev, struct ethtool_cmd *et_cmd) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + + if ((nesadapter->OneG_Mode) && + (nesadapter->phy_type[nesdev->mac_index] != NES_PHY_TYPE_PUMA_1G)) { + unsigned long flags; + u16 phy_data; + u8 phy_index = nesadapter->phy_index[nesdev->mac_index]; + + spin_lock_irqsave(&nesadapter->phy_lock, flags); + nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data); + if (et_cmd->autoneg) { + /* Turn on Full duplex, Autoneg, and restart autonegotiation */ + phy_data |= 0x1300; + } else { + /* Turn off autoneg */ + phy_data &= ~0x1000; + } + nes_write_1G_phy_reg(nesdev, 0, phy_index, phy_data); + spin_unlock_irqrestore(&nesadapter->phy_lock, flags); + } + + return 0; +} + + +static const struct ethtool_ops nes_ethtool_ops = { + .get_link = ethtool_op_get_link, + .get_settings = nes_netdev_get_settings, + .set_settings = nes_netdev_set_settings, + .get_strings = nes_netdev_get_strings, + .get_sset_count = nes_netdev_get_sset_count, + .get_ethtool_stats = nes_netdev_get_ethtool_stats, + .get_drvinfo = nes_netdev_get_drvinfo, + .get_coalesce = nes_netdev_get_coalesce, + .set_coalesce = nes_netdev_set_coalesce, + .get_pauseparam = nes_netdev_get_pauseparam, + .set_pauseparam = nes_netdev_set_pauseparam, +}; + +static void nes_vlan_mode(struct net_device *netdev, struct nes_device *nesdev, netdev_features_t features) +{ + struct nes_adapter *nesadapter = nesdev->nesadapter; + u32 u32temp; + unsigned long flags; + + spin_lock_irqsave(&nesadapter->phy_lock, flags); + + nes_debug(NES_DBG_NETDEV, "%s: %s\n", __func__, netdev->name); + + /* Enable/Disable VLAN Stripping */ + u32temp = nes_read_indexed(nesdev, NES_IDX_PCIX_DIAG); + if (features & NETIF_F_HW_VLAN_CTAG_RX) + u32temp &= 0xfdffffff; + else + u32temp |= 0x02000000; + + nes_write_indexed(nesdev, NES_IDX_PCIX_DIAG, u32temp); + spin_unlock_irqrestore(&nesadapter->phy_lock, flags); +} + +static netdev_features_t nes_fix_features(struct net_device *netdev, netdev_features_t features) +{ + /* + * Since there is no support for separate rx/tx vlan accel + * enable/disable make sure tx flag is always in same state as rx. + */ + if (features & NETIF_F_HW_VLAN_CTAG_RX) + features |= NETIF_F_HW_VLAN_CTAG_TX; + else + features &= ~NETIF_F_HW_VLAN_CTAG_TX; + + return features; +} + +static int nes_set_features(struct net_device *netdev, netdev_features_t features) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_device *nesdev = nesvnic->nesdev; + u32 changed = netdev->features ^ features; + + if (changed & NETIF_F_HW_VLAN_CTAG_RX) + nes_vlan_mode(netdev, nesdev, features); + + return 0; +} + +static const struct net_device_ops nes_netdev_ops = { + .ndo_open = nes_netdev_open, + .ndo_stop = nes_netdev_stop, + .ndo_start_xmit = nes_netdev_start_xmit, + .ndo_get_stats = nes_netdev_get_stats, + .ndo_tx_timeout = nes_netdev_tx_timeout, + .ndo_set_mac_address = nes_netdev_set_mac_address, + .ndo_set_rx_mode = nes_netdev_set_multicast_list, + .ndo_change_mtu = nes_netdev_change_mtu, + .ndo_validate_addr = eth_validate_addr, + .ndo_fix_features = nes_fix_features, + .ndo_set_features = nes_set_features, +}; + +/** + * nes_netdev_init - initialize network device + */ +struct net_device *nes_netdev_init(struct nes_device *nesdev, + void __iomem *mmio_addr) +{ + u64 u64temp; + struct nes_vnic *nesvnic; + struct net_device *netdev; + struct nic_qp_map *curr_qp_map; + u8 phy_type = nesdev->nesadapter->phy_type[nesdev->mac_index]; + + netdev = alloc_etherdev(sizeof(struct nes_vnic)); + if (!netdev) { + printk(KERN_ERR PFX "nesvnic etherdev alloc failed"); + return NULL; + } + nesvnic = netdev_priv(netdev); + + nes_debug(NES_DBG_INIT, "netdev = %p, %s\n", netdev, netdev->name); + + SET_NETDEV_DEV(netdev, &nesdev->pcidev->dev); + + netdev->watchdog_timeo = NES_TX_TIMEOUT; + netdev->irq = nesdev->pcidev->irq; + netdev->mtu = ETH_DATA_LEN; + netdev->hard_header_len = ETH_HLEN; + netdev->addr_len = ETH_ALEN; + netdev->type = ARPHRD_ETHER; + netdev->netdev_ops = &nes_netdev_ops; + netdev->ethtool_ops = &nes_ethtool_ops; + netif_napi_add(netdev, &nesvnic->napi, nes_netdev_poll, 128); + nes_debug(NES_DBG_INIT, "Enabling VLAN Insert/Delete.\n"); + + /* Fill in the port structure */ + nesvnic->netdev = netdev; + nesvnic->nesdev = nesdev; + nesvnic->msg_enable = netif_msg_init(debug, default_msg); + nesvnic->netdev_index = nesdev->netdev_count; + nesvnic->perfect_filter_index = nesdev->nesadapter->netdev_count; + nesvnic->max_frame_size = netdev->mtu + netdev->hard_header_len + VLAN_HLEN; + + curr_qp_map = nic_qp_mapping_per_function[PCI_FUNC(nesdev->pcidev->devfn)]; + nesvnic->nic.qp_id = curr_qp_map[nesdev->netdev_count].qpid; + nesvnic->nic_index = curr_qp_map[nesdev->netdev_count].nic_index; + nesvnic->logical_port = curr_qp_map[nesdev->netdev_count].logical_port; + + /* Setup the burned in MAC address */ + u64temp = (u64)nesdev->nesadapter->mac_addr_low; + u64temp += ((u64)nesdev->nesadapter->mac_addr_high) << 32; + u64temp += nesvnic->nic_index; + netdev->dev_addr[0] = (u8)(u64temp>>40); + netdev->dev_addr[1] = (u8)(u64temp>>32); + netdev->dev_addr[2] = (u8)(u64temp>>24); + netdev->dev_addr[3] = (u8)(u64temp>>16); + netdev->dev_addr[4] = (u8)(u64temp>>8); + netdev->dev_addr[5] = (u8)u64temp; + + netdev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_RXCSUM | NETIF_F_HW_VLAN_CTAG_RX; + if ((nesvnic->logical_port < 2) || (nesdev->nesadapter->hw_rev != NE020_REV)) + netdev->hw_features |= NETIF_F_TSO; + + netdev->features = netdev->hw_features | NETIF_F_HIGHDMA | NETIF_F_HW_VLAN_CTAG_TX; + netdev->hw_features |= NETIF_F_LRO; + + nes_debug(NES_DBG_INIT, "nesvnic = %p, reported features = 0x%lX, QPid = %d," + " nic_index = %d, logical_port = %d, mac_index = %d.\n", + nesvnic, (unsigned long)netdev->features, nesvnic->nic.qp_id, + nesvnic->nic_index, nesvnic->logical_port, nesdev->mac_index); + + if (nesvnic->nesdev->nesadapter->port_count == 1 && + nesvnic->nesdev->nesadapter->adapter_fcn_count == 1) { + + nesvnic->qp_nic_index[0] = nesvnic->nic_index; + nesvnic->qp_nic_index[1] = nesvnic->nic_index + 1; + if (nes_drv_opt & NES_DRV_OPT_DUAL_LOGICAL_PORT) { + nesvnic->qp_nic_index[2] = 0xf; + nesvnic->qp_nic_index[3] = 0xf; + } else { + nesvnic->qp_nic_index[2] = nesvnic->nic_index + 2; + nesvnic->qp_nic_index[3] = nesvnic->nic_index + 3; + } + } else { + if (nesvnic->nesdev->nesadapter->port_count == 2 || + (nesvnic->nesdev->nesadapter->port_count == 1 && + nesvnic->nesdev->nesadapter->adapter_fcn_count == 2)) { + nesvnic->qp_nic_index[0] = nesvnic->nic_index; + nesvnic->qp_nic_index[1] = nesvnic->nic_index + + 2; + nesvnic->qp_nic_index[2] = 0xf; + nesvnic->qp_nic_index[3] = 0xf; + } else { + nesvnic->qp_nic_index[0] = nesvnic->nic_index; + nesvnic->qp_nic_index[1] = 0xf; + nesvnic->qp_nic_index[2] = 0xf; + nesvnic->qp_nic_index[3] = 0xf; + } + } + nesvnic->next_qp_nic_index = 0; + + if (nesdev->netdev_count == 0) { + nesvnic->rdma_enabled = 1; + } else { + nesvnic->rdma_enabled = 0; + } + nesvnic->nic_cq.cq_number = nesvnic->nic.qp_id; + init_timer(&nesvnic->event_timer); + nesvnic->event_timer.function = NULL; + spin_lock_init(&nesvnic->tx_lock); + spin_lock_init(&nesvnic->port_ibevent_lock); + nesdev->netdev[nesdev->netdev_count] = netdev; + + nes_debug(NES_DBG_INIT, "Adding nesvnic (%p) to the adapters nesvnic_list for MAC%d.\n", + nesvnic, nesdev->mac_index); + list_add_tail(&nesvnic->list, &nesdev->nesadapter->nesvnic_list[nesdev->mac_index]); + + if ((nesdev->netdev_count == 0) && + ((PCI_FUNC(nesdev->pcidev->devfn) == nesdev->mac_index) || + ((phy_type == NES_PHY_TYPE_PUMA_1G) && + (((PCI_FUNC(nesdev->pcidev->devfn) == 1) && (nesdev->mac_index == 2)) || + ((PCI_FUNC(nesdev->pcidev->devfn) == 2) && (nesdev->mac_index == 1)))))) { + u32 u32temp; + u32 link_mask = 0; + u32 link_val = 0; + u16 temp_phy_data; + u16 phy_data = 0; + unsigned long flags; + + u32temp = nes_read_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0 + + (0x200 * (nesdev->mac_index & 1))); + if (phy_type != NES_PHY_TYPE_PUMA_1G) { + u32temp |= 0x00200000; + nes_write_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0 + + (0x200 * (nesdev->mac_index & 1)), u32temp); + } + + /* Check and set linkup here. This is for back to back */ + /* configuration where second port won't get link interrupt */ + switch (phy_type) { + case NES_PHY_TYPE_PUMA_1G: + if (nesdev->mac_index < 2) { + link_mask = 0x01010000; + link_val = 0x01010000; + } else { + link_mask = 0x02020000; + link_val = 0x02020000; + } + break; + case NES_PHY_TYPE_SFP_D: + spin_lock_irqsave(&nesdev->nesadapter->phy_lock, flags); + nes_read_10G_phy_reg(nesdev, + nesdev->nesadapter->phy_index[nesdev->mac_index], + 1, 0x9003); + temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); + nes_read_10G_phy_reg(nesdev, + nesdev->nesadapter->phy_index[nesdev->mac_index], + 3, 0x0021); + nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); + nes_read_10G_phy_reg(nesdev, + nesdev->nesadapter->phy_index[nesdev->mac_index], + 3, 0x0021); + phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); + spin_unlock_irqrestore(&nesdev->nesadapter->phy_lock, flags); + phy_data = (!temp_phy_data && (phy_data == 0x8000)) ? 0x4 : 0x0; + break; + default: + link_mask = 0x0f1f0000; + link_val = 0x0f0f0000; + break; + } + + u32temp = nes_read_indexed(nesdev, + NES_IDX_PHY_PCS_CONTROL_STATUS0 + + (0x200 * (nesdev->mac_index & 1))); + + if (phy_type == NES_PHY_TYPE_SFP_D) { + if (phy_data & 0x0004) + nesvnic->linkup = 1; + } else { + if ((u32temp & link_mask) == link_val) + nesvnic->linkup = 1; + } + + /* clear the MAC interrupt status, assumes direct logical to physical mapping */ + u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS + (0x200 * nesdev->mac_index)); + nes_debug(NES_DBG_INIT, "Phy interrupt status = 0x%X.\n", u32temp); + nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS + (0x200 * nesdev->mac_index), u32temp); + + nes_init_phy(nesdev); + } + + nes_vlan_mode(netdev, nesdev, netdev->features); + + return netdev; +} + + +/** + * nes_netdev_destroy - destroy network device structure + */ +void nes_netdev_destroy(struct net_device *netdev) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + + /* make sure 'stop' method is called by Linux stack */ + /* nes_netdev_stop(netdev); */ + + list_del(&nesvnic->list); + + if (nesvnic->of_device_registered) { + nes_destroy_ofa_device(nesvnic->nesibdev); + } + + free_netdev(netdev); +} + + +/** + * nes_nic_cm_xmit -- CM calls this to send out pkts + */ +int nes_nic_cm_xmit(struct sk_buff *skb, struct net_device *netdev) +{ + int ret; + + skb->dev = netdev; + ret = dev_queue_xmit(skb); + if (ret) { + nes_debug(NES_DBG_CM, "Bad return code from dev_queue_xmit %d\n", ret); + } + + return ret; +} diff --git a/kernel/drivers/infiniband/hw/nes/nes_user.h b/kernel/drivers/infiniband/hw/nes/nes_user.h new file mode 100644 index 000000000..529c421bb --- /dev/null +++ b/kernel/drivers/infiniband/hw/nes/nes_user.h @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2006 - 2011 Intel Corporation. All rights reserved. + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Cisco Systems. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef NES_USER_H +#define NES_USER_H + +#include + +#define NES_ABI_USERSPACE_VER 2 +#define NES_ABI_KERNEL_VER 2 + +/* + * Make sure that all structs defined in this file remain laid out so + * that they pack the same way on 32-bit and 64-bit architectures (to + * avoid incompatibility between 32-bit userspace and 64-bit kernels). + * In particular do not use pointer types -- pass pointers in __u64 + * instead. + */ + +struct nes_alloc_ucontext_req { + __u32 reserved32; + __u8 userspace_ver; + __u8 reserved8[3]; +}; + +struct nes_alloc_ucontext_resp { + __u32 max_pds; /* maximum pds allowed for this user process */ + __u32 max_qps; /* maximum qps allowed for this user process */ + __u32 wq_size; /* size of the WQs (sq+rq) allocated to the mmaped area */ + __u8 virtwq; /* flag to indicate if virtual WQ are to be used or not */ + __u8 kernel_ver; + __u8 reserved[2]; +}; + +struct nes_alloc_pd_resp { + __u32 pd_id; + __u32 mmap_db_index; +}; + +struct nes_create_cq_req { + __u64 user_cq_buffer; + __u32 mcrqf; + __u8 reserved[4]; +}; + +struct nes_create_qp_req { + __u64 user_wqe_buffers; + __u64 user_qp_buffer; +}; + +enum iwnes_memreg_type { + IWNES_MEMREG_TYPE_MEM = 0x0000, + IWNES_MEMREG_TYPE_QP = 0x0001, + IWNES_MEMREG_TYPE_CQ = 0x0002, + IWNES_MEMREG_TYPE_MW = 0x0003, + IWNES_MEMREG_TYPE_FMR = 0x0004, + IWNES_MEMREG_TYPE_FMEM = 0x0005, +}; + +struct nes_mem_reg_req { + __u32 reg_type; /* indicates if id is memory, QP or CQ */ + __u32 reserved; +}; + +struct nes_create_cq_resp { + __u32 cq_id; + __u32 cq_size; + __u32 mmap_db_index; + __u32 reserved; +}; + +struct nes_create_qp_resp { + __u32 qp_id; + __u32 actual_sq_size; + __u32 actual_rq_size; + __u32 mmap_sq_db_index; + __u32 mmap_rq_db_index; + __u32 nes_drv_opt; +}; + +#endif /* NES_USER_H */ diff --git a/kernel/drivers/infiniband/hw/nes/nes_utils.c b/kernel/drivers/infiniband/hw/nes/nes_utils.c new file mode 100644 index 000000000..2042c0f29 --- /dev/null +++ b/kernel/drivers/infiniband/hw/nes/nes_utils.c @@ -0,0 +1,972 @@ +/* + * Copyright (c) 2006 - 2011 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "nes.h" + +static u16 nes_read16_eeprom(void __iomem *addr, u16 offset); + +u32 mh_detected; +u32 mh_pauses_sent; + +static u32 nes_set_pau(struct nes_device *nesdev) +{ + u32 ret = 0; + u32 counter; + + nes_write_indexed(nesdev, NES_IDX_GPR2, NES_ENABLE_PAU); + nes_write_indexed(nesdev, NES_IDX_GPR_TRIGGER, 1); + + for (counter = 0; counter < NES_PAU_COUNTER; counter++) { + udelay(30); + if (!nes_read_indexed(nesdev, NES_IDX_GPR2)) { + printk(KERN_INFO PFX "PAU is supported.\n"); + break; + } + nes_write_indexed(nesdev, NES_IDX_GPR_TRIGGER, 1); + } + if (counter == NES_PAU_COUNTER) { + printk(KERN_INFO PFX "PAU is not supported.\n"); + return -EPERM; + } + return ret; +} + +/** + * nes_read_eeprom_values - + */ +int nes_read_eeprom_values(struct nes_device *nesdev, struct nes_adapter *nesadapter) +{ + u32 mac_addr_low; + u16 mac_addr_high; + u16 eeprom_data; + u16 eeprom_offset; + u16 next_section_address; + u16 sw_section_ver; + u8 major_ver = 0; + u8 minor_ver = 0; + + /* TODO: deal with EEPROM endian issues */ + if (nesadapter->firmware_eeprom_offset == 0) { + /* Read the EEPROM Parameters */ + eeprom_data = nes_read16_eeprom(nesdev->regs, 0); + nes_debug(NES_DBG_HW, "EEPROM Offset 0 = 0x%04X\n", eeprom_data); + eeprom_offset = 2 + (((eeprom_data & 0x007f) << 3) << + ((eeprom_data & 0x0080) >> 7)); + nes_debug(NES_DBG_HW, "Firmware Offset = 0x%04X\n", eeprom_offset); + nesadapter->firmware_eeprom_offset = eeprom_offset; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 4); + if (eeprom_data != 0x5746) { + nes_debug(NES_DBG_HW, "Not a valid Firmware Image = 0x%04X\n", eeprom_data); + return -1; + } + + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2); + nes_debug(NES_DBG_HW, "EEPROM Offset %u = 0x%04X\n", + eeprom_offset + 2, eeprom_data); + eeprom_offset += ((eeprom_data & 0x00ff) << 3) << ((eeprom_data & 0x0100) >> 8); + nes_debug(NES_DBG_HW, "Software Offset = 0x%04X\n", eeprom_offset); + nesadapter->software_eeprom_offset = eeprom_offset; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 4); + if (eeprom_data != 0x5753) { + printk("Not a valid Software Image = 0x%04X\n", eeprom_data); + return -1; + } + sw_section_ver = nes_read16_eeprom(nesdev->regs, nesadapter->software_eeprom_offset + 6); + nes_debug(NES_DBG_HW, "Software section version number = 0x%04X\n", + sw_section_ver); + + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2); + nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section) = 0x%04X\n", + eeprom_offset + 2, eeprom_data); + next_section_address = eeprom_offset + (((eeprom_data & 0x00ff) << 3) << + ((eeprom_data & 0x0100) >> 8)); + eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4); + if (eeprom_data != 0x414d) { + nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x414d but was 0x%04X\n", + eeprom_data); + goto no_fw_rev; + } + eeprom_offset = next_section_address; + + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2); + nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section) = 0x%04X\n", + eeprom_offset + 2, eeprom_data); + next_section_address = eeprom_offset + (((eeprom_data & 0x00ff) << 3) << + ((eeprom_data & 0x0100) >> 8)); + eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4); + if (eeprom_data != 0x4f52) { + nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x4f52 but was 0x%04X\n", + eeprom_data); + goto no_fw_rev; + } + eeprom_offset = next_section_address; + + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2); + nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section) = 0x%04X\n", + eeprom_offset + 2, eeprom_data); + next_section_address = eeprom_offset + ((eeprom_data & 0x00ff) << 3); + eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4); + if (eeprom_data != 0x5746) { + nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x5746 but was 0x%04X\n", + eeprom_data); + goto no_fw_rev; + } + eeprom_offset = next_section_address; + + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2); + nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section) = 0x%04X\n", + eeprom_offset + 2, eeprom_data); + next_section_address = eeprom_offset + ((eeprom_data & 0x00ff) << 3); + eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4); + if (eeprom_data != 0x5753) { + nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x5753 but was 0x%04X\n", + eeprom_data); + goto no_fw_rev; + } + eeprom_offset = next_section_address; + + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2); + nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section) = 0x%04X\n", + eeprom_offset + 2, eeprom_data); + next_section_address = eeprom_offset + ((eeprom_data & 0x00ff) << 3); + eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4); + if (eeprom_data != 0x414d) { + nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x414d but was 0x%04X\n", + eeprom_data); + goto no_fw_rev; + } + eeprom_offset = next_section_address; + + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2); + nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section) = 0x%04X\n", + eeprom_offset + 2, eeprom_data); + next_section_address = eeprom_offset + ((eeprom_data & 0x00ff) << 3); + eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4); + if (eeprom_data != 0x464e) { + nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x464e but was 0x%04X\n", + eeprom_data); + goto no_fw_rev; + } + eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 8); + printk(PFX "Firmware version %u.%u\n", (u8)(eeprom_data>>8), (u8)eeprom_data); + major_ver = (u8)(eeprom_data >> 8); + minor_ver = (u8)(eeprom_data); + + if (nes_drv_opt & NES_DRV_OPT_DISABLE_VIRT_WQ) { + nes_debug(NES_DBG_HW, "Virtual WQs have been disabled\n"); + } else if (((major_ver == 2) && (minor_ver > 21)) || ((major_ver > 2) && (major_ver != 255))) { + nesadapter->virtwq = 1; + } + if (((major_ver == 3) && (minor_ver >= 16)) || (major_ver > 3)) + nesadapter->send_term_ok = 1; + + if (nes_drv_opt & NES_DRV_OPT_ENABLE_PAU) { + if (!nes_set_pau(nesdev)) + nesadapter->allow_unaligned_fpdus = 1; + } + + nesadapter->firmware_version = (((u32)(u8)(eeprom_data>>8)) << 16) + + (u32)((u8)eeprom_data); + + eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 10); + printk(PFX "EEPROM version %u.%u\n", (u8)(eeprom_data>>8), (u8)eeprom_data); + nesadapter->eeprom_version = (((u32)(u8)(eeprom_data>>8)) << 16) + + (u32)((u8)eeprom_data); + +no_fw_rev: + /* eeprom is valid */ + eeprom_offset = nesadapter->software_eeprom_offset; + eeprom_offset += 8; + nesadapter->netdev_max = (u8)nes_read16_eeprom(nesdev->regs, eeprom_offset); + eeprom_offset += 2; + mac_addr_high = nes_read16_eeprom(nesdev->regs, eeprom_offset); + eeprom_offset += 2; + mac_addr_low = (u32)nes_read16_eeprom(nesdev->regs, eeprom_offset); + eeprom_offset += 2; + mac_addr_low <<= 16; + mac_addr_low += (u32)nes_read16_eeprom(nesdev->regs, eeprom_offset); + nes_debug(NES_DBG_HW, "Base MAC Address = 0x%04X%08X\n", + mac_addr_high, mac_addr_low); + nes_debug(NES_DBG_HW, "MAC Address count = %u\n", nesadapter->netdev_max); + + nesadapter->mac_addr_low = mac_addr_low; + nesadapter->mac_addr_high = mac_addr_high; + + /* Read the Phy Type array */ + eeprom_offset += 10; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + nesadapter->phy_type[0] = (u8)(eeprom_data >> 8); + nesadapter->phy_type[1] = (u8)eeprom_data; + + /* Read the port array */ + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + nesadapter->phy_type[2] = (u8)(eeprom_data >> 8); + nesadapter->phy_type[3] = (u8)eeprom_data; + /* port_count is set by soft reset reg */ + nes_debug(NES_DBG_HW, "port_count = %u, port 0 -> %u, port 1 -> %u," + " port 2 -> %u, port 3 -> %u\n", + nesadapter->port_count, + nesadapter->phy_type[0], nesadapter->phy_type[1], + nesadapter->phy_type[2], nesadapter->phy_type[3]); + + /* Read PD config array */ + eeprom_offset += 10; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + nesadapter->pd_config_size[0] = eeprom_data; + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + nesadapter->pd_config_base[0] = eeprom_data; + nes_debug(NES_DBG_HW, "PD0 config, size=0x%04x, base=0x%04x\n", + nesadapter->pd_config_size[0], nesadapter->pd_config_base[0]); + + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + nesadapter->pd_config_size[1] = eeprom_data; + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + nesadapter->pd_config_base[1] = eeprom_data; + nes_debug(NES_DBG_HW, "PD1 config, size=0x%04x, base=0x%04x\n", + nesadapter->pd_config_size[1], nesadapter->pd_config_base[1]); + + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + nesadapter->pd_config_size[2] = eeprom_data; + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + nesadapter->pd_config_base[2] = eeprom_data; + nes_debug(NES_DBG_HW, "PD2 config, size=0x%04x, base=0x%04x\n", + nesadapter->pd_config_size[2], nesadapter->pd_config_base[2]); + + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + nesadapter->pd_config_size[3] = eeprom_data; + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + nesadapter->pd_config_base[3] = eeprom_data; + nes_debug(NES_DBG_HW, "PD3 config, size=0x%04x, base=0x%04x\n", + nesadapter->pd_config_size[3], nesadapter->pd_config_base[3]); + + /* Read Rx Pool Size */ + eeprom_offset += 22; /* 46 */ + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + eeprom_offset += 2; + nesadapter->rx_pool_size = (((u32)eeprom_data) << 16) + + nes_read16_eeprom(nesdev->regs, eeprom_offset); + nes_debug(NES_DBG_HW, "rx_pool_size = 0x%08X\n", nesadapter->rx_pool_size); + + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + eeprom_offset += 2; + nesadapter->tx_pool_size = (((u32)eeprom_data) << 16) + + nes_read16_eeprom(nesdev->regs, eeprom_offset); + nes_debug(NES_DBG_HW, "tx_pool_size = 0x%08X\n", nesadapter->tx_pool_size); + + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + eeprom_offset += 2; + nesadapter->rx_threshold = (((u32)eeprom_data) << 16) + + nes_read16_eeprom(nesdev->regs, eeprom_offset); + nes_debug(NES_DBG_HW, "rx_threshold = 0x%08X\n", nesadapter->rx_threshold); + + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + eeprom_offset += 2; + nesadapter->tcp_timer_core_clk_divisor = (((u32)eeprom_data) << 16) + + nes_read16_eeprom(nesdev->regs, eeprom_offset); + nes_debug(NES_DBG_HW, "tcp_timer_core_clk_divisor = 0x%08X\n", + nesadapter->tcp_timer_core_clk_divisor); + + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + eeprom_offset += 2; + nesadapter->iwarp_config = (((u32)eeprom_data) << 16) + + nes_read16_eeprom(nesdev->regs, eeprom_offset); + nes_debug(NES_DBG_HW, "iwarp_config = 0x%08X\n", nesadapter->iwarp_config); + + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + eeprom_offset += 2; + nesadapter->cm_config = (((u32)eeprom_data) << 16) + + nes_read16_eeprom(nesdev->regs, eeprom_offset); + nes_debug(NES_DBG_HW, "cm_config = 0x%08X\n", nesadapter->cm_config); + + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + eeprom_offset += 2; + nesadapter->sws_timer_config = (((u32)eeprom_data) << 16) + + nes_read16_eeprom(nesdev->regs, eeprom_offset); + nes_debug(NES_DBG_HW, "sws_timer_config = 0x%08X\n", nesadapter->sws_timer_config); + + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + eeprom_offset += 2; + nesadapter->tcp_config1 = (((u32)eeprom_data) << 16) + + nes_read16_eeprom(nesdev->regs, eeprom_offset); + nes_debug(NES_DBG_HW, "tcp_config1 = 0x%08X\n", nesadapter->tcp_config1); + + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + eeprom_offset += 2; + nesadapter->wqm_wat = (((u32)eeprom_data) << 16) + + nes_read16_eeprom(nesdev->regs, eeprom_offset); + nes_debug(NES_DBG_HW, "wqm_wat = 0x%08X\n", nesadapter->wqm_wat); + + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + eeprom_offset += 2; + nesadapter->core_clock = (((u32)eeprom_data) << 16) + + nes_read16_eeprom(nesdev->regs, eeprom_offset); + nes_debug(NES_DBG_HW, "core_clock = 0x%08X\n", nesadapter->core_clock); + + if ((sw_section_ver) && (nesadapter->hw_rev != NE020_REV)) { + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + nesadapter->phy_index[0] = (eeprom_data & 0xff00)>>8; + nesadapter->phy_index[1] = eeprom_data & 0x00ff; + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + nesadapter->phy_index[2] = (eeprom_data & 0xff00)>>8; + nesadapter->phy_index[3] = eeprom_data & 0x00ff; + } else { + nesadapter->phy_index[0] = 4; + nesadapter->phy_index[1] = 5; + nesadapter->phy_index[2] = 6; + nesadapter->phy_index[3] = 7; + } + nes_debug(NES_DBG_HW, "Phy address map = 0 > %u, 1 > %u, 2 > %u, 3 > %u\n", + nesadapter->phy_index[0],nesadapter->phy_index[1], + nesadapter->phy_index[2],nesadapter->phy_index[3]); + } + + return 0; +} + + +/** + * nes_read16_eeprom + */ +static u16 nes_read16_eeprom(void __iomem *addr, u16 offset) +{ + writel(NES_EEPROM_READ_REQUEST + (offset >> 1), + (void __iomem *)addr + NES_EEPROM_COMMAND); + + do { + } while (readl((void __iomem *)addr + NES_EEPROM_COMMAND) & + NES_EEPROM_READ_REQUEST); + + return readw((void __iomem *)addr + NES_EEPROM_DATA); +} + + +/** + * nes_write_1G_phy_reg + */ +void nes_write_1G_phy_reg(struct nes_device *nesdev, u8 phy_reg, u8 phy_addr, u16 data) +{ + u32 u32temp; + u32 counter; + + nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL, + 0x50020000 | data | ((u32)phy_reg << 18) | ((u32)phy_addr << 23)); + for (counter = 0; counter < 100 ; counter++) { + udelay(30); + u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS); + if (u32temp & 1) { + /* nes_debug(NES_DBG_PHY, "Phy interrupt status = 0x%X.\n", u32temp); */ + nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1); + break; + } + } + if (!(u32temp & 1)) + nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n", + u32temp); +} + + +/** + * nes_read_1G_phy_reg + * This routine only issues the read, the data must be read + * separately. + */ +void nes_read_1G_phy_reg(struct nes_device *nesdev, u8 phy_reg, u8 phy_addr, u16 *data) +{ + u32 u32temp; + u32 counter; + + /* nes_debug(NES_DBG_PHY, "phy addr = %d, mac_index = %d\n", + phy_addr, nesdev->mac_index); */ + + nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL, + 0x60020000 | ((u32)phy_reg << 18) | ((u32)phy_addr << 23)); + for (counter = 0; counter < 100 ; counter++) { + udelay(30); + u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS); + if (u32temp & 1) { + /* nes_debug(NES_DBG_PHY, "Phy interrupt status = 0x%X.\n", u32temp); */ + nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1); + break; + } + } + if (!(u32temp & 1)) { + nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n", + u32temp); + *data = 0xffff; + } else { + *data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); + } +} + + +/** + * nes_write_10G_phy_reg + */ +void nes_write_10G_phy_reg(struct nes_device *nesdev, u16 phy_addr, u8 dev_addr, u16 phy_reg, + u16 data) +{ + u32 port_addr; + u32 u32temp; + u32 counter; + + port_addr = phy_addr; + + /* set address */ + nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL, + 0x00020000 | (u32)phy_reg | (((u32)dev_addr) << 18) | (((u32)port_addr) << 23)); + for (counter = 0; counter < 100 ; counter++) { + udelay(30); + u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS); + if (u32temp & 1) { + nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1); + break; + } + } + if (!(u32temp & 1)) + nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n", + u32temp); + + /* set data */ + nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL, + 0x10020000 | (u32)data | (((u32)dev_addr) << 18) | (((u32)port_addr) << 23)); + for (counter = 0; counter < 100 ; counter++) { + udelay(30); + u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS); + if (u32temp & 1) { + nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1); + break; + } + } + if (!(u32temp & 1)) + nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n", + u32temp); +} + + +/** + * nes_read_10G_phy_reg + * This routine only issues the read, the data must be read + * separately. + */ +void nes_read_10G_phy_reg(struct nes_device *nesdev, u8 phy_addr, u8 dev_addr, u16 phy_reg) +{ + u32 port_addr; + u32 u32temp; + u32 counter; + + port_addr = phy_addr; + + /* set address */ + nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL, + 0x00020000 | (u32)phy_reg | (((u32)dev_addr) << 18) | (((u32)port_addr) << 23)); + for (counter = 0; counter < 100 ; counter++) { + udelay(30); + u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS); + if (u32temp & 1) { + nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1); + break; + } + } + if (!(u32temp & 1)) + nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n", + u32temp); + + /* issue read */ + nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL, + 0x30020000 | (((u32)dev_addr) << 18) | (((u32)port_addr) << 23)); + for (counter = 0; counter < 100 ; counter++) { + udelay(30); + u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS); + if (u32temp & 1) { + nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1); + break; + } + } + if (!(u32temp & 1)) + nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n", + u32temp); +} + + +/** + * nes_get_cqp_request + */ +struct nes_cqp_request *nes_get_cqp_request(struct nes_device *nesdev) +{ + unsigned long flags; + struct nes_cqp_request *cqp_request = NULL; + + if (!list_empty(&nesdev->cqp_avail_reqs)) { + spin_lock_irqsave(&nesdev->cqp.lock, flags); + if (!list_empty(&nesdev->cqp_avail_reqs)) { + cqp_request = list_entry(nesdev->cqp_avail_reqs.next, + struct nes_cqp_request, list); + list_del_init(&cqp_request->list); + } + spin_unlock_irqrestore(&nesdev->cqp.lock, flags); + } + if (cqp_request == NULL) { + cqp_request = kzalloc(sizeof(struct nes_cqp_request), GFP_ATOMIC); + if (cqp_request) { + cqp_request->dynamic = 1; + INIT_LIST_HEAD(&cqp_request->list); + } + } + + if (cqp_request) { + init_waitqueue_head(&cqp_request->waitq); + cqp_request->waiting = 0; + cqp_request->request_done = 0; + cqp_request->callback = 0; + init_waitqueue_head(&cqp_request->waitq); + nes_debug(NES_DBG_CQP, "Got cqp request %p from the available list \n", + cqp_request); + } else + printk(KERN_ERR PFX "%s: Could not allocated a CQP request.\n", + __func__); + + return cqp_request; +} + +void nes_free_cqp_request(struct nes_device *nesdev, + struct nes_cqp_request *cqp_request) +{ + unsigned long flags; + + nes_debug(NES_DBG_CQP, "CQP request %p (opcode 0x%02X) freed.\n", + cqp_request, + le32_to_cpu(cqp_request->cqp_wqe.wqe_words[NES_CQP_WQE_OPCODE_IDX]) & 0x3f); + + if (cqp_request->dynamic) { + kfree(cqp_request); + } else { + spin_lock_irqsave(&nesdev->cqp.lock, flags); + list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs); + spin_unlock_irqrestore(&nesdev->cqp.lock, flags); + } +} + +void nes_put_cqp_request(struct nes_device *nesdev, + struct nes_cqp_request *cqp_request) +{ + if (atomic_dec_and_test(&cqp_request->refcount)) + nes_free_cqp_request(nesdev, cqp_request); +} + + +/** + * nes_post_cqp_request + */ +void nes_post_cqp_request(struct nes_device *nesdev, + struct nes_cqp_request *cqp_request) +{ + struct nes_hw_cqp_wqe *cqp_wqe; + unsigned long flags; + u32 cqp_head; + u64 u64temp; + u32 opcode; + int ctx_index = NES_CQP_WQE_COMP_CTX_LOW_IDX; + + spin_lock_irqsave(&nesdev->cqp.lock, flags); + + if (((((nesdev->cqp.sq_tail+(nesdev->cqp.sq_size*2))-nesdev->cqp.sq_head) & + (nesdev->cqp.sq_size - 1)) != 1) + && (list_empty(&nesdev->cqp_pending_reqs))) { + cqp_head = nesdev->cqp.sq_head++; + nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1; + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + memcpy(cqp_wqe, &cqp_request->cqp_wqe, sizeof(*cqp_wqe)); + opcode = le32_to_cpu(cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX]); + if ((opcode & NES_CQP_OPCODE_MASK) == NES_CQP_DOWNLOAD_SEGMENT) + ctx_index = NES_CQP_WQE_DL_COMP_CTX_LOW_IDX; + barrier(); + u64temp = (unsigned long)cqp_request; + set_wqe_64bit_value(cqp_wqe->wqe_words, ctx_index, u64temp); + nes_debug(NES_DBG_CQP, "CQP request (opcode 0x%02X), line 1 = 0x%08X put on CQPs SQ," + " request = %p, cqp_head = %u, cqp_tail = %u, cqp_size = %u," + " waiting = %d, refcount = %d.\n", + opcode & NES_CQP_OPCODE_MASK, + le32_to_cpu(cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX]), cqp_request, + nesdev->cqp.sq_head, nesdev->cqp.sq_tail, nesdev->cqp.sq_size, + cqp_request->waiting, atomic_read(&cqp_request->refcount)); + + barrier(); + + /* Ring doorbell (1 WQEs) */ + nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x01800000 | nesdev->cqp.qp_id); + + barrier(); + } else { + nes_debug(NES_DBG_CQP, "CQP request %p (opcode 0x%02X), line 1 = 0x%08X" + " put on the pending queue.\n", + cqp_request, + le32_to_cpu(cqp_request->cqp_wqe.wqe_words[NES_CQP_WQE_OPCODE_IDX])&0x3f, + le32_to_cpu(cqp_request->cqp_wqe.wqe_words[NES_CQP_WQE_ID_IDX])); + list_add_tail(&cqp_request->list, &nesdev->cqp_pending_reqs); + } + + spin_unlock_irqrestore(&nesdev->cqp.lock, flags); + + return; +} + +/** + * nes_arp_table + */ +int nes_arp_table(struct nes_device *nesdev, u32 ip_addr, u8 *mac_addr, u32 action) +{ + struct nes_adapter *nesadapter = nesdev->nesadapter; + int arp_index; + int err = 0; + __be32 tmp_addr; + + for (arp_index = 0; (u32) arp_index < nesadapter->arp_table_size; arp_index++) { + if (nesadapter->arp_table[arp_index].ip_addr == ip_addr) + break; + } + + if (action == NES_ARP_ADD) { + if (arp_index != nesadapter->arp_table_size) { + return -1; + } + + arp_index = 0; + err = nes_alloc_resource(nesadapter, nesadapter->allocated_arps, + nesadapter->arp_table_size, (u32 *)&arp_index, &nesadapter->next_arp_index, NES_RESOURCE_ARP); + if (err) { + nes_debug(NES_DBG_NETDEV, "nes_alloc_resource returned error = %u\n", err); + return err; + } + nes_debug(NES_DBG_NETDEV, "ADD, arp_index=%d\n", arp_index); + + nesadapter->arp_table[arp_index].ip_addr = ip_addr; + memcpy(nesadapter->arp_table[arp_index].mac_addr, mac_addr, ETH_ALEN); + return arp_index; + } + + /* DELETE or RESOLVE */ + if (arp_index == nesadapter->arp_table_size) { + tmp_addr = cpu_to_be32(ip_addr); + nes_debug(NES_DBG_NETDEV, "MAC for %pI4 not in ARP table - cannot %s\n", + &tmp_addr, action == NES_ARP_RESOLVE ? "resolve" : "delete"); + return -1; + } + + if (action == NES_ARP_RESOLVE) { + nes_debug(NES_DBG_NETDEV, "RESOLVE, arp_index=%d\n", arp_index); + return arp_index; + } + + if (action == NES_ARP_DELETE) { + nes_debug(NES_DBG_NETDEV, "DELETE, arp_index=%d\n", arp_index); + nesadapter->arp_table[arp_index].ip_addr = 0; + memset(nesadapter->arp_table[arp_index].mac_addr, 0x00, ETH_ALEN); + nes_free_resource(nesadapter, nesadapter->allocated_arps, arp_index); + return arp_index; + } + + return -1; +} + + +/** + * nes_mh_fix + */ +void nes_mh_fix(unsigned long parm) +{ + unsigned long flags; + struct nes_device *nesdev = (struct nes_device *)parm; + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_vnic *nesvnic; + u32 used_chunks_tx; + u32 temp_used_chunks_tx; + u32 temp_last_used_chunks_tx; + u32 used_chunks_mask; + u32 mac_tx_frames_low; + u32 mac_tx_frames_high; + u32 mac_tx_pauses; + u32 serdes_status; + u32 reset_value; + u32 tx_control; + u32 tx_config; + u32 tx_pause_quanta; + u32 rx_control; + u32 rx_config; + u32 mac_exact_match; + u32 mpp_debug; + u32 i=0; + u32 chunks_tx_progress = 0; + + spin_lock_irqsave(&nesadapter->phy_lock, flags); + if ((nesadapter->mac_sw_state[0] != NES_MAC_SW_IDLE) || (nesadapter->mac_link_down[0])) { + spin_unlock_irqrestore(&nesadapter->phy_lock, flags); + goto no_mh_work; + } + nesadapter->mac_sw_state[0] = NES_MAC_SW_MH; + spin_unlock_irqrestore(&nesadapter->phy_lock, flags); + do { + mac_tx_frames_low = nes_read_indexed(nesdev, NES_IDX_MAC_TX_FRAMES_LOW); + mac_tx_frames_high = nes_read_indexed(nesdev, NES_IDX_MAC_TX_FRAMES_HIGH); + mac_tx_pauses = nes_read_indexed(nesdev, NES_IDX_MAC_TX_PAUSE_FRAMES); + used_chunks_tx = nes_read_indexed(nesdev, NES_IDX_USED_CHUNKS_TX); + nesdev->mac_pause_frames_sent += mac_tx_pauses; + used_chunks_mask = 0; + temp_used_chunks_tx = used_chunks_tx; + temp_last_used_chunks_tx = nesdev->last_used_chunks_tx; + + if (nesdev->netdev[0]) { + nesvnic = netdev_priv(nesdev->netdev[0]); + } else { + break; + } + + for (i=0; i<4; i++) { + used_chunks_mask <<= 8; + if (nesvnic->qp_nic_index[i] != 0xff) { + used_chunks_mask |= 0xff; + if ((temp_used_chunks_tx&0xff)<(temp_last_used_chunks_tx&0xff)) { + chunks_tx_progress = 1; + } + } + temp_used_chunks_tx >>= 8; + temp_last_used_chunks_tx >>= 8; + } + if ((mac_tx_frames_low) || (mac_tx_frames_high) || + (!(used_chunks_tx&used_chunks_mask)) || + (!(nesdev->last_used_chunks_tx&used_chunks_mask)) || + (chunks_tx_progress) ) { + nesdev->last_used_chunks_tx = used_chunks_tx; + break; + } + nesdev->last_used_chunks_tx = used_chunks_tx; + barrier(); + + nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONTROL, 0x00000005); + mh_pauses_sent++; + mac_tx_pauses = nes_read_indexed(nesdev, NES_IDX_MAC_TX_PAUSE_FRAMES); + if (mac_tx_pauses) { + nesdev->mac_pause_frames_sent += mac_tx_pauses; + break; + } + + tx_control = nes_read_indexed(nesdev, NES_IDX_MAC_TX_CONTROL); + tx_config = nes_read_indexed(nesdev, NES_IDX_MAC_TX_CONFIG); + tx_pause_quanta = nes_read_indexed(nesdev, NES_IDX_MAC_TX_PAUSE_QUANTA); + rx_control = nes_read_indexed(nesdev, NES_IDX_MAC_RX_CONTROL); + rx_config = nes_read_indexed(nesdev, NES_IDX_MAC_RX_CONFIG); + mac_exact_match = nes_read_indexed(nesdev, NES_IDX_MAC_EXACT_MATCH_BOTTOM); + mpp_debug = nes_read_indexed(nesdev, NES_IDX_MPP_DEBUG); + + /* one last ditch effort to avoid a false positive */ + mac_tx_pauses = nes_read_indexed(nesdev, NES_IDX_MAC_TX_PAUSE_FRAMES); + if (mac_tx_pauses) { + nesdev->last_mac_tx_pauses = nesdev->mac_pause_frames_sent; + nes_debug(NES_DBG_HW, "failsafe caught slow outbound pause\n"); + break; + } + mh_detected++; + + nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONTROL, 0x00000000); + nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONFIG, 0x00000000); + reset_value = nes_read32(nesdev->regs+NES_SOFTWARE_RESET); + + nes_write32(nesdev->regs+NES_SOFTWARE_RESET, reset_value | 0x0000001d); + + while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET) + & 0x00000040) != 0x00000040) && (i++ < 5000)) { + /* mdelay(1); */ + } + + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0, 0x00000008); + serdes_status = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0); + + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP0, 0x000bdef7); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_DRIVE0, 0x9ce73000); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_MODE0, 0x0ff00000); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_SIGDET0, 0x00000000); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_BYPASS0, 0x00000000); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_LOOPBACK_CONTROL0, 0x00000000); + if (nesadapter->OneG_Mode) { + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_CONTROL0, 0xf0182222); + } else { + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_CONTROL0, 0xf0042222); + } + serdes_status = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_STATUS0); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000ff); + + nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONTROL, tx_control); + nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONFIG, tx_config); + nes_write_indexed(nesdev, NES_IDX_MAC_TX_PAUSE_QUANTA, tx_pause_quanta); + nes_write_indexed(nesdev, NES_IDX_MAC_RX_CONTROL, rx_control); + nes_write_indexed(nesdev, NES_IDX_MAC_RX_CONFIG, rx_config); + nes_write_indexed(nesdev, NES_IDX_MAC_EXACT_MATCH_BOTTOM, mac_exact_match); + nes_write_indexed(nesdev, NES_IDX_MPP_DEBUG, mpp_debug); + + } while (0); + + nesadapter->mac_sw_state[0] = NES_MAC_SW_IDLE; +no_mh_work: + nesdev->nesadapter->mh_timer.expires = jiffies + (HZ/5); + add_timer(&nesdev->nesadapter->mh_timer); +} + +/** + * nes_clc + */ +void nes_clc(unsigned long parm) +{ + unsigned long flags; + struct nes_device *nesdev = (struct nes_device *)parm; + struct nes_adapter *nesadapter = nesdev->nesadapter; + + spin_lock_irqsave(&nesadapter->phy_lock, flags); + nesadapter->link_interrupt_count[0] = 0; + nesadapter->link_interrupt_count[1] = 0; + nesadapter->link_interrupt_count[2] = 0; + nesadapter->link_interrupt_count[3] = 0; + spin_unlock_irqrestore(&nesadapter->phy_lock, flags); + + nesadapter->lc_timer.expires = jiffies + 3600 * HZ; /* 1 hour */ + add_timer(&nesadapter->lc_timer); +} + + +/** + * nes_dump_mem + */ +void nes_dump_mem(unsigned int dump_debug_level, void *addr, int length) +{ + char xlate[] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', + 'a', 'b', 'c', 'd', 'e', 'f'}; + char *ptr; + char hex_buf[80]; + char ascii_buf[20]; + int num_char; + int num_ascii; + int num_hex; + + if (!(nes_debug_level & dump_debug_level)) { + return; + } + + ptr = addr; + if (length > 0x100) { + nes_debug(dump_debug_level, "Length truncated from %x to %x\n", length, 0x100); + length = 0x100; + } + nes_debug(dump_debug_level, "Address=0x%p, length=0x%x (%d)\n", ptr, length, length); + + memset(ascii_buf, 0, 20); + memset(hex_buf, 0, 80); + + num_ascii = 0; + num_hex = 0; + for (num_char = 0; num_char < length; num_char++) { + if (num_ascii == 8) { + ascii_buf[num_ascii++] = ' '; + hex_buf[num_hex++] = '-'; + hex_buf[num_hex++] = ' '; + } + + if (*ptr < 0x20 || *ptr > 0x7e) + ascii_buf[num_ascii++] = '.'; + else + ascii_buf[num_ascii++] = *ptr; + hex_buf[num_hex++] = xlate[((*ptr & 0xf0) >> 4)]; + hex_buf[num_hex++] = xlate[*ptr & 0x0f]; + hex_buf[num_hex++] = ' '; + ptr++; + + if (num_ascii >= 17) { + /* output line and reset */ + nes_debug(dump_debug_level, " %s | %s\n", hex_buf, ascii_buf); + memset(ascii_buf, 0, 20); + memset(hex_buf, 0, 80); + num_ascii = 0; + num_hex = 0; + } + } + + /* output the rest */ + if (num_ascii) { + while (num_ascii < 17) { + if (num_ascii == 8) { + hex_buf[num_hex++] = ' '; + hex_buf[num_hex++] = ' '; + } + hex_buf[num_hex++] = ' '; + hex_buf[num_hex++] = ' '; + hex_buf[num_hex++] = ' '; + num_ascii++; + } + + nes_debug(dump_debug_level, " %s | %s\n", hex_buf, ascii_buf); + } +} diff --git a/kernel/drivers/infiniband/hw/nes/nes_verbs.c b/kernel/drivers/infiniband/hw/nes/nes_verbs.c new file mode 100644 index 000000000..c0d0296e7 --- /dev/null +++ b/kernel/drivers/infiniband/hw/nes/nes_verbs.c @@ -0,0 +1,4054 @@ +/* + * Copyright (c) 2006 - 2011 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "nes.h" + +#include + +atomic_t mod_qp_timouts; +atomic_t qps_created; +atomic_t sw_qps_destroyed; + +static void nes_unregister_ofa_device(struct nes_ib_device *nesibdev); + +/** + * nes_alloc_mw + */ +static struct ib_mw *nes_alloc_mw(struct ib_pd *ibpd, enum ib_mw_type type) +{ + struct nes_pd *nespd = to_nespd(ibpd); + struct nes_vnic *nesvnic = to_nesvnic(ibpd->device); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_cqp_request *cqp_request; + struct nes_mr *nesmr; + struct ib_mw *ibmw; + struct nes_hw_cqp_wqe *cqp_wqe; + int ret; + u32 stag; + u32 stag_index = 0; + u32 next_stag_index = 0; + u32 driver_key = 0; + u8 stag_key = 0; + + if (type != IB_MW_TYPE_1) + return ERR_PTR(-EINVAL); + + get_random_bytes(&next_stag_index, sizeof(next_stag_index)); + stag_key = (u8)next_stag_index; + + driver_key = 0; + + next_stag_index >>= 8; + next_stag_index %= nesadapter->max_mr; + + ret = nes_alloc_resource(nesadapter, nesadapter->allocated_mrs, + nesadapter->max_mr, &stag_index, &next_stag_index, NES_RESOURCE_MW); + if (ret) { + return ERR_PTR(ret); + } + + nesmr = kzalloc(sizeof(*nesmr), GFP_KERNEL); + if (!nesmr) { + nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); + return ERR_PTR(-ENOMEM); + } + + stag = stag_index << 8; + stag |= driver_key; + stag += (u32)stag_key; + + nes_debug(NES_DBG_MR, "Registering STag 0x%08X, index = 0x%08X\n", + stag, stag_index); + + /* Register the region with the adapter */ + cqp_request = nes_get_cqp_request(nesdev); + if (cqp_request == NULL) { + kfree(nesmr); + nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); + return ERR_PTR(-ENOMEM); + } + + cqp_request->waiting = 1; + cqp_wqe = &cqp_request->cqp_wqe; + + cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = + cpu_to_le32( NES_CQP_ALLOCATE_STAG | NES_CQP_STAG_RIGHTS_REMOTE_READ | + NES_CQP_STAG_RIGHTS_REMOTE_WRITE | NES_CQP_STAG_VA_TO | + NES_CQP_STAG_REM_ACC_EN); + + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX, (nespd->pd_id & 0x00007fff)); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, stag); + + atomic_set(&cqp_request->refcount, 2); + nes_post_cqp_request(nesdev, cqp_request); + + /* Wait for CQP */ + ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0), + NES_EVENT_TIMEOUT); + nes_debug(NES_DBG_MR, "Register STag 0x%08X completed, wait_event_timeout ret = %u," + " CQP Major:Minor codes = 0x%04X:0x%04X.\n", + stag, ret, cqp_request->major_code, cqp_request->minor_code); + if ((!ret) || (cqp_request->major_code)) { + nes_put_cqp_request(nesdev, cqp_request); + kfree(nesmr); + nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); + if (!ret) { + return ERR_PTR(-ETIME); + } else { + return ERR_PTR(-ENOMEM); + } + } + nes_put_cqp_request(nesdev, cqp_request); + + nesmr->ibmw.rkey = stag; + nesmr->mode = IWNES_MEMREG_TYPE_MW; + ibmw = &nesmr->ibmw; + nesmr->pbl_4k = 0; + nesmr->pbls_used = 0; + + return ibmw; +} + + +/** + * nes_dealloc_mw + */ +static int nes_dealloc_mw(struct ib_mw *ibmw) +{ + struct nes_mr *nesmr = to_nesmw(ibmw); + struct nes_vnic *nesvnic = to_nesvnic(ibmw->device); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_hw_cqp_wqe *cqp_wqe; + struct nes_cqp_request *cqp_request; + int err = 0; + int ret; + + /* Deallocate the window with the adapter */ + cqp_request = nes_get_cqp_request(nesdev); + if (cqp_request == NULL) { + nes_debug(NES_DBG_MR, "Failed to get a cqp_request.\n"); + return -ENOMEM; + } + cqp_request->waiting = 1; + cqp_wqe = &cqp_request->cqp_wqe; + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, NES_CQP_DEALLOCATE_STAG); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, ibmw->rkey); + + atomic_set(&cqp_request->refcount, 2); + nes_post_cqp_request(nesdev, cqp_request); + + /* Wait for CQP */ + nes_debug(NES_DBG_MR, "Waiting for deallocate STag 0x%08X to complete.\n", + ibmw->rkey); + ret = wait_event_timeout(cqp_request->waitq, (0 != cqp_request->request_done), + NES_EVENT_TIMEOUT); + nes_debug(NES_DBG_MR, "Deallocate STag completed, wait_event_timeout ret = %u," + " CQP Major:Minor codes = 0x%04X:0x%04X.\n", + ret, cqp_request->major_code, cqp_request->minor_code); + if (!ret) + err = -ETIME; + else if (cqp_request->major_code) + err = -EIO; + + nes_put_cqp_request(nesdev, cqp_request); + + nes_free_resource(nesadapter, nesadapter->allocated_mrs, + (ibmw->rkey & 0x0fffff00) >> 8); + kfree(nesmr); + + return err; +} + + +/** + * nes_bind_mw + */ +static int nes_bind_mw(struct ib_qp *ibqp, struct ib_mw *ibmw, + struct ib_mw_bind *ibmw_bind) +{ + u64 u64temp; + struct nes_vnic *nesvnic = to_nesvnic(ibqp->device); + struct nes_device *nesdev = nesvnic->nesdev; + /* struct nes_mr *nesmr = to_nesmw(ibmw); */ + struct nes_qp *nesqp = to_nesqp(ibqp); + struct nes_hw_qp_wqe *wqe; + unsigned long flags = 0; + u32 head; + u32 wqe_misc = 0; + u32 qsize; + + if (nesqp->ibqp_state > IB_QPS_RTS) + return -EINVAL; + + spin_lock_irqsave(&nesqp->lock, flags); + + head = nesqp->hwqp.sq_head; + qsize = nesqp->hwqp.sq_tail; + + /* Check for SQ overflow */ + if (((head + (2 * qsize) - nesqp->hwqp.sq_tail) % qsize) == (qsize - 1)) { + spin_unlock_irqrestore(&nesqp->lock, flags); + return -ENOMEM; + } + + wqe = &nesqp->hwqp.sq_vbase[head]; + /* nes_debug(NES_DBG_MR, "processing sq wqe at %p, head = %u.\n", wqe, head); */ + nes_fill_init_qp_wqe(wqe, nesqp, head); + u64temp = ibmw_bind->wr_id; + set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_SCRATCH_LOW_IDX, u64temp); + wqe_misc = NES_IWARP_SQ_OP_BIND; + + wqe_misc |= NES_IWARP_SQ_WQE_LOCAL_FENCE; + + if (ibmw_bind->send_flags & IB_SEND_SIGNALED) + wqe_misc |= NES_IWARP_SQ_WQE_SIGNALED_COMPL; + + if (ibmw_bind->bind_info.mw_access_flags & IB_ACCESS_REMOTE_WRITE) + wqe_misc |= NES_CQP_STAG_RIGHTS_REMOTE_WRITE; + if (ibmw_bind->bind_info.mw_access_flags & IB_ACCESS_REMOTE_READ) + wqe_misc |= NES_CQP_STAG_RIGHTS_REMOTE_READ; + + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_MISC_IDX, wqe_misc); + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_BIND_WQE_MR_IDX, + ibmw_bind->bind_info.mr->lkey); + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_BIND_WQE_MW_IDX, ibmw->rkey); + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_BIND_WQE_LENGTH_LOW_IDX, + ibmw_bind->bind_info.length); + wqe->wqe_words[NES_IWARP_SQ_BIND_WQE_LENGTH_HIGH_IDX] = 0; + u64temp = (u64)ibmw_bind->bind_info.addr; + set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_BIND_WQE_VA_FBO_LOW_IDX, u64temp); + + head++; + if (head >= qsize) + head = 0; + + nesqp->hwqp.sq_head = head; + barrier(); + + nes_write32(nesdev->regs+NES_WQE_ALLOC, + (1 << 24) | 0x00800000 | nesqp->hwqp.qp_id); + + spin_unlock_irqrestore(&nesqp->lock, flags); + + return 0; +} + + +/* + * nes_alloc_fast_mr + */ +static int alloc_fast_reg_mr(struct nes_device *nesdev, struct nes_pd *nespd, + u32 stag, u32 page_count) +{ + struct nes_hw_cqp_wqe *cqp_wqe; + struct nes_cqp_request *cqp_request; + unsigned long flags; + int ret; + struct nes_adapter *nesadapter = nesdev->nesadapter; + u32 opcode = 0; + u16 major_code; + u64 region_length = page_count * PAGE_SIZE; + + + cqp_request = nes_get_cqp_request(nesdev); + if (cqp_request == NULL) { + nes_debug(NES_DBG_MR, "Failed to get a cqp_request.\n"); + return -ENOMEM; + } + nes_debug(NES_DBG_MR, "alloc_fast_reg_mr: page_count = %d, " + "region_length = %llu\n", + page_count, region_length); + cqp_request->waiting = 1; + cqp_wqe = &cqp_request->cqp_wqe; + + spin_lock_irqsave(&nesadapter->pbl_lock, flags); + if (nesadapter->free_4kpbl > 0) { + nesadapter->free_4kpbl--; + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + } else { + /* No 4kpbl's available: */ + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + nes_debug(NES_DBG_MR, "Out of Pbls\n"); + nes_free_cqp_request(nesdev, cqp_request); + return -ENOMEM; + } + + opcode = NES_CQP_ALLOCATE_STAG | NES_CQP_STAG_MR | + NES_CQP_STAG_PBL_BLK_SIZE | NES_CQP_STAG_VA_TO | + NES_CQP_STAG_REM_ACC_EN; + /* + * The current OFED API does not support the zero based TO option. + * If added then need to changed the NES_CQP_STAG_VA* option. Also, + * the API does not support that ability to have the MR set for local + * access only when created and not allow the SQ op to override. Given + * this the remote enable must be set here. + */ + + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PBL_BLK_COUNT_IDX, 1); + + cqp_wqe->wqe_words[NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX] = + cpu_to_le32((u32)(region_length >> 8) & 0xff000000); + cqp_wqe->wqe_words[NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX] |= + cpu_to_le32(nespd->pd_id & 0x00007fff); + + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, stag); + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_VA_LOW_IDX, 0); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_LEN_LOW_IDX, 0); + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PA_LOW_IDX, 0); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PBL_LEN_IDX, (page_count * 8)); + cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] |= cpu_to_le32(NES_CQP_STAG_PBL_BLK_SIZE); + barrier(); + + atomic_set(&cqp_request->refcount, 2); + nes_post_cqp_request(nesdev, cqp_request); + + /* Wait for CQP */ + ret = wait_event_timeout(cqp_request->waitq, + (0 != cqp_request->request_done), + NES_EVENT_TIMEOUT); + + nes_debug(NES_DBG_MR, "Allocate STag 0x%08X completed, " + "wait_event_timeout ret = %u, CQP Major:Minor codes = " + "0x%04X:0x%04X.\n", stag, ret, cqp_request->major_code, + cqp_request->minor_code); + major_code = cqp_request->major_code; + nes_put_cqp_request(nesdev, cqp_request); + + if (!ret || major_code) { + spin_lock_irqsave(&nesadapter->pbl_lock, flags); + nesadapter->free_4kpbl++; + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + } + + if (!ret) + return -ETIME; + else if (major_code) + return -EIO; + return 0; +} + +/* + * nes_alloc_fast_reg_mr + */ +static struct ib_mr *nes_alloc_fast_reg_mr(struct ib_pd *ibpd, int max_page_list_len) +{ + struct nes_pd *nespd = to_nespd(ibpd); + struct nes_vnic *nesvnic = to_nesvnic(ibpd->device); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + + u32 next_stag_index; + u8 stag_key = 0; + u32 driver_key = 0; + int err = 0; + u32 stag_index = 0; + struct nes_mr *nesmr; + u32 stag; + int ret; + struct ib_mr *ibmr; +/* + * Note: Set to always use a fixed length single page entry PBL. This is to allow + * for the fast_reg_mr operation to always know the size of the PBL. + */ + if (max_page_list_len > (NES_4K_PBL_CHUNK_SIZE / sizeof(u64))) + return ERR_PTR(-E2BIG); + + get_random_bytes(&next_stag_index, sizeof(next_stag_index)); + stag_key = (u8)next_stag_index; + next_stag_index >>= 8; + next_stag_index %= nesadapter->max_mr; + + err = nes_alloc_resource(nesadapter, nesadapter->allocated_mrs, + nesadapter->max_mr, &stag_index, + &next_stag_index, NES_RESOURCE_FAST_MR); + if (err) + return ERR_PTR(err); + + nesmr = kzalloc(sizeof(*nesmr), GFP_KERNEL); + if (!nesmr) { + nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); + return ERR_PTR(-ENOMEM); + } + + stag = stag_index << 8; + stag |= driver_key; + stag += (u32)stag_key; + + nes_debug(NES_DBG_MR, "Allocating STag 0x%08X index = 0x%08X\n", + stag, stag_index); + + ret = alloc_fast_reg_mr(nesdev, nespd, stag, max_page_list_len); + + if (ret == 0) { + nesmr->ibmr.rkey = stag; + nesmr->ibmr.lkey = stag; + nesmr->mode = IWNES_MEMREG_TYPE_FMEM; + ibmr = &nesmr->ibmr; + } else { + kfree(nesmr); + nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); + ibmr = ERR_PTR(-ENOMEM); + } + return ibmr; +} + +/* + * nes_alloc_fast_reg_page_list + */ +static struct ib_fast_reg_page_list *nes_alloc_fast_reg_page_list( + struct ib_device *ibdev, + int page_list_len) +{ + struct nes_vnic *nesvnic = to_nesvnic(ibdev); + struct nes_device *nesdev = nesvnic->nesdev; + struct ib_fast_reg_page_list *pifrpl; + struct nes_ib_fast_reg_page_list *pnesfrpl; + + if (page_list_len > (NES_4K_PBL_CHUNK_SIZE / sizeof(u64))) + return ERR_PTR(-E2BIG); + /* + * Allocate the ib_fast_reg_page_list structure, the + * nes_fast_bpl structure, and the PLB table. + */ + pnesfrpl = kmalloc(sizeof(struct nes_ib_fast_reg_page_list) + + page_list_len * sizeof(u64), GFP_KERNEL); + + if (!pnesfrpl) + return ERR_PTR(-ENOMEM); + + pifrpl = &pnesfrpl->ibfrpl; + pifrpl->page_list = &pnesfrpl->pbl; + pifrpl->max_page_list_len = page_list_len; + /* + * Allocate the WQE PBL + */ + pnesfrpl->nes_wqe_pbl.kva = pci_alloc_consistent(nesdev->pcidev, + page_list_len * sizeof(u64), + &pnesfrpl->nes_wqe_pbl.paddr); + + if (!pnesfrpl->nes_wqe_pbl.kva) { + kfree(pnesfrpl); + return ERR_PTR(-ENOMEM); + } + nes_debug(NES_DBG_MR, "nes_alloc_fast_reg_pbl: nes_frpl = %p, " + "ibfrpl = %p, ibfrpl.page_list = %p, pbl.kva = %p, " + "pbl.paddr = %llx\n", pnesfrpl, &pnesfrpl->ibfrpl, + pnesfrpl->ibfrpl.page_list, pnesfrpl->nes_wqe_pbl.kva, + (unsigned long long) pnesfrpl->nes_wqe_pbl.paddr); + + return pifrpl; +} + +/* + * nes_free_fast_reg_page_list + */ +static void nes_free_fast_reg_page_list(struct ib_fast_reg_page_list *pifrpl) +{ + struct nes_vnic *nesvnic = to_nesvnic(pifrpl->device); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_ib_fast_reg_page_list *pnesfrpl; + + pnesfrpl = container_of(pifrpl, struct nes_ib_fast_reg_page_list, ibfrpl); + /* + * Free the WQE PBL. + */ + pci_free_consistent(nesdev->pcidev, + pifrpl->max_page_list_len * sizeof(u64), + pnesfrpl->nes_wqe_pbl.kva, + pnesfrpl->nes_wqe_pbl.paddr); + /* + * Free the PBL structure + */ + kfree(pnesfrpl); +} + +/** + * nes_query_device + */ +static int nes_query_device(struct ib_device *ibdev, struct ib_device_attr *props) +{ + struct nes_vnic *nesvnic = to_nesvnic(ibdev); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_ib_device *nesibdev = nesvnic->nesibdev; + + memset(props, 0, sizeof(*props)); + memcpy(&props->sys_image_guid, nesvnic->netdev->dev_addr, 6); + + props->fw_ver = nesdev->nesadapter->firmware_version; + props->device_cap_flags = nesdev->nesadapter->device_cap_flags; + props->vendor_id = nesdev->nesadapter->vendor_id; + props->vendor_part_id = nesdev->nesadapter->vendor_part_id; + props->hw_ver = nesdev->nesadapter->hw_rev; + props->max_mr_size = 0x80000000; + props->max_qp = nesibdev->max_qp; + props->max_qp_wr = nesdev->nesadapter->max_qp_wr - 2; + props->max_sge = nesdev->nesadapter->max_sge; + props->max_cq = nesibdev->max_cq; + props->max_cqe = nesdev->nesadapter->max_cqe; + props->max_mr = nesibdev->max_mr; + props->max_mw = nesibdev->max_mr; + props->max_pd = nesibdev->max_pd; + props->max_sge_rd = 1; + switch (nesdev->nesadapter->max_irrq_wr) { + case 0: + props->max_qp_rd_atom = 2; + break; + case 1: + props->max_qp_rd_atom = 8; + break; + case 2: + props->max_qp_rd_atom = 32; + break; + case 3: + props->max_qp_rd_atom = 64; + break; + default: + props->max_qp_rd_atom = 0; + } + props->max_qp_init_rd_atom = props->max_qp_rd_atom; + props->atomic_cap = IB_ATOMIC_NONE; + props->max_map_per_fmr = 1; + + return 0; +} + + +/** + * nes_query_port + */ +static int nes_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props) +{ + struct nes_vnic *nesvnic = to_nesvnic(ibdev); + struct net_device *netdev = nesvnic->netdev; + + memset(props, 0, sizeof(*props)); + + props->max_mtu = IB_MTU_4096; + + if (netdev->mtu >= 4096) + props->active_mtu = IB_MTU_4096; + else if (netdev->mtu >= 2048) + props->active_mtu = IB_MTU_2048; + else if (netdev->mtu >= 1024) + props->active_mtu = IB_MTU_1024; + else if (netdev->mtu >= 512) + props->active_mtu = IB_MTU_512; + else + props->active_mtu = IB_MTU_256; + + props->lid = 1; + props->lmc = 0; + props->sm_lid = 0; + props->sm_sl = 0; + if (netif_queue_stopped(netdev)) + props->state = IB_PORT_DOWN; + else if (nesvnic->linkup) + props->state = IB_PORT_ACTIVE; + else + props->state = IB_PORT_DOWN; + props->phys_state = 0; + props->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_REINIT_SUP | + IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP; + props->gid_tbl_len = 1; + props->pkey_tbl_len = 1; + props->qkey_viol_cntr = 0; + props->active_width = IB_WIDTH_4X; + props->active_speed = IB_SPEED_SDR; + props->max_msg_sz = 0x80000000; + + return 0; +} + + +/** + * nes_query_pkey + */ +static int nes_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey) +{ + *pkey = 0; + return 0; +} + + +/** + * nes_query_gid + */ +static int nes_query_gid(struct ib_device *ibdev, u8 port, + int index, union ib_gid *gid) +{ + struct nes_vnic *nesvnic = to_nesvnic(ibdev); + + memset(&(gid->raw[0]), 0, sizeof(gid->raw)); + memcpy(&(gid->raw[0]), nesvnic->netdev->dev_addr, 6); + + return 0; +} + + +/** + * nes_alloc_ucontext - Allocate the user context data structure. This keeps track + * of all objects associated with a particular user-mode client. + */ +static struct ib_ucontext *nes_alloc_ucontext(struct ib_device *ibdev, + struct ib_udata *udata) +{ + struct nes_vnic *nesvnic = to_nesvnic(ibdev); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_alloc_ucontext_req req; + struct nes_alloc_ucontext_resp uresp; + struct nes_ucontext *nes_ucontext; + struct nes_ib_device *nesibdev = nesvnic->nesibdev; + + + if (ib_copy_from_udata(&req, udata, sizeof(struct nes_alloc_ucontext_req))) { + printk(KERN_ERR PFX "Invalid structure size on allocate user context.\n"); + return ERR_PTR(-EINVAL); + } + + if (req.userspace_ver != NES_ABI_USERSPACE_VER) { + printk(KERN_ERR PFX "Invalid userspace driver version detected. Detected version %d, should be %d\n", + req.userspace_ver, NES_ABI_USERSPACE_VER); + return ERR_PTR(-EINVAL); + } + + + memset(&uresp, 0, sizeof uresp); + + uresp.max_qps = nesibdev->max_qp; + uresp.max_pds = nesibdev->max_pd; + uresp.wq_size = nesdev->nesadapter->max_qp_wr * 2; + uresp.virtwq = nesadapter->virtwq; + uresp.kernel_ver = NES_ABI_KERNEL_VER; + + nes_ucontext = kzalloc(sizeof *nes_ucontext, GFP_KERNEL); + if (!nes_ucontext) + return ERR_PTR(-ENOMEM); + + nes_ucontext->nesdev = nesdev; + nes_ucontext->mmap_wq_offset = uresp.max_pds; + nes_ucontext->mmap_cq_offset = nes_ucontext->mmap_wq_offset + + ((sizeof(struct nes_hw_qp_wqe) * uresp.max_qps * 2) + PAGE_SIZE-1) / + PAGE_SIZE; + + + if (ib_copy_to_udata(udata, &uresp, sizeof uresp)) { + kfree(nes_ucontext); + return ERR_PTR(-EFAULT); + } + + INIT_LIST_HEAD(&nes_ucontext->cq_reg_mem_list); + INIT_LIST_HEAD(&nes_ucontext->qp_reg_mem_list); + atomic_set(&nes_ucontext->usecnt, 1); + return &nes_ucontext->ibucontext; +} + + +/** + * nes_dealloc_ucontext + */ +static int nes_dealloc_ucontext(struct ib_ucontext *context) +{ + /* struct nes_vnic *nesvnic = to_nesvnic(context->device); */ + /* struct nes_device *nesdev = nesvnic->nesdev; */ + struct nes_ucontext *nes_ucontext = to_nesucontext(context); + + if (!atomic_dec_and_test(&nes_ucontext->usecnt)) + return 0; + kfree(nes_ucontext); + return 0; +} + + +/** + * nes_mmap + */ +static int nes_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) +{ + unsigned long index; + struct nes_vnic *nesvnic = to_nesvnic(context->device); + struct nes_device *nesdev = nesvnic->nesdev; + /* struct nes_adapter *nesadapter = nesdev->nesadapter; */ + struct nes_ucontext *nes_ucontext; + struct nes_qp *nesqp; + + nes_ucontext = to_nesucontext(context); + + + if (vma->vm_pgoff >= nes_ucontext->mmap_wq_offset) { + index = (vma->vm_pgoff - nes_ucontext->mmap_wq_offset) * PAGE_SIZE; + index /= ((sizeof(struct nes_hw_qp_wqe) * nesdev->nesadapter->max_qp_wr * 2) + + PAGE_SIZE-1) & (~(PAGE_SIZE-1)); + if (!test_bit(index, nes_ucontext->allocated_wqs)) { + nes_debug(NES_DBG_MMAP, "wq %lu not allocated\n", index); + return -EFAULT; + } + nesqp = nes_ucontext->mmap_nesqp[index]; + if (nesqp == NULL) { + nes_debug(NES_DBG_MMAP, "wq %lu has a NULL QP base.\n", index); + return -EFAULT; + } + if (remap_pfn_range(vma, vma->vm_start, + virt_to_phys(nesqp->hwqp.sq_vbase) >> PAGE_SHIFT, + vma->vm_end - vma->vm_start, + vma->vm_page_prot)) { + nes_debug(NES_DBG_MMAP, "remap_pfn_range failed.\n"); + return -EAGAIN; + } + vma->vm_private_data = nesqp; + return 0; + } else { + index = vma->vm_pgoff; + if (!test_bit(index, nes_ucontext->allocated_doorbells)) + return -EFAULT; + + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + if (io_remap_pfn_range(vma, vma->vm_start, + (nesdev->doorbell_start + + ((nes_ucontext->mmap_db_index[index] - nesdev->base_doorbell_index) * 4096)) + >> PAGE_SHIFT, PAGE_SIZE, vma->vm_page_prot)) + return -EAGAIN; + vma->vm_private_data = nes_ucontext; + return 0; + } + + return -ENOSYS; +} + + +/** + * nes_alloc_pd + */ +static struct ib_pd *nes_alloc_pd(struct ib_device *ibdev, + struct ib_ucontext *context, struct ib_udata *udata) +{ + struct nes_pd *nespd; + struct nes_vnic *nesvnic = to_nesvnic(ibdev); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_ucontext *nesucontext; + struct nes_alloc_pd_resp uresp; + u32 pd_num = 0; + int err; + + nes_debug(NES_DBG_PD, "nesvnic=%p, netdev=%p %s, ibdev=%p, context=%p, netdev refcnt=%u\n", + nesvnic, nesdev->netdev[0], nesdev->netdev[0]->name, ibdev, context, + netdev_refcnt_read(nesvnic->netdev)); + + err = nes_alloc_resource(nesadapter, nesadapter->allocated_pds, + nesadapter->max_pd, &pd_num, &nesadapter->next_pd, NES_RESOURCE_PD); + if (err) { + return ERR_PTR(err); + } + + nespd = kzalloc(sizeof (struct nes_pd), GFP_KERNEL); + if (!nespd) { + nes_free_resource(nesadapter, nesadapter->allocated_pds, pd_num); + return ERR_PTR(-ENOMEM); + } + + nes_debug(NES_DBG_PD, "Allocating PD (%p) for ib device %s\n", + nespd, nesvnic->nesibdev->ibdev.name); + + nespd->pd_id = (pd_num << (PAGE_SHIFT-12)) + nesadapter->base_pd; + + if (context) { + nesucontext = to_nesucontext(context); + nespd->mmap_db_index = find_next_zero_bit(nesucontext->allocated_doorbells, + NES_MAX_USER_DB_REGIONS, nesucontext->first_free_db); + nes_debug(NES_DBG_PD, "find_first_zero_biton doorbells returned %u, mapping pd_id %u.\n", + nespd->mmap_db_index, nespd->pd_id); + if (nespd->mmap_db_index >= NES_MAX_USER_DB_REGIONS) { + nes_debug(NES_DBG_PD, "mmap_db_index > MAX\n"); + nes_free_resource(nesadapter, nesadapter->allocated_pds, pd_num); + kfree(nespd); + return ERR_PTR(-ENOMEM); + } + + uresp.pd_id = nespd->pd_id; + uresp.mmap_db_index = nespd->mmap_db_index; + if (ib_copy_to_udata(udata, &uresp, sizeof (struct nes_alloc_pd_resp))) { + nes_free_resource(nesadapter, nesadapter->allocated_pds, pd_num); + kfree(nespd); + return ERR_PTR(-EFAULT); + } + + set_bit(nespd->mmap_db_index, nesucontext->allocated_doorbells); + nesucontext->mmap_db_index[nespd->mmap_db_index] = nespd->pd_id; + nesucontext->first_free_db = nespd->mmap_db_index + 1; + } + + nes_debug(NES_DBG_PD, "PD%u structure located @%p.\n", nespd->pd_id, nespd); + return &nespd->ibpd; +} + + +/** + * nes_dealloc_pd + */ +static int nes_dealloc_pd(struct ib_pd *ibpd) +{ + struct nes_ucontext *nesucontext; + struct nes_pd *nespd = to_nespd(ibpd); + struct nes_vnic *nesvnic = to_nesvnic(ibpd->device); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + + if ((ibpd->uobject) && (ibpd->uobject->context)) { + nesucontext = to_nesucontext(ibpd->uobject->context); + nes_debug(NES_DBG_PD, "Clearing bit %u from allocated doorbells\n", + nespd->mmap_db_index); + clear_bit(nespd->mmap_db_index, nesucontext->allocated_doorbells); + nesucontext->mmap_db_index[nespd->mmap_db_index] = 0; + if (nesucontext->first_free_db > nespd->mmap_db_index) { + nesucontext->first_free_db = nespd->mmap_db_index; + } + } + + nes_debug(NES_DBG_PD, "Deallocating PD%u structure located @%p.\n", + nespd->pd_id, nespd); + nes_free_resource(nesadapter, nesadapter->allocated_pds, + (nespd->pd_id-nesadapter->base_pd)>>(PAGE_SHIFT-12)); + kfree(nespd); + + return 0; +} + + +/** + * nes_create_ah + */ +static struct ib_ah *nes_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr) +{ + return ERR_PTR(-ENOSYS); +} + + +/** + * nes_destroy_ah + */ +static int nes_destroy_ah(struct ib_ah *ah) +{ + return -ENOSYS; +} + + +/** + * nes_get_encoded_size + */ +static inline u8 nes_get_encoded_size(int *size) +{ + u8 encoded_size = 0; + if (*size <= 32) { + *size = 32; + encoded_size = 1; + } else if (*size <= 128) { + *size = 128; + encoded_size = 2; + } else if (*size <= 512) { + *size = 512; + encoded_size = 3; + } + return (encoded_size); +} + + + +/** + * nes_setup_virt_qp + */ +static int nes_setup_virt_qp(struct nes_qp *nesqp, struct nes_pbl *nespbl, + struct nes_vnic *nesvnic, int sq_size, int rq_size) +{ + unsigned long flags; + void *mem; + __le64 *pbl = NULL; + __le64 *tpbl; + __le64 *pblbuffer; + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + u32 pbl_entries; + u8 rq_pbl_entries; + u8 sq_pbl_entries; + + pbl_entries = nespbl->pbl_size >> 3; + nes_debug(NES_DBG_QP, "Userspace PBL, pbl_size=%u, pbl_entries = %d pbl_vbase=%p, pbl_pbase=%lx\n", + nespbl->pbl_size, pbl_entries, + (void *)nespbl->pbl_vbase, + (unsigned long) nespbl->pbl_pbase); + pbl = (__le64 *) nespbl->pbl_vbase; /* points to first pbl entry */ + /* now lets set the sq_vbase as well as rq_vbase addrs we will assign */ + /* the first pbl to be fro the rq_vbase... */ + rq_pbl_entries = (rq_size * sizeof(struct nes_hw_qp_wqe)) >> 12; + sq_pbl_entries = (sq_size * sizeof(struct nes_hw_qp_wqe)) >> 12; + nesqp->hwqp.sq_pbase = (le32_to_cpu(((__le32 *)pbl)[0])) | ((u64)((le32_to_cpu(((__le32 *)pbl)[1]))) << 32); + if (!nespbl->page) { + nes_debug(NES_DBG_QP, "QP nespbl->page is NULL \n"); + kfree(nespbl); + return -ENOMEM; + } + + nesqp->hwqp.sq_vbase = kmap(nespbl->page); + nesqp->page = nespbl->page; + if (!nesqp->hwqp.sq_vbase) { + nes_debug(NES_DBG_QP, "QP sq_vbase kmap failed\n"); + kfree(nespbl); + return -ENOMEM; + } + + /* Now to get to sq.. we need to calculate how many */ + /* PBL entries were used by the rq.. */ + pbl += sq_pbl_entries; + nesqp->hwqp.rq_pbase = (le32_to_cpu(((__le32 *)pbl)[0])) | ((u64)((le32_to_cpu(((__le32 *)pbl)[1]))) << 32); + /* nesqp->hwqp.rq_vbase = bus_to_virt(*pbl); */ + /*nesqp->hwqp.rq_vbase = phys_to_virt(*pbl); */ + + nes_debug(NES_DBG_QP, "QP sq_vbase= %p sq_pbase=%lx rq_vbase=%p rq_pbase=%lx\n", + nesqp->hwqp.sq_vbase, (unsigned long) nesqp->hwqp.sq_pbase, + nesqp->hwqp.rq_vbase, (unsigned long) nesqp->hwqp.rq_pbase); + spin_lock_irqsave(&nesadapter->pbl_lock, flags); + if (!nesadapter->free_256pbl) { + pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase, + nespbl->pbl_pbase); + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + kunmap(nesqp->page); + kfree(nespbl); + return -ENOMEM; + } + nesadapter->free_256pbl--; + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + + nesqp->pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 256, &nesqp->pbl_pbase); + pblbuffer = nesqp->pbl_vbase; + if (!nesqp->pbl_vbase) { + /* memory allocated during nes_reg_user_mr() */ + pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase, + nespbl->pbl_pbase); + kfree(nespbl); + spin_lock_irqsave(&nesadapter->pbl_lock, flags); + nesadapter->free_256pbl++; + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + kunmap(nesqp->page); + return -ENOMEM; + } + memset(nesqp->pbl_vbase, 0, 256); + /* fill in the page address in the pbl buffer.. */ + tpbl = pblbuffer + 16; + pbl = (__le64 *)nespbl->pbl_vbase; + while (sq_pbl_entries--) + *tpbl++ = *pbl++; + tpbl = pblbuffer; + while (rq_pbl_entries--) + *tpbl++ = *pbl++; + + /* done with memory allocated during nes_reg_user_mr() */ + pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase, + nespbl->pbl_pbase); + kfree(nespbl); + + nesqp->qp_mem_size = + max((u32)sizeof(struct nes_qp_context), ((u32)256)) + 256; /* this is Q2 */ + /* Round up to a multiple of a page */ + nesqp->qp_mem_size += PAGE_SIZE - 1; + nesqp->qp_mem_size &= ~(PAGE_SIZE - 1); + + mem = pci_alloc_consistent(nesdev->pcidev, nesqp->qp_mem_size, + &nesqp->hwqp.q2_pbase); + + if (!mem) { + pci_free_consistent(nesdev->pcidev, 256, nesqp->pbl_vbase, nesqp->pbl_pbase); + nesqp->pbl_vbase = NULL; + spin_lock_irqsave(&nesadapter->pbl_lock, flags); + nesadapter->free_256pbl++; + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + kunmap(nesqp->page); + return -ENOMEM; + } + nesqp->sq_kmapped = 1; + nesqp->hwqp.q2_vbase = mem; + mem += 256; + memset(nesqp->hwqp.q2_vbase, 0, 256); + nesqp->nesqp_context = mem; + memset(nesqp->nesqp_context, 0, sizeof(*nesqp->nesqp_context)); + nesqp->nesqp_context_pbase = nesqp->hwqp.q2_pbase + 256; + + return 0; +} + + +/** + * nes_setup_mmap_qp + */ +static int nes_setup_mmap_qp(struct nes_qp *nesqp, struct nes_vnic *nesvnic, + int sq_size, int rq_size) +{ + void *mem; + struct nes_device *nesdev = nesvnic->nesdev; + + nesqp->qp_mem_size = (sizeof(struct nes_hw_qp_wqe) * sq_size) + + (sizeof(struct nes_hw_qp_wqe) * rq_size) + + max((u32)sizeof(struct nes_qp_context), ((u32)256)) + + 256; /* this is Q2 */ + /* Round up to a multiple of a page */ + nesqp->qp_mem_size += PAGE_SIZE - 1; + nesqp->qp_mem_size &= ~(PAGE_SIZE - 1); + + mem = pci_alloc_consistent(nesdev->pcidev, nesqp->qp_mem_size, + &nesqp->hwqp.sq_pbase); + if (!mem) + return -ENOMEM; + nes_debug(NES_DBG_QP, "PCI consistent memory for " + "host descriptor rings located @ %p (pa = 0x%08lX.) size = %u.\n", + mem, (unsigned long)nesqp->hwqp.sq_pbase, nesqp->qp_mem_size); + + memset(mem, 0, nesqp->qp_mem_size); + + nesqp->hwqp.sq_vbase = mem; + mem += sizeof(struct nes_hw_qp_wqe) * sq_size; + + nesqp->hwqp.rq_vbase = mem; + nesqp->hwqp.rq_pbase = nesqp->hwqp.sq_pbase + + sizeof(struct nes_hw_qp_wqe) * sq_size; + mem += sizeof(struct nes_hw_qp_wqe) * rq_size; + + nesqp->hwqp.q2_vbase = mem; + nesqp->hwqp.q2_pbase = nesqp->hwqp.rq_pbase + + sizeof(struct nes_hw_qp_wqe) * rq_size; + mem += 256; + memset(nesqp->hwqp.q2_vbase, 0, 256); + + nesqp->nesqp_context = mem; + nesqp->nesqp_context_pbase = nesqp->hwqp.q2_pbase + 256; + memset(nesqp->nesqp_context, 0, sizeof(*nesqp->nesqp_context)); + return 0; +} + + +/** + * nes_free_qp_mem() is to free up the qp's pci_alloc_consistent() memory. + */ +static inline void nes_free_qp_mem(struct nes_device *nesdev, + struct nes_qp *nesqp, int virt_wqs) +{ + unsigned long flags; + struct nes_adapter *nesadapter = nesdev->nesadapter; + if (!virt_wqs) { + pci_free_consistent(nesdev->pcidev, nesqp->qp_mem_size, + nesqp->hwqp.sq_vbase, nesqp->hwqp.sq_pbase); + }else { + spin_lock_irqsave(&nesadapter->pbl_lock, flags); + nesadapter->free_256pbl++; + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + pci_free_consistent(nesdev->pcidev, nesqp->qp_mem_size, nesqp->hwqp.q2_vbase, nesqp->hwqp.q2_pbase); + pci_free_consistent(nesdev->pcidev, 256, nesqp->pbl_vbase, nesqp->pbl_pbase ); + nesqp->pbl_vbase = NULL; + if (nesqp->sq_kmapped) { + nesqp->sq_kmapped = 0; + kunmap(nesqp->page); + } + } +} + + +/** + * nes_create_qp + */ +static struct ib_qp *nes_create_qp(struct ib_pd *ibpd, + struct ib_qp_init_attr *init_attr, struct ib_udata *udata) +{ + u64 u64temp= 0; + u64 u64nesqp = 0; + struct nes_pd *nespd = to_nespd(ibpd); + struct nes_vnic *nesvnic = to_nesvnic(ibpd->device); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_qp *nesqp; + struct nes_cq *nescq; + struct nes_ucontext *nes_ucontext; + struct nes_hw_cqp_wqe *cqp_wqe; + struct nes_cqp_request *cqp_request; + struct nes_create_qp_req req; + struct nes_create_qp_resp uresp; + struct nes_pbl *nespbl = NULL; + u32 qp_num = 0; + u32 opcode = 0; + /* u32 counter = 0; */ + void *mem; + unsigned long flags; + int ret; + int err; + int virt_wqs = 0; + int sq_size; + int rq_size; + u8 sq_encoded_size; + u8 rq_encoded_size; + /* int counter; */ + + if (init_attr->create_flags) + return ERR_PTR(-EINVAL); + + atomic_inc(&qps_created); + switch (init_attr->qp_type) { + case IB_QPT_RC: + if (nes_drv_opt & NES_DRV_OPT_NO_INLINE_DATA) { + init_attr->cap.max_inline_data = 0; + } else { + init_attr->cap.max_inline_data = 64; + } + sq_size = init_attr->cap.max_send_wr; + rq_size = init_attr->cap.max_recv_wr; + + /* check if the encoded sizes are OK or not... */ + sq_encoded_size = nes_get_encoded_size(&sq_size); + rq_encoded_size = nes_get_encoded_size(&rq_size); + + if ((!sq_encoded_size) || (!rq_encoded_size)) { + nes_debug(NES_DBG_QP, "ERROR bad rq (%u) or sq (%u) size\n", + rq_size, sq_size); + return ERR_PTR(-EINVAL); + } + + init_attr->cap.max_send_wr = sq_size -2; + init_attr->cap.max_recv_wr = rq_size -1; + nes_debug(NES_DBG_QP, "RQ size=%u, SQ Size=%u\n", rq_size, sq_size); + + ret = nes_alloc_resource(nesadapter, nesadapter->allocated_qps, + nesadapter->max_qp, &qp_num, &nesadapter->next_qp, NES_RESOURCE_QP); + if (ret) { + return ERR_PTR(ret); + } + + /* Need 512 (actually now 1024) byte alignment on this structure */ + mem = kzalloc(sizeof(*nesqp)+NES_SW_CONTEXT_ALIGN-1, GFP_KERNEL); + if (!mem) { + nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num); + nes_debug(NES_DBG_QP, "Unable to allocate QP\n"); + return ERR_PTR(-ENOMEM); + } + u64nesqp = (unsigned long)mem; + u64nesqp += ((u64)NES_SW_CONTEXT_ALIGN) - 1; + u64temp = ((u64)NES_SW_CONTEXT_ALIGN) - 1; + u64nesqp &= ~u64temp; + nesqp = (struct nes_qp *)(unsigned long)u64nesqp; + /* nes_debug(NES_DBG_QP, "nesqp=%p, allocated buffer=%p. Rounded to closest %u\n", + nesqp, mem, NES_SW_CONTEXT_ALIGN); */ + nesqp->allocated_buffer = mem; + + if (udata) { + if (ib_copy_from_udata(&req, udata, sizeof(struct nes_create_qp_req))) { + nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num); + kfree(nesqp->allocated_buffer); + nes_debug(NES_DBG_QP, "ib_copy_from_udata() Failed \n"); + return ERR_PTR(-EFAULT); + } + if (req.user_wqe_buffers) { + virt_wqs = 1; + } + if (req.user_qp_buffer) + nesqp->nesuqp_addr = req.user_qp_buffer; + if ((ibpd->uobject) && (ibpd->uobject->context)) { + nesqp->user_mode = 1; + nes_ucontext = to_nesucontext(ibpd->uobject->context); + if (virt_wqs) { + err = 1; + list_for_each_entry(nespbl, &nes_ucontext->qp_reg_mem_list, list) { + if (nespbl->user_base == (unsigned long )req.user_wqe_buffers) { + list_del(&nespbl->list); + err = 0; + nes_debug(NES_DBG_QP, "Found PBL for virtual QP. nespbl=%p. user_base=0x%lx\n", + nespbl, nespbl->user_base); + break; + } + } + if (err) { + nes_debug(NES_DBG_QP, "Didn't Find PBL for virtual QP. address = %llx.\n", + (long long unsigned int)req.user_wqe_buffers); + nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num); + kfree(nesqp->allocated_buffer); + return ERR_PTR(-EFAULT); + } + } + + nes_ucontext = to_nesucontext(ibpd->uobject->context); + nesqp->mmap_sq_db_index = + find_next_zero_bit(nes_ucontext->allocated_wqs, + NES_MAX_USER_WQ_REGIONS, nes_ucontext->first_free_wq); + /* nes_debug(NES_DBG_QP, "find_first_zero_biton wqs returned %u\n", + nespd->mmap_db_index); */ + if (nesqp->mmap_sq_db_index >= NES_MAX_USER_WQ_REGIONS) { + nes_debug(NES_DBG_QP, + "db index > max user regions, failing create QP\n"); + nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num); + if (virt_wqs) { + pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase, + nespbl->pbl_pbase); + kfree(nespbl); + } + kfree(nesqp->allocated_buffer); + return ERR_PTR(-ENOMEM); + } + set_bit(nesqp->mmap_sq_db_index, nes_ucontext->allocated_wqs); + nes_ucontext->mmap_nesqp[nesqp->mmap_sq_db_index] = nesqp; + nes_ucontext->first_free_wq = nesqp->mmap_sq_db_index + 1; + } else { + nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num); + kfree(nesqp->allocated_buffer); + return ERR_PTR(-EFAULT); + } + } + err = (!virt_wqs) ? nes_setup_mmap_qp(nesqp, nesvnic, sq_size, rq_size) : + nes_setup_virt_qp(nesqp, nespbl, nesvnic, sq_size, rq_size); + if (err) { + nes_debug(NES_DBG_QP, + "error geting qp mem code = %d\n", err); + nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num); + kfree(nesqp->allocated_buffer); + return ERR_PTR(-ENOMEM); + } + + nesqp->hwqp.sq_size = sq_size; + nesqp->hwqp.sq_encoded_size = sq_encoded_size; + nesqp->hwqp.sq_head = 1; + nesqp->hwqp.rq_size = rq_size; + nesqp->hwqp.rq_encoded_size = rq_encoded_size; + /* nes_debug(NES_DBG_QP, "nesqp->nesqp_context_pbase = %p\n", + (void *)nesqp->nesqp_context_pbase); + */ + nesqp->hwqp.qp_id = qp_num; + nesqp->ibqp.qp_num = nesqp->hwqp.qp_id; + nesqp->nespd = nespd; + + nescq = to_nescq(init_attr->send_cq); + nesqp->nesscq = nescq; + nescq = to_nescq(init_attr->recv_cq); + nesqp->nesrcq = nescq; + + nesqp->nesqp_context->misc |= cpu_to_le32((u32)PCI_FUNC(nesdev->pcidev->devfn) << + NES_QPCONTEXT_MISC_PCI_FCN_SHIFT); + nesqp->nesqp_context->misc |= cpu_to_le32((u32)nesqp->hwqp.rq_encoded_size << + NES_QPCONTEXT_MISC_RQ_SIZE_SHIFT); + nesqp->nesqp_context->misc |= cpu_to_le32((u32)nesqp->hwqp.sq_encoded_size << + NES_QPCONTEXT_MISC_SQ_SIZE_SHIFT); + if (!udata) { + nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_PRIV_EN); + nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_FAST_REGISTER_EN); + } + nesqp->nesqp_context->cqs = cpu_to_le32(nesqp->nesscq->hw_cq.cq_number + + ((u32)nesqp->nesrcq->hw_cq.cq_number << 16)); + u64temp = (u64)nesqp->hwqp.sq_pbase; + nesqp->nesqp_context->sq_addr_low = cpu_to_le32((u32)u64temp); + nesqp->nesqp_context->sq_addr_high = cpu_to_le32((u32)(u64temp >> 32)); + + + if (!virt_wqs) { + u64temp = (u64)nesqp->hwqp.sq_pbase; + nesqp->nesqp_context->sq_addr_low = cpu_to_le32((u32)u64temp); + nesqp->nesqp_context->sq_addr_high = cpu_to_le32((u32)(u64temp >> 32)); + u64temp = (u64)nesqp->hwqp.rq_pbase; + nesqp->nesqp_context->rq_addr_low = cpu_to_le32((u32)u64temp); + nesqp->nesqp_context->rq_addr_high = cpu_to_le32((u32)(u64temp >> 32)); + } else { + u64temp = (u64)nesqp->pbl_pbase; + nesqp->nesqp_context->rq_addr_low = cpu_to_le32((u32)u64temp); + nesqp->nesqp_context->rq_addr_high = cpu_to_le32((u32)(u64temp >> 32)); + } + + /* nes_debug(NES_DBG_QP, "next_qp_nic_index=%u, using nic_index=%d\n", + nesvnic->next_qp_nic_index, + nesvnic->qp_nic_index[nesvnic->next_qp_nic_index]); */ + spin_lock_irqsave(&nesdev->cqp.lock, flags); + nesqp->nesqp_context->misc2 |= cpu_to_le32( + (u32)nesvnic->qp_nic_index[nesvnic->next_qp_nic_index] << + NES_QPCONTEXT_MISC2_NIC_INDEX_SHIFT); + nesvnic->next_qp_nic_index++; + if ((nesvnic->next_qp_nic_index > 3) || + (nesvnic->qp_nic_index[nesvnic->next_qp_nic_index] == 0xf)) { + nesvnic->next_qp_nic_index = 0; + } + spin_unlock_irqrestore(&nesdev->cqp.lock, flags); + + nesqp->nesqp_context->pd_index_wscale |= cpu_to_le32((u32)nesqp->nespd->pd_id << 16); + u64temp = (u64)nesqp->hwqp.q2_pbase; + nesqp->nesqp_context->q2_addr_low = cpu_to_le32((u32)u64temp); + nesqp->nesqp_context->q2_addr_high = cpu_to_le32((u32)(u64temp >> 32)); + nesqp->nesqp_context->aeq_token_low = cpu_to_le32((u32)((unsigned long)(nesqp))); + nesqp->nesqp_context->aeq_token_high = cpu_to_le32((u32)(upper_32_bits((unsigned long)(nesqp)))); + nesqp->nesqp_context->ird_ord_sizes = cpu_to_le32(NES_QPCONTEXT_ORDIRD_ALSMM | + NES_QPCONTEXT_ORDIRD_AAH | + ((((u32)nesadapter->max_irrq_wr) << + NES_QPCONTEXT_ORDIRD_IRDSIZE_SHIFT) & NES_QPCONTEXT_ORDIRD_IRDSIZE_MASK)); + if (disable_mpa_crc) { + nes_debug(NES_DBG_QP, "Disabling MPA crc checking due to module option.\n"); + nesqp->nesqp_context->ird_ord_sizes |= cpu_to_le32(NES_QPCONTEXT_ORDIRD_RNMC); + } + + + /* Create the QP */ + cqp_request = nes_get_cqp_request(nesdev); + if (cqp_request == NULL) { + nes_debug(NES_DBG_QP, "Failed to get a cqp_request\n"); + nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num); + nes_free_qp_mem(nesdev, nesqp,virt_wqs); + kfree(nesqp->allocated_buffer); + return ERR_PTR(-ENOMEM); + } + cqp_request->waiting = 1; + cqp_wqe = &cqp_request->cqp_wqe; + + if (!virt_wqs) { + opcode = NES_CQP_CREATE_QP | NES_CQP_QP_TYPE_IWARP | + NES_CQP_QP_IWARP_STATE_IDLE; + } else { + opcode = NES_CQP_CREATE_QP | NES_CQP_QP_TYPE_IWARP | NES_CQP_QP_VIRT_WQS | + NES_CQP_QP_IWARP_STATE_IDLE; + } + opcode |= NES_CQP_QP_CQS_VALID; + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id); + + u64temp = (u64)nesqp->nesqp_context_pbase; + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp); + + atomic_set(&cqp_request->refcount, 2); + nes_post_cqp_request(nesdev, cqp_request); + + /* Wait for CQP */ + nes_debug(NES_DBG_QP, "Waiting for create iWARP QP%u to complete.\n", + nesqp->hwqp.qp_id); + ret = wait_event_timeout(cqp_request->waitq, + (cqp_request->request_done != 0), NES_EVENT_TIMEOUT); + nes_debug(NES_DBG_QP, "Create iwarp QP%u completed, wait_event_timeout ret=%u," + " nesdev->cqp_head = %u, nesdev->cqp.sq_tail = %u," + " CQP Major:Minor codes = 0x%04X:0x%04X.\n", + nesqp->hwqp.qp_id, ret, nesdev->cqp.sq_head, nesdev->cqp.sq_tail, + cqp_request->major_code, cqp_request->minor_code); + if ((!ret) || (cqp_request->major_code)) { + nes_put_cqp_request(nesdev, cqp_request); + nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num); + nes_free_qp_mem(nesdev, nesqp,virt_wqs); + kfree(nesqp->allocated_buffer); + if (!ret) { + return ERR_PTR(-ETIME); + } else { + return ERR_PTR(-EIO); + } + } + + nes_put_cqp_request(nesdev, cqp_request); + + if (ibpd->uobject) { + uresp.mmap_sq_db_index = nesqp->mmap_sq_db_index; + uresp.mmap_rq_db_index = 0; + uresp.actual_sq_size = sq_size; + uresp.actual_rq_size = rq_size; + uresp.qp_id = nesqp->hwqp.qp_id; + uresp.nes_drv_opt = nes_drv_opt; + if (ib_copy_to_udata(udata, &uresp, sizeof uresp)) { + nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num); + nes_free_qp_mem(nesdev, nesqp,virt_wqs); + kfree(nesqp->allocated_buffer); + return ERR_PTR(-EFAULT); + } + } + + nes_debug(NES_DBG_QP, "QP%u structure located @%p.Size = %u.\n", + nesqp->hwqp.qp_id, nesqp, (u32)sizeof(*nesqp)); + spin_lock_init(&nesqp->lock); + nes_add_ref(&nesqp->ibqp); + break; + default: + nes_debug(NES_DBG_QP, "Invalid QP type: %d\n", init_attr->qp_type); + return ERR_PTR(-EINVAL); + } + + nesqp->sig_all = (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR); + init_timer(&nesqp->terminate_timer); + nesqp->terminate_timer.function = nes_terminate_timeout; + nesqp->terminate_timer.data = (unsigned long)nesqp; + + /* update the QP table */ + nesdev->nesadapter->qp_table[nesqp->hwqp.qp_id-NES_FIRST_QPN] = nesqp; + nes_debug(NES_DBG_QP, "netdev refcnt=%u\n", + netdev_refcnt_read(nesvnic->netdev)); + + return &nesqp->ibqp; +} + +/** + * nes_clean_cq + */ +static void nes_clean_cq(struct nes_qp *nesqp, struct nes_cq *nescq) +{ + u32 cq_head; + u32 lo; + u32 hi; + u64 u64temp; + unsigned long flags = 0; + + spin_lock_irqsave(&nescq->lock, flags); + + cq_head = nescq->hw_cq.cq_head; + while (le32_to_cpu(nescq->hw_cq.cq_vbase[cq_head].cqe_words[NES_CQE_OPCODE_IDX]) & NES_CQE_VALID) { + rmb(); + lo = le32_to_cpu(nescq->hw_cq.cq_vbase[cq_head].cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX]); + hi = le32_to_cpu(nescq->hw_cq.cq_vbase[cq_head].cqe_words[NES_CQE_COMP_COMP_CTX_HIGH_IDX]); + u64temp = (((u64)hi) << 32) | ((u64)lo); + u64temp &= ~(NES_SW_CONTEXT_ALIGN-1); + if (u64temp == (u64)(unsigned long)nesqp) { + /* Zero the context value so cqe will be ignored */ + nescq->hw_cq.cq_vbase[cq_head].cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX] = 0; + nescq->hw_cq.cq_vbase[cq_head].cqe_words[NES_CQE_COMP_COMP_CTX_HIGH_IDX] = 0; + } + + if (++cq_head >= nescq->hw_cq.cq_size) + cq_head = 0; + } + + spin_unlock_irqrestore(&nescq->lock, flags); +} + + +/** + * nes_destroy_qp + */ +static int nes_destroy_qp(struct ib_qp *ibqp) +{ + struct nes_qp *nesqp = to_nesqp(ibqp); + struct nes_ucontext *nes_ucontext; + struct ib_qp_attr attr; + struct iw_cm_id *cm_id; + struct iw_cm_event cm_event; + int ret = 0; + + atomic_inc(&sw_qps_destroyed); + nesqp->destroyed = 1; + + /* Blow away the connection if it exists. */ + if (nesqp->ibqp_state >= IB_QPS_INIT && nesqp->ibqp_state <= IB_QPS_RTS) { + /* if (nesqp->ibqp_state == IB_QPS_RTS) { */ + attr.qp_state = IB_QPS_ERR; + nes_modify_qp(&nesqp->ibqp, &attr, IB_QP_STATE, NULL); + } + + if (((nesqp->ibqp_state == IB_QPS_INIT) || + (nesqp->ibqp_state == IB_QPS_RTR)) && (nesqp->cm_id)) { + cm_id = nesqp->cm_id; + cm_event.event = IW_CM_EVENT_CONNECT_REPLY; + cm_event.status = -ETIMEDOUT; + cm_event.local_addr = cm_id->local_addr; + cm_event.remote_addr = cm_id->remote_addr; + cm_event.private_data = NULL; + cm_event.private_data_len = 0; + + nes_debug(NES_DBG_QP, "Generating a CM Timeout Event for " + "QP%u. cm_id = %p, refcount = %u. \n", + nesqp->hwqp.qp_id, cm_id, atomic_read(&nesqp->refcount)); + + cm_id->rem_ref(cm_id); + ret = cm_id->event_handler(cm_id, &cm_event); + if (ret) + nes_debug(NES_DBG_QP, "OFA CM event_handler returned, ret=%d\n", ret); + } + + if (nesqp->user_mode) { + if ((ibqp->uobject)&&(ibqp->uobject->context)) { + nes_ucontext = to_nesucontext(ibqp->uobject->context); + clear_bit(nesqp->mmap_sq_db_index, nes_ucontext->allocated_wqs); + nes_ucontext->mmap_nesqp[nesqp->mmap_sq_db_index] = NULL; + if (nes_ucontext->first_free_wq > nesqp->mmap_sq_db_index) { + nes_ucontext->first_free_wq = nesqp->mmap_sq_db_index; + } + } + if (nesqp->pbl_pbase && nesqp->sq_kmapped) { + nesqp->sq_kmapped = 0; + kunmap(nesqp->page); + } + } else { + /* Clean any pending completions from the cq(s) */ + if (nesqp->nesscq) + nes_clean_cq(nesqp, nesqp->nesscq); + + if ((nesqp->nesrcq) && (nesqp->nesrcq != nesqp->nesscq)) + nes_clean_cq(nesqp, nesqp->nesrcq); + } + nes_rem_ref(&nesqp->ibqp); + return 0; +} + + +/** + * nes_create_cq + */ +static struct ib_cq *nes_create_cq(struct ib_device *ibdev, int entries, + int comp_vector, + struct ib_ucontext *context, struct ib_udata *udata) +{ + u64 u64temp; + struct nes_vnic *nesvnic = to_nesvnic(ibdev); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_cq *nescq; + struct nes_ucontext *nes_ucontext = NULL; + struct nes_cqp_request *cqp_request; + void *mem = NULL; + struct nes_hw_cqp_wqe *cqp_wqe; + struct nes_pbl *nespbl = NULL; + struct nes_create_cq_req req; + struct nes_create_cq_resp resp; + u32 cq_num = 0; + u32 opcode = 0; + u32 pbl_entries = 1; + int err; + unsigned long flags; + int ret; + + if (entries > nesadapter->max_cqe) + return ERR_PTR(-EINVAL); + + err = nes_alloc_resource(nesadapter, nesadapter->allocated_cqs, + nesadapter->max_cq, &cq_num, &nesadapter->next_cq, NES_RESOURCE_CQ); + if (err) { + return ERR_PTR(err); + } + + nescq = kzalloc(sizeof(struct nes_cq), GFP_KERNEL); + if (!nescq) { + nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); + nes_debug(NES_DBG_CQ, "Unable to allocate nes_cq struct\n"); + return ERR_PTR(-ENOMEM); + } + + nescq->hw_cq.cq_size = max(entries + 1, 5); + nescq->hw_cq.cq_number = cq_num; + nescq->ibcq.cqe = nescq->hw_cq.cq_size - 1; + + + if (context) { + nes_ucontext = to_nesucontext(context); + if (ib_copy_from_udata(&req, udata, sizeof (struct nes_create_cq_req))) { + nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); + kfree(nescq); + return ERR_PTR(-EFAULT); + } + nesvnic->mcrq_ucontext = nes_ucontext; + nes_ucontext->mcrqf = req.mcrqf; + if (nes_ucontext->mcrqf) { + if (nes_ucontext->mcrqf & 0x80000000) + nescq->hw_cq.cq_number = nesvnic->nic.qp_id + 28 + 2 * ((nes_ucontext->mcrqf & 0xf) - 1); + else if (nes_ucontext->mcrqf & 0x40000000) + nescq->hw_cq.cq_number = nes_ucontext->mcrqf & 0xffff; + else + nescq->hw_cq.cq_number = nesvnic->mcrq_qp_id + nes_ucontext->mcrqf-1; + nescq->mcrqf = nes_ucontext->mcrqf; + nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); + } + nes_debug(NES_DBG_CQ, "CQ Virtual Address = %08lX, size = %u.\n", + (unsigned long)req.user_cq_buffer, entries); + err = 1; + list_for_each_entry(nespbl, &nes_ucontext->cq_reg_mem_list, list) { + if (nespbl->user_base == (unsigned long )req.user_cq_buffer) { + list_del(&nespbl->list); + err = 0; + nes_debug(NES_DBG_CQ, "Found PBL for virtual CQ. nespbl=%p.\n", + nespbl); + break; + } + } + if (err) { + nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); + kfree(nescq); + return ERR_PTR(-EFAULT); + } + + pbl_entries = nespbl->pbl_size >> 3; + nescq->cq_mem_size = 0; + } else { + nescq->cq_mem_size = nescq->hw_cq.cq_size * sizeof(struct nes_hw_cqe); + nes_debug(NES_DBG_CQ, "Attempting to allocate pci memory (%u entries, %u bytes) for CQ%u.\n", + entries, nescq->cq_mem_size, nescq->hw_cq.cq_number); + + /* allocate the physical buffer space */ + mem = pci_zalloc_consistent(nesdev->pcidev, nescq->cq_mem_size, + &nescq->hw_cq.cq_pbase); + if (!mem) { + printk(KERN_ERR PFX "Unable to allocate pci memory for cq\n"); + nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); + kfree(nescq); + return ERR_PTR(-ENOMEM); + } + + nescq->hw_cq.cq_vbase = mem; + nescq->hw_cq.cq_head = 0; + nes_debug(NES_DBG_CQ, "CQ%u virtual address @ %p, phys = 0x%08X\n", + nescq->hw_cq.cq_number, nescq->hw_cq.cq_vbase, + (u32)nescq->hw_cq.cq_pbase); + } + + nescq->hw_cq.ce_handler = nes_iwarp_ce_handler; + spin_lock_init(&nescq->lock); + + /* send CreateCQ request to CQP */ + cqp_request = nes_get_cqp_request(nesdev); + if (cqp_request == NULL) { + nes_debug(NES_DBG_CQ, "Failed to get a cqp_request.\n"); + if (!context) + pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, mem, + nescq->hw_cq.cq_pbase); + else { + pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, + nespbl->pbl_vbase, nespbl->pbl_pbase); + kfree(nespbl); + } + + nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); + kfree(nescq); + return ERR_PTR(-ENOMEM); + } + cqp_request->waiting = 1; + cqp_wqe = &cqp_request->cqp_wqe; + + opcode = NES_CQP_CREATE_CQ | NES_CQP_CQ_CEQ_VALID | + NES_CQP_CQ_CHK_OVERFLOW | + NES_CQP_CQ_CEQE_MASK | ((u32)nescq->hw_cq.cq_size << 16); + + spin_lock_irqsave(&nesadapter->pbl_lock, flags); + + if (pbl_entries != 1) { + if (pbl_entries > 32) { + /* use 4k pbl */ + nes_debug(NES_DBG_CQ, "pbl_entries=%u, use a 4k PBL\n", pbl_entries); + if (nesadapter->free_4kpbl == 0) { + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + nes_free_cqp_request(nesdev, cqp_request); + if (!context) + pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, mem, + nescq->hw_cq.cq_pbase); + else { + pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, + nespbl->pbl_vbase, nespbl->pbl_pbase); + kfree(nespbl); + } + nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); + kfree(nescq); + return ERR_PTR(-ENOMEM); + } else { + opcode |= (NES_CQP_CQ_VIRT | NES_CQP_CQ_4KB_CHUNK); + nescq->virtual_cq = 2; + nesadapter->free_4kpbl--; + } + } else { + /* use 256 byte pbl */ + nes_debug(NES_DBG_CQ, "pbl_entries=%u, use a 256 byte PBL\n", pbl_entries); + if (nesadapter->free_256pbl == 0) { + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + nes_free_cqp_request(nesdev, cqp_request); + if (!context) + pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, mem, + nescq->hw_cq.cq_pbase); + else { + pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, + nespbl->pbl_vbase, nespbl->pbl_pbase); + kfree(nespbl); + } + nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); + kfree(nescq); + return ERR_PTR(-ENOMEM); + } else { + opcode |= NES_CQP_CQ_VIRT; + nescq->virtual_cq = 1; + nesadapter->free_256pbl--; + } + } + } + + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, + (nescq->hw_cq.cq_number | ((u32)nesdev->ceq_index << 16))); + + if (context) { + if (pbl_entries != 1) + u64temp = (u64)nespbl->pbl_pbase; + else + u64temp = le64_to_cpu(nespbl->pbl_vbase[0]); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX, + nes_ucontext->mmap_db_index[0]); + } else { + u64temp = (u64)nescq->hw_cq.cq_pbase; + cqp_wqe->wqe_words[NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX] = 0; + } + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp); + cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] = 0; + u64temp = (u64)(unsigned long)&nescq->hw_cq; + cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_LOW_IDX] = + cpu_to_le32((u32)(u64temp >> 1)); + cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] = + cpu_to_le32(((u32)((u64temp) >> 33)) & 0x7FFFFFFF); + + atomic_set(&cqp_request->refcount, 2); + nes_post_cqp_request(nesdev, cqp_request); + + /* Wait for CQP */ + nes_debug(NES_DBG_CQ, "Waiting for create iWARP CQ%u to complete.\n", + nescq->hw_cq.cq_number); + ret = wait_event_timeout(cqp_request->waitq, (0 != cqp_request->request_done), + NES_EVENT_TIMEOUT * 2); + nes_debug(NES_DBG_CQ, "Create iWARP CQ%u completed, wait_event_timeout ret = %d.\n", + nescq->hw_cq.cq_number, ret); + if ((!ret) || (cqp_request->major_code)) { + nes_put_cqp_request(nesdev, cqp_request); + if (!context) + pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, mem, + nescq->hw_cq.cq_pbase); + else { + pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, + nespbl->pbl_vbase, nespbl->pbl_pbase); + kfree(nespbl); + } + nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); + kfree(nescq); + return ERR_PTR(-EIO); + } + nes_put_cqp_request(nesdev, cqp_request); + + if (context) { + /* free the nespbl */ + pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase, + nespbl->pbl_pbase); + kfree(nespbl); + resp.cq_id = nescq->hw_cq.cq_number; + resp.cq_size = nescq->hw_cq.cq_size; + resp.mmap_db_index = 0; + if (ib_copy_to_udata(udata, &resp, sizeof resp - sizeof resp.reserved)) { + nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); + kfree(nescq); + return ERR_PTR(-EFAULT); + } + } + + return &nescq->ibcq; +} + + +/** + * nes_destroy_cq + */ +static int nes_destroy_cq(struct ib_cq *ib_cq) +{ + struct nes_cq *nescq; + struct nes_device *nesdev; + struct nes_vnic *nesvnic; + struct nes_adapter *nesadapter; + struct nes_hw_cqp_wqe *cqp_wqe; + struct nes_cqp_request *cqp_request; + unsigned long flags; + u32 opcode = 0; + int ret; + + if (ib_cq == NULL) + return 0; + + nescq = to_nescq(ib_cq); + nesvnic = to_nesvnic(ib_cq->device); + nesdev = nesvnic->nesdev; + nesadapter = nesdev->nesadapter; + + nes_debug(NES_DBG_CQ, "Destroy CQ%u\n", nescq->hw_cq.cq_number); + + /* Send DestroyCQ request to CQP */ + cqp_request = nes_get_cqp_request(nesdev); + if (cqp_request == NULL) { + nes_debug(NES_DBG_CQ, "Failed to get a cqp_request.\n"); + return -ENOMEM; + } + cqp_request->waiting = 1; + cqp_wqe = &cqp_request->cqp_wqe; + opcode = NES_CQP_DESTROY_CQ | (nescq->hw_cq.cq_size << 16); + spin_lock_irqsave(&nesadapter->pbl_lock, flags); + if (nescq->virtual_cq == 1) { + nesadapter->free_256pbl++; + if (nesadapter->free_256pbl > nesadapter->max_256pbl) { + printk(KERN_ERR PFX "%s: free 256B PBLs(%u) has exceeded the max(%u)\n", + __func__, nesadapter->free_256pbl, nesadapter->max_256pbl); + } + } else if (nescq->virtual_cq == 2) { + nesadapter->free_4kpbl++; + if (nesadapter->free_4kpbl > nesadapter->max_4kpbl) { + printk(KERN_ERR PFX "%s: free 4K PBLs(%u) has exceeded the max(%u)\n", + __func__, nesadapter->free_4kpbl, nesadapter->max_4kpbl); + } + opcode |= NES_CQP_CQ_4KB_CHUNK; + } + + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, + (nescq->hw_cq.cq_number | ((u32)PCI_FUNC(nesdev->pcidev->devfn) << 16))); + if (!nescq->mcrqf) + nes_free_resource(nesadapter, nesadapter->allocated_cqs, nescq->hw_cq.cq_number); + + atomic_set(&cqp_request->refcount, 2); + nes_post_cqp_request(nesdev, cqp_request); + + /* Wait for CQP */ + nes_debug(NES_DBG_CQ, "Waiting for destroy iWARP CQ%u to complete.\n", + nescq->hw_cq.cq_number); + ret = wait_event_timeout(cqp_request->waitq, (0 != cqp_request->request_done), + NES_EVENT_TIMEOUT); + nes_debug(NES_DBG_CQ, "Destroy iWARP CQ%u completed, wait_event_timeout ret = %u," + " CQP Major:Minor codes = 0x%04X:0x%04X.\n", + nescq->hw_cq.cq_number, ret, cqp_request->major_code, + cqp_request->minor_code); + if (!ret) { + nes_debug(NES_DBG_CQ, "iWARP CQ%u destroy timeout expired\n", + nescq->hw_cq.cq_number); + ret = -ETIME; + } else if (cqp_request->major_code) { + nes_debug(NES_DBG_CQ, "iWARP CQ%u destroy failed\n", + nescq->hw_cq.cq_number); + ret = -EIO; + } else { + ret = 0; + } + nes_put_cqp_request(nesdev, cqp_request); + + if (nescq->cq_mem_size) + pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, + nescq->hw_cq.cq_vbase, nescq->hw_cq.cq_pbase); + kfree(nescq); + + return ret; +} + +/** + * root_256 + */ +static u32 root_256(struct nes_device *nesdev, + struct nes_root_vpbl *root_vpbl, + struct nes_root_vpbl *new_root, + u16 pbl_count_4k) +{ + u64 leaf_pbl; + int i, j, k; + + if (pbl_count_4k == 1) { + new_root->pbl_vbase = pci_alloc_consistent(nesdev->pcidev, + 512, &new_root->pbl_pbase); + + if (new_root->pbl_vbase == NULL) + return 0; + + leaf_pbl = (u64)root_vpbl->pbl_pbase; + for (i = 0; i < 16; i++) { + new_root->pbl_vbase[i].pa_low = + cpu_to_le32((u32)leaf_pbl); + new_root->pbl_vbase[i].pa_high = + cpu_to_le32((u32)((((u64)leaf_pbl) >> 32))); + leaf_pbl += 256; + } + } else { + for (i = 3; i >= 0; i--) { + j = i * 16; + root_vpbl->pbl_vbase[j] = root_vpbl->pbl_vbase[i]; + leaf_pbl = le32_to_cpu(root_vpbl->pbl_vbase[j].pa_low) + + (((u64)le32_to_cpu(root_vpbl->pbl_vbase[j].pa_high)) + << 32); + for (k = 1; k < 16; k++) { + leaf_pbl += 256; + root_vpbl->pbl_vbase[j + k].pa_low = + cpu_to_le32((u32)leaf_pbl); + root_vpbl->pbl_vbase[j + k].pa_high = + cpu_to_le32((u32)((((u64)leaf_pbl) >> 32))); + } + } + } + + return 1; +} + + +/** + * nes_reg_mr + */ +static int nes_reg_mr(struct nes_device *nesdev, struct nes_pd *nespd, + u32 stag, u64 region_length, struct nes_root_vpbl *root_vpbl, + dma_addr_t single_buffer, u16 pbl_count_4k, + u16 residual_page_count_4k, int acc, u64 *iova_start, + u16 *actual_pbl_cnt, u8 *used_4k_pbls) +{ + struct nes_hw_cqp_wqe *cqp_wqe; + struct nes_cqp_request *cqp_request; + unsigned long flags; + int ret; + struct nes_adapter *nesadapter = nesdev->nesadapter; + uint pg_cnt = 0; + u16 pbl_count_256 = 0; + u16 pbl_count = 0; + u8 use_256_pbls = 0; + u8 use_4k_pbls = 0; + u16 use_two_level = (pbl_count_4k > 1) ? 1 : 0; + struct nes_root_vpbl new_root = { 0, NULL, NULL }; + u32 opcode = 0; + u16 major_code; + + /* Register the region with the adapter */ + cqp_request = nes_get_cqp_request(nesdev); + if (cqp_request == NULL) { + nes_debug(NES_DBG_MR, "Failed to get a cqp_request.\n"); + return -ENOMEM; + } + cqp_request->waiting = 1; + cqp_wqe = &cqp_request->cqp_wqe; + + if (pbl_count_4k) { + spin_lock_irqsave(&nesadapter->pbl_lock, flags); + + pg_cnt = ((pbl_count_4k - 1) * 512) + residual_page_count_4k; + pbl_count_256 = (pg_cnt + 31) / 32; + if (pg_cnt <= 32) { + if (pbl_count_256 <= nesadapter->free_256pbl) + use_256_pbls = 1; + else if (pbl_count_4k <= nesadapter->free_4kpbl) + use_4k_pbls = 1; + } else if (pg_cnt <= 2048) { + if (((pbl_count_4k + use_two_level) <= nesadapter->free_4kpbl) && + (nesadapter->free_4kpbl > (nesadapter->max_4kpbl >> 1))) { + use_4k_pbls = 1; + } else if ((pbl_count_256 + 1) <= nesadapter->free_256pbl) { + use_256_pbls = 1; + use_two_level = 1; + } else if ((pbl_count_4k + use_two_level) <= nesadapter->free_4kpbl) { + use_4k_pbls = 1; + } + } else { + if ((pbl_count_4k + 1) <= nesadapter->free_4kpbl) + use_4k_pbls = 1; + } + + if (use_256_pbls) { + pbl_count = pbl_count_256; + nesadapter->free_256pbl -= pbl_count + use_two_level; + } else if (use_4k_pbls) { + pbl_count = pbl_count_4k; + nesadapter->free_4kpbl -= pbl_count + use_two_level; + } else { + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + nes_debug(NES_DBG_MR, "Out of Pbls\n"); + nes_free_cqp_request(nesdev, cqp_request); + return -ENOMEM; + } + + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + } + + if (use_256_pbls && use_two_level) { + if (root_256(nesdev, root_vpbl, &new_root, pbl_count_4k) == 1) { + if (new_root.pbl_pbase != 0) + root_vpbl = &new_root; + } else { + spin_lock_irqsave(&nesadapter->pbl_lock, flags); + nesadapter->free_256pbl += pbl_count_256 + use_two_level; + use_256_pbls = 0; + + if (pbl_count_4k == 1) + use_two_level = 0; + pbl_count = pbl_count_4k; + + if ((pbl_count_4k + use_two_level) <= nesadapter->free_4kpbl) { + nesadapter->free_4kpbl -= pbl_count + use_two_level; + use_4k_pbls = 1; + } + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + + if (use_4k_pbls == 0) + return -ENOMEM; + } + } + + opcode = NES_CQP_REGISTER_STAG | NES_CQP_STAG_RIGHTS_LOCAL_READ | + NES_CQP_STAG_VA_TO | NES_CQP_STAG_MR; + if (acc & IB_ACCESS_LOCAL_WRITE) + opcode |= NES_CQP_STAG_RIGHTS_LOCAL_WRITE; + if (acc & IB_ACCESS_REMOTE_WRITE) + opcode |= NES_CQP_STAG_RIGHTS_REMOTE_WRITE | NES_CQP_STAG_REM_ACC_EN; + if (acc & IB_ACCESS_REMOTE_READ) + opcode |= NES_CQP_STAG_RIGHTS_REMOTE_READ | NES_CQP_STAG_REM_ACC_EN; + if (acc & IB_ACCESS_MW_BIND) + opcode |= NES_CQP_STAG_RIGHTS_WINDOW_BIND | NES_CQP_STAG_REM_ACC_EN; + + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode); + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_VA_LOW_IDX, *iova_start); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_LEN_LOW_IDX, region_length); + + cqp_wqe->wqe_words[NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX] = + cpu_to_le32((u32)(region_length >> 8) & 0xff000000); + cqp_wqe->wqe_words[NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX] |= + cpu_to_le32(nespd->pd_id & 0x00007fff); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, stag); + + if (pbl_count == 0) { + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PA_LOW_IDX, single_buffer); + } else { + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PA_LOW_IDX, root_vpbl->pbl_pbase); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PBL_BLK_COUNT_IDX, pbl_count); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PBL_LEN_IDX, (pg_cnt * 8)); + + if (use_4k_pbls) + cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] |= cpu_to_le32(NES_CQP_STAG_PBL_BLK_SIZE); + } + barrier(); + + atomic_set(&cqp_request->refcount, 2); + nes_post_cqp_request(nesdev, cqp_request); + + /* Wait for CQP */ + ret = wait_event_timeout(cqp_request->waitq, (0 != cqp_request->request_done), + NES_EVENT_TIMEOUT); + nes_debug(NES_DBG_MR, "Register STag 0x%08X completed, wait_event_timeout ret = %u," + " CQP Major:Minor codes = 0x%04X:0x%04X.\n", + stag, ret, cqp_request->major_code, cqp_request->minor_code); + major_code = cqp_request->major_code; + nes_put_cqp_request(nesdev, cqp_request); + + if ((!ret || major_code) && pbl_count != 0) { + spin_lock_irqsave(&nesadapter->pbl_lock, flags); + if (use_256_pbls) + nesadapter->free_256pbl += pbl_count + use_two_level; + else if (use_4k_pbls) + nesadapter->free_4kpbl += pbl_count + use_two_level; + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + } + if (new_root.pbl_pbase) + pci_free_consistent(nesdev->pcidev, 512, new_root.pbl_vbase, + new_root.pbl_pbase); + + if (!ret) + return -ETIME; + else if (major_code) + return -EIO; + + *actual_pbl_cnt = pbl_count + use_two_level; + *used_4k_pbls = use_4k_pbls; + return 0; +} + + +/** + * nes_reg_phys_mr + */ +static struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd, + struct ib_phys_buf *buffer_list, int num_phys_buf, int acc, + u64 * iova_start) +{ + u64 region_length; + struct nes_pd *nespd = to_nespd(ib_pd); + struct nes_vnic *nesvnic = to_nesvnic(ib_pd->device); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_mr *nesmr; + struct ib_mr *ibmr; + struct nes_vpbl vpbl; + struct nes_root_vpbl root_vpbl; + u32 stag; + u32 i; + unsigned long mask; + u32 stag_index = 0; + u32 next_stag_index = 0; + u32 driver_key = 0; + u32 root_pbl_index = 0; + u32 cur_pbl_index = 0; + int err = 0; + int ret = 0; + u16 pbl_count = 0; + u8 single_page = 1; + u8 stag_key = 0; + + region_length = 0; + vpbl.pbl_vbase = NULL; + root_vpbl.pbl_vbase = NULL; + root_vpbl.pbl_pbase = 0; + + get_random_bytes(&next_stag_index, sizeof(next_stag_index)); + stag_key = (u8)next_stag_index; + + driver_key = 0; + + next_stag_index >>= 8; + next_stag_index %= nesadapter->max_mr; + if (num_phys_buf > (1024*512)) { + return ERR_PTR(-E2BIG); + } + + if ((buffer_list[0].addr ^ *iova_start) & ~PAGE_MASK) + return ERR_PTR(-EINVAL); + + err = nes_alloc_resource(nesadapter, nesadapter->allocated_mrs, nesadapter->max_mr, + &stag_index, &next_stag_index, NES_RESOURCE_PHYS_MR); + if (err) { + return ERR_PTR(err); + } + + nesmr = kzalloc(sizeof(*nesmr), GFP_KERNEL); + if (!nesmr) { + nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); + return ERR_PTR(-ENOMEM); + } + + for (i = 0; i < num_phys_buf; i++) { + + if ((i & 0x01FF) == 0) { + if (root_pbl_index == 1) { + /* Allocate the root PBL */ + root_vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 8192, + &root_vpbl.pbl_pbase); + nes_debug(NES_DBG_MR, "Allocating root PBL, va = %p, pa = 0x%08X\n", + root_vpbl.pbl_vbase, (unsigned int)root_vpbl.pbl_pbase); + if (!root_vpbl.pbl_vbase) { + pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase, + vpbl.pbl_pbase); + nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); + kfree(nesmr); + return ERR_PTR(-ENOMEM); + } + root_vpbl.leaf_vpbl = kzalloc(sizeof(*root_vpbl.leaf_vpbl)*1024, GFP_KERNEL); + if (!root_vpbl.leaf_vpbl) { + pci_free_consistent(nesdev->pcidev, 8192, root_vpbl.pbl_vbase, + root_vpbl.pbl_pbase); + pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase, + vpbl.pbl_pbase); + nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); + kfree(nesmr); + return ERR_PTR(-ENOMEM); + } + root_vpbl.pbl_vbase[0].pa_low = cpu_to_le32((u32)vpbl.pbl_pbase); + root_vpbl.pbl_vbase[0].pa_high = + cpu_to_le32((u32)((((u64)vpbl.pbl_pbase) >> 32))); + root_vpbl.leaf_vpbl[0] = vpbl; + } + /* Allocate a 4K buffer for the PBL */ + vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 4096, + &vpbl.pbl_pbase); + nes_debug(NES_DBG_MR, "Allocating leaf PBL, va = %p, pa = 0x%016lX\n", + vpbl.pbl_vbase, (unsigned long)vpbl.pbl_pbase); + if (!vpbl.pbl_vbase) { + nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); + ibmr = ERR_PTR(-ENOMEM); + kfree(nesmr); + goto reg_phys_err; + } + /* Fill in the root table */ + if (1 <= root_pbl_index) { + root_vpbl.pbl_vbase[root_pbl_index].pa_low = + cpu_to_le32((u32)vpbl.pbl_pbase); + root_vpbl.pbl_vbase[root_pbl_index].pa_high = + cpu_to_le32((u32)((((u64)vpbl.pbl_pbase) >> 32))); + root_vpbl.leaf_vpbl[root_pbl_index] = vpbl; + } + root_pbl_index++; + cur_pbl_index = 0; + } + + mask = !buffer_list[i].size; + if (i != 0) + mask |= buffer_list[i].addr; + if (i != num_phys_buf - 1) + mask |= buffer_list[i].addr + buffer_list[i].size; + + if (mask & ~PAGE_MASK) { + nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); + nes_debug(NES_DBG_MR, "Invalid buffer addr or size\n"); + ibmr = ERR_PTR(-EINVAL); + kfree(nesmr); + goto reg_phys_err; + } + + region_length += buffer_list[i].size; + if ((i != 0) && (single_page)) { + if ((buffer_list[i-1].addr+PAGE_SIZE) != buffer_list[i].addr) + single_page = 0; + } + vpbl.pbl_vbase[cur_pbl_index].pa_low = cpu_to_le32((u32)buffer_list[i].addr & PAGE_MASK); + vpbl.pbl_vbase[cur_pbl_index++].pa_high = + cpu_to_le32((u32)((((u64)buffer_list[i].addr) >> 32))); + } + + stag = stag_index << 8; + stag |= driver_key; + stag += (u32)stag_key; + + nes_debug(NES_DBG_MR, "Registering STag 0x%08X, VA = 0x%016lX," + " length = 0x%016lX, index = 0x%08X\n", + stag, (unsigned long)*iova_start, (unsigned long)region_length, stag_index); + + /* Make the leaf PBL the root if only one PBL */ + if (root_pbl_index == 1) { + root_vpbl.pbl_pbase = vpbl.pbl_pbase; + } + + if (single_page) { + pbl_count = 0; + } else { + pbl_count = root_pbl_index; + } + ret = nes_reg_mr(nesdev, nespd, stag, region_length, &root_vpbl, + buffer_list[0].addr, pbl_count, (u16)cur_pbl_index, acc, iova_start, + &nesmr->pbls_used, &nesmr->pbl_4k); + + if (ret == 0) { + nesmr->ibmr.rkey = stag; + nesmr->ibmr.lkey = stag; + nesmr->mode = IWNES_MEMREG_TYPE_MEM; + ibmr = &nesmr->ibmr; + } else { + kfree(nesmr); + ibmr = ERR_PTR(-ENOMEM); + } + + reg_phys_err: + /* free the resources */ + if (root_pbl_index == 1) { + /* single PBL case */ + pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase, vpbl.pbl_pbase); + } else { + for (i=0; ipcidev, 4096, root_vpbl.leaf_vpbl[i].pbl_vbase, + root_vpbl.leaf_vpbl[i].pbl_pbase); + } + kfree(root_vpbl.leaf_vpbl); + pci_free_consistent(nesdev->pcidev, 8192, root_vpbl.pbl_vbase, + root_vpbl.pbl_pbase); + } + + return ibmr; +} + + +/** + * nes_get_dma_mr + */ +static struct ib_mr *nes_get_dma_mr(struct ib_pd *pd, int acc) +{ + struct ib_phys_buf bl; + u64 kva = 0; + + nes_debug(NES_DBG_MR, "\n"); + + bl.size = (u64)0xffffffffffULL; + bl.addr = 0; + return nes_reg_phys_mr(pd, &bl, 1, acc, &kva); +} + + +/** + * nes_reg_user_mr + */ +static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt, int acc, struct ib_udata *udata) +{ + u64 iova_start; + __le64 *pbl; + u64 region_length; + dma_addr_t last_dma_addr = 0; + dma_addr_t first_dma_addr = 0; + struct nes_pd *nespd = to_nespd(pd); + struct nes_vnic *nesvnic = to_nesvnic(pd->device); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct ib_mr *ibmr = ERR_PTR(-EINVAL); + struct scatterlist *sg; + struct nes_ucontext *nes_ucontext; + struct nes_pbl *nespbl; + struct nes_mr *nesmr; + struct ib_umem *region; + struct nes_mem_reg_req req; + struct nes_vpbl vpbl; + struct nes_root_vpbl root_vpbl; + int entry, page_index; + int page_count = 0; + int err, pbl_depth = 0; + int chunk_pages; + int ret; + u32 stag; + u32 stag_index = 0; + u32 next_stag_index; + u32 driver_key; + u32 root_pbl_index = 0; + u32 cur_pbl_index = 0; + u32 skip_pages; + u16 pbl_count; + u8 single_page = 1; + u8 stag_key; + int first_page = 1; + + region = ib_umem_get(pd->uobject->context, start, length, acc, 0); + if (IS_ERR(region)) { + return (struct ib_mr *)region; + } + + nes_debug(NES_DBG_MR, "User base = 0x%lX, Virt base = 0x%lX, length = %u," + " offset = %u, page size = %u.\n", + (unsigned long int)start, (unsigned long int)virt, (u32)length, + ib_umem_offset(region), region->page_size); + + skip_pages = ((u32)ib_umem_offset(region)) >> 12; + + if (ib_copy_from_udata(&req, udata, sizeof(req))) { + ib_umem_release(region); + return ERR_PTR(-EFAULT); + } + nes_debug(NES_DBG_MR, "Memory Registration type = %08X.\n", req.reg_type); + + switch (req.reg_type) { + case IWNES_MEMREG_TYPE_MEM: + pbl_depth = 0; + region_length = 0; + vpbl.pbl_vbase = NULL; + root_vpbl.pbl_vbase = NULL; + root_vpbl.pbl_pbase = 0; + + get_random_bytes(&next_stag_index, sizeof(next_stag_index)); + stag_key = (u8)next_stag_index; + + driver_key = next_stag_index & 0x70000000; + + next_stag_index >>= 8; + next_stag_index %= nesadapter->max_mr; + + err = nes_alloc_resource(nesadapter, nesadapter->allocated_mrs, + nesadapter->max_mr, &stag_index, &next_stag_index, NES_RESOURCE_USER_MR); + if (err) { + ib_umem_release(region); + return ERR_PTR(err); + } + + nesmr = kzalloc(sizeof(*nesmr), GFP_KERNEL); + if (!nesmr) { + ib_umem_release(region); + nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); + return ERR_PTR(-ENOMEM); + } + nesmr->region = region; + + for_each_sg(region->sg_head.sgl, sg, region->nmap, entry) { + if (sg_dma_address(sg) & ~PAGE_MASK) { + ib_umem_release(region); + nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); + nes_debug(NES_DBG_MR, "Unaligned Memory Buffer: 0x%x\n", + (unsigned int) sg_dma_address(sg)); + ibmr = ERR_PTR(-EINVAL); + kfree(nesmr); + goto reg_user_mr_err; + } + + if (!sg_dma_len(sg)) { + ib_umem_release(region); + nes_free_resource(nesadapter, nesadapter->allocated_mrs, + stag_index); + nes_debug(NES_DBG_MR, "Invalid Buffer Size\n"); + ibmr = ERR_PTR(-EINVAL); + kfree(nesmr); + goto reg_user_mr_err; + } + + region_length += sg_dma_len(sg); + chunk_pages = sg_dma_len(sg) >> 12; + region_length -= skip_pages << 12; + for (page_index = skip_pages; page_index < chunk_pages; page_index++) { + skip_pages = 0; + if ((page_count != 0) && (page_count << 12) - (ib_umem_offset(region) & (4096 - 1)) >= region->length) + goto enough_pages; + if ((page_count&0x01FF) == 0) { + if (page_count >= 1024 * 512) { + ib_umem_release(region); + nes_free_resource(nesadapter, + nesadapter->allocated_mrs, stag_index); + kfree(nesmr); + ibmr = ERR_PTR(-E2BIG); + goto reg_user_mr_err; + } + if (root_pbl_index == 1) { + root_vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, + 8192, &root_vpbl.pbl_pbase); + nes_debug(NES_DBG_MR, "Allocating root PBL, va = %p, pa = 0x%08X\n", + root_vpbl.pbl_vbase, (unsigned int)root_vpbl.pbl_pbase); + if (!root_vpbl.pbl_vbase) { + ib_umem_release(region); + pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase, + vpbl.pbl_pbase); + nes_free_resource(nesadapter, nesadapter->allocated_mrs, + stag_index); + kfree(nesmr); + ibmr = ERR_PTR(-ENOMEM); + goto reg_user_mr_err; + } + root_vpbl.leaf_vpbl = kzalloc(sizeof(*root_vpbl.leaf_vpbl)*1024, + GFP_KERNEL); + if (!root_vpbl.leaf_vpbl) { + ib_umem_release(region); + pci_free_consistent(nesdev->pcidev, 8192, root_vpbl.pbl_vbase, + root_vpbl.pbl_pbase); + pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase, + vpbl.pbl_pbase); + nes_free_resource(nesadapter, nesadapter->allocated_mrs, + stag_index); + kfree(nesmr); + ibmr = ERR_PTR(-ENOMEM); + goto reg_user_mr_err; + } + root_vpbl.pbl_vbase[0].pa_low = + cpu_to_le32((u32)vpbl.pbl_pbase); + root_vpbl.pbl_vbase[0].pa_high = + cpu_to_le32((u32)((((u64)vpbl.pbl_pbase) >> 32))); + root_vpbl.leaf_vpbl[0] = vpbl; + } + vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 4096, + &vpbl.pbl_pbase); + nes_debug(NES_DBG_MR, "Allocating leaf PBL, va = %p, pa = 0x%08X\n", + vpbl.pbl_vbase, (unsigned int)vpbl.pbl_pbase); + if (!vpbl.pbl_vbase) { + ib_umem_release(region); + nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); + ibmr = ERR_PTR(-ENOMEM); + kfree(nesmr); + goto reg_user_mr_err; + } + if (1 <= root_pbl_index) { + root_vpbl.pbl_vbase[root_pbl_index].pa_low = + cpu_to_le32((u32)vpbl.pbl_pbase); + root_vpbl.pbl_vbase[root_pbl_index].pa_high = + cpu_to_le32((u32)((((u64)vpbl.pbl_pbase)>>32))); + root_vpbl.leaf_vpbl[root_pbl_index] = vpbl; + } + root_pbl_index++; + cur_pbl_index = 0; + } + if (single_page) { + if (page_count != 0) { + if ((last_dma_addr+4096) != + (sg_dma_address(sg)+ + (page_index*4096))) + single_page = 0; + last_dma_addr = sg_dma_address(sg)+ + (page_index*4096); + } else { + first_dma_addr = sg_dma_address(sg)+ + (page_index*4096); + last_dma_addr = first_dma_addr; + } + } + + vpbl.pbl_vbase[cur_pbl_index].pa_low = + cpu_to_le32((u32)(sg_dma_address(sg)+ + (page_index*4096))); + vpbl.pbl_vbase[cur_pbl_index].pa_high = + cpu_to_le32((u32)((((u64)(sg_dma_address(sg)+ + (page_index*4096))) >> 32))); + cur_pbl_index++; + page_count++; + } + } + + enough_pages: + nes_debug(NES_DBG_MR, "calculating stag, stag_index=0x%08x, driver_key=0x%08x," + " stag_key=0x%08x\n", + stag_index, driver_key, stag_key); + stag = stag_index << 8; + stag |= driver_key; + stag += (u32)stag_key; + + iova_start = virt; + /* Make the leaf PBL the root if only one PBL */ + if (root_pbl_index == 1) { + root_vpbl.pbl_pbase = vpbl.pbl_pbase; + } + + if (single_page) { + pbl_count = 0; + } else { + pbl_count = root_pbl_index; + first_dma_addr = 0; + } + nes_debug(NES_DBG_MR, "Registering STag 0x%08X, VA = 0x%08X, length = 0x%08X," + " index = 0x%08X, region->length=0x%08llx, pbl_count = %u\n", + stag, (unsigned int)iova_start, + (unsigned int)region_length, stag_index, + (unsigned long long)region->length, pbl_count); + ret = nes_reg_mr(nesdev, nespd, stag, region->length, &root_vpbl, + first_dma_addr, pbl_count, (u16)cur_pbl_index, acc, + &iova_start, &nesmr->pbls_used, &nesmr->pbl_4k); + + nes_debug(NES_DBG_MR, "ret=%d\n", ret); + + if (ret == 0) { + nesmr->ibmr.rkey = stag; + nesmr->ibmr.lkey = stag; + nesmr->mode = IWNES_MEMREG_TYPE_MEM; + ibmr = &nesmr->ibmr; + } else { + ib_umem_release(region); + kfree(nesmr); + ibmr = ERR_PTR(-ENOMEM); + } + + reg_user_mr_err: + /* free the resources */ + if (root_pbl_index == 1) { + pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase, + vpbl.pbl_pbase); + } else { + for (page_index=0; page_indexpcidev, 4096, + root_vpbl.leaf_vpbl[page_index].pbl_vbase, + root_vpbl.leaf_vpbl[page_index].pbl_pbase); + } + kfree(root_vpbl.leaf_vpbl); + pci_free_consistent(nesdev->pcidev, 8192, root_vpbl.pbl_vbase, + root_vpbl.pbl_pbase); + } + + nes_debug(NES_DBG_MR, "Leaving, ibmr=%p", ibmr); + + return ibmr; + case IWNES_MEMREG_TYPE_QP: + case IWNES_MEMREG_TYPE_CQ: + if (!region->length) { + nes_debug(NES_DBG_MR, "Unable to register zero length region for CQ\n"); + ib_umem_release(region); + return ERR_PTR(-EINVAL); + } + nespbl = kzalloc(sizeof(*nespbl), GFP_KERNEL); + if (!nespbl) { + nes_debug(NES_DBG_MR, "Unable to allocate PBL\n"); + ib_umem_release(region); + return ERR_PTR(-ENOMEM); + } + nesmr = kzalloc(sizeof(*nesmr), GFP_KERNEL); + if (!nesmr) { + ib_umem_release(region); + kfree(nespbl); + nes_debug(NES_DBG_MR, "Unable to allocate nesmr\n"); + return ERR_PTR(-ENOMEM); + } + nesmr->region = region; + nes_ucontext = to_nesucontext(pd->uobject->context); + pbl_depth = region->length >> 12; + pbl_depth += (region->length & (4096-1)) ? 1 : 0; + nespbl->pbl_size = pbl_depth*sizeof(u64); + if (req.reg_type == IWNES_MEMREG_TYPE_QP) { + nes_debug(NES_DBG_MR, "Attempting to allocate QP PBL memory"); + } else { + nes_debug(NES_DBG_MR, "Attempting to allocate CP PBL memory"); + } + + nes_debug(NES_DBG_MR, " %u bytes, %u entries.\n", + nespbl->pbl_size, pbl_depth); + pbl = pci_alloc_consistent(nesdev->pcidev, nespbl->pbl_size, + &nespbl->pbl_pbase); + if (!pbl) { + ib_umem_release(region); + kfree(nesmr); + kfree(nespbl); + nes_debug(NES_DBG_MR, "Unable to allocate PBL memory\n"); + return ERR_PTR(-ENOMEM); + } + + nespbl->pbl_vbase = (u64 *)pbl; + nespbl->user_base = start; + nes_debug(NES_DBG_MR, "Allocated PBL memory, %u bytes, pbl_pbase=%lx," + " pbl_vbase=%p user_base=0x%lx\n", + nespbl->pbl_size, (unsigned long) nespbl->pbl_pbase, + (void *) nespbl->pbl_vbase, nespbl->user_base); + + for_each_sg(region->sg_head.sgl, sg, region->nmap, entry) { + chunk_pages = sg_dma_len(sg) >> 12; + chunk_pages += (sg_dma_len(sg) & (4096-1)) ? 1 : 0; + if (first_page) { + nespbl->page = sg_page(sg); + first_page = 0; + } + + for (page_index = 0; page_index < chunk_pages; page_index++) { + ((__le32 *)pbl)[0] = cpu_to_le32((u32) + (sg_dma_address(sg)+ + (page_index*4096))); + ((__le32 *)pbl)[1] = cpu_to_le32(((u64) + (sg_dma_address(sg)+ + (page_index*4096)))>>32); + nes_debug(NES_DBG_MR, "pbl=%p, *pbl=0x%016llx, 0x%08x%08x\n", pbl, + (unsigned long long)*pbl, + le32_to_cpu(((__le32 *)pbl)[1]), le32_to_cpu(((__le32 *)pbl)[0])); + pbl++; + } + } + + if (req.reg_type == IWNES_MEMREG_TYPE_QP) { + list_add_tail(&nespbl->list, &nes_ucontext->qp_reg_mem_list); + } else { + list_add_tail(&nespbl->list, &nes_ucontext->cq_reg_mem_list); + } + nesmr->ibmr.rkey = -1; + nesmr->ibmr.lkey = -1; + nesmr->mode = req.reg_type; + return &nesmr->ibmr; + } + + ib_umem_release(region); + return ERR_PTR(-ENOSYS); +} + + +/** + * nes_dereg_mr + */ +static int nes_dereg_mr(struct ib_mr *ib_mr) +{ + struct nes_mr *nesmr = to_nesmr(ib_mr); + struct nes_vnic *nesvnic = to_nesvnic(ib_mr->device); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_hw_cqp_wqe *cqp_wqe; + struct nes_cqp_request *cqp_request; + unsigned long flags; + int ret; + u16 major_code; + u16 minor_code; + + if (nesmr->region) { + ib_umem_release(nesmr->region); + } + if (nesmr->mode != IWNES_MEMREG_TYPE_MEM) { + kfree(nesmr); + return 0; + } + + /* Deallocate the region with the adapter */ + + cqp_request = nes_get_cqp_request(nesdev); + if (cqp_request == NULL) { + nes_debug(NES_DBG_MR, "Failed to get a cqp_request.\n"); + return -ENOMEM; + } + cqp_request->waiting = 1; + cqp_wqe = &cqp_request->cqp_wqe; + + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, + NES_CQP_DEALLOCATE_STAG | NES_CQP_STAG_VA_TO | + NES_CQP_STAG_DEALLOC_PBLS | NES_CQP_STAG_MR); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, ib_mr->rkey); + + atomic_set(&cqp_request->refcount, 2); + nes_post_cqp_request(nesdev, cqp_request); + + /* Wait for CQP */ + nes_debug(NES_DBG_MR, "Waiting for deallocate STag 0x%08X completed\n", ib_mr->rkey); + ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0), + NES_EVENT_TIMEOUT); + nes_debug(NES_DBG_MR, "Deallocate STag 0x%08X completed, wait_event_timeout ret = %u," + " CQP Major:Minor codes = 0x%04X:0x%04X\n", + ib_mr->rkey, ret, cqp_request->major_code, cqp_request->minor_code); + + major_code = cqp_request->major_code; + minor_code = cqp_request->minor_code; + + nes_put_cqp_request(nesdev, cqp_request); + + if (!ret) { + nes_debug(NES_DBG_MR, "Timeout waiting to destroy STag," + " ib_mr=%p, rkey = 0x%08X\n", + ib_mr, ib_mr->rkey); + return -ETIME; + } else if (major_code) { + nes_debug(NES_DBG_MR, "Error (0x%04X:0x%04X) while attempting" + " to destroy STag, ib_mr=%p, rkey = 0x%08X\n", + major_code, minor_code, ib_mr, ib_mr->rkey); + return -EIO; + } + + if (nesmr->pbls_used != 0) { + spin_lock_irqsave(&nesadapter->pbl_lock, flags); + if (nesmr->pbl_4k) { + nesadapter->free_4kpbl += nesmr->pbls_used; + if (nesadapter->free_4kpbl > nesadapter->max_4kpbl) + printk(KERN_ERR PFX "free 4KB PBLs(%u) has " + "exceeded the max(%u)\n", + nesadapter->free_4kpbl, + nesadapter->max_4kpbl); + } else { + nesadapter->free_256pbl += nesmr->pbls_used; + if (nesadapter->free_256pbl > nesadapter->max_256pbl) + printk(KERN_ERR PFX "free 256B PBLs(%u) has " + "exceeded the max(%u)\n", + nesadapter->free_256pbl, + nesadapter->max_256pbl); + } + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + } + nes_free_resource(nesadapter, nesadapter->allocated_mrs, + (ib_mr->rkey & 0x0fffff00) >> 8); + + kfree(nesmr); + + return 0; +} + + +/** + * show_rev + */ +static ssize_t show_rev(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nes_ib_device *nesibdev = + container_of(dev, struct nes_ib_device, ibdev.dev); + struct nes_vnic *nesvnic = nesibdev->nesvnic; + + nes_debug(NES_DBG_INIT, "\n"); + return sprintf(buf, "%x\n", nesvnic->nesdev->nesadapter->hw_rev); +} + + +/** + * show_fw_ver + */ +static ssize_t show_fw_ver(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nes_ib_device *nesibdev = + container_of(dev, struct nes_ib_device, ibdev.dev); + struct nes_vnic *nesvnic = nesibdev->nesvnic; + + nes_debug(NES_DBG_INIT, "\n"); + return sprintf(buf, "%u.%u\n", + (nesvnic->nesdev->nesadapter->firmware_version >> 16), + (nesvnic->nesdev->nesadapter->firmware_version & 0x000000ff)); +} + + +/** + * show_hca + */ +static ssize_t show_hca(struct device *dev, struct device_attribute *attr, + char *buf) +{ + nes_debug(NES_DBG_INIT, "\n"); + return sprintf(buf, "NES020\n"); +} + + +/** + * show_board + */ +static ssize_t show_board(struct device *dev, struct device_attribute *attr, + char *buf) +{ + nes_debug(NES_DBG_INIT, "\n"); + return sprintf(buf, "%.*s\n", 32, "NES020 Board ID"); +} + + +static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); +static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); +static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); +static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); + +static struct device_attribute *nes_dev_attributes[] = { + &dev_attr_hw_rev, + &dev_attr_fw_ver, + &dev_attr_hca_type, + &dev_attr_board_id +}; + + +/** + * nes_query_qp + */ +static int nes_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_qp_init_attr *init_attr) +{ + struct nes_qp *nesqp = to_nesqp(ibqp); + + nes_debug(NES_DBG_QP, "\n"); + + attr->qp_access_flags = 0; + attr->cap.max_send_wr = nesqp->hwqp.sq_size; + attr->cap.max_recv_wr = nesqp->hwqp.rq_size; + attr->cap.max_recv_sge = 1; + if (nes_drv_opt & NES_DRV_OPT_NO_INLINE_DATA) + attr->cap.max_inline_data = 0; + else + attr->cap.max_inline_data = 64; + + init_attr->event_handler = nesqp->ibqp.event_handler; + init_attr->qp_context = nesqp->ibqp.qp_context; + init_attr->send_cq = nesqp->ibqp.send_cq; + init_attr->recv_cq = nesqp->ibqp.recv_cq; + init_attr->srq = nesqp->ibqp.srq; + init_attr->cap = attr->cap; + + return 0; +} + + +/** + * nes_hw_modify_qp + */ +int nes_hw_modify_qp(struct nes_device *nesdev, struct nes_qp *nesqp, + u32 next_iwarp_state, u32 termlen, u32 wait_completion) +{ + struct nes_hw_cqp_wqe *cqp_wqe; + /* struct iw_cm_id *cm_id = nesqp->cm_id; */ + /* struct iw_cm_event cm_event; */ + struct nes_cqp_request *cqp_request; + int ret; + u16 major_code; + + nes_debug(NES_DBG_MOD_QP, "QP%u, refcount=%d\n", + nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount)); + + cqp_request = nes_get_cqp_request(nesdev); + if (cqp_request == NULL) { + nes_debug(NES_DBG_MOD_QP, "Failed to get a cqp_request.\n"); + return -ENOMEM; + } + if (wait_completion) { + cqp_request->waiting = 1; + } else { + cqp_request->waiting = 0; + } + cqp_wqe = &cqp_request->cqp_wqe; + + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, + NES_CQP_MODIFY_QP | NES_CQP_QP_TYPE_IWARP | next_iwarp_state); + nes_debug(NES_DBG_MOD_QP, "using next_iwarp_state=%08x, wqe_words=%08x\n", + next_iwarp_state, le32_to_cpu(cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX])); + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id); + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, (u64)nesqp->nesqp_context_pbase); + + /* If sending a terminate message, fill in the length (in words) */ + if (((next_iwarp_state & NES_CQP_QP_IWARP_STATE_MASK) == NES_CQP_QP_IWARP_STATE_TERMINATE) && + !(next_iwarp_state & NES_CQP_QP_TERM_DONT_SEND_TERM_MSG)) { + termlen = ((termlen + 3) >> 2) << NES_CQP_OP_TERMLEN_SHIFT; + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_NEW_MSS_IDX, termlen); + } + + atomic_set(&cqp_request->refcount, 2); + nes_post_cqp_request(nesdev, cqp_request); + + /* Wait for CQP */ + if (wait_completion) { + /* nes_debug(NES_DBG_MOD_QP, "Waiting for modify iWARP QP%u to complete.\n", + nesqp->hwqp.qp_id); */ + ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0), + NES_EVENT_TIMEOUT); + nes_debug(NES_DBG_MOD_QP, "Modify iwarp QP%u completed, wait_event_timeout ret=%u, " + "CQP Major:Minor codes = 0x%04X:0x%04X.\n", + nesqp->hwqp.qp_id, ret, cqp_request->major_code, cqp_request->minor_code); + major_code = cqp_request->major_code; + if (major_code) { + nes_debug(NES_DBG_MOD_QP, "Modify iwarp QP%u failed" + "CQP Major:Minor codes = 0x%04X:0x%04X, intended next state = 0x%08X.\n", + nesqp->hwqp.qp_id, cqp_request->major_code, + cqp_request->minor_code, next_iwarp_state); + } + + nes_put_cqp_request(nesdev, cqp_request); + + if (!ret) + return -ETIME; + else if (major_code) + return -EIO; + else + return 0; + } else { + return 0; + } +} + + +/** + * nes_modify_qp + */ +int nes_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct nes_qp *nesqp = to_nesqp(ibqp); + struct nes_vnic *nesvnic = to_nesvnic(ibqp->device); + struct nes_device *nesdev = nesvnic->nesdev; + /* u32 cqp_head; */ + /* u32 counter; */ + u32 next_iwarp_state = 0; + int err; + unsigned long qplockflags; + int ret; + u16 original_last_aeq; + u8 issue_modify_qp = 0; + u8 dont_wait = 0; + + nes_debug(NES_DBG_MOD_QP, "QP%u: QP State=%u, cur QP State=%u," + " iwarp_state=0x%X, refcount=%d\n", + nesqp->hwqp.qp_id, attr->qp_state, nesqp->ibqp_state, + nesqp->iwarp_state, atomic_read(&nesqp->refcount)); + + spin_lock_irqsave(&nesqp->lock, qplockflags); + + nes_debug(NES_DBG_MOD_QP, "QP%u: hw_iwarp_state=0x%X, hw_tcp_state=0x%X," + " QP Access Flags=0x%X, attr_mask = 0x%0x\n", + nesqp->hwqp.qp_id, nesqp->hw_iwarp_state, + nesqp->hw_tcp_state, attr->qp_access_flags, attr_mask); + + if (attr_mask & IB_QP_STATE) { + switch (attr->qp_state) { + case IB_QPS_INIT: + nes_debug(NES_DBG_MOD_QP, "QP%u: new state = init\n", + nesqp->hwqp.qp_id); + if (nesqp->iwarp_state > (u32)NES_CQP_QP_IWARP_STATE_IDLE) { + spin_unlock_irqrestore(&nesqp->lock, qplockflags); + return -EINVAL; + } + next_iwarp_state = NES_CQP_QP_IWARP_STATE_IDLE; + issue_modify_qp = 1; + break; + case IB_QPS_RTR: + nes_debug(NES_DBG_MOD_QP, "QP%u: new state = rtr\n", + nesqp->hwqp.qp_id); + if (nesqp->iwarp_state>(u32)NES_CQP_QP_IWARP_STATE_IDLE) { + spin_unlock_irqrestore(&nesqp->lock, qplockflags); + return -EINVAL; + } + next_iwarp_state = NES_CQP_QP_IWARP_STATE_IDLE; + issue_modify_qp = 1; + break; + case IB_QPS_RTS: + nes_debug(NES_DBG_MOD_QP, "QP%u: new state = rts\n", + nesqp->hwqp.qp_id); + if (nesqp->iwarp_state>(u32)NES_CQP_QP_IWARP_STATE_RTS) { + spin_unlock_irqrestore(&nesqp->lock, qplockflags); + return -EINVAL; + } + if (nesqp->cm_id == NULL) { + nes_debug(NES_DBG_MOD_QP, "QP%u: Failing attempt to move QP to RTS without a CM_ID. \n", + nesqp->hwqp.qp_id ); + spin_unlock_irqrestore(&nesqp->lock, qplockflags); + return -EINVAL; + } + next_iwarp_state = NES_CQP_QP_IWARP_STATE_RTS; + if (nesqp->iwarp_state != NES_CQP_QP_IWARP_STATE_RTS) + next_iwarp_state |= NES_CQP_QP_CONTEXT_VALID | + NES_CQP_QP_ARP_VALID | NES_CQP_QP_ORD_VALID; + issue_modify_qp = 1; + nesqp->hw_tcp_state = NES_AEQE_TCP_STATE_ESTABLISHED; + nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_RTS; + nesqp->hte_added = 1; + break; + case IB_QPS_SQD: + issue_modify_qp = 1; + nes_debug(NES_DBG_MOD_QP, "QP%u: new state=closing. SQ head=%u, SQ tail=%u\n", + nesqp->hwqp.qp_id, nesqp->hwqp.sq_head, nesqp->hwqp.sq_tail); + if (nesqp->iwarp_state == (u32)NES_CQP_QP_IWARP_STATE_CLOSING) { + spin_unlock_irqrestore(&nesqp->lock, qplockflags); + return 0; + } else { + if (nesqp->iwarp_state > (u32)NES_CQP_QP_IWARP_STATE_CLOSING) { + nes_debug(NES_DBG_MOD_QP, "QP%u: State change to closing" + " ignored due to current iWARP state\n", + nesqp->hwqp.qp_id); + spin_unlock_irqrestore(&nesqp->lock, qplockflags); + return -EINVAL; + } + if (nesqp->hw_iwarp_state != NES_AEQE_IWARP_STATE_RTS) { + nes_debug(NES_DBG_MOD_QP, "QP%u: State change to closing" + " already done based on hw state.\n", + nesqp->hwqp.qp_id); + issue_modify_qp = 0; + } + switch (nesqp->hw_iwarp_state) { + case NES_AEQE_IWARP_STATE_CLOSING: + next_iwarp_state = NES_CQP_QP_IWARP_STATE_CLOSING; + break; + case NES_AEQE_IWARP_STATE_TERMINATE: + next_iwarp_state = NES_CQP_QP_IWARP_STATE_TERMINATE; + break; + case NES_AEQE_IWARP_STATE_ERROR: + next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR; + break; + default: + next_iwarp_state = NES_CQP_QP_IWARP_STATE_CLOSING; + nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_CLOSING; + break; + } + } + break; + case IB_QPS_SQE: + nes_debug(NES_DBG_MOD_QP, "QP%u: new state = terminate\n", + nesqp->hwqp.qp_id); + if (nesqp->iwarp_state>=(u32)NES_CQP_QP_IWARP_STATE_TERMINATE) { + spin_unlock_irqrestore(&nesqp->lock, qplockflags); + return -EINVAL; + } + /* next_iwarp_state = (NES_CQP_QP_IWARP_STATE_TERMINATE | 0x02000000); */ + next_iwarp_state = NES_CQP_QP_IWARP_STATE_TERMINATE; + nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_TERMINATE; + issue_modify_qp = 1; + break; + case IB_QPS_ERR: + case IB_QPS_RESET: + if (nesqp->iwarp_state == (u32)NES_CQP_QP_IWARP_STATE_ERROR) { + spin_unlock_irqrestore(&nesqp->lock, qplockflags); + return -EINVAL; + } + nes_debug(NES_DBG_MOD_QP, "QP%u: new state = error\n", + nesqp->hwqp.qp_id); + if (nesqp->term_flags) + del_timer(&nesqp->terminate_timer); + + next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR; + /* next_iwarp_state = (NES_CQP_QP_IWARP_STATE_TERMINATE | 0x02000000); */ + if (nesqp->hte_added) { + nes_debug(NES_DBG_MOD_QP, "set CQP_QP_DEL_HTE\n"); + next_iwarp_state |= NES_CQP_QP_DEL_HTE; + nesqp->hte_added = 0; + } + if ((nesqp->hw_tcp_state > NES_AEQE_TCP_STATE_CLOSED) && + (nesdev->iw_status) && + (nesqp->hw_tcp_state != NES_AEQE_TCP_STATE_TIME_WAIT)) { + next_iwarp_state |= NES_CQP_QP_RESET; + } else { + nes_debug(NES_DBG_MOD_QP, "QP%u NOT setting NES_CQP_QP_RESET since TCP state = %u\n", + nesqp->hwqp.qp_id, nesqp->hw_tcp_state); + dont_wait = 1; + } + issue_modify_qp = 1; + nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_ERROR; + break; + default: + spin_unlock_irqrestore(&nesqp->lock, qplockflags); + return -EINVAL; + break; + } + + nesqp->ibqp_state = attr->qp_state; + nesqp->iwarp_state = next_iwarp_state & NES_CQP_QP_IWARP_STATE_MASK; + nes_debug(NES_DBG_MOD_QP, "Change nesqp->iwarp_state=%08x\n", + nesqp->iwarp_state); + } + + if (attr_mask & IB_QP_ACCESS_FLAGS) { + if (attr->qp_access_flags & IB_ACCESS_LOCAL_WRITE) { + nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_RDMA_WRITE_EN | + NES_QPCONTEXT_MISC_RDMA_READ_EN); + issue_modify_qp = 1; + } + if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE) { + nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_RDMA_WRITE_EN); + issue_modify_qp = 1; + } + if (attr->qp_access_flags & IB_ACCESS_REMOTE_READ) { + nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_RDMA_READ_EN); + issue_modify_qp = 1; + } + if (attr->qp_access_flags & IB_ACCESS_MW_BIND) { + nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_WBIND_EN); + issue_modify_qp = 1; + } + + if (nesqp->user_mode) { + nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_RDMA_WRITE_EN | + NES_QPCONTEXT_MISC_RDMA_READ_EN); + issue_modify_qp = 1; + } + } + + original_last_aeq = nesqp->last_aeq; + spin_unlock_irqrestore(&nesqp->lock, qplockflags); + + nes_debug(NES_DBG_MOD_QP, "issue_modify_qp=%u\n", issue_modify_qp); + + ret = 0; + + + if (issue_modify_qp) { + nes_debug(NES_DBG_MOD_QP, "call nes_hw_modify_qp\n"); + ret = nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0, 1); + if (ret) + nes_debug(NES_DBG_MOD_QP, "nes_hw_modify_qp (next_iwarp_state = 0x%08X)" + " failed for QP%u.\n", + next_iwarp_state, nesqp->hwqp.qp_id); + + } + + if ((issue_modify_qp) && (nesqp->ibqp_state > IB_QPS_RTS)) { + nes_debug(NES_DBG_MOD_QP, "QP%u Issued ModifyQP refcount (%d)," + " original_last_aeq = 0x%04X. last_aeq = 0x%04X.\n", + nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount), + original_last_aeq, nesqp->last_aeq); + if (!ret || original_last_aeq != NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE) { + if (dont_wait) { + if (nesqp->cm_id && nesqp->hw_tcp_state != 0) { + nes_debug(NES_DBG_MOD_QP, "QP%u Queuing fake disconnect for QP refcount (%d)," + " original_last_aeq = 0x%04X. last_aeq = 0x%04X.\n", + nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount), + original_last_aeq, nesqp->last_aeq); + /* this one is for the cm_disconnect thread */ + spin_lock_irqsave(&nesqp->lock, qplockflags); + nesqp->hw_tcp_state = NES_AEQE_TCP_STATE_CLOSED; + nesqp->last_aeq = NES_AEQE_AEID_RESET_SENT; + spin_unlock_irqrestore(&nesqp->lock, qplockflags); + nes_cm_disconn(nesqp); + } else { + nes_debug(NES_DBG_MOD_QP, "QP%u No fake disconnect, QP refcount=%d\n", + nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount)); + } + } else { + spin_lock_irqsave(&nesqp->lock, qplockflags); + if (nesqp->cm_id) { + /* These two are for the timer thread */ + if (atomic_inc_return(&nesqp->close_timer_started) == 1) { + nesqp->cm_id->add_ref(nesqp->cm_id); + nes_debug(NES_DBG_MOD_QP, "QP%u Not decrementing QP refcount (%d)," + " need ae to finish up, original_last_aeq = 0x%04X." + " last_aeq = 0x%04X, scheduling timer.\n", + nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount), + original_last_aeq, nesqp->last_aeq); + schedule_nes_timer(nesqp->cm_node, (struct sk_buff *) nesqp, NES_TIMER_TYPE_CLOSE, 1, 0); + } + spin_unlock_irqrestore(&nesqp->lock, qplockflags); + } else { + spin_unlock_irqrestore(&nesqp->lock, qplockflags); + nes_debug(NES_DBG_MOD_QP, "QP%u Not decrementing QP refcount (%d)," + " need ae to finish up, original_last_aeq = 0x%04X." + " last_aeq = 0x%04X.\n", + nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount), + original_last_aeq, nesqp->last_aeq); + } + } + } else { + nes_debug(NES_DBG_MOD_QP, "QP%u Decrementing QP refcount (%d), No ae to finish up," + " original_last_aeq = 0x%04X. last_aeq = 0x%04X.\n", + nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount), + original_last_aeq, nesqp->last_aeq); + } + } else { + nes_debug(NES_DBG_MOD_QP, "QP%u Decrementing QP refcount (%d), No ae to finish up," + " original_last_aeq = 0x%04X. last_aeq = 0x%04X.\n", + nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount), + original_last_aeq, nesqp->last_aeq); + } + + err = 0; + + nes_debug(NES_DBG_MOD_QP, "QP%u Leaving, refcount=%d\n", + nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount)); + + return err; +} + + +/** + * nes_muticast_attach + */ +static int nes_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + nes_debug(NES_DBG_INIT, "\n"); + return -ENOSYS; +} + + +/** + * nes_multicast_detach + */ +static int nes_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + nes_debug(NES_DBG_INIT, "\n"); + return -ENOSYS; +} + + +/** + * nes_process_mad + */ +static int nes_process_mad(struct ib_device *ibdev, int mad_flags, + u8 port_num, struct ib_wc *in_wc, struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad) +{ + nes_debug(NES_DBG_INIT, "\n"); + return -ENOSYS; +} + +static inline void +fill_wqe_sg_send(struct nes_hw_qp_wqe *wqe, struct ib_send_wr *ib_wr, u32 uselkey) +{ + int sge_index; + int total_payload_length = 0; + for (sge_index = 0; sge_index < ib_wr->num_sge; sge_index++) { + set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_FRAG0_LOW_IDX+(sge_index*4), + ib_wr->sg_list[sge_index].addr); + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_LENGTH0_IDX + (sge_index*4), + ib_wr->sg_list[sge_index].length); + if (uselkey) + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_STAG0_IDX + (sge_index*4), + (ib_wr->sg_list[sge_index].lkey)); + else + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_STAG0_IDX + (sge_index*4), 0); + + total_payload_length += ib_wr->sg_list[sge_index].length; + } + nes_debug(NES_DBG_IW_TX, "UC UC UC, sending total_payload_length=%u \n", + total_payload_length); + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX, + total_payload_length); +} + +/** + * nes_post_send + */ +static int nes_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr, + struct ib_send_wr **bad_wr) +{ + u64 u64temp; + unsigned long flags = 0; + struct nes_vnic *nesvnic = to_nesvnic(ibqp->device); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_qp *nesqp = to_nesqp(ibqp); + struct nes_hw_qp_wqe *wqe; + int err = 0; + u32 qsize = nesqp->hwqp.sq_size; + u32 head; + u32 wqe_misc = 0; + u32 wqe_count = 0; + u32 counter; + + if (nesqp->ibqp_state > IB_QPS_RTS) { + err = -EINVAL; + goto out; + } + + spin_lock_irqsave(&nesqp->lock, flags); + + head = nesqp->hwqp.sq_head; + + while (ib_wr) { + /* Check for QP error */ + if (nesqp->term_flags) { + err = -EINVAL; + break; + } + + /* Check for SQ overflow */ + if (((head + (2 * qsize) - nesqp->hwqp.sq_tail) % qsize) == (qsize - 1)) { + err = -ENOMEM; + break; + } + + wqe = &nesqp->hwqp.sq_vbase[head]; + /* nes_debug(NES_DBG_IW_TX, "processing sq wqe for QP%u at %p, head = %u.\n", + nesqp->hwqp.qp_id, wqe, head); */ + nes_fill_init_qp_wqe(wqe, nesqp, head); + u64temp = (u64)(ib_wr->wr_id); + set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_SCRATCH_LOW_IDX, + u64temp); + switch (ib_wr->opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_INV: + if (IB_WR_SEND == ib_wr->opcode) { + if (ib_wr->send_flags & IB_SEND_SOLICITED) + wqe_misc = NES_IWARP_SQ_OP_SENDSE; + else + wqe_misc = NES_IWARP_SQ_OP_SEND; + } else { + if (ib_wr->send_flags & IB_SEND_SOLICITED) + wqe_misc = NES_IWARP_SQ_OP_SENDSEINV; + else + wqe_misc = NES_IWARP_SQ_OP_SENDINV; + + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_INV_STAG_LOW_IDX, + ib_wr->ex.invalidate_rkey); + } + + if (ib_wr->num_sge > nesdev->nesadapter->max_sge) { + err = -EINVAL; + break; + } + + if (ib_wr->send_flags & IB_SEND_FENCE) + wqe_misc |= NES_IWARP_SQ_WQE_LOCAL_FENCE; + + if ((ib_wr->send_flags & IB_SEND_INLINE) && + ((nes_drv_opt & NES_DRV_OPT_NO_INLINE_DATA) == 0) && + (ib_wr->sg_list[0].length <= 64)) { + memcpy(&wqe->wqe_words[NES_IWARP_SQ_WQE_IMM_DATA_START_IDX], + (void *)(unsigned long)ib_wr->sg_list[0].addr, ib_wr->sg_list[0].length); + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX, + ib_wr->sg_list[0].length); + wqe_misc |= NES_IWARP_SQ_WQE_IMM_DATA; + } else { + fill_wqe_sg_send(wqe, ib_wr, 1); + } + + break; + case IB_WR_RDMA_WRITE: + wqe_misc = NES_IWARP_SQ_OP_RDMAW; + if (ib_wr->num_sge > nesdev->nesadapter->max_sge) { + nes_debug(NES_DBG_IW_TX, "Exceeded max sge, ib_wr=%u, max=%u\n", + ib_wr->num_sge, nesdev->nesadapter->max_sge); + err = -EINVAL; + break; + } + + if (ib_wr->send_flags & IB_SEND_FENCE) + wqe_misc |= NES_IWARP_SQ_WQE_LOCAL_FENCE; + + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_STAG_IDX, + ib_wr->wr.rdma.rkey); + set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_TO_LOW_IDX, + ib_wr->wr.rdma.remote_addr); + + if ((ib_wr->send_flags & IB_SEND_INLINE) && + ((nes_drv_opt & NES_DRV_OPT_NO_INLINE_DATA) == 0) && + (ib_wr->sg_list[0].length <= 64)) { + memcpy(&wqe->wqe_words[NES_IWARP_SQ_WQE_IMM_DATA_START_IDX], + (void *)(unsigned long)ib_wr->sg_list[0].addr, ib_wr->sg_list[0].length); + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX, + ib_wr->sg_list[0].length); + wqe_misc |= NES_IWARP_SQ_WQE_IMM_DATA; + } else { + fill_wqe_sg_send(wqe, ib_wr, 1); + } + + wqe->wqe_words[NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX] = + wqe->wqe_words[NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX]; + break; + case IB_WR_RDMA_READ: + case IB_WR_RDMA_READ_WITH_INV: + /* iWARP only supports 1 sge for RDMA reads */ + if (ib_wr->num_sge > 1) { + nes_debug(NES_DBG_IW_TX, "Exceeded max sge, ib_wr=%u, max=1\n", + ib_wr->num_sge); + err = -EINVAL; + break; + } + if (ib_wr->opcode == IB_WR_RDMA_READ) { + wqe_misc = NES_IWARP_SQ_OP_RDMAR; + } else { + wqe_misc = NES_IWARP_SQ_OP_RDMAR_LOCINV; + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_INV_STAG_LOW_IDX, + ib_wr->ex.invalidate_rkey); + } + + set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_TO_LOW_IDX, + ib_wr->wr.rdma.remote_addr); + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_STAG_IDX, + ib_wr->wr.rdma.rkey); + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX, + ib_wr->sg_list->length); + set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_FRAG0_LOW_IDX, + ib_wr->sg_list->addr); + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_STAG0_IDX, + ib_wr->sg_list->lkey); + break; + case IB_WR_LOCAL_INV: + wqe_misc = NES_IWARP_SQ_OP_LOCINV; + set_wqe_32bit_value(wqe->wqe_words, + NES_IWARP_SQ_LOCINV_WQE_INV_STAG_IDX, + ib_wr->ex.invalidate_rkey); + break; + case IB_WR_FAST_REG_MR: + { + int i; + int flags = ib_wr->wr.fast_reg.access_flags; + struct nes_ib_fast_reg_page_list *pnesfrpl = + container_of(ib_wr->wr.fast_reg.page_list, + struct nes_ib_fast_reg_page_list, + ibfrpl); + u64 *src_page_list = pnesfrpl->ibfrpl.page_list; + u64 *dst_page_list = pnesfrpl->nes_wqe_pbl.kva; + + if (ib_wr->wr.fast_reg.page_list_len > + (NES_4K_PBL_CHUNK_SIZE / sizeof(u64))) { + nes_debug(NES_DBG_IW_TX, "SQ_FMR: bad page_list_len\n"); + err = -EINVAL; + break; + } + wqe_misc = NES_IWARP_SQ_OP_FAST_REG; + set_wqe_64bit_value(wqe->wqe_words, + NES_IWARP_SQ_FMR_WQE_VA_FBO_LOW_IDX, + ib_wr->wr.fast_reg.iova_start); + set_wqe_32bit_value(wqe->wqe_words, + NES_IWARP_SQ_FMR_WQE_LENGTH_LOW_IDX, + ib_wr->wr.fast_reg.length); + set_wqe_32bit_value(wqe->wqe_words, + NES_IWARP_SQ_FMR_WQE_LENGTH_HIGH_IDX, 0); + set_wqe_32bit_value(wqe->wqe_words, + NES_IWARP_SQ_FMR_WQE_MR_STAG_IDX, + ib_wr->wr.fast_reg.rkey); + /* Set page size: */ + if (ib_wr->wr.fast_reg.page_shift == 12) { + wqe_misc |= NES_IWARP_SQ_FMR_WQE_PAGE_SIZE_4K; + } else if (ib_wr->wr.fast_reg.page_shift == 21) { + wqe_misc |= NES_IWARP_SQ_FMR_WQE_PAGE_SIZE_2M; + } else { + nes_debug(NES_DBG_IW_TX, "Invalid page shift," + " ib_wr=%u, max=1\n", ib_wr->num_sge); + err = -EINVAL; + break; + } + /* Set access_flags */ + wqe_misc |= NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_LOCAL_READ; + if (flags & IB_ACCESS_LOCAL_WRITE) + wqe_misc |= NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_LOCAL_WRITE; + + if (flags & IB_ACCESS_REMOTE_WRITE) + wqe_misc |= NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_REMOTE_WRITE; + + if (flags & IB_ACCESS_REMOTE_READ) + wqe_misc |= NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_REMOTE_READ; + + if (flags & IB_ACCESS_MW_BIND) + wqe_misc |= NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_WINDOW_BIND; + + /* Fill in PBL info: */ + if (ib_wr->wr.fast_reg.page_list_len > + pnesfrpl->ibfrpl.max_page_list_len) { + nes_debug(NES_DBG_IW_TX, "Invalid page list length," + " ib_wr=%p, value=%u, max=%u\n", + ib_wr, ib_wr->wr.fast_reg.page_list_len, + pnesfrpl->ibfrpl.max_page_list_len); + err = -EINVAL; + break; + } + + set_wqe_64bit_value(wqe->wqe_words, + NES_IWARP_SQ_FMR_WQE_PBL_ADDR_LOW_IDX, + pnesfrpl->nes_wqe_pbl.paddr); + + set_wqe_32bit_value(wqe->wqe_words, + NES_IWARP_SQ_FMR_WQE_PBL_LENGTH_IDX, + ib_wr->wr.fast_reg.page_list_len * 8); + + for (i = 0; i < ib_wr->wr.fast_reg.page_list_len; i++) + dst_page_list[i] = cpu_to_le64(src_page_list[i]); + + nes_debug(NES_DBG_IW_TX, "SQ_FMR: iova_start: %llx, " + "length: %d, rkey: %0x, pgl_paddr: %llx, " + "page_list_len: %u, wqe_misc: %x\n", + (unsigned long long) ib_wr->wr.fast_reg.iova_start, + ib_wr->wr.fast_reg.length, + ib_wr->wr.fast_reg.rkey, + (unsigned long long) pnesfrpl->nes_wqe_pbl.paddr, + ib_wr->wr.fast_reg.page_list_len, + wqe_misc); + break; + } + default: + /* error */ + err = -EINVAL; + break; + } + + if (err) + break; + + if ((ib_wr->send_flags & IB_SEND_SIGNALED) || nesqp->sig_all) + wqe_misc |= NES_IWARP_SQ_WQE_SIGNALED_COMPL; + + wqe->wqe_words[NES_IWARP_SQ_WQE_MISC_IDX] = cpu_to_le32(wqe_misc); + + ib_wr = ib_wr->next; + head++; + wqe_count++; + if (head >= qsize) + head = 0; + + } + + nesqp->hwqp.sq_head = head; + barrier(); + while (wqe_count) { + counter = min(wqe_count, ((u32)255)); + wqe_count -= counter; + nes_write32(nesdev->regs + NES_WQE_ALLOC, + (counter << 24) | 0x00800000 | nesqp->hwqp.qp_id); + } + + spin_unlock_irqrestore(&nesqp->lock, flags); + +out: + if (err) + *bad_wr = ib_wr; + return err; +} + + +/** + * nes_post_recv + */ +static int nes_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *ib_wr, + struct ib_recv_wr **bad_wr) +{ + u64 u64temp; + unsigned long flags = 0; + struct nes_vnic *nesvnic = to_nesvnic(ibqp->device); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_qp *nesqp = to_nesqp(ibqp); + struct nes_hw_qp_wqe *wqe; + int err = 0; + int sge_index; + u32 qsize = nesqp->hwqp.rq_size; + u32 head; + u32 wqe_count = 0; + u32 counter; + u32 total_payload_length; + + if (nesqp->ibqp_state > IB_QPS_RTS) { + err = -EINVAL; + goto out; + } + + spin_lock_irqsave(&nesqp->lock, flags); + + head = nesqp->hwqp.rq_head; + + while (ib_wr) { + /* Check for QP error */ + if (nesqp->term_flags) { + err = -EINVAL; + break; + } + + if (ib_wr->num_sge > nesdev->nesadapter->max_sge) { + err = -EINVAL; + break; + } + /* Check for RQ overflow */ + if (((head + (2 * qsize) - nesqp->hwqp.rq_tail) % qsize) == (qsize - 1)) { + err = -ENOMEM; + break; + } + + nes_debug(NES_DBG_IW_RX, "ibwr sge count = %u.\n", ib_wr->num_sge); + wqe = &nesqp->hwqp.rq_vbase[head]; + + /* nes_debug(NES_DBG_IW_RX, "QP%u:processing rq wqe at %p, head = %u.\n", + nesqp->hwqp.qp_id, wqe, head); */ + nes_fill_init_qp_wqe(wqe, nesqp, head); + u64temp = (u64)(ib_wr->wr_id); + set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_SCRATCH_LOW_IDX, + u64temp); + total_payload_length = 0; + for (sge_index=0; sge_index < ib_wr->num_sge; sge_index++) { + set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_RQ_WQE_FRAG0_LOW_IDX+(sge_index*4), + ib_wr->sg_list[sge_index].addr); + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_RQ_WQE_LENGTH0_IDX+(sge_index*4), + ib_wr->sg_list[sge_index].length); + set_wqe_32bit_value(wqe->wqe_words,NES_IWARP_RQ_WQE_STAG0_IDX+(sge_index*4), + ib_wr->sg_list[sge_index].lkey); + + total_payload_length += ib_wr->sg_list[sge_index].length; + } + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_RQ_WQE_TOTAL_PAYLOAD_IDX, + total_payload_length); + + ib_wr = ib_wr->next; + head++; + wqe_count++; + if (head >= qsize) + head = 0; + } + + nesqp->hwqp.rq_head = head; + barrier(); + while (wqe_count) { + counter = min(wqe_count, ((u32)255)); + wqe_count -= counter; + nes_write32(nesdev->regs+NES_WQE_ALLOC, (counter<<24) | nesqp->hwqp.qp_id); + } + + spin_unlock_irqrestore(&nesqp->lock, flags); + +out: + if (err) + *bad_wr = ib_wr; + return err; +} + + +/** + * nes_poll_cq + */ +static int nes_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) +{ + u64 u64temp; + u64 wrid; + unsigned long flags = 0; + struct nes_vnic *nesvnic = to_nesvnic(ibcq->device); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_cq *nescq = to_nescq(ibcq); + struct nes_qp *nesqp; + struct nes_hw_cqe cqe; + u32 head; + u32 wq_tail = 0; + u32 cq_size; + u32 cqe_count = 0; + u32 wqe_index; + u32 u32temp; + u32 move_cq_head = 1; + u32 err_code; + + nes_debug(NES_DBG_CQ, "\n"); + + spin_lock_irqsave(&nescq->lock, flags); + + head = nescq->hw_cq.cq_head; + cq_size = nescq->hw_cq.cq_size; + + while (cqe_count < num_entries) { + if ((le32_to_cpu(nescq->hw_cq.cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX]) & + NES_CQE_VALID) == 0) + break; + + /* + * Make sure we read CQ entry contents *after* + * we've checked the valid bit. + */ + rmb(); + + cqe = nescq->hw_cq.cq_vbase[head]; + u32temp = le32_to_cpu(cqe.cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX]); + wqe_index = u32temp & (nesdev->nesadapter->max_qp_wr - 1); + u32temp &= ~(NES_SW_CONTEXT_ALIGN-1); + /* parse CQE, get completion context from WQE (either rq or sq) */ + u64temp = (((u64)(le32_to_cpu(cqe.cqe_words[NES_CQE_COMP_COMP_CTX_HIGH_IDX])))<<32) | + ((u64)u32temp); + + if (u64temp) { + nesqp = (struct nes_qp *)(unsigned long)u64temp; + memset(entry, 0, sizeof *entry); + if (cqe.cqe_words[NES_CQE_ERROR_CODE_IDX] == 0) { + entry->status = IB_WC_SUCCESS; + } else { + err_code = le32_to_cpu(cqe.cqe_words[NES_CQE_ERROR_CODE_IDX]); + if (NES_IWARP_CQE_MAJOR_DRV == (err_code >> 16)) { + entry->status = err_code & 0x0000ffff; + + /* The rest of the cqe's will be marked as flushed */ + nescq->hw_cq.cq_vbase[head].cqe_words[NES_CQE_ERROR_CODE_IDX] = + cpu_to_le32((NES_IWARP_CQE_MAJOR_FLUSH << 16) | + NES_IWARP_CQE_MINOR_FLUSH); + } else + entry->status = IB_WC_WR_FLUSH_ERR; + } + + entry->qp = &nesqp->ibqp; + entry->src_qp = nesqp->hwqp.qp_id; + + if (le32_to_cpu(cqe.cqe_words[NES_CQE_OPCODE_IDX]) & NES_CQE_SQ) { + if (nesqp->skip_lsmm) { + nesqp->skip_lsmm = 0; + nesqp->hwqp.sq_tail++; + } + + /* Working on a SQ Completion*/ + wrid = (((u64)(cpu_to_le32((u32)nesqp->hwqp.sq_vbase[wqe_index]. + wqe_words[NES_IWARP_SQ_WQE_COMP_SCRATCH_HIGH_IDX]))) << 32) | + ((u64)(cpu_to_le32((u32)nesqp->hwqp.sq_vbase[wqe_index]. + wqe_words[NES_IWARP_SQ_WQE_COMP_SCRATCH_LOW_IDX]))); + entry->byte_len = le32_to_cpu(nesqp->hwqp.sq_vbase[wqe_index]. + wqe_words[NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX]); + + switch (le32_to_cpu(nesqp->hwqp.sq_vbase[wqe_index]. + wqe_words[NES_IWARP_SQ_WQE_MISC_IDX]) & 0x3f) { + case NES_IWARP_SQ_OP_RDMAW: + nes_debug(NES_DBG_CQ, "Operation = RDMA WRITE.\n"); + entry->opcode = IB_WC_RDMA_WRITE; + break; + case NES_IWARP_SQ_OP_RDMAR: + nes_debug(NES_DBG_CQ, "Operation = RDMA READ.\n"); + entry->opcode = IB_WC_RDMA_READ; + entry->byte_len = le32_to_cpu(nesqp->hwqp.sq_vbase[wqe_index]. + wqe_words[NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX]); + break; + case NES_IWARP_SQ_OP_SENDINV: + case NES_IWARP_SQ_OP_SENDSEINV: + case NES_IWARP_SQ_OP_SEND: + case NES_IWARP_SQ_OP_SENDSE: + nes_debug(NES_DBG_CQ, "Operation = Send.\n"); + entry->opcode = IB_WC_SEND; + break; + case NES_IWARP_SQ_OP_LOCINV: + entry->opcode = IB_WC_LOCAL_INV; + break; + case NES_IWARP_SQ_OP_FAST_REG: + entry->opcode = IB_WC_FAST_REG_MR; + break; + } + + nesqp->hwqp.sq_tail = (wqe_index+1)&(nesqp->hwqp.sq_size - 1); + if ((entry->status != IB_WC_SUCCESS) && (nesqp->hwqp.sq_tail != nesqp->hwqp.sq_head)) { + move_cq_head = 0; + wq_tail = nesqp->hwqp.sq_tail; + } + } else { + /* Working on a RQ Completion*/ + entry->byte_len = le32_to_cpu(cqe.cqe_words[NES_CQE_PAYLOAD_LENGTH_IDX]); + wrid = ((u64)(le32_to_cpu(nesqp->hwqp.rq_vbase[wqe_index].wqe_words[NES_IWARP_RQ_WQE_COMP_SCRATCH_LOW_IDX]))) | + ((u64)(le32_to_cpu(nesqp->hwqp.rq_vbase[wqe_index].wqe_words[NES_IWARP_RQ_WQE_COMP_SCRATCH_HIGH_IDX]))<<32); + entry->opcode = IB_WC_RECV; + + nesqp->hwqp.rq_tail = (wqe_index+1)&(nesqp->hwqp.rq_size - 1); + if ((entry->status != IB_WC_SUCCESS) && (nesqp->hwqp.rq_tail != nesqp->hwqp.rq_head)) { + move_cq_head = 0; + wq_tail = nesqp->hwqp.rq_tail; + } + } + + entry->wr_id = wrid; + entry++; + cqe_count++; + } + + if (move_cq_head) { + nescq->hw_cq.cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX] = 0; + if (++head >= cq_size) + head = 0; + nescq->polled_completions++; + + if ((nescq->polled_completions > (cq_size / 2)) || + (nescq->polled_completions == 255)) { + nes_debug(NES_DBG_CQ, "CQ%u Issuing CQE Allocate since more than half of cqes" + " are pending %u of %u.\n", + nescq->hw_cq.cq_number, nescq->polled_completions, cq_size); + nes_write32(nesdev->regs+NES_CQE_ALLOC, + nescq->hw_cq.cq_number | (nescq->polled_completions << 16)); + nescq->polled_completions = 0; + } + } else { + /* Update the wqe index and set status to flush */ + wqe_index = le32_to_cpu(cqe.cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX]); + wqe_index = (wqe_index & (~(nesdev->nesadapter->max_qp_wr - 1))) | wq_tail; + nescq->hw_cq.cq_vbase[head].cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX] = + cpu_to_le32(wqe_index); + move_cq_head = 1; /* ready for next pass */ + } + } + + if (nescq->polled_completions) { + nes_write32(nesdev->regs+NES_CQE_ALLOC, + nescq->hw_cq.cq_number | (nescq->polled_completions << 16)); + nescq->polled_completions = 0; + } + + nescq->hw_cq.cq_head = head; + nes_debug(NES_DBG_CQ, "Reporting %u completions for CQ%u.\n", + cqe_count, nescq->hw_cq.cq_number); + + spin_unlock_irqrestore(&nescq->lock, flags); + + return cqe_count; +} + + +/** + * nes_req_notify_cq + */ +static int nes_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags) + { + struct nes_vnic *nesvnic = to_nesvnic(ibcq->device); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_cq *nescq = to_nescq(ibcq); + u32 cq_arm; + + nes_debug(NES_DBG_CQ, "Requesting notification for CQ%u.\n", + nescq->hw_cq.cq_number); + + cq_arm = nescq->hw_cq.cq_number; + if ((notify_flags & IB_CQ_SOLICITED_MASK) == IB_CQ_NEXT_COMP) + cq_arm |= NES_CQE_ALLOC_NOTIFY_NEXT; + else if ((notify_flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED) + cq_arm |= NES_CQE_ALLOC_NOTIFY_SE; + else + return -EINVAL; + + nes_write32(nesdev->regs+NES_CQE_ALLOC, cq_arm); + nes_read32(nesdev->regs+NES_CQE_ALLOC); + + return 0; +} + + +/** + * nes_init_ofa_device + */ +struct nes_ib_device *nes_init_ofa_device(struct net_device *netdev) +{ + struct nes_ib_device *nesibdev; + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_device *nesdev = nesvnic->nesdev; + + nesibdev = (struct nes_ib_device *)ib_alloc_device(sizeof(struct nes_ib_device)); + if (nesibdev == NULL) { + return NULL; + } + strlcpy(nesibdev->ibdev.name, "nes%d", IB_DEVICE_NAME_MAX); + nesibdev->ibdev.owner = THIS_MODULE; + + nesibdev->ibdev.node_type = RDMA_NODE_RNIC; + memset(&nesibdev->ibdev.node_guid, 0, sizeof(nesibdev->ibdev.node_guid)); + memcpy(&nesibdev->ibdev.node_guid, netdev->dev_addr, 6); + + nesibdev->ibdev.uverbs_cmd_mask = + (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | + (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | + (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_DEREG_MR) | + (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | + (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_AH) | + (1ull << IB_USER_VERBS_CMD_DESTROY_AH) | + (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_QP) | + (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | + (1ull << IB_USER_VERBS_CMD_POLL_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | + (1ull << IB_USER_VERBS_CMD_ALLOC_MW) | + (1ull << IB_USER_VERBS_CMD_BIND_MW) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_MW) | + (1ull << IB_USER_VERBS_CMD_POST_RECV) | + (1ull << IB_USER_VERBS_CMD_POST_SEND); + + nesibdev->ibdev.phys_port_cnt = 1; + nesibdev->ibdev.num_comp_vectors = 1; + nesibdev->ibdev.dma_device = &nesdev->pcidev->dev; + nesibdev->ibdev.dev.parent = &nesdev->pcidev->dev; + nesibdev->ibdev.query_device = nes_query_device; + nesibdev->ibdev.query_port = nes_query_port; + nesibdev->ibdev.query_pkey = nes_query_pkey; + nesibdev->ibdev.query_gid = nes_query_gid; + nesibdev->ibdev.alloc_ucontext = nes_alloc_ucontext; + nesibdev->ibdev.dealloc_ucontext = nes_dealloc_ucontext; + nesibdev->ibdev.mmap = nes_mmap; + nesibdev->ibdev.alloc_pd = nes_alloc_pd; + nesibdev->ibdev.dealloc_pd = nes_dealloc_pd; + nesibdev->ibdev.create_ah = nes_create_ah; + nesibdev->ibdev.destroy_ah = nes_destroy_ah; + nesibdev->ibdev.create_qp = nes_create_qp; + nesibdev->ibdev.modify_qp = nes_modify_qp; + nesibdev->ibdev.query_qp = nes_query_qp; + nesibdev->ibdev.destroy_qp = nes_destroy_qp; + nesibdev->ibdev.create_cq = nes_create_cq; + nesibdev->ibdev.destroy_cq = nes_destroy_cq; + nesibdev->ibdev.poll_cq = nes_poll_cq; + nesibdev->ibdev.get_dma_mr = nes_get_dma_mr; + nesibdev->ibdev.reg_phys_mr = nes_reg_phys_mr; + nesibdev->ibdev.reg_user_mr = nes_reg_user_mr; + nesibdev->ibdev.dereg_mr = nes_dereg_mr; + nesibdev->ibdev.alloc_mw = nes_alloc_mw; + nesibdev->ibdev.dealloc_mw = nes_dealloc_mw; + nesibdev->ibdev.bind_mw = nes_bind_mw; + + nesibdev->ibdev.alloc_fast_reg_mr = nes_alloc_fast_reg_mr; + nesibdev->ibdev.alloc_fast_reg_page_list = nes_alloc_fast_reg_page_list; + nesibdev->ibdev.free_fast_reg_page_list = nes_free_fast_reg_page_list; + + nesibdev->ibdev.attach_mcast = nes_multicast_attach; + nesibdev->ibdev.detach_mcast = nes_multicast_detach; + nesibdev->ibdev.process_mad = nes_process_mad; + + nesibdev->ibdev.req_notify_cq = nes_req_notify_cq; + nesibdev->ibdev.post_send = nes_post_send; + nesibdev->ibdev.post_recv = nes_post_recv; + + nesibdev->ibdev.iwcm = kzalloc(sizeof(*nesibdev->ibdev.iwcm), GFP_KERNEL); + if (nesibdev->ibdev.iwcm == NULL) { + ib_dealloc_device(&nesibdev->ibdev); + return NULL; + } + nesibdev->ibdev.iwcm->add_ref = nes_add_ref; + nesibdev->ibdev.iwcm->rem_ref = nes_rem_ref; + nesibdev->ibdev.iwcm->get_qp = nes_get_qp; + nesibdev->ibdev.iwcm->connect = nes_connect; + nesibdev->ibdev.iwcm->accept = nes_accept; + nesibdev->ibdev.iwcm->reject = nes_reject; + nesibdev->ibdev.iwcm->create_listen = nes_create_listen; + nesibdev->ibdev.iwcm->destroy_listen = nes_destroy_listen; + + return nesibdev; +} + + +/** + * nes_handle_delayed_event + */ +static void nes_handle_delayed_event(unsigned long data) +{ + struct nes_vnic *nesvnic = (void *) data; + + if (nesvnic->delayed_event != nesvnic->last_dispatched_event) { + struct ib_event event; + + event.device = &nesvnic->nesibdev->ibdev; + if (!event.device) + goto stop_timer; + event.event = nesvnic->delayed_event; + event.element.port_num = nesvnic->logical_port + 1; + ib_dispatch_event(&event); + } + +stop_timer: + nesvnic->event_timer.function = NULL; +} + + +void nes_port_ibevent(struct nes_vnic *nesvnic) +{ + struct nes_ib_device *nesibdev = nesvnic->nesibdev; + struct nes_device *nesdev = nesvnic->nesdev; + struct ib_event event; + event.device = &nesibdev->ibdev; + event.element.port_num = nesvnic->logical_port + 1; + event.event = nesdev->iw_status ? IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR; + + if (!nesvnic->event_timer.function) { + ib_dispatch_event(&event); + nesvnic->last_dispatched_event = event.event; + nesvnic->event_timer.function = nes_handle_delayed_event; + nesvnic->event_timer.data = (unsigned long) nesvnic; + nesvnic->event_timer.expires = jiffies + NES_EVENT_DELAY; + add_timer(&nesvnic->event_timer); + } else { + mod_timer(&nesvnic->event_timer, jiffies + NES_EVENT_DELAY); + } + nesvnic->delayed_event = event.event; +} + + +/** + * nes_destroy_ofa_device + */ +void nes_destroy_ofa_device(struct nes_ib_device *nesibdev) +{ + if (nesibdev == NULL) + return; + + nes_unregister_ofa_device(nesibdev); + + kfree(nesibdev->ibdev.iwcm); + ib_dealloc_device(&nesibdev->ibdev); +} + + +/** + * nes_register_ofa_device + */ +int nes_register_ofa_device(struct nes_ib_device *nesibdev) +{ + struct nes_vnic *nesvnic = nesibdev->nesvnic; + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + int i, ret; + + ret = ib_register_device(&nesvnic->nesibdev->ibdev, NULL); + if (ret) { + return ret; + } + + /* Get the resources allocated to this device */ + nesibdev->max_cq = (nesadapter->max_cq-NES_FIRST_QPN) / nesadapter->port_count; + nesibdev->max_mr = nesadapter->max_mr / nesadapter->port_count; + nesibdev->max_qp = (nesadapter->max_qp-NES_FIRST_QPN) / nesadapter->port_count; + nesibdev->max_pd = nesadapter->max_pd / nesadapter->port_count; + + for (i = 0; i < ARRAY_SIZE(nes_dev_attributes); ++i) { + ret = device_create_file(&nesibdev->ibdev.dev, nes_dev_attributes[i]); + if (ret) { + while (i > 0) { + i--; + device_remove_file(&nesibdev->ibdev.dev, + nes_dev_attributes[i]); + } + ib_unregister_device(&nesibdev->ibdev); + return ret; + } + } + + nesvnic->of_device_registered = 1; + + return 0; +} + + +/** + * nes_unregister_ofa_device + */ +static void nes_unregister_ofa_device(struct nes_ib_device *nesibdev) +{ + struct nes_vnic *nesvnic = nesibdev->nesvnic; + int i; + + for (i = 0; i < ARRAY_SIZE(nes_dev_attributes); ++i) { + device_remove_file(&nesibdev->ibdev.dev, nes_dev_attributes[i]); + } + + if (nesvnic->of_device_registered) { + ib_unregister_device(&nesibdev->ibdev); + } + + nesvnic->of_device_registered = 0; +} diff --git a/kernel/drivers/infiniband/hw/nes/nes_verbs.h b/kernel/drivers/infiniband/hw/nes/nes_verbs.h new file mode 100644 index 000000000..309b31c31 --- /dev/null +++ b/kernel/drivers/infiniband/hw/nes/nes_verbs.h @@ -0,0 +1,189 @@ +/* + * Copyright (c) 2006 - 2011 Intel Corporation. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef NES_VERBS_H +#define NES_VERBS_H + +struct nes_device; + +#define NES_MAX_USER_DB_REGIONS 4096 +#define NES_MAX_USER_WQ_REGIONS 4096 + +#define NES_TERM_SENT 0x01 +#define NES_TERM_RCVD 0x02 +#define NES_TERM_DONE 0x04 + +struct nes_ucontext { + struct ib_ucontext ibucontext; + struct nes_device *nesdev; + unsigned long mmap_wq_offset; + unsigned long mmap_cq_offset; /* to be removed */ + int index; /* rnic index (minor) */ + unsigned long allocated_doorbells[BITS_TO_LONGS(NES_MAX_USER_DB_REGIONS)]; + u16 mmap_db_index[NES_MAX_USER_DB_REGIONS]; + u16 first_free_db; + unsigned long allocated_wqs[BITS_TO_LONGS(NES_MAX_USER_WQ_REGIONS)]; + struct nes_qp *mmap_nesqp[NES_MAX_USER_WQ_REGIONS]; + u16 first_free_wq; + struct list_head cq_reg_mem_list; + struct list_head qp_reg_mem_list; + u32 mcrqf; + atomic_t usecnt; +}; + +struct nes_pd { + struct ib_pd ibpd; + u16 pd_id; + atomic_t sqp_count; + u16 mmap_db_index; +}; + +struct nes_mr { + union { + struct ib_mr ibmr; + struct ib_mw ibmw; + struct ib_fmr ibfmr; + }; + struct ib_umem *region; + u16 pbls_used; + u8 mode; + u8 pbl_4k; +}; + +struct nes_hw_pb { + __le32 pa_low; + __le32 pa_high; +}; + +struct nes_vpbl { + dma_addr_t pbl_pbase; + struct nes_hw_pb *pbl_vbase; +}; + +struct nes_root_vpbl { + dma_addr_t pbl_pbase; + struct nes_hw_pb *pbl_vbase; + struct nes_vpbl *leaf_vpbl; +}; + +struct nes_fmr { + struct nes_mr nesmr; + u32 leaf_pbl_cnt; + struct nes_root_vpbl root_vpbl; + struct ib_qp *ib_qp; + int access_rights; + struct ib_fmr_attr attr; +}; + +struct nes_av; + +struct nes_cq { + struct ib_cq ibcq; + struct nes_hw_cq hw_cq; + u32 polled_completions; + u32 cq_mem_size; + spinlock_t lock; + u8 virtual_cq; + u8 pad[3]; + u32 mcrqf; +}; + +struct nes_wq { + spinlock_t lock; +}; + +struct disconn_work { + struct work_struct work; + struct nes_qp *nesqp; +}; + +struct iw_cm_id; +struct ietf_mpa_frame; + +struct nes_qp { + struct ib_qp ibqp; + void *allocated_buffer; + struct iw_cm_id *cm_id; + struct nes_cq *nesscq; + struct nes_cq *nesrcq; + struct nes_pd *nespd; + void *cm_node; /* handle of the node this QP is associated with */ + void *ietf_frame; + u8 ietf_frame_size; + dma_addr_t ietf_frame_pbase; + struct ib_mr *lsmm_mr; + struct nes_hw_qp hwqp; + struct work_struct work; + enum ib_qp_state ibqp_state; + u32 iwarp_state; + u32 hte_index; + u32 last_aeq; + u32 qp_mem_size; + atomic_t refcount; + atomic_t close_timer_started; + u32 mmap_sq_db_index; + u32 mmap_rq_db_index; + spinlock_t lock; + spinlock_t pau_lock; + struct nes_qp_context *nesqp_context; + dma_addr_t nesqp_context_pbase; + void *pbl_vbase; + dma_addr_t pbl_pbase; + struct page *page; + struct timer_list terminate_timer; + enum ib_event_type terminate_eventtype; + struct sk_buff_head pau_list; + u32 pau_rcv_nxt; + u16 active_conn:1; + u16 skip_lsmm:1; + u16 user_mode:1; + u16 hte_added:1; + u16 flush_issued:1; + u16 destroyed:1; + u16 sig_all:1; + u16 pau_mode:1; + u16 rsvd:8; + u16 private_data_len; + u16 term_sq_flush_code; + u16 term_rq_flush_code; + u8 hw_iwarp_state; + u8 hw_tcp_state; + u8 term_flags; + u8 sq_kmapped; + u8 pau_busy; + u8 pau_pending; + u8 pau_state; + __u64 nesuqp_addr; +}; +#endif /* NES_VERBS_H */ diff --git a/kernel/drivers/infiniband/hw/ocrdma/Kconfig b/kernel/drivers/infiniband/hw/ocrdma/Kconfig new file mode 100644 index 000000000..c0cddc019 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ocrdma/Kconfig @@ -0,0 +1,8 @@ +config INFINIBAND_OCRDMA + tristate "Emulex One Connect HCA support" + depends on ETHERNET && NETDEVICES && PCI && INET && (IPV6 || IPV6=n) + select NET_VENDOR_EMULEX + select BE2NET + ---help--- + This driver provides low-level InfiniBand over Ethernet + support for Emulex One Connect host channel adapters (HCAs). diff --git a/kernel/drivers/infiniband/hw/ocrdma/Makefile b/kernel/drivers/infiniband/hw/ocrdma/Makefile new file mode 100644 index 000000000..d1bfd4f4c --- /dev/null +++ b/kernel/drivers/infiniband/hw/ocrdma/Makefile @@ -0,0 +1,5 @@ +ccflags-y := -Idrivers/net/ethernet/emulex/benet + +obj-$(CONFIG_INFINIBAND_OCRDMA) += ocrdma.o + +ocrdma-y := ocrdma_main.o ocrdma_verbs.o ocrdma_hw.o ocrdma_ah.o ocrdma_stats.o diff --git a/kernel/drivers/infiniband/hw/ocrdma/ocrdma.h b/kernel/drivers/infiniband/hw/ocrdma/ocrdma.h new file mode 100644 index 000000000..b396344fa --- /dev/null +++ b/kernel/drivers/infiniband/hw/ocrdma/ocrdma.h @@ -0,0 +1,579 @@ +/******************************************************************* + * This file is part of the Emulex RoCE Device Driver for * + * RoCE (RDMA over Converged Ethernet) adapters. * + * Copyright (C) 2008-2012 Emulex. All rights reserved. * + * EMULEX and SLI are trademarks of Emulex. * + * www.emulex.com * + * * + * This program is free software; you can redistribute it and/or * + * modify it under the terms of version 2 of the GNU General * + * Public License as published by the Free Software Foundation. * + * This program is distributed in the hope that it will be useful. * + * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND * + * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, * + * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE * + * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD * + * TO BE LEGALLY INVALID. See the GNU General Public License for * + * more details, a copy of which can be found in the file COPYING * + * included with this package. * + * + * Contact Information: + * linux-drivers@emulex.com + * + * Emulex + * 3333 Susan Street + * Costa Mesa, CA 92626 + *******************************************************************/ + +#ifndef __OCRDMA_H__ +#define __OCRDMA_H__ + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include "ocrdma_sli.h" + +#define OCRDMA_ROCE_DRV_VERSION "10.6.0.0" + +#define OCRDMA_ROCE_DRV_DESC "Emulex OneConnect RoCE Driver" +#define OCRDMA_NODE_DESC "Emulex OneConnect RoCE HCA" + +#define OC_NAME_SH OCRDMA_NODE_DESC "(Skyhawk)" +#define OC_NAME_UNKNOWN OCRDMA_NODE_DESC "(Unknown)" + +#define OC_SKH_DEVICE_PF 0x720 +#define OC_SKH_DEVICE_VF 0x728 +#define OCRDMA_MAX_AH 512 + +#define OCRDMA_UVERBS(CMD_NAME) (1ull << IB_USER_VERBS_CMD_##CMD_NAME) + +#define convert_to_64bit(lo, hi) ((u64)hi << 32 | (u64)lo) +#define EQ_INTR_PER_SEC_THRSH_HI 150000 +#define EQ_INTR_PER_SEC_THRSH_LOW 100000 +#define EQ_AIC_MAX_EQD 20 +#define EQ_AIC_MIN_EQD 0 + +void ocrdma_eqd_set_task(struct work_struct *work); + +struct ocrdma_dev_attr { + u8 fw_ver[32]; + u32 vendor_id; + u32 device_id; + u16 max_pd; + u16 max_dpp_pds; + u16 max_cq; + u16 max_cqe; + u16 max_qp; + u16 max_wqe; + u16 max_rqe; + u16 max_srq; + u32 max_inline_data; + int max_send_sge; + int max_recv_sge; + int max_srq_sge; + int max_rdma_sge; + int max_mr; + u64 max_mr_size; + u32 max_num_mr_pbl; + int max_mw; + int max_fmr; + int max_map_per_fmr; + int max_pages_per_frmr; + u16 max_ord_per_qp; + u16 max_ird_per_qp; + + int device_cap_flags; + u8 cq_overflow_detect; + u8 srq_supported; + + u32 wqe_size; + u32 rqe_size; + u32 ird_page_size; + u8 local_ca_ack_delay; + u8 ird; + u8 num_ird_pages; +}; + +struct ocrdma_dma_mem { + void *va; + dma_addr_t pa; + u32 size; +}; + +struct ocrdma_pbl { + void *va; + dma_addr_t pa; +}; + +struct ocrdma_queue_info { + void *va; + dma_addr_t dma; + u32 size; + u16 len; + u16 entry_size; /* Size of an element in the queue */ + u16 id; /* qid, where to ring the doorbell. */ + u16 head, tail; + bool created; +}; + +struct ocrdma_aic_obj { /* Adaptive interrupt coalescing (AIC) info */ + u32 prev_eqd; + u64 eq_intr_cnt; + u64 prev_eq_intr_cnt; +}; + +struct ocrdma_eq { + struct ocrdma_queue_info q; + u32 vector; + int cq_cnt; + struct ocrdma_dev *dev; + char irq_name[32]; + struct ocrdma_aic_obj aic_obj; +}; + +struct ocrdma_mq { + struct ocrdma_queue_info sq; + struct ocrdma_queue_info cq; + bool rearm_cq; +}; + +struct mqe_ctx { + struct mutex lock; /* for serializing mailbox commands on MQ */ + wait_queue_head_t cmd_wait; + u32 tag; + u16 cqe_status; + u16 ext_status; + bool cmd_done; + bool fw_error_state; +}; + +struct ocrdma_hw_mr { + u32 lkey; + u8 fr_mr; + u8 remote_atomic; + u8 remote_rd; + u8 remote_wr; + u8 local_rd; + u8 local_wr; + u8 mw_bind; + u8 rsvd; + u64 len; + struct ocrdma_pbl *pbl_table; + u32 num_pbls; + u32 num_pbes; + u32 pbl_size; + u32 pbe_size; + u64 fbo; + u64 va; +}; + +struct ocrdma_mr { + struct ib_mr ibmr; + struct ib_umem *umem; + struct ocrdma_hw_mr hwmr; +}; + +struct ocrdma_stats { + u8 type; + struct ocrdma_dev *dev; +}; + +struct ocrdma_pd_resource_mgr { + u32 pd_norm_start; + u16 pd_norm_count; + u16 pd_norm_thrsh; + u16 max_normal_pd; + u32 pd_dpp_start; + u16 pd_dpp_count; + u16 pd_dpp_thrsh; + u16 max_dpp_pd; + u16 dpp_page_index; + unsigned long *pd_norm_bitmap; + unsigned long *pd_dpp_bitmap; + bool pd_prealloc_valid; +}; + +struct stats_mem { + struct ocrdma_mqe mqe; + void *va; + dma_addr_t pa; + u32 size; + char *debugfs_mem; +}; + +struct phy_info { + u16 auto_speeds_supported; + u16 fixed_speeds_supported; + u16 phy_type; + u16 interface_type; +}; + +struct ocrdma_dev { + struct ib_device ibdev; + struct ocrdma_dev_attr attr; + + struct mutex dev_lock; /* provides syncronise access to device data */ + spinlock_t flush_q_lock ____cacheline_aligned; + + struct ocrdma_cq **cq_tbl; + struct ocrdma_qp **qp_tbl; + + struct ocrdma_eq *eq_tbl; + int eq_cnt; + struct delayed_work eqd_work; + u16 base_eqid; + u16 max_eq; + + union ib_gid *sgid_tbl; + /* provided synchronization to sgid table for + * updating gid entries triggered by notifier. + */ + spinlock_t sgid_lock; + + int gsi_qp_created; + struct ocrdma_cq *gsi_sqcq; + struct ocrdma_cq *gsi_rqcq; + + struct { + struct ocrdma_av *va; + dma_addr_t pa; + u32 size; + u32 num_ah; + /* provide synchronization for av + * entry allocations. + */ + spinlock_t lock; + u32 ahid; + struct ocrdma_pbl pbl; + } av_tbl; + + void *mbx_cmd; + struct ocrdma_mq mq; + struct mqe_ctx mqe_ctx; + + struct be_dev_info nic_info; + struct phy_info phy; + char model_number[32]; + u32 hba_port_num; + + struct list_head entry; + struct rcu_head rcu; + int id; + u64 *stag_arr; + u8 sl; /* service level */ + bool pfc_state; + atomic_t update_sl; + u16 pvid; + u32 asic_id; + + ulong last_stats_time; + struct mutex stats_lock; /* provide synch for debugfs operations */ + struct stats_mem stats_mem; + struct ocrdma_stats rsrc_stats; + struct ocrdma_stats rx_stats; + struct ocrdma_stats wqe_stats; + struct ocrdma_stats tx_stats; + struct ocrdma_stats db_err_stats; + struct ocrdma_stats tx_qp_err_stats; + struct ocrdma_stats rx_qp_err_stats; + struct ocrdma_stats tx_dbg_stats; + struct ocrdma_stats rx_dbg_stats; + struct ocrdma_stats driver_stats; + struct ocrdma_stats reset_stats; + struct dentry *dir; + atomic_t async_err_stats[OCRDMA_MAX_ASYNC_ERRORS]; + atomic_t cqe_err_stats[OCRDMA_MAX_CQE_ERR]; + struct ocrdma_pd_resource_mgr *pd_mgr; +}; + +struct ocrdma_cq { + struct ib_cq ibcq; + struct ocrdma_cqe *va; + u32 phase; + u32 getp; /* pointer to pending wrs to + * return to stack, wrap arounds + * at max_hw_cqe + */ + u32 max_hw_cqe; + bool phase_change; + bool deferred_arm, deferred_sol; + bool first_arm; + + spinlock_t cq_lock ____cacheline_aligned; /* provide synchronization + * to cq polling + */ + /* syncronizes cq completion handler invoked from multiple context */ + spinlock_t comp_handler_lock ____cacheline_aligned; + u16 id; + u16 eqn; + + struct ocrdma_ucontext *ucontext; + dma_addr_t pa; + u32 len; + u32 cqe_cnt; + + /* head of all qp's sq and rq for which cqes need to be flushed + * by the software. + */ + struct list_head sq_head, rq_head; +}; + +struct ocrdma_pd { + struct ib_pd ibpd; + struct ocrdma_ucontext *uctx; + u32 id; + int num_dpp_qp; + u32 dpp_page; + bool dpp_enabled; +}; + +struct ocrdma_ah { + struct ib_ah ibah; + struct ocrdma_av *av; + u16 sgid_index; + u32 id; +}; + +struct ocrdma_qp_hwq_info { + u8 *va; /* virtual address */ + u32 max_sges; + u32 head, tail; + u32 entry_size; + u32 max_cnt; + u32 max_wqe_idx; + u16 dbid; /* qid, where to ring the doorbell. */ + u32 len; + dma_addr_t pa; +}; + +struct ocrdma_srq { + struct ib_srq ibsrq; + u8 __iomem *db; + struct ocrdma_qp_hwq_info rq; + u64 *rqe_wr_id_tbl; + u32 *idx_bit_fields; + u32 bit_fields_len; + + /* provide synchronization to multiple context(s) posting rqe */ + spinlock_t q_lock ____cacheline_aligned; + + struct ocrdma_pd *pd; + u32 id; +}; + +struct ocrdma_qp { + struct ib_qp ibqp; + + u8 __iomem *sq_db; + struct ocrdma_qp_hwq_info sq; + struct { + uint64_t wrid; + uint16_t dpp_wqe_idx; + uint16_t dpp_wqe; + uint8_t signaled; + uint8_t rsvd[3]; + } *wqe_wr_id_tbl; + u32 max_inline_data; + + /* provide synchronization to multiple context(s) posting wqe, rqe */ + spinlock_t q_lock ____cacheline_aligned; + struct ocrdma_cq *sq_cq; + /* list maintained per CQ to flush SQ errors */ + struct list_head sq_entry; + + u8 __iomem *rq_db; + struct ocrdma_qp_hwq_info rq; + u64 *rqe_wr_id_tbl; + struct ocrdma_cq *rq_cq; + struct ocrdma_srq *srq; + /* list maintained per CQ to flush RQ errors */ + struct list_head rq_entry; + + enum ocrdma_qp_state state; /* QP state */ + int cap_flags; + u32 max_ord, max_ird; + + u32 id; + struct ocrdma_pd *pd; + + enum ib_qp_type qp_type; + + int sgid_idx; + u32 qkey; + bool dpp_enabled; + u8 *ird_q_va; + bool signaled; +}; + +struct ocrdma_ucontext { + struct ib_ucontext ibucontext; + + struct list_head mm_head; + struct mutex mm_list_lock; /* protects list entries of mm type */ + struct ocrdma_pd *cntxt_pd; + int pd_in_use; + + struct { + u32 *va; + dma_addr_t pa; + u32 len; + } ah_tbl; +}; + +struct ocrdma_mm { + struct { + u64 phy_addr; + unsigned long len; + } key; + struct list_head entry; +}; + +static inline struct ocrdma_dev *get_ocrdma_dev(struct ib_device *ibdev) +{ + return container_of(ibdev, struct ocrdma_dev, ibdev); +} + +static inline struct ocrdma_ucontext *get_ocrdma_ucontext(struct ib_ucontext + *ibucontext) +{ + return container_of(ibucontext, struct ocrdma_ucontext, ibucontext); +} + +static inline struct ocrdma_pd *get_ocrdma_pd(struct ib_pd *ibpd) +{ + return container_of(ibpd, struct ocrdma_pd, ibpd); +} + +static inline struct ocrdma_cq *get_ocrdma_cq(struct ib_cq *ibcq) +{ + return container_of(ibcq, struct ocrdma_cq, ibcq); +} + +static inline struct ocrdma_qp *get_ocrdma_qp(struct ib_qp *ibqp) +{ + return container_of(ibqp, struct ocrdma_qp, ibqp); +} + +static inline struct ocrdma_mr *get_ocrdma_mr(struct ib_mr *ibmr) +{ + return container_of(ibmr, struct ocrdma_mr, ibmr); +} + +static inline struct ocrdma_ah *get_ocrdma_ah(struct ib_ah *ibah) +{ + return container_of(ibah, struct ocrdma_ah, ibah); +} + +static inline struct ocrdma_srq *get_ocrdma_srq(struct ib_srq *ibsrq) +{ + return container_of(ibsrq, struct ocrdma_srq, ibsrq); +} + +static inline int is_cqe_valid(struct ocrdma_cq *cq, struct ocrdma_cqe *cqe) +{ + int cqe_valid; + cqe_valid = le32_to_cpu(cqe->flags_status_srcqpn) & OCRDMA_CQE_VALID; + return (cqe_valid == cq->phase); +} + +static inline int is_cqe_for_sq(struct ocrdma_cqe *cqe) +{ + return (le32_to_cpu(cqe->flags_status_srcqpn) & + OCRDMA_CQE_QTYPE) ? 0 : 1; +} + +static inline int is_cqe_invalidated(struct ocrdma_cqe *cqe) +{ + return (le32_to_cpu(cqe->flags_status_srcqpn) & + OCRDMA_CQE_INVALIDATE) ? 1 : 0; +} + +static inline int is_cqe_imm(struct ocrdma_cqe *cqe) +{ + return (le32_to_cpu(cqe->flags_status_srcqpn) & + OCRDMA_CQE_IMM) ? 1 : 0; +} + +static inline int is_cqe_wr_imm(struct ocrdma_cqe *cqe) +{ + return (le32_to_cpu(cqe->flags_status_srcqpn) & + OCRDMA_CQE_WRITE_IMM) ? 1 : 0; +} + +static inline int ocrdma_resolve_dmac(struct ocrdma_dev *dev, + struct ib_ah_attr *ah_attr, u8 *mac_addr) +{ + struct in6_addr in6; + + memcpy(&in6, ah_attr->grh.dgid.raw, sizeof(in6)); + if (rdma_is_multicast_addr(&in6)) + rdma_get_mcast_mac(&in6, mac_addr); + else if (rdma_link_local_addr(&in6)) + rdma_get_ll_mac(&in6, mac_addr); + else + memcpy(mac_addr, ah_attr->dmac, ETH_ALEN); + return 0; +} + +static inline char *hca_name(struct ocrdma_dev *dev) +{ + switch (dev->nic_info.pdev->device) { + case OC_SKH_DEVICE_PF: + case OC_SKH_DEVICE_VF: + return OC_NAME_SH; + default: + return OC_NAME_UNKNOWN; + } +} + +static inline int ocrdma_get_eq_table_index(struct ocrdma_dev *dev, + int eqid) +{ + int indx; + + for (indx = 0; indx < dev->eq_cnt; indx++) { + if (dev->eq_tbl[indx].q.id == eqid) + return indx; + } + + return -EINVAL; +} + +static inline u8 ocrdma_get_asic_type(struct ocrdma_dev *dev) +{ + if (dev->nic_info.dev_family == 0xF && !dev->asic_id) { + pci_read_config_dword( + dev->nic_info.pdev, + OCRDMA_SLI_ASIC_ID_OFFSET, &dev->asic_id); + } + + return (dev->asic_id & OCRDMA_SLI_ASIC_GEN_NUM_MASK) >> + OCRDMA_SLI_ASIC_GEN_NUM_SHIFT; +} + +static inline u8 ocrdma_get_pfc_prio(u8 *pfc, u8 prio) +{ + return *(pfc + prio); +} + +static inline u8 ocrdma_get_app_prio(u8 *app_prio, u8 prio) +{ + return *(app_prio + prio); +} + +static inline u8 ocrdma_is_enabled_and_synced(u32 state) +{ /* May also be used to interpret TC-state, QCN-state + * Appl-state and Logical-link-state in future. + */ + return (state & OCRDMA_STATE_FLAG_ENABLED) && + (state & OCRDMA_STATE_FLAG_SYNC); +} + +#endif diff --git a/kernel/drivers/infiniband/hw/ocrdma/ocrdma_abi.h b/kernel/drivers/infiniband/hw/ocrdma/ocrdma_abi.h new file mode 100644 index 000000000..1554cca57 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ocrdma/ocrdma_abi.h @@ -0,0 +1,134 @@ +/******************************************************************* + * This file is part of the Emulex RoCE Device Driver for * + * RoCE (RDMA over Converged Ethernet) adapters. * + * Copyright (C) 2008-2012 Emulex. All rights reserved. * + * EMULEX and SLI are trademarks of Emulex. * + * www.emulex.com * + * * + * This program is free software; you can redistribute it and/or * + * modify it under the terms of version 2 of the GNU General * + * Public License as published by the Free Software Foundation. * + * This program is distributed in the hope that it will be useful. * + * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND * + * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, * + * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE * + * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD * + * TO BE LEGALLY INVALID. See the GNU General Public License for * + * more details, a copy of which can be found in the file COPYING * + * included with this package. * + * + * Contact Information: + * linux-drivers@emulex.com + * + * Emulex + * 3333 Susan Street + * Costa Mesa, CA 92626 + *******************************************************************/ + +#ifndef __OCRDMA_ABI_H__ +#define __OCRDMA_ABI_H__ + +#define OCRDMA_ABI_VERSION 2 +#define OCRDMA_BE_ROCE_ABI_VERSION 1 +/* user kernel communication data structures. */ + +struct ocrdma_alloc_ucontext_resp { + u32 dev_id; + u32 wqe_size; + u32 max_inline_data; + u32 dpp_wqe_size; + u64 ah_tbl_page; + u32 ah_tbl_len; + u32 rqe_size; + u8 fw_ver[32]; + /* for future use/new features in progress */ + u64 rsvd1; + u64 rsvd2; +}; + +struct ocrdma_alloc_pd_ureq { + u64 rsvd1; +}; + +struct ocrdma_alloc_pd_uresp { + u32 id; + u32 dpp_enabled; + u32 dpp_page_addr_hi; + u32 dpp_page_addr_lo; + u64 rsvd1; +}; + +struct ocrdma_create_cq_ureq { + u32 dpp_cq; + u32 rsvd; /* pad */ +}; + +#define MAX_CQ_PAGES 8 +struct ocrdma_create_cq_uresp { + u32 cq_id; + u32 page_size; + u32 num_pages; + u32 max_hw_cqe; + u64 page_addr[MAX_CQ_PAGES]; + u64 db_page_addr; + u32 db_page_size; + u32 phase_change; + /* for future use/new features in progress */ + u64 rsvd1; + u64 rsvd2; +}; + +#define MAX_QP_PAGES 8 +#define MAX_UD_AV_PAGES 8 + +struct ocrdma_create_qp_ureq { + u8 enable_dpp_cq; + u8 rsvd; + u16 dpp_cq_id; + u32 rsvd1; /* pad */ +}; + +struct ocrdma_create_qp_uresp { + u16 qp_id; + u16 sq_dbid; + u16 rq_dbid; + u16 resv0; /* pad */ + u32 sq_page_size; + u32 rq_page_size; + u32 num_sq_pages; + u32 num_rq_pages; + u64 sq_page_addr[MAX_QP_PAGES]; + u64 rq_page_addr[MAX_QP_PAGES]; + u64 db_page_addr; + u32 db_page_size; + u32 dpp_credit; + u32 dpp_offset; + u32 num_wqe_allocated; + u32 num_rqe_allocated; + u32 db_sq_offset; + u32 db_rq_offset; + u32 db_shift; + u64 rsvd[11]; +} __packed; + +struct ocrdma_create_srq_uresp { + u16 rq_dbid; + u16 resv0; /* pad */ + u32 resv1; + + u32 rq_page_size; + u32 num_rq_pages; + + u64 rq_page_addr[MAX_QP_PAGES]; + u64 db_page_addr; + + u32 db_page_size; + u32 num_rqe_allocated; + u32 db_rq_offset; + u32 db_shift; + + u64 rsvd2; + u64 rsvd3; +}; + +#endif /* __OCRDMA_ABI_H__ */ diff --git a/kernel/drivers/infiniband/hw/ocrdma/ocrdma_ah.c b/kernel/drivers/infiniband/hw/ocrdma/ocrdma_ah.c new file mode 100644 index 000000000..f5a5ea836 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ocrdma/ocrdma_ah.c @@ -0,0 +1,227 @@ +/******************************************************************* + * This file is part of the Emulex RoCE Device Driver for * + * RoCE (RDMA over Converged Ethernet) adapters. * + * Copyright (C) 2008-2012 Emulex. All rights reserved. * + * EMULEX and SLI are trademarks of Emulex. * + * www.emulex.com * + * * + * This program is free software; you can redistribute it and/or * + * modify it under the terms of version 2 of the GNU General * + * Public License as published by the Free Software Foundation. * + * This program is distributed in the hope that it will be useful. * + * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND * + * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, * + * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE * + * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD * + * TO BE LEGALLY INVALID. See the GNU General Public License for * + * more details, a copy of which can be found in the file COPYING * + * included with this package. * + * + * Contact Information: + * linux-drivers@emulex.com + * + * Emulex + * 3333 Susan Street + * Costa Mesa, CA 92626 + *******************************************************************/ + +#include +#include + +#include +#include + +#include "ocrdma.h" +#include "ocrdma_verbs.h" +#include "ocrdma_ah.h" +#include "ocrdma_hw.h" +#include "ocrdma_stats.h" + +#define OCRDMA_VID_PCP_SHIFT 0xD + +static inline int set_av_attr(struct ocrdma_dev *dev, struct ocrdma_ah *ah, + struct ib_ah_attr *attr, union ib_gid *sgid, + int pdid, bool *isvlan) +{ + int status = 0; + u16 vlan_tag; + struct ocrdma_eth_vlan eth; + struct ocrdma_grh grh; + int eth_sz; + + memset(ð, 0, sizeof(eth)); + memset(&grh, 0, sizeof(grh)); + + /* VLAN */ + vlan_tag = attr->vlan_id; + if (!vlan_tag || (vlan_tag > 0xFFF)) + vlan_tag = dev->pvid; + if (vlan_tag || dev->pfc_state) { + if (!vlan_tag) { + pr_err("ocrdma%d:Using VLAN with PFC is recommended\n", + dev->id); + pr_err("ocrdma%d:Using VLAN 0 for this connection\n", + dev->id); + } + eth.eth_type = cpu_to_be16(0x8100); + eth.roce_eth_type = cpu_to_be16(OCRDMA_ROCE_ETH_TYPE); + vlan_tag |= (dev->sl & 0x07) << OCRDMA_VID_PCP_SHIFT; + eth.vlan_tag = cpu_to_be16(vlan_tag); + eth_sz = sizeof(struct ocrdma_eth_vlan); + *isvlan = true; + } else { + eth.eth_type = cpu_to_be16(OCRDMA_ROCE_ETH_TYPE); + eth_sz = sizeof(struct ocrdma_eth_basic); + } + /* MAC */ + memcpy(ð.smac[0], &dev->nic_info.mac_addr[0], ETH_ALEN); + status = ocrdma_resolve_dmac(dev, attr, ð.dmac[0]); + if (status) + return status; + ah->sgid_index = attr->grh.sgid_index; + memcpy(&grh.sgid[0], sgid->raw, sizeof(union ib_gid)); + memcpy(&grh.dgid[0], attr->grh.dgid.raw, sizeof(attr->grh.dgid.raw)); + + grh.tclass_flow = cpu_to_be32((6 << 28) | + (attr->grh.traffic_class << 24) | + attr->grh.flow_label); + /* 0x1b is next header value in GRH */ + grh.pdid_hoplimit = cpu_to_be32((pdid << 16) | + (0x1b << 8) | attr->grh.hop_limit); + /* Eth HDR */ + memcpy(&ah->av->eth_hdr, ð, eth_sz); + memcpy((u8 *)ah->av + eth_sz, &grh, sizeof(struct ocrdma_grh)); + if (*isvlan) + ah->av->valid |= OCRDMA_AV_VLAN_VALID; + ah->av->valid = cpu_to_le32(ah->av->valid); + return status; +} + +struct ib_ah *ocrdma_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr) +{ + u32 *ahid_addr; + bool isvlan = false; + int status; + struct ocrdma_ah *ah; + struct ocrdma_pd *pd = get_ocrdma_pd(ibpd); + struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device); + union ib_gid sgid; + + if (!(attr->ah_flags & IB_AH_GRH)) + return ERR_PTR(-EINVAL); + + if (atomic_cmpxchg(&dev->update_sl, 1, 0)) + ocrdma_init_service_level(dev); + ah = kzalloc(sizeof(*ah), GFP_ATOMIC); + if (!ah) + return ERR_PTR(-ENOMEM); + + status = ocrdma_alloc_av(dev, ah); + if (status) + goto av_err; + + status = ocrdma_query_gid(&dev->ibdev, 1, attr->grh.sgid_index, &sgid); + if (status) { + pr_err("%s(): Failed to query sgid, status = %d\n", + __func__, status); + goto av_conf_err; + } + + if ((pd->uctx) && + (!rdma_is_multicast_addr((struct in6_addr *)attr->grh.dgid.raw)) && + (!rdma_link_local_addr((struct in6_addr *)attr->grh.dgid.raw))) { + status = rdma_addr_find_dmac_by_grh(&sgid, &attr->grh.dgid, + attr->dmac, &attr->vlan_id); + if (status) { + pr_err("%s(): Failed to resolve dmac from gid." + "status = %d\n", __func__, status); + goto av_conf_err; + } + } + + status = set_av_attr(dev, ah, attr, &sgid, pd->id, &isvlan); + if (status) + goto av_conf_err; + + /* if pd is for the user process, pass the ah_id to user space */ + if ((pd->uctx) && (pd->uctx->ah_tbl.va)) { + ahid_addr = pd->uctx->ah_tbl.va + attr->dlid; + *ahid_addr = 0; + *ahid_addr |= ah->id & OCRDMA_AH_ID_MASK; + if (isvlan) + *ahid_addr |= (OCRDMA_AH_VLAN_VALID_MASK << + OCRDMA_AH_VLAN_VALID_SHIFT); + } + + return &ah->ibah; + +av_conf_err: + ocrdma_free_av(dev, ah); +av_err: + kfree(ah); + return ERR_PTR(status); +} + +int ocrdma_destroy_ah(struct ib_ah *ibah) +{ + struct ocrdma_ah *ah = get_ocrdma_ah(ibah); + struct ocrdma_dev *dev = get_ocrdma_dev(ibah->device); + + ocrdma_free_av(dev, ah); + kfree(ah); + return 0; +} + +int ocrdma_query_ah(struct ib_ah *ibah, struct ib_ah_attr *attr) +{ + struct ocrdma_ah *ah = get_ocrdma_ah(ibah); + struct ocrdma_av *av = ah->av; + struct ocrdma_grh *grh; + attr->ah_flags |= IB_AH_GRH; + if (ah->av->valid & OCRDMA_AV_VALID) { + grh = (struct ocrdma_grh *)((u8 *)ah->av + + sizeof(struct ocrdma_eth_vlan)); + attr->sl = be16_to_cpu(av->eth_hdr.vlan_tag) >> 13; + } else { + grh = (struct ocrdma_grh *)((u8 *)ah->av + + sizeof(struct ocrdma_eth_basic)); + attr->sl = 0; + } + memcpy(&attr->grh.dgid.raw[0], &grh->dgid[0], sizeof(grh->dgid)); + attr->grh.sgid_index = ah->sgid_index; + attr->grh.hop_limit = be32_to_cpu(grh->pdid_hoplimit) & 0xff; + attr->grh.traffic_class = be32_to_cpu(grh->tclass_flow) >> 24; + attr->grh.flow_label = be32_to_cpu(grh->tclass_flow) & 0x00ffffffff; + return 0; +} + +int ocrdma_modify_ah(struct ib_ah *ibah, struct ib_ah_attr *attr) +{ + /* modify_ah is unsupported */ + return -ENOSYS; +} + +int ocrdma_process_mad(struct ib_device *ibdev, + int process_mad_flags, + u8 port_num, + struct ib_wc *in_wc, + struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad) +{ + int status; + struct ocrdma_dev *dev; + + switch (in_mad->mad_hdr.mgmt_class) { + case IB_MGMT_CLASS_PERF_MGMT: + dev = get_ocrdma_dev(ibdev); + if (!ocrdma_pma_counters(dev, out_mad)) + status = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; + else + status = IB_MAD_RESULT_SUCCESS; + break; + default: + status = IB_MAD_RESULT_SUCCESS; + break; + } + return status; +} diff --git a/kernel/drivers/infiniband/hw/ocrdma/ocrdma_ah.h b/kernel/drivers/infiniband/hw/ocrdma/ocrdma_ah.h new file mode 100644 index 000000000..726a87cf2 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ocrdma/ocrdma_ah.h @@ -0,0 +1,48 @@ +/******************************************************************* + * This file is part of the Emulex RoCE Device Driver for * + * RoCE (RDMA over Converged Ethernet) adapters. * + * Copyright (C) 2008-2012 Emulex. All rights reserved. * + * EMULEX and SLI are trademarks of Emulex. * + * www.emulex.com * + * * + * This program is free software; you can redistribute it and/or * + * modify it under the terms of version 2 of the GNU General * + * Public License as published by the Free Software Foundation. * + * This program is distributed in the hope that it will be useful. * + * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND * + * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, * + * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE * + * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD * + * TO BE LEGALLY INVALID. See the GNU General Public License for * + * more details, a copy of which can be found in the file COPYING * + * included with this package. * + * + * Contact Information: + * linux-drivers@emulex.com + * + * Emulex + * 3333 Susan Street + * Costa Mesa, CA 92626 + *******************************************************************/ + +#ifndef __OCRDMA_AH_H__ +#define __OCRDMA_AH_H__ + +enum { + OCRDMA_AH_ID_MASK = 0x3FF, + OCRDMA_AH_VLAN_VALID_MASK = 0x01, + OCRDMA_AH_VLAN_VALID_SHIFT = 0x1F +}; + +struct ib_ah *ocrdma_create_ah(struct ib_pd *, struct ib_ah_attr *); +int ocrdma_destroy_ah(struct ib_ah *); +int ocrdma_query_ah(struct ib_ah *, struct ib_ah_attr *); +int ocrdma_modify_ah(struct ib_ah *, struct ib_ah_attr *); + +int ocrdma_process_mad(struct ib_device *, + int process_mad_flags, + u8 port_num, + struct ib_wc *in_wc, + struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad); +#endif /* __OCRDMA_AH_H__ */ diff --git a/kernel/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/kernel/drivers/infiniband/hw/ocrdma/ocrdma_hw.c new file mode 100644 index 000000000..47615ff33 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ocrdma/ocrdma_hw.c @@ -0,0 +1,3172 @@ +/******************************************************************* + * This file is part of the Emulex RoCE Device Driver for * + * RoCE (RDMA over Converged Ethernet) CNA Adapters. * + * Copyright (C) 2008-2012 Emulex. All rights reserved. * + * EMULEX and SLI are trademarks of Emulex. * + * www.emulex.com * + * * + * This program is free software; you can redistribute it and/or * + * modify it under the terms of version 2 of the GNU General * + * Public License as published by the Free Software Foundation. * + * This program is distributed in the hope that it will be useful. * + * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND * + * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, * + * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE * + * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD * + * TO BE LEGALLY INVALID. See the GNU General Public License for * + * more details, a copy of which can be found in the file COPYING * + * included with this package. * + * + * Contact Information: + * linux-drivers@emulex.com + * + * Emulex + * 3333 Susan Street + * Costa Mesa, CA 92626 + *******************************************************************/ + +#include +#include +#include +#include + +#include +#include + +#include "ocrdma.h" +#include "ocrdma_hw.h" +#include "ocrdma_verbs.h" +#include "ocrdma_ah.h" + +enum mbx_status { + OCRDMA_MBX_STATUS_FAILED = 1, + OCRDMA_MBX_STATUS_ILLEGAL_FIELD = 3, + OCRDMA_MBX_STATUS_OOR = 100, + OCRDMA_MBX_STATUS_INVALID_PD = 101, + OCRDMA_MBX_STATUS_PD_INUSE = 102, + OCRDMA_MBX_STATUS_INVALID_CQ = 103, + OCRDMA_MBX_STATUS_INVALID_QP = 104, + OCRDMA_MBX_STATUS_INVALID_LKEY = 105, + OCRDMA_MBX_STATUS_ORD_EXCEEDS = 106, + OCRDMA_MBX_STATUS_IRD_EXCEEDS = 107, + OCRDMA_MBX_STATUS_SENDQ_WQE_EXCEEDS = 108, + OCRDMA_MBX_STATUS_RECVQ_RQE_EXCEEDS = 109, + OCRDMA_MBX_STATUS_SGE_SEND_EXCEEDS = 110, + OCRDMA_MBX_STATUS_SGE_WRITE_EXCEEDS = 111, + OCRDMA_MBX_STATUS_SGE_RECV_EXCEEDS = 112, + OCRDMA_MBX_STATUS_INVALID_STATE_CHANGE = 113, + OCRDMA_MBX_STATUS_MW_BOUND = 114, + OCRDMA_MBX_STATUS_INVALID_VA = 115, + OCRDMA_MBX_STATUS_INVALID_LENGTH = 116, + OCRDMA_MBX_STATUS_INVALID_FBO = 117, + OCRDMA_MBX_STATUS_INVALID_ACC_RIGHTS = 118, + OCRDMA_MBX_STATUS_INVALID_PBE_SIZE = 119, + OCRDMA_MBX_STATUS_INVALID_PBL_ENTRY = 120, + OCRDMA_MBX_STATUS_INVALID_PBL_SHIFT = 121, + OCRDMA_MBX_STATUS_INVALID_SRQ_ID = 129, + OCRDMA_MBX_STATUS_SRQ_ERROR = 133, + OCRDMA_MBX_STATUS_RQE_EXCEEDS = 134, + OCRDMA_MBX_STATUS_MTU_EXCEEDS = 135, + OCRDMA_MBX_STATUS_MAX_QP_EXCEEDS = 136, + OCRDMA_MBX_STATUS_SRQ_LIMIT_EXCEEDS = 137, + OCRDMA_MBX_STATUS_SRQ_SIZE_UNDERUNS = 138, + OCRDMA_MBX_STATUS_QP_BOUND = 130, + OCRDMA_MBX_STATUS_INVALID_CHANGE = 139, + OCRDMA_MBX_STATUS_ATOMIC_OPS_UNSUP = 140, + OCRDMA_MBX_STATUS_INVALID_RNR_NAK_TIMER = 141, + OCRDMA_MBX_STATUS_MW_STILL_BOUND = 142, + OCRDMA_MBX_STATUS_PKEY_INDEX_INVALID = 143, + OCRDMA_MBX_STATUS_PKEY_INDEX_EXCEEDS = 144 +}; + +enum additional_status { + OCRDMA_MBX_ADDI_STATUS_INSUFFICIENT_RESOURCES = 22 +}; + +enum cqe_status { + OCRDMA_MBX_CQE_STATUS_INSUFFICIENT_PRIVILEDGES = 1, + OCRDMA_MBX_CQE_STATUS_INVALID_PARAMETER = 2, + OCRDMA_MBX_CQE_STATUS_INSUFFICIENT_RESOURCES = 3, + OCRDMA_MBX_CQE_STATUS_QUEUE_FLUSHING = 4, + OCRDMA_MBX_CQE_STATUS_DMA_FAILED = 5 +}; + +static inline void *ocrdma_get_eqe(struct ocrdma_eq *eq) +{ + return eq->q.va + (eq->q.tail * sizeof(struct ocrdma_eqe)); +} + +static inline void ocrdma_eq_inc_tail(struct ocrdma_eq *eq) +{ + eq->q.tail = (eq->q.tail + 1) & (OCRDMA_EQ_LEN - 1); +} + +static inline void *ocrdma_get_mcqe(struct ocrdma_dev *dev) +{ + struct ocrdma_mcqe *cqe = (struct ocrdma_mcqe *) + (dev->mq.cq.va + (dev->mq.cq.tail * sizeof(struct ocrdma_mcqe))); + + if (!(le32_to_cpu(cqe->valid_ae_cmpl_cons) & OCRDMA_MCQE_VALID_MASK)) + return NULL; + return cqe; +} + +static inline void ocrdma_mcq_inc_tail(struct ocrdma_dev *dev) +{ + dev->mq.cq.tail = (dev->mq.cq.tail + 1) & (OCRDMA_MQ_CQ_LEN - 1); +} + +static inline struct ocrdma_mqe *ocrdma_get_mqe(struct ocrdma_dev *dev) +{ + return dev->mq.sq.va + (dev->mq.sq.head * sizeof(struct ocrdma_mqe)); +} + +static inline void ocrdma_mq_inc_head(struct ocrdma_dev *dev) +{ + dev->mq.sq.head = (dev->mq.sq.head + 1) & (OCRDMA_MQ_LEN - 1); +} + +static inline void *ocrdma_get_mqe_rsp(struct ocrdma_dev *dev) +{ + return dev->mq.sq.va + (dev->mqe_ctx.tag * sizeof(struct ocrdma_mqe)); +} + +enum ib_qp_state get_ibqp_state(enum ocrdma_qp_state qps) +{ + switch (qps) { + case OCRDMA_QPS_RST: + return IB_QPS_RESET; + case OCRDMA_QPS_INIT: + return IB_QPS_INIT; + case OCRDMA_QPS_RTR: + return IB_QPS_RTR; + case OCRDMA_QPS_RTS: + return IB_QPS_RTS; + case OCRDMA_QPS_SQD: + case OCRDMA_QPS_SQ_DRAINING: + return IB_QPS_SQD; + case OCRDMA_QPS_SQE: + return IB_QPS_SQE; + case OCRDMA_QPS_ERR: + return IB_QPS_ERR; + } + return IB_QPS_ERR; +} + +static enum ocrdma_qp_state get_ocrdma_qp_state(enum ib_qp_state qps) +{ + switch (qps) { + case IB_QPS_RESET: + return OCRDMA_QPS_RST; + case IB_QPS_INIT: + return OCRDMA_QPS_INIT; + case IB_QPS_RTR: + return OCRDMA_QPS_RTR; + case IB_QPS_RTS: + return OCRDMA_QPS_RTS; + case IB_QPS_SQD: + return OCRDMA_QPS_SQD; + case IB_QPS_SQE: + return OCRDMA_QPS_SQE; + case IB_QPS_ERR: + return OCRDMA_QPS_ERR; + } + return OCRDMA_QPS_ERR; +} + +static int ocrdma_get_mbx_errno(u32 status) +{ + int err_num; + u8 mbox_status = (status & OCRDMA_MBX_RSP_STATUS_MASK) >> + OCRDMA_MBX_RSP_STATUS_SHIFT; + u8 add_status = (status & OCRDMA_MBX_RSP_ASTATUS_MASK) >> + OCRDMA_MBX_RSP_ASTATUS_SHIFT; + + switch (mbox_status) { + case OCRDMA_MBX_STATUS_OOR: + case OCRDMA_MBX_STATUS_MAX_QP_EXCEEDS: + err_num = -EAGAIN; + break; + + case OCRDMA_MBX_STATUS_INVALID_PD: + case OCRDMA_MBX_STATUS_INVALID_CQ: + case OCRDMA_MBX_STATUS_INVALID_SRQ_ID: + case OCRDMA_MBX_STATUS_INVALID_QP: + case OCRDMA_MBX_STATUS_INVALID_CHANGE: + case OCRDMA_MBX_STATUS_MTU_EXCEEDS: + case OCRDMA_MBX_STATUS_INVALID_RNR_NAK_TIMER: + case OCRDMA_MBX_STATUS_PKEY_INDEX_INVALID: + case OCRDMA_MBX_STATUS_PKEY_INDEX_EXCEEDS: + case OCRDMA_MBX_STATUS_ILLEGAL_FIELD: + case OCRDMA_MBX_STATUS_INVALID_PBL_ENTRY: + case OCRDMA_MBX_STATUS_INVALID_LKEY: + case OCRDMA_MBX_STATUS_INVALID_VA: + case OCRDMA_MBX_STATUS_INVALID_LENGTH: + case OCRDMA_MBX_STATUS_INVALID_FBO: + case OCRDMA_MBX_STATUS_INVALID_ACC_RIGHTS: + case OCRDMA_MBX_STATUS_INVALID_PBE_SIZE: + case OCRDMA_MBX_STATUS_ATOMIC_OPS_UNSUP: + case OCRDMA_MBX_STATUS_SRQ_ERROR: + case OCRDMA_MBX_STATUS_SRQ_SIZE_UNDERUNS: + err_num = -EINVAL; + break; + + case OCRDMA_MBX_STATUS_PD_INUSE: + case OCRDMA_MBX_STATUS_QP_BOUND: + case OCRDMA_MBX_STATUS_MW_STILL_BOUND: + case OCRDMA_MBX_STATUS_MW_BOUND: + err_num = -EBUSY; + break; + + case OCRDMA_MBX_STATUS_RECVQ_RQE_EXCEEDS: + case OCRDMA_MBX_STATUS_SGE_RECV_EXCEEDS: + case OCRDMA_MBX_STATUS_RQE_EXCEEDS: + case OCRDMA_MBX_STATUS_SRQ_LIMIT_EXCEEDS: + case OCRDMA_MBX_STATUS_ORD_EXCEEDS: + case OCRDMA_MBX_STATUS_IRD_EXCEEDS: + case OCRDMA_MBX_STATUS_SENDQ_WQE_EXCEEDS: + case OCRDMA_MBX_STATUS_SGE_SEND_EXCEEDS: + case OCRDMA_MBX_STATUS_SGE_WRITE_EXCEEDS: + err_num = -ENOBUFS; + break; + + case OCRDMA_MBX_STATUS_FAILED: + switch (add_status) { + case OCRDMA_MBX_ADDI_STATUS_INSUFFICIENT_RESOURCES: + err_num = -EAGAIN; + break; + } + default: + err_num = -EFAULT; + } + return err_num; +} + +char *port_speed_string(struct ocrdma_dev *dev) +{ + char *str = ""; + u16 speeds_supported; + + speeds_supported = dev->phy.fixed_speeds_supported | + dev->phy.auto_speeds_supported; + if (speeds_supported & OCRDMA_PHY_SPEED_40GBPS) + str = "40Gbps "; + else if (speeds_supported & OCRDMA_PHY_SPEED_10GBPS) + str = "10Gbps "; + else if (speeds_supported & OCRDMA_PHY_SPEED_1GBPS) + str = "1Gbps "; + + return str; +} + +static int ocrdma_get_mbx_cqe_errno(u16 cqe_status) +{ + int err_num = -EINVAL; + + switch (cqe_status) { + case OCRDMA_MBX_CQE_STATUS_INSUFFICIENT_PRIVILEDGES: + err_num = -EPERM; + break; + case OCRDMA_MBX_CQE_STATUS_INVALID_PARAMETER: + err_num = -EINVAL; + break; + case OCRDMA_MBX_CQE_STATUS_INSUFFICIENT_RESOURCES: + case OCRDMA_MBX_CQE_STATUS_QUEUE_FLUSHING: + err_num = -EINVAL; + break; + case OCRDMA_MBX_CQE_STATUS_DMA_FAILED: + default: + err_num = -EINVAL; + break; + } + return err_num; +} + +void ocrdma_ring_cq_db(struct ocrdma_dev *dev, u16 cq_id, bool armed, + bool solicited, u16 cqe_popped) +{ + u32 val = cq_id & OCRDMA_DB_CQ_RING_ID_MASK; + + val |= ((cq_id & OCRDMA_DB_CQ_RING_ID_EXT_MASK) << + OCRDMA_DB_CQ_RING_ID_EXT_MASK_SHIFT); + + if (armed) + val |= (1 << OCRDMA_DB_CQ_REARM_SHIFT); + if (solicited) + val |= (1 << OCRDMA_DB_CQ_SOLICIT_SHIFT); + val |= (cqe_popped << OCRDMA_DB_CQ_NUM_POPPED_SHIFT); + iowrite32(val, dev->nic_info.db + OCRDMA_DB_CQ_OFFSET); +} + +static void ocrdma_ring_mq_db(struct ocrdma_dev *dev) +{ + u32 val = 0; + + val |= dev->mq.sq.id & OCRDMA_MQ_ID_MASK; + val |= 1 << OCRDMA_MQ_NUM_MQE_SHIFT; + iowrite32(val, dev->nic_info.db + OCRDMA_DB_MQ_OFFSET); +} + +static void ocrdma_ring_eq_db(struct ocrdma_dev *dev, u16 eq_id, + bool arm, bool clear_int, u16 num_eqe) +{ + u32 val = 0; + + val |= eq_id & OCRDMA_EQ_ID_MASK; + val |= ((eq_id & OCRDMA_EQ_ID_EXT_MASK) << OCRDMA_EQ_ID_EXT_MASK_SHIFT); + if (arm) + val |= (1 << OCRDMA_REARM_SHIFT); + if (clear_int) + val |= (1 << OCRDMA_EQ_CLR_SHIFT); + val |= (1 << OCRDMA_EQ_TYPE_SHIFT); + val |= (num_eqe << OCRDMA_NUM_EQE_SHIFT); + iowrite32(val, dev->nic_info.db + OCRDMA_DB_EQ_OFFSET); +} + +static void ocrdma_init_mch(struct ocrdma_mbx_hdr *cmd_hdr, + u8 opcode, u8 subsys, u32 cmd_len) +{ + cmd_hdr->subsys_op = (opcode | (subsys << OCRDMA_MCH_SUBSYS_SHIFT)); + cmd_hdr->timeout = 20; /* seconds */ + cmd_hdr->cmd_len = cmd_len - sizeof(struct ocrdma_mbx_hdr); +} + +static void *ocrdma_init_emb_mqe(u8 opcode, u32 cmd_len) +{ + struct ocrdma_mqe *mqe; + + mqe = kzalloc(sizeof(struct ocrdma_mqe), GFP_KERNEL); + if (!mqe) + return NULL; + mqe->hdr.spcl_sge_cnt_emb |= + (OCRDMA_MQE_EMBEDDED << OCRDMA_MQE_HDR_EMB_SHIFT) & + OCRDMA_MQE_HDR_EMB_MASK; + mqe->hdr.pyld_len = cmd_len - sizeof(struct ocrdma_mqe_hdr); + + ocrdma_init_mch(&mqe->u.emb_req.mch, opcode, OCRDMA_SUBSYS_ROCE, + mqe->hdr.pyld_len); + return mqe; +} + +static void ocrdma_free_q(struct ocrdma_dev *dev, struct ocrdma_queue_info *q) +{ + dma_free_coherent(&dev->nic_info.pdev->dev, q->size, q->va, q->dma); +} + +static int ocrdma_alloc_q(struct ocrdma_dev *dev, + struct ocrdma_queue_info *q, u16 len, u16 entry_size) +{ + memset(q, 0, sizeof(*q)); + q->len = len; + q->entry_size = entry_size; + q->size = len * entry_size; + q->va = dma_alloc_coherent(&dev->nic_info.pdev->dev, q->size, + &q->dma, GFP_KERNEL); + if (!q->va) + return -ENOMEM; + memset(q->va, 0, q->size); + return 0; +} + +static void ocrdma_build_q_pages(struct ocrdma_pa *q_pa, int cnt, + dma_addr_t host_pa, int hw_page_size) +{ + int i; + + for (i = 0; i < cnt; i++) { + q_pa[i].lo = (u32) (host_pa & 0xffffffff); + q_pa[i].hi = (u32) upper_32_bits(host_pa); + host_pa += hw_page_size; + } +} + +static int ocrdma_mbx_delete_q(struct ocrdma_dev *dev, + struct ocrdma_queue_info *q, int queue_type) +{ + u8 opcode = 0; + int status; + struct ocrdma_delete_q_req *cmd = dev->mbx_cmd; + + switch (queue_type) { + case QTYPE_MCCQ: + opcode = OCRDMA_CMD_DELETE_MQ; + break; + case QTYPE_CQ: + opcode = OCRDMA_CMD_DELETE_CQ; + break; + case QTYPE_EQ: + opcode = OCRDMA_CMD_DELETE_EQ; + break; + default: + BUG(); + } + memset(cmd, 0, sizeof(*cmd)); + ocrdma_init_mch(&cmd->req, opcode, OCRDMA_SUBSYS_COMMON, sizeof(*cmd)); + cmd->id = q->id; + + status = be_roce_mcc_cmd(dev->nic_info.netdev, + cmd, sizeof(*cmd), NULL, NULL); + if (!status) + q->created = false; + return status; +} + +static int ocrdma_mbx_create_eq(struct ocrdma_dev *dev, struct ocrdma_eq *eq) +{ + int status; + struct ocrdma_create_eq_req *cmd = dev->mbx_cmd; + struct ocrdma_create_eq_rsp *rsp = dev->mbx_cmd; + + memset(cmd, 0, sizeof(*cmd)); + ocrdma_init_mch(&cmd->req, OCRDMA_CMD_CREATE_EQ, OCRDMA_SUBSYS_COMMON, + sizeof(*cmd)); + + cmd->req.rsvd_version = 2; + cmd->num_pages = 4; + cmd->valid = OCRDMA_CREATE_EQ_VALID; + cmd->cnt = 4 << OCRDMA_CREATE_EQ_CNT_SHIFT; + + ocrdma_build_q_pages(&cmd->pa[0], cmd->num_pages, eq->q.dma, + PAGE_SIZE_4K); + status = be_roce_mcc_cmd(dev->nic_info.netdev, cmd, sizeof(*cmd), NULL, + NULL); + if (!status) { + eq->q.id = rsp->vector_eqid & 0xffff; + eq->vector = (rsp->vector_eqid >> 16) & 0xffff; + eq->q.created = true; + } + return status; +} + +static int ocrdma_create_eq(struct ocrdma_dev *dev, + struct ocrdma_eq *eq, u16 q_len) +{ + int status; + + status = ocrdma_alloc_q(dev, &eq->q, OCRDMA_EQ_LEN, + sizeof(struct ocrdma_eqe)); + if (status) + return status; + + status = ocrdma_mbx_create_eq(dev, eq); + if (status) + goto mbx_err; + eq->dev = dev; + ocrdma_ring_eq_db(dev, eq->q.id, true, true, 0); + + return 0; +mbx_err: + ocrdma_free_q(dev, &eq->q); + return status; +} + +int ocrdma_get_irq(struct ocrdma_dev *dev, struct ocrdma_eq *eq) +{ + int irq; + + if (dev->nic_info.intr_mode == BE_INTERRUPT_MODE_INTX) + irq = dev->nic_info.pdev->irq; + else + irq = dev->nic_info.msix.vector_list[eq->vector]; + return irq; +} + +static void _ocrdma_destroy_eq(struct ocrdma_dev *dev, struct ocrdma_eq *eq) +{ + if (eq->q.created) { + ocrdma_mbx_delete_q(dev, &eq->q, QTYPE_EQ); + ocrdma_free_q(dev, &eq->q); + } +} + +static void ocrdma_destroy_eq(struct ocrdma_dev *dev, struct ocrdma_eq *eq) +{ + int irq; + + /* disarm EQ so that interrupts are not generated + * during freeing and EQ delete is in progress. + */ + ocrdma_ring_eq_db(dev, eq->q.id, false, false, 0); + + irq = ocrdma_get_irq(dev, eq); + free_irq(irq, eq); + _ocrdma_destroy_eq(dev, eq); +} + +static void ocrdma_destroy_eqs(struct ocrdma_dev *dev) +{ + int i; + + for (i = 0; i < dev->eq_cnt; i++) + ocrdma_destroy_eq(dev, &dev->eq_tbl[i]); +} + +static int ocrdma_mbx_mq_cq_create(struct ocrdma_dev *dev, + struct ocrdma_queue_info *cq, + struct ocrdma_queue_info *eq) +{ + struct ocrdma_create_cq_cmd *cmd = dev->mbx_cmd; + struct ocrdma_create_cq_cmd_rsp *rsp = dev->mbx_cmd; + int status; + + memset(cmd, 0, sizeof(*cmd)); + ocrdma_init_mch(&cmd->req, OCRDMA_CMD_CREATE_CQ, + OCRDMA_SUBSYS_COMMON, sizeof(*cmd)); + + cmd->req.rsvd_version = OCRDMA_CREATE_CQ_VER2; + cmd->pgsz_pgcnt = (cq->size / OCRDMA_MIN_Q_PAGE_SIZE) << + OCRDMA_CREATE_CQ_PAGE_SIZE_SHIFT; + cmd->pgsz_pgcnt |= PAGES_4K_SPANNED(cq->va, cq->size); + + cmd->ev_cnt_flags = OCRDMA_CREATE_CQ_DEF_FLAGS; + cmd->eqn = eq->id; + cmd->pdid_cqecnt = cq->size / sizeof(struct ocrdma_mcqe); + + ocrdma_build_q_pages(&cmd->pa[0], cq->size / OCRDMA_MIN_Q_PAGE_SIZE, + cq->dma, PAGE_SIZE_4K); + status = be_roce_mcc_cmd(dev->nic_info.netdev, + cmd, sizeof(*cmd), NULL, NULL); + if (!status) { + cq->id = (u16) (rsp->cq_id & OCRDMA_CREATE_CQ_RSP_CQ_ID_MASK); + cq->created = true; + } + return status; +} + +static u32 ocrdma_encoded_q_len(int q_len) +{ + u32 len_encoded = fls(q_len); /* log2(len) + 1 */ + + if (len_encoded == 16) + len_encoded = 0; + return len_encoded; +} + +static int ocrdma_mbx_create_mq(struct ocrdma_dev *dev, + struct ocrdma_queue_info *mq, + struct ocrdma_queue_info *cq) +{ + int num_pages, status; + struct ocrdma_create_mq_req *cmd = dev->mbx_cmd; + struct ocrdma_create_mq_rsp *rsp = dev->mbx_cmd; + struct ocrdma_pa *pa; + + memset(cmd, 0, sizeof(*cmd)); + num_pages = PAGES_4K_SPANNED(mq->va, mq->size); + + ocrdma_init_mch(&cmd->req, OCRDMA_CMD_CREATE_MQ_EXT, + OCRDMA_SUBSYS_COMMON, sizeof(*cmd)); + cmd->req.rsvd_version = 1; + cmd->cqid_pages = num_pages; + cmd->cqid_pages |= (cq->id << OCRDMA_CREATE_MQ_CQ_ID_SHIFT); + cmd->async_cqid_valid = OCRDMA_CREATE_MQ_ASYNC_CQ_VALID; + + cmd->async_event_bitmap = BIT(OCRDMA_ASYNC_GRP5_EVE_CODE); + cmd->async_event_bitmap |= BIT(OCRDMA_ASYNC_RDMA_EVE_CODE); + + cmd->async_cqid_ringsize = cq->id; + cmd->async_cqid_ringsize |= (ocrdma_encoded_q_len(mq->len) << + OCRDMA_CREATE_MQ_RING_SIZE_SHIFT); + cmd->valid = OCRDMA_CREATE_MQ_VALID; + pa = &cmd->pa[0]; + + ocrdma_build_q_pages(pa, num_pages, mq->dma, PAGE_SIZE_4K); + status = be_roce_mcc_cmd(dev->nic_info.netdev, + cmd, sizeof(*cmd), NULL, NULL); + if (!status) { + mq->id = rsp->id; + mq->created = true; + } + return status; +} + +static int ocrdma_create_mq(struct ocrdma_dev *dev) +{ + int status; + + /* Alloc completion queue for Mailbox queue */ + status = ocrdma_alloc_q(dev, &dev->mq.cq, OCRDMA_MQ_CQ_LEN, + sizeof(struct ocrdma_mcqe)); + if (status) + goto alloc_err; + + dev->eq_tbl[0].cq_cnt++; + status = ocrdma_mbx_mq_cq_create(dev, &dev->mq.cq, &dev->eq_tbl[0].q); + if (status) + goto mbx_cq_free; + + memset(&dev->mqe_ctx, 0, sizeof(dev->mqe_ctx)); + init_waitqueue_head(&dev->mqe_ctx.cmd_wait); + mutex_init(&dev->mqe_ctx.lock); + + /* Alloc Mailbox queue */ + status = ocrdma_alloc_q(dev, &dev->mq.sq, OCRDMA_MQ_LEN, + sizeof(struct ocrdma_mqe)); + if (status) + goto mbx_cq_destroy; + status = ocrdma_mbx_create_mq(dev, &dev->mq.sq, &dev->mq.cq); + if (status) + goto mbx_q_free; + ocrdma_ring_cq_db(dev, dev->mq.cq.id, true, false, 0); + return 0; + +mbx_q_free: + ocrdma_free_q(dev, &dev->mq.sq); +mbx_cq_destroy: + ocrdma_mbx_delete_q(dev, &dev->mq.cq, QTYPE_CQ); +mbx_cq_free: + ocrdma_free_q(dev, &dev->mq.cq); +alloc_err: + return status; +} + +static void ocrdma_destroy_mq(struct ocrdma_dev *dev) +{ + struct ocrdma_queue_info *mbxq, *cq; + + /* mqe_ctx lock synchronizes with any other pending cmds. */ + mutex_lock(&dev->mqe_ctx.lock); + mbxq = &dev->mq.sq; + if (mbxq->created) { + ocrdma_mbx_delete_q(dev, mbxq, QTYPE_MCCQ); + ocrdma_free_q(dev, mbxq); + } + mutex_unlock(&dev->mqe_ctx.lock); + + cq = &dev->mq.cq; + if (cq->created) { + ocrdma_mbx_delete_q(dev, cq, QTYPE_CQ); + ocrdma_free_q(dev, cq); + } +} + +static void ocrdma_process_qpcat_error(struct ocrdma_dev *dev, + struct ocrdma_qp *qp) +{ + enum ib_qp_state new_ib_qps = IB_QPS_ERR; + enum ib_qp_state old_ib_qps; + + if (qp == NULL) + BUG(); + ocrdma_qp_state_change(qp, new_ib_qps, &old_ib_qps); +} + +static void ocrdma_dispatch_ibevent(struct ocrdma_dev *dev, + struct ocrdma_ae_mcqe *cqe) +{ + struct ocrdma_qp *qp = NULL; + struct ocrdma_cq *cq = NULL; + struct ib_event ib_evt; + int cq_event = 0; + int qp_event = 1; + int srq_event = 0; + int dev_event = 0; + int type = (cqe->valid_ae_event & OCRDMA_AE_MCQE_EVENT_TYPE_MASK) >> + OCRDMA_AE_MCQE_EVENT_TYPE_SHIFT; + + if (cqe->qpvalid_qpid & OCRDMA_AE_MCQE_QPVALID) + qp = dev->qp_tbl[cqe->qpvalid_qpid & OCRDMA_AE_MCQE_QPID_MASK]; + if (cqe->cqvalid_cqid & OCRDMA_AE_MCQE_CQVALID) + cq = dev->cq_tbl[cqe->cqvalid_cqid & OCRDMA_AE_MCQE_CQID_MASK]; + + memset(&ib_evt, 0, sizeof(ib_evt)); + + ib_evt.device = &dev->ibdev; + + switch (type) { + case OCRDMA_CQ_ERROR: + ib_evt.element.cq = &cq->ibcq; + ib_evt.event = IB_EVENT_CQ_ERR; + cq_event = 1; + qp_event = 0; + break; + case OCRDMA_CQ_OVERRUN_ERROR: + ib_evt.element.cq = &cq->ibcq; + ib_evt.event = IB_EVENT_CQ_ERR; + cq_event = 1; + qp_event = 0; + break; + case OCRDMA_CQ_QPCAT_ERROR: + ib_evt.element.qp = &qp->ibqp; + ib_evt.event = IB_EVENT_QP_FATAL; + ocrdma_process_qpcat_error(dev, qp); + break; + case OCRDMA_QP_ACCESS_ERROR: + ib_evt.element.qp = &qp->ibqp; + ib_evt.event = IB_EVENT_QP_ACCESS_ERR; + break; + case OCRDMA_QP_COMM_EST_EVENT: + ib_evt.element.qp = &qp->ibqp; + ib_evt.event = IB_EVENT_COMM_EST; + break; + case OCRDMA_SQ_DRAINED_EVENT: + ib_evt.element.qp = &qp->ibqp; + ib_evt.event = IB_EVENT_SQ_DRAINED; + break; + case OCRDMA_DEVICE_FATAL_EVENT: + ib_evt.element.port_num = 1; + ib_evt.event = IB_EVENT_DEVICE_FATAL; + qp_event = 0; + dev_event = 1; + break; + case OCRDMA_SRQCAT_ERROR: + ib_evt.element.srq = &qp->srq->ibsrq; + ib_evt.event = IB_EVENT_SRQ_ERR; + srq_event = 1; + qp_event = 0; + break; + case OCRDMA_SRQ_LIMIT_EVENT: + ib_evt.element.srq = &qp->srq->ibsrq; + ib_evt.event = IB_EVENT_SRQ_LIMIT_REACHED; + srq_event = 1; + qp_event = 0; + break; + case OCRDMA_QP_LAST_WQE_EVENT: + ib_evt.element.qp = &qp->ibqp; + ib_evt.event = IB_EVENT_QP_LAST_WQE_REACHED; + break; + default: + cq_event = 0; + qp_event = 0; + srq_event = 0; + dev_event = 0; + pr_err("%s() unknown type=0x%x\n", __func__, type); + break; + } + + if (type < OCRDMA_MAX_ASYNC_ERRORS) + atomic_inc(&dev->async_err_stats[type]); + + if (qp_event) { + if (qp->ibqp.event_handler) + qp->ibqp.event_handler(&ib_evt, qp->ibqp.qp_context); + } else if (cq_event) { + if (cq->ibcq.event_handler) + cq->ibcq.event_handler(&ib_evt, cq->ibcq.cq_context); + } else if (srq_event) { + if (qp->srq->ibsrq.event_handler) + qp->srq->ibsrq.event_handler(&ib_evt, + qp->srq->ibsrq. + srq_context); + } else if (dev_event) { + pr_err("%s: Fatal event received\n", dev->ibdev.name); + ib_dispatch_event(&ib_evt); + } + +} + +static void ocrdma_process_grp5_aync(struct ocrdma_dev *dev, + struct ocrdma_ae_mcqe *cqe) +{ + struct ocrdma_ae_pvid_mcqe *evt; + int type = (cqe->valid_ae_event & OCRDMA_AE_MCQE_EVENT_TYPE_MASK) >> + OCRDMA_AE_MCQE_EVENT_TYPE_SHIFT; + + switch (type) { + case OCRDMA_ASYNC_EVENT_PVID_STATE: + evt = (struct ocrdma_ae_pvid_mcqe *)cqe; + if ((evt->tag_enabled & OCRDMA_AE_PVID_MCQE_ENABLED_MASK) >> + OCRDMA_AE_PVID_MCQE_ENABLED_SHIFT) + dev->pvid = ((evt->tag_enabled & + OCRDMA_AE_PVID_MCQE_TAG_MASK) >> + OCRDMA_AE_PVID_MCQE_TAG_SHIFT); + break; + + case OCRDMA_ASYNC_EVENT_COS_VALUE: + atomic_set(&dev->update_sl, 1); + break; + default: + /* Not interested evts. */ + break; + } +} + +static void ocrdma_process_acqe(struct ocrdma_dev *dev, void *ae_cqe) +{ + /* async CQE processing */ + struct ocrdma_ae_mcqe *cqe = ae_cqe; + u32 evt_code = (cqe->valid_ae_event & OCRDMA_AE_MCQE_EVENT_CODE_MASK) >> + OCRDMA_AE_MCQE_EVENT_CODE_SHIFT; + + if (evt_code == OCRDMA_ASYNC_RDMA_EVE_CODE) + ocrdma_dispatch_ibevent(dev, cqe); + else if (evt_code == OCRDMA_ASYNC_GRP5_EVE_CODE) + ocrdma_process_grp5_aync(dev, cqe); + else + pr_err("%s(%d) invalid evt code=0x%x\n", __func__, + dev->id, evt_code); +} + +static void ocrdma_process_mcqe(struct ocrdma_dev *dev, struct ocrdma_mcqe *cqe) +{ + if (dev->mqe_ctx.tag == cqe->tag_lo && dev->mqe_ctx.cmd_done == false) { + dev->mqe_ctx.cqe_status = (cqe->status & + OCRDMA_MCQE_STATUS_MASK) >> OCRDMA_MCQE_STATUS_SHIFT; + dev->mqe_ctx.ext_status = + (cqe->status & OCRDMA_MCQE_ESTATUS_MASK) + >> OCRDMA_MCQE_ESTATUS_SHIFT; + dev->mqe_ctx.cmd_done = true; + wake_up(&dev->mqe_ctx.cmd_wait); + } else + pr_err("%s() cqe for invalid tag0x%x.expected=0x%x\n", + __func__, cqe->tag_lo, dev->mqe_ctx.tag); +} + +static int ocrdma_mq_cq_handler(struct ocrdma_dev *dev, u16 cq_id) +{ + u16 cqe_popped = 0; + struct ocrdma_mcqe *cqe; + + while (1) { + cqe = ocrdma_get_mcqe(dev); + if (cqe == NULL) + break; + ocrdma_le32_to_cpu(cqe, sizeof(*cqe)); + cqe_popped += 1; + if (cqe->valid_ae_cmpl_cons & OCRDMA_MCQE_AE_MASK) + ocrdma_process_acqe(dev, cqe); + else if (cqe->valid_ae_cmpl_cons & OCRDMA_MCQE_CMPL_MASK) + ocrdma_process_mcqe(dev, cqe); + memset(cqe, 0, sizeof(struct ocrdma_mcqe)); + ocrdma_mcq_inc_tail(dev); + } + ocrdma_ring_cq_db(dev, dev->mq.cq.id, true, false, cqe_popped); + return 0; +} + +static struct ocrdma_cq *_ocrdma_qp_buddy_cq_handler(struct ocrdma_dev *dev, + struct ocrdma_cq *cq, bool sq) +{ + struct ocrdma_qp *qp; + struct list_head *cur; + struct ocrdma_cq *bcq = NULL; + struct list_head *head = sq?(&cq->sq_head):(&cq->rq_head); + + list_for_each(cur, head) { + if (sq) + qp = list_entry(cur, struct ocrdma_qp, sq_entry); + else + qp = list_entry(cur, struct ocrdma_qp, rq_entry); + + if (qp->srq) + continue; + /* if wq and rq share the same cq, than comp_handler + * is already invoked. + */ + if (qp->sq_cq == qp->rq_cq) + continue; + /* if completion came on sq, rq's cq is buddy cq. + * if completion came on rq, sq's cq is buddy cq. + */ + if (qp->sq_cq == cq) + bcq = qp->rq_cq; + else + bcq = qp->sq_cq; + return bcq; + } + return NULL; +} + +static void ocrdma_qp_buddy_cq_handler(struct ocrdma_dev *dev, + struct ocrdma_cq *cq) +{ + unsigned long flags; + struct ocrdma_cq *bcq = NULL; + + /* Go through list of QPs in error state which are using this CQ + * and invoke its callback handler to trigger CQE processing for + * error/flushed CQE. It is rare to find more than few entries in + * this list as most consumers stops after getting error CQE. + * List is traversed only once when a matching buddy cq found for a QP. + */ + spin_lock_irqsave(&dev->flush_q_lock, flags); + /* Check if buddy CQ is present. + * true - Check for SQ CQ + * false - Check for RQ CQ + */ + bcq = _ocrdma_qp_buddy_cq_handler(dev, cq, true); + if (bcq == NULL) + bcq = _ocrdma_qp_buddy_cq_handler(dev, cq, false); + spin_unlock_irqrestore(&dev->flush_q_lock, flags); + + /* if there is valid buddy cq, look for its completion handler */ + if (bcq && bcq->ibcq.comp_handler) { + spin_lock_irqsave(&bcq->comp_handler_lock, flags); + (*bcq->ibcq.comp_handler) (&bcq->ibcq, bcq->ibcq.cq_context); + spin_unlock_irqrestore(&bcq->comp_handler_lock, flags); + } +} + +static void ocrdma_qp_cq_handler(struct ocrdma_dev *dev, u16 cq_idx) +{ + unsigned long flags; + struct ocrdma_cq *cq; + + if (cq_idx >= OCRDMA_MAX_CQ) + BUG(); + + cq = dev->cq_tbl[cq_idx]; + if (cq == NULL) + return; + + if (cq->ibcq.comp_handler) { + spin_lock_irqsave(&cq->comp_handler_lock, flags); + (*cq->ibcq.comp_handler) (&cq->ibcq, cq->ibcq.cq_context); + spin_unlock_irqrestore(&cq->comp_handler_lock, flags); + } + ocrdma_qp_buddy_cq_handler(dev, cq); +} + +static void ocrdma_cq_handler(struct ocrdma_dev *dev, u16 cq_id) +{ + /* process the MQ-CQE. */ + if (cq_id == dev->mq.cq.id) + ocrdma_mq_cq_handler(dev, cq_id); + else + ocrdma_qp_cq_handler(dev, cq_id); +} + +static irqreturn_t ocrdma_irq_handler(int irq, void *handle) +{ + struct ocrdma_eq *eq = handle; + struct ocrdma_dev *dev = eq->dev; + struct ocrdma_eqe eqe; + struct ocrdma_eqe *ptr; + u16 cq_id; + u8 mcode; + int budget = eq->cq_cnt; + + do { + ptr = ocrdma_get_eqe(eq); + eqe = *ptr; + ocrdma_le32_to_cpu(&eqe, sizeof(eqe)); + mcode = (eqe.id_valid & OCRDMA_EQE_MAJOR_CODE_MASK) + >> OCRDMA_EQE_MAJOR_CODE_SHIFT; + if (mcode == OCRDMA_MAJOR_CODE_SENTINAL) + pr_err("EQ full on eqid = 0x%x, eqe = 0x%x\n", + eq->q.id, eqe.id_valid); + if ((eqe.id_valid & OCRDMA_EQE_VALID_MASK) == 0) + break; + + ptr->id_valid = 0; + /* ring eq doorbell as soon as its consumed. */ + ocrdma_ring_eq_db(dev, eq->q.id, false, true, 1); + /* check whether its CQE or not. */ + if ((eqe.id_valid & OCRDMA_EQE_FOR_CQE_MASK) == 0) { + cq_id = eqe.id_valid >> OCRDMA_EQE_RESOURCE_ID_SHIFT; + ocrdma_cq_handler(dev, cq_id); + } + ocrdma_eq_inc_tail(eq); + + /* There can be a stale EQE after the last bound CQ is + * destroyed. EQE valid and budget == 0 implies this. + */ + if (budget) + budget--; + + } while (budget); + + eq->aic_obj.eq_intr_cnt++; + ocrdma_ring_eq_db(dev, eq->q.id, true, true, 0); + return IRQ_HANDLED; +} + +static void ocrdma_post_mqe(struct ocrdma_dev *dev, struct ocrdma_mqe *cmd) +{ + struct ocrdma_mqe *mqe; + + dev->mqe_ctx.tag = dev->mq.sq.head; + dev->mqe_ctx.cmd_done = false; + mqe = ocrdma_get_mqe(dev); + cmd->hdr.tag_lo = dev->mq.sq.head; + ocrdma_copy_cpu_to_le32(mqe, cmd, sizeof(*mqe)); + /* make sure descriptor is written before ringing doorbell */ + wmb(); + ocrdma_mq_inc_head(dev); + ocrdma_ring_mq_db(dev); +} + +static int ocrdma_wait_mqe_cmpl(struct ocrdma_dev *dev) +{ + long status; + /* 30 sec timeout */ + status = wait_event_timeout(dev->mqe_ctx.cmd_wait, + (dev->mqe_ctx.cmd_done != false), + msecs_to_jiffies(30000)); + if (status) + return 0; + else { + dev->mqe_ctx.fw_error_state = true; + pr_err("%s(%d) mailbox timeout: fw not responding\n", + __func__, dev->id); + return -1; + } +} + +/* issue a mailbox command on the MQ */ +static int ocrdma_mbx_cmd(struct ocrdma_dev *dev, struct ocrdma_mqe *mqe) +{ + int status = 0; + u16 cqe_status, ext_status; + struct ocrdma_mqe *rsp_mqe; + struct ocrdma_mbx_rsp *rsp = NULL; + + mutex_lock(&dev->mqe_ctx.lock); + if (dev->mqe_ctx.fw_error_state) + goto mbx_err; + ocrdma_post_mqe(dev, mqe); + status = ocrdma_wait_mqe_cmpl(dev); + if (status) + goto mbx_err; + cqe_status = dev->mqe_ctx.cqe_status; + ext_status = dev->mqe_ctx.ext_status; + rsp_mqe = ocrdma_get_mqe_rsp(dev); + ocrdma_copy_le32_to_cpu(mqe, rsp_mqe, (sizeof(*mqe))); + if ((mqe->hdr.spcl_sge_cnt_emb & OCRDMA_MQE_HDR_EMB_MASK) >> + OCRDMA_MQE_HDR_EMB_SHIFT) + rsp = &mqe->u.rsp; + + if (cqe_status || ext_status) { + pr_err("%s() cqe_status=0x%x, ext_status=0x%x,", + __func__, cqe_status, ext_status); + if (rsp) { + /* This is for embedded cmds. */ + pr_err("opcode=0x%x, subsystem=0x%x\n", + (rsp->subsys_op & OCRDMA_MBX_RSP_OPCODE_MASK) >> + OCRDMA_MBX_RSP_OPCODE_SHIFT, + (rsp->subsys_op & OCRDMA_MBX_RSP_SUBSYS_MASK) >> + OCRDMA_MBX_RSP_SUBSYS_SHIFT); + } + status = ocrdma_get_mbx_cqe_errno(cqe_status); + goto mbx_err; + } + /* For non embedded, rsp errors are handled in ocrdma_nonemb_mbx_cmd */ + if (rsp && (mqe->u.rsp.status & OCRDMA_MBX_RSP_STATUS_MASK)) + status = ocrdma_get_mbx_errno(mqe->u.rsp.status); +mbx_err: + mutex_unlock(&dev->mqe_ctx.lock); + return status; +} + +static int ocrdma_nonemb_mbx_cmd(struct ocrdma_dev *dev, struct ocrdma_mqe *mqe, + void *payload_va) +{ + int status = 0; + struct ocrdma_mbx_rsp *rsp = payload_va; + + if ((mqe->hdr.spcl_sge_cnt_emb & OCRDMA_MQE_HDR_EMB_MASK) >> + OCRDMA_MQE_HDR_EMB_SHIFT) + BUG(); + + status = ocrdma_mbx_cmd(dev, mqe); + if (!status) + /* For non embedded, only CQE failures are handled in + * ocrdma_mbx_cmd. We need to check for RSP errors. + */ + if (rsp->status & OCRDMA_MBX_RSP_STATUS_MASK) + status = ocrdma_get_mbx_errno(rsp->status); + + if (status) + pr_err("opcode=0x%x, subsystem=0x%x\n", + (rsp->subsys_op & OCRDMA_MBX_RSP_OPCODE_MASK) >> + OCRDMA_MBX_RSP_OPCODE_SHIFT, + (rsp->subsys_op & OCRDMA_MBX_RSP_SUBSYS_MASK) >> + OCRDMA_MBX_RSP_SUBSYS_SHIFT); + return status; +} + +static void ocrdma_get_attr(struct ocrdma_dev *dev, + struct ocrdma_dev_attr *attr, + struct ocrdma_mbx_query_config *rsp) +{ + attr->max_pd = + (rsp->max_pd_ca_ack_delay & OCRDMA_MBX_QUERY_CFG_MAX_PD_MASK) >> + OCRDMA_MBX_QUERY_CFG_MAX_PD_SHIFT; + attr->max_dpp_pds = + (rsp->max_dpp_pds_credits & OCRDMA_MBX_QUERY_CFG_MAX_DPP_PDS_MASK) >> + OCRDMA_MBX_QUERY_CFG_MAX_DPP_PDS_OFFSET; + attr->max_qp = + (rsp->qp_srq_cq_ird_ord & OCRDMA_MBX_QUERY_CFG_MAX_QP_MASK) >> + OCRDMA_MBX_QUERY_CFG_MAX_QP_SHIFT; + attr->max_srq = + (rsp->max_srq_rpir_qps & OCRDMA_MBX_QUERY_CFG_MAX_SRQ_MASK) >> + OCRDMA_MBX_QUERY_CFG_MAX_SRQ_OFFSET; + attr->max_send_sge = ((rsp->max_write_send_sge & + OCRDMA_MBX_QUERY_CFG_MAX_SEND_SGE_MASK) >> + OCRDMA_MBX_QUERY_CFG_MAX_SEND_SGE_SHIFT); + attr->max_recv_sge = (rsp->max_write_send_sge & + OCRDMA_MBX_QUERY_CFG_MAX_SEND_SGE_MASK) >> + OCRDMA_MBX_QUERY_CFG_MAX_SEND_SGE_SHIFT; + attr->max_srq_sge = (rsp->max_srq_rqe_sge & + OCRDMA_MBX_QUERY_CFG_MAX_SRQ_SGE_MASK) >> + OCRDMA_MBX_QUERY_CFG_MAX_SRQ_SGE_OFFSET; + attr->max_rdma_sge = (rsp->max_write_send_sge & + OCRDMA_MBX_QUERY_CFG_MAX_WRITE_SGE_MASK) >> + OCRDMA_MBX_QUERY_CFG_MAX_WRITE_SGE_SHIFT; + attr->max_ord_per_qp = (rsp->max_ird_ord_per_qp & + OCRDMA_MBX_QUERY_CFG_MAX_ORD_PER_QP_MASK) >> + OCRDMA_MBX_QUERY_CFG_MAX_ORD_PER_QP_SHIFT; + attr->max_ird_per_qp = (rsp->max_ird_ord_per_qp & + OCRDMA_MBX_QUERY_CFG_MAX_IRD_PER_QP_MASK) >> + OCRDMA_MBX_QUERY_CFG_MAX_IRD_PER_QP_SHIFT; + attr->cq_overflow_detect = (rsp->qp_srq_cq_ird_ord & + OCRDMA_MBX_QUERY_CFG_CQ_OVERFLOW_MASK) >> + OCRDMA_MBX_QUERY_CFG_CQ_OVERFLOW_SHIFT; + attr->srq_supported = (rsp->qp_srq_cq_ird_ord & + OCRDMA_MBX_QUERY_CFG_SRQ_SUPPORTED_MASK) >> + OCRDMA_MBX_QUERY_CFG_SRQ_SUPPORTED_SHIFT; + attr->local_ca_ack_delay = (rsp->max_pd_ca_ack_delay & + OCRDMA_MBX_QUERY_CFG_CA_ACK_DELAY_MASK) >> + OCRDMA_MBX_QUERY_CFG_CA_ACK_DELAY_SHIFT; + attr->max_mw = rsp->max_mw; + attr->max_mr = rsp->max_mr; + attr->max_mr_size = ((u64)rsp->max_mr_size_hi << 32) | + rsp->max_mr_size_lo; + attr->max_fmr = 0; + attr->max_pages_per_frmr = rsp->max_pages_per_frmr; + attr->max_num_mr_pbl = rsp->max_num_mr_pbl; + attr->max_cqe = rsp->max_cq_cqes_per_cq & + OCRDMA_MBX_QUERY_CFG_MAX_CQES_PER_CQ_MASK; + attr->max_cq = (rsp->max_cq_cqes_per_cq & + OCRDMA_MBX_QUERY_CFG_MAX_CQ_MASK) >> + OCRDMA_MBX_QUERY_CFG_MAX_CQ_OFFSET; + attr->wqe_size = ((rsp->wqe_rqe_stride_max_dpp_cqs & + OCRDMA_MBX_QUERY_CFG_MAX_WQE_SIZE_MASK) >> + OCRDMA_MBX_QUERY_CFG_MAX_WQE_SIZE_OFFSET) * + OCRDMA_WQE_STRIDE; + attr->rqe_size = ((rsp->wqe_rqe_stride_max_dpp_cqs & + OCRDMA_MBX_QUERY_CFG_MAX_RQE_SIZE_MASK) >> + OCRDMA_MBX_QUERY_CFG_MAX_RQE_SIZE_OFFSET) * + OCRDMA_WQE_STRIDE; + attr->max_inline_data = + attr->wqe_size - (sizeof(struct ocrdma_hdr_wqe) + + sizeof(struct ocrdma_sge)); + if (ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R) { + attr->ird = 1; + attr->ird_page_size = OCRDMA_MIN_Q_PAGE_SIZE; + attr->num_ird_pages = MAX_OCRDMA_IRD_PAGES; + } + dev->attr.max_wqe = rsp->max_wqes_rqes_per_q >> + OCRDMA_MBX_QUERY_CFG_MAX_WQES_PER_WQ_OFFSET; + dev->attr.max_rqe = rsp->max_wqes_rqes_per_q & + OCRDMA_MBX_QUERY_CFG_MAX_RQES_PER_RQ_MASK; +} + +static int ocrdma_check_fw_config(struct ocrdma_dev *dev, + struct ocrdma_fw_conf_rsp *conf) +{ + u32 fn_mode; + + fn_mode = conf->fn_mode & OCRDMA_FN_MODE_RDMA; + if (fn_mode != OCRDMA_FN_MODE_RDMA) + return -EINVAL; + dev->base_eqid = conf->base_eqid; + dev->max_eq = conf->max_eq; + return 0; +} + +/* can be issued only during init time. */ +static int ocrdma_mbx_query_fw_ver(struct ocrdma_dev *dev) +{ + int status = -ENOMEM; + struct ocrdma_mqe *cmd; + struct ocrdma_fw_ver_rsp *rsp; + + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_GET_FW_VER, sizeof(*cmd)); + if (!cmd) + return -ENOMEM; + ocrdma_init_mch((struct ocrdma_mbx_hdr *)&cmd->u.cmd[0], + OCRDMA_CMD_GET_FW_VER, + OCRDMA_SUBSYS_COMMON, sizeof(*cmd)); + + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + if (status) + goto mbx_err; + rsp = (struct ocrdma_fw_ver_rsp *)cmd; + memset(&dev->attr.fw_ver[0], 0, sizeof(dev->attr.fw_ver)); + memcpy(&dev->attr.fw_ver[0], &rsp->running_ver[0], + sizeof(rsp->running_ver)); + ocrdma_le32_to_cpu(dev->attr.fw_ver, sizeof(rsp->running_ver)); +mbx_err: + kfree(cmd); + return status; +} + +/* can be issued only during init time. */ +static int ocrdma_mbx_query_fw_config(struct ocrdma_dev *dev) +{ + int status = -ENOMEM; + struct ocrdma_mqe *cmd; + struct ocrdma_fw_conf_rsp *rsp; + + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_GET_FW_CONFIG, sizeof(*cmd)); + if (!cmd) + return -ENOMEM; + ocrdma_init_mch((struct ocrdma_mbx_hdr *)&cmd->u.cmd[0], + OCRDMA_CMD_GET_FW_CONFIG, + OCRDMA_SUBSYS_COMMON, sizeof(*cmd)); + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + if (status) + goto mbx_err; + rsp = (struct ocrdma_fw_conf_rsp *)cmd; + status = ocrdma_check_fw_config(dev, rsp); +mbx_err: + kfree(cmd); + return status; +} + +int ocrdma_mbx_rdma_stats(struct ocrdma_dev *dev, bool reset) +{ + struct ocrdma_rdma_stats_req *req = dev->stats_mem.va; + struct ocrdma_mqe *mqe = &dev->stats_mem.mqe; + struct ocrdma_rdma_stats_resp *old_stats; + int status; + + old_stats = kmalloc(sizeof(*old_stats), GFP_KERNEL); + if (old_stats == NULL) + return -ENOMEM; + + memset(mqe, 0, sizeof(*mqe)); + mqe->hdr.pyld_len = dev->stats_mem.size; + mqe->hdr.spcl_sge_cnt_emb |= + (1 << OCRDMA_MQE_HDR_SGE_CNT_SHIFT) & + OCRDMA_MQE_HDR_SGE_CNT_MASK; + mqe->u.nonemb_req.sge[0].pa_lo = (u32) (dev->stats_mem.pa & 0xffffffff); + mqe->u.nonemb_req.sge[0].pa_hi = (u32) upper_32_bits(dev->stats_mem.pa); + mqe->u.nonemb_req.sge[0].len = dev->stats_mem.size; + + /* Cache the old stats */ + memcpy(old_stats, req, sizeof(struct ocrdma_rdma_stats_resp)); + memset(req, 0, dev->stats_mem.size); + + ocrdma_init_mch((struct ocrdma_mbx_hdr *)req, + OCRDMA_CMD_GET_RDMA_STATS, + OCRDMA_SUBSYS_ROCE, + dev->stats_mem.size); + if (reset) + req->reset_stats = reset; + + status = ocrdma_nonemb_mbx_cmd(dev, mqe, dev->stats_mem.va); + if (status) + /* Copy from cache, if mbox fails */ + memcpy(req, old_stats, sizeof(struct ocrdma_rdma_stats_resp)); + else + ocrdma_le32_to_cpu(req, dev->stats_mem.size); + + kfree(old_stats); + return status; +} + +static int ocrdma_mbx_get_ctrl_attribs(struct ocrdma_dev *dev) +{ + int status = -ENOMEM; + struct ocrdma_dma_mem dma; + struct ocrdma_mqe *mqe; + struct ocrdma_get_ctrl_attribs_rsp *ctrl_attr_rsp; + struct mgmt_hba_attribs *hba_attribs; + + mqe = kzalloc(sizeof(struct ocrdma_mqe), GFP_KERNEL); + if (!mqe) + return status; + + dma.size = sizeof(struct ocrdma_get_ctrl_attribs_rsp); + dma.va = dma_alloc_coherent(&dev->nic_info.pdev->dev, + dma.size, &dma.pa, GFP_KERNEL); + if (!dma.va) + goto free_mqe; + + mqe->hdr.pyld_len = dma.size; + mqe->hdr.spcl_sge_cnt_emb |= + (1 << OCRDMA_MQE_HDR_SGE_CNT_SHIFT) & + OCRDMA_MQE_HDR_SGE_CNT_MASK; + mqe->u.nonemb_req.sge[0].pa_lo = (u32) (dma.pa & 0xffffffff); + mqe->u.nonemb_req.sge[0].pa_hi = (u32) upper_32_bits(dma.pa); + mqe->u.nonemb_req.sge[0].len = dma.size; + + memset(dma.va, 0, dma.size); + ocrdma_init_mch((struct ocrdma_mbx_hdr *)dma.va, + OCRDMA_CMD_GET_CTRL_ATTRIBUTES, + OCRDMA_SUBSYS_COMMON, + dma.size); + + status = ocrdma_nonemb_mbx_cmd(dev, mqe, dma.va); + if (!status) { + ctrl_attr_rsp = (struct ocrdma_get_ctrl_attribs_rsp *)dma.va; + hba_attribs = &ctrl_attr_rsp->ctrl_attribs.hba_attribs; + + dev->hba_port_num = (hba_attribs->ptpnum_maxdoms_hbast_cv & + OCRDMA_HBA_ATTRB_PTNUM_MASK) + >> OCRDMA_HBA_ATTRB_PTNUM_SHIFT; + strncpy(dev->model_number, + hba_attribs->controller_model_number, 31); + } + dma_free_coherent(&dev->nic_info.pdev->dev, dma.size, dma.va, dma.pa); +free_mqe: + kfree(mqe); + return status; +} + +static int ocrdma_mbx_query_dev(struct ocrdma_dev *dev) +{ + int status = -ENOMEM; + struct ocrdma_mbx_query_config *rsp; + struct ocrdma_mqe *cmd; + + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_QUERY_CONFIG, sizeof(*cmd)); + if (!cmd) + return status; + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + if (status) + goto mbx_err; + rsp = (struct ocrdma_mbx_query_config *)cmd; + ocrdma_get_attr(dev, &dev->attr, rsp); +mbx_err: + kfree(cmd); + return status; +} + +int ocrdma_mbx_get_link_speed(struct ocrdma_dev *dev, u8 *lnk_speed) +{ + int status = -ENOMEM; + struct ocrdma_get_link_speed_rsp *rsp; + struct ocrdma_mqe *cmd; + + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_QUERY_NTWK_LINK_CONFIG_V1, + sizeof(*cmd)); + if (!cmd) + return status; + ocrdma_init_mch((struct ocrdma_mbx_hdr *)&cmd->u.cmd[0], + OCRDMA_CMD_QUERY_NTWK_LINK_CONFIG_V1, + OCRDMA_SUBSYS_COMMON, sizeof(*cmd)); + + ((struct ocrdma_mbx_hdr *)cmd->u.cmd)->rsvd_version = 0x1; + + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + if (status) + goto mbx_err; + + rsp = (struct ocrdma_get_link_speed_rsp *)cmd; + *lnk_speed = (rsp->pflt_pps_ld_pnum & OCRDMA_PHY_PS_MASK) + >> OCRDMA_PHY_PS_SHIFT; + +mbx_err: + kfree(cmd); + return status; +} + +static int ocrdma_mbx_get_phy_info(struct ocrdma_dev *dev) +{ + int status = -ENOMEM; + struct ocrdma_mqe *cmd; + struct ocrdma_get_phy_info_rsp *rsp; + + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_PHY_DETAILS, sizeof(*cmd)); + if (!cmd) + return status; + + ocrdma_init_mch((struct ocrdma_mbx_hdr *)&cmd->u.cmd[0], + OCRDMA_CMD_PHY_DETAILS, OCRDMA_SUBSYS_COMMON, + sizeof(*cmd)); + + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + if (status) + goto mbx_err; + + rsp = (struct ocrdma_get_phy_info_rsp *)cmd; + dev->phy.phy_type = + (rsp->ityp_ptyp & OCRDMA_PHY_TYPE_MASK); + dev->phy.interface_type = + (rsp->ityp_ptyp & OCRDMA_IF_TYPE_MASK) + >> OCRDMA_IF_TYPE_SHIFT; + dev->phy.auto_speeds_supported = + (rsp->fspeed_aspeed & OCRDMA_ASPEED_SUPP_MASK); + dev->phy.fixed_speeds_supported = + (rsp->fspeed_aspeed & OCRDMA_FSPEED_SUPP_MASK) + >> OCRDMA_FSPEED_SUPP_SHIFT; +mbx_err: + kfree(cmd); + return status; +} + +int ocrdma_mbx_alloc_pd(struct ocrdma_dev *dev, struct ocrdma_pd *pd) +{ + int status = -ENOMEM; + struct ocrdma_alloc_pd *cmd; + struct ocrdma_alloc_pd_rsp *rsp; + + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_ALLOC_PD, sizeof(*cmd)); + if (!cmd) + return status; + if (pd->dpp_enabled) + cmd->enable_dpp_rsvd |= OCRDMA_ALLOC_PD_ENABLE_DPP; + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + if (status) + goto mbx_err; + rsp = (struct ocrdma_alloc_pd_rsp *)cmd; + pd->id = rsp->dpp_page_pdid & OCRDMA_ALLOC_PD_RSP_PDID_MASK; + if (rsp->dpp_page_pdid & OCRDMA_ALLOC_PD_RSP_DPP) { + pd->dpp_enabled = true; + pd->dpp_page = rsp->dpp_page_pdid >> + OCRDMA_ALLOC_PD_RSP_DPP_PAGE_SHIFT; + } else { + pd->dpp_enabled = false; + pd->num_dpp_qp = 0; + } +mbx_err: + kfree(cmd); + return status; +} + +int ocrdma_mbx_dealloc_pd(struct ocrdma_dev *dev, struct ocrdma_pd *pd) +{ + int status = -ENOMEM; + struct ocrdma_dealloc_pd *cmd; + + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_DEALLOC_PD, sizeof(*cmd)); + if (!cmd) + return status; + cmd->id = pd->id; + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + kfree(cmd); + return status; +} + + +static int ocrdma_mbx_alloc_pd_range(struct ocrdma_dev *dev) +{ + int status = -ENOMEM; + size_t pd_bitmap_size; + struct ocrdma_alloc_pd_range *cmd; + struct ocrdma_alloc_pd_range_rsp *rsp; + + /* Pre allocate the DPP PDs */ + if (dev->attr.max_dpp_pds) { + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_ALLOC_PD_RANGE, + sizeof(*cmd)); + if (!cmd) + return -ENOMEM; + cmd->pd_count = dev->attr.max_dpp_pds; + cmd->enable_dpp_rsvd |= OCRDMA_ALLOC_PD_ENABLE_DPP; + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + rsp = (struct ocrdma_alloc_pd_range_rsp *)cmd; + + if (!status && (rsp->dpp_page_pdid & OCRDMA_ALLOC_PD_RSP_DPP) && + rsp->pd_count) { + dev->pd_mgr->dpp_page_index = rsp->dpp_page_pdid >> + OCRDMA_ALLOC_PD_RSP_DPP_PAGE_SHIFT; + dev->pd_mgr->pd_dpp_start = rsp->dpp_page_pdid & + OCRDMA_ALLOC_PD_RNG_RSP_START_PDID_MASK; + dev->pd_mgr->max_dpp_pd = rsp->pd_count; + pd_bitmap_size = + BITS_TO_LONGS(rsp->pd_count) * sizeof(long); + dev->pd_mgr->pd_dpp_bitmap = kzalloc(pd_bitmap_size, + GFP_KERNEL); + } + kfree(cmd); + } + + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_ALLOC_PD_RANGE, sizeof(*cmd)); + if (!cmd) + return -ENOMEM; + + cmd->pd_count = dev->attr.max_pd - dev->attr.max_dpp_pds; + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + rsp = (struct ocrdma_alloc_pd_range_rsp *)cmd; + if (!status && rsp->pd_count) { + dev->pd_mgr->pd_norm_start = rsp->dpp_page_pdid & + OCRDMA_ALLOC_PD_RNG_RSP_START_PDID_MASK; + dev->pd_mgr->max_normal_pd = rsp->pd_count; + pd_bitmap_size = BITS_TO_LONGS(rsp->pd_count) * sizeof(long); + dev->pd_mgr->pd_norm_bitmap = kzalloc(pd_bitmap_size, + GFP_KERNEL); + } + kfree(cmd); + + if (dev->pd_mgr->pd_norm_bitmap || dev->pd_mgr->pd_dpp_bitmap) { + /* Enable PD resource manager */ + dev->pd_mgr->pd_prealloc_valid = true; + return 0; + } + return status; +} + +static void ocrdma_mbx_dealloc_pd_range(struct ocrdma_dev *dev) +{ + struct ocrdma_dealloc_pd_range *cmd; + + /* return normal PDs to firmware */ + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_DEALLOC_PD_RANGE, sizeof(*cmd)); + if (!cmd) + goto mbx_err; + + if (dev->pd_mgr->max_normal_pd) { + cmd->start_pd_id = dev->pd_mgr->pd_norm_start; + cmd->pd_count = dev->pd_mgr->max_normal_pd; + ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + } + + if (dev->pd_mgr->max_dpp_pd) { + kfree(cmd); + /* return DPP PDs to firmware */ + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_DEALLOC_PD_RANGE, + sizeof(*cmd)); + if (!cmd) + goto mbx_err; + + cmd->start_pd_id = dev->pd_mgr->pd_dpp_start; + cmd->pd_count = dev->pd_mgr->max_dpp_pd; + ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + } +mbx_err: + kfree(cmd); +} + +void ocrdma_alloc_pd_pool(struct ocrdma_dev *dev) +{ + int status; + + dev->pd_mgr = kzalloc(sizeof(struct ocrdma_pd_resource_mgr), + GFP_KERNEL); + if (!dev->pd_mgr) { + pr_err("%s(%d)Memory allocation failure.\n", __func__, dev->id); + return; + } + status = ocrdma_mbx_alloc_pd_range(dev); + if (status) { + pr_err("%s(%d) Unable to initialize PD pool, using default.\n", + __func__, dev->id); + } +} + +static void ocrdma_free_pd_pool(struct ocrdma_dev *dev) +{ + ocrdma_mbx_dealloc_pd_range(dev); + kfree(dev->pd_mgr->pd_norm_bitmap); + kfree(dev->pd_mgr->pd_dpp_bitmap); + kfree(dev->pd_mgr); +} + +static int ocrdma_build_q_conf(u32 *num_entries, int entry_size, + int *num_pages, int *page_size) +{ + int i; + int mem_size; + + *num_entries = roundup_pow_of_two(*num_entries); + mem_size = *num_entries * entry_size; + /* find the possible lowest possible multiplier */ + for (i = 0; i < OCRDMA_MAX_Q_PAGE_SIZE_CNT; i++) { + if (mem_size <= (OCRDMA_Q_PAGE_BASE_SIZE << i)) + break; + } + if (i >= OCRDMA_MAX_Q_PAGE_SIZE_CNT) + return -EINVAL; + mem_size = roundup(mem_size, + ((OCRDMA_Q_PAGE_BASE_SIZE << i) / OCRDMA_MAX_Q_PAGES)); + *num_pages = + mem_size / ((OCRDMA_Q_PAGE_BASE_SIZE << i) / OCRDMA_MAX_Q_PAGES); + *page_size = ((OCRDMA_Q_PAGE_BASE_SIZE << i) / OCRDMA_MAX_Q_PAGES); + *num_entries = mem_size / entry_size; + return 0; +} + +static int ocrdma_mbx_create_ah_tbl(struct ocrdma_dev *dev) +{ + int i; + int status = 0; + int max_ah; + struct ocrdma_create_ah_tbl *cmd; + struct ocrdma_create_ah_tbl_rsp *rsp; + struct pci_dev *pdev = dev->nic_info.pdev; + dma_addr_t pa; + struct ocrdma_pbe *pbes; + + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_CREATE_AH_TBL, sizeof(*cmd)); + if (!cmd) + return status; + + max_ah = OCRDMA_MAX_AH; + dev->av_tbl.size = sizeof(struct ocrdma_av) * max_ah; + + /* number of PBEs in PBL */ + cmd->ah_conf = (OCRDMA_AH_TBL_PAGES << + OCRDMA_CREATE_AH_NUM_PAGES_SHIFT) & + OCRDMA_CREATE_AH_NUM_PAGES_MASK; + + /* page size */ + for (i = 0; i < OCRDMA_MAX_Q_PAGE_SIZE_CNT; i++) { + if (PAGE_SIZE == (OCRDMA_MIN_Q_PAGE_SIZE << i)) + break; + } + cmd->ah_conf |= (i << OCRDMA_CREATE_AH_PAGE_SIZE_SHIFT) & + OCRDMA_CREATE_AH_PAGE_SIZE_MASK; + + /* ah_entry size */ + cmd->ah_conf |= (sizeof(struct ocrdma_av) << + OCRDMA_CREATE_AH_ENTRY_SIZE_SHIFT) & + OCRDMA_CREATE_AH_ENTRY_SIZE_MASK; + + dev->av_tbl.pbl.va = dma_alloc_coherent(&pdev->dev, PAGE_SIZE, + &dev->av_tbl.pbl.pa, + GFP_KERNEL); + if (dev->av_tbl.pbl.va == NULL) + goto mem_err; + + dev->av_tbl.va = dma_alloc_coherent(&pdev->dev, dev->av_tbl.size, + &pa, GFP_KERNEL); + if (dev->av_tbl.va == NULL) + goto mem_err_ah; + dev->av_tbl.pa = pa; + dev->av_tbl.num_ah = max_ah; + memset(dev->av_tbl.va, 0, dev->av_tbl.size); + + pbes = (struct ocrdma_pbe *)dev->av_tbl.pbl.va; + for (i = 0; i < dev->av_tbl.size / OCRDMA_MIN_Q_PAGE_SIZE; i++) { + pbes[i].pa_lo = (u32)cpu_to_le32(pa & 0xffffffff); + pbes[i].pa_hi = (u32)cpu_to_le32(upper_32_bits(pa)); + pa += PAGE_SIZE; + } + cmd->tbl_addr[0].lo = (u32)(dev->av_tbl.pbl.pa & 0xFFFFFFFF); + cmd->tbl_addr[0].hi = (u32)upper_32_bits(dev->av_tbl.pbl.pa); + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + if (status) + goto mbx_err; + rsp = (struct ocrdma_create_ah_tbl_rsp *)cmd; + dev->av_tbl.ahid = rsp->ahid & 0xFFFF; + kfree(cmd); + return 0; + +mbx_err: + dma_free_coherent(&pdev->dev, dev->av_tbl.size, dev->av_tbl.va, + dev->av_tbl.pa); + dev->av_tbl.va = NULL; +mem_err_ah: + dma_free_coherent(&pdev->dev, PAGE_SIZE, dev->av_tbl.pbl.va, + dev->av_tbl.pbl.pa); + dev->av_tbl.pbl.va = NULL; + dev->av_tbl.size = 0; +mem_err: + kfree(cmd); + return status; +} + +static void ocrdma_mbx_delete_ah_tbl(struct ocrdma_dev *dev) +{ + struct ocrdma_delete_ah_tbl *cmd; + struct pci_dev *pdev = dev->nic_info.pdev; + + if (dev->av_tbl.va == NULL) + return; + + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_DELETE_AH_TBL, sizeof(*cmd)); + if (!cmd) + return; + cmd->ahid = dev->av_tbl.ahid; + + ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + dma_free_coherent(&pdev->dev, dev->av_tbl.size, dev->av_tbl.va, + dev->av_tbl.pa); + dev->av_tbl.va = NULL; + dma_free_coherent(&pdev->dev, PAGE_SIZE, dev->av_tbl.pbl.va, + dev->av_tbl.pbl.pa); + kfree(cmd); +} + +/* Multiple CQs uses the EQ. This routine returns least used + * EQ to associate with CQ. This will distributes the interrupt + * processing and CPU load to associated EQ, vector and so to that CPU. + */ +static u16 ocrdma_bind_eq(struct ocrdma_dev *dev) +{ + int i, selected_eq = 0, cq_cnt = 0; + u16 eq_id; + + mutex_lock(&dev->dev_lock); + cq_cnt = dev->eq_tbl[0].cq_cnt; + eq_id = dev->eq_tbl[0].q.id; + /* find the EQ which is has the least number of + * CQs associated with it. + */ + for (i = 0; i < dev->eq_cnt; i++) { + if (dev->eq_tbl[i].cq_cnt < cq_cnt) { + cq_cnt = dev->eq_tbl[i].cq_cnt; + eq_id = dev->eq_tbl[i].q.id; + selected_eq = i; + } + } + dev->eq_tbl[selected_eq].cq_cnt += 1; + mutex_unlock(&dev->dev_lock); + return eq_id; +} + +static void ocrdma_unbind_eq(struct ocrdma_dev *dev, u16 eq_id) +{ + int i; + + mutex_lock(&dev->dev_lock); + i = ocrdma_get_eq_table_index(dev, eq_id); + if (i == -EINVAL) + BUG(); + dev->eq_tbl[i].cq_cnt -= 1; + mutex_unlock(&dev->dev_lock); +} + +int ocrdma_mbx_create_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq, + int entries, int dpp_cq, u16 pd_id) +{ + int status = -ENOMEM; int max_hw_cqe; + struct pci_dev *pdev = dev->nic_info.pdev; + struct ocrdma_create_cq *cmd; + struct ocrdma_create_cq_rsp *rsp; + u32 hw_pages, cqe_size, page_size, cqe_count; + + if (entries > dev->attr.max_cqe) { + pr_err("%s(%d) max_cqe=0x%x, requester_cqe=0x%x\n", + __func__, dev->id, dev->attr.max_cqe, entries); + return -EINVAL; + } + if (dpp_cq && (ocrdma_get_asic_type(dev) != OCRDMA_ASIC_GEN_SKH_R)) + return -EINVAL; + + if (dpp_cq) { + cq->max_hw_cqe = 1; + max_hw_cqe = 1; + cqe_size = OCRDMA_DPP_CQE_SIZE; + hw_pages = 1; + } else { + cq->max_hw_cqe = dev->attr.max_cqe; + max_hw_cqe = dev->attr.max_cqe; + cqe_size = sizeof(struct ocrdma_cqe); + hw_pages = OCRDMA_CREATE_CQ_MAX_PAGES; + } + + cq->len = roundup(max_hw_cqe * cqe_size, OCRDMA_MIN_Q_PAGE_SIZE); + + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_CREATE_CQ, sizeof(*cmd)); + if (!cmd) + return -ENOMEM; + ocrdma_init_mch(&cmd->cmd.req, OCRDMA_CMD_CREATE_CQ, + OCRDMA_SUBSYS_COMMON, sizeof(*cmd)); + cq->va = dma_alloc_coherent(&pdev->dev, cq->len, &cq->pa, GFP_KERNEL); + if (!cq->va) { + status = -ENOMEM; + goto mem_err; + } + memset(cq->va, 0, cq->len); + page_size = cq->len / hw_pages; + cmd->cmd.pgsz_pgcnt = (page_size / OCRDMA_MIN_Q_PAGE_SIZE) << + OCRDMA_CREATE_CQ_PAGE_SIZE_SHIFT; + cmd->cmd.pgsz_pgcnt |= hw_pages; + cmd->cmd.ev_cnt_flags = OCRDMA_CREATE_CQ_DEF_FLAGS; + + cq->eqn = ocrdma_bind_eq(dev); + cmd->cmd.req.rsvd_version = OCRDMA_CREATE_CQ_VER3; + cqe_count = cq->len / cqe_size; + cq->cqe_cnt = cqe_count; + if (cqe_count > 1024) { + /* Set cnt to 3 to indicate more than 1024 cq entries */ + cmd->cmd.ev_cnt_flags |= (0x3 << OCRDMA_CREATE_CQ_CNT_SHIFT); + } else { + u8 count = 0; + switch (cqe_count) { + case 256: + count = 0; + break; + case 512: + count = 1; + break; + case 1024: + count = 2; + break; + default: + goto mbx_err; + } + cmd->cmd.ev_cnt_flags |= (count << OCRDMA_CREATE_CQ_CNT_SHIFT); + } + /* shared eq between all the consumer cqs. */ + cmd->cmd.eqn = cq->eqn; + if (ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R) { + if (dpp_cq) + cmd->cmd.pgsz_pgcnt |= OCRDMA_CREATE_CQ_DPP << + OCRDMA_CREATE_CQ_TYPE_SHIFT; + cq->phase_change = false; + cmd->cmd.pdid_cqecnt = (cq->len / cqe_size); + } else { + cmd->cmd.pdid_cqecnt = (cq->len / cqe_size) - 1; + cmd->cmd.ev_cnt_flags |= OCRDMA_CREATE_CQ_FLAGS_AUTO_VALID; + cq->phase_change = true; + } + + /* pd_id valid only for v3 */ + cmd->cmd.pdid_cqecnt |= (pd_id << + OCRDMA_CREATE_CQ_CMD_PDID_SHIFT); + ocrdma_build_q_pages(&cmd->cmd.pa[0], hw_pages, cq->pa, page_size); + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + if (status) + goto mbx_err; + + rsp = (struct ocrdma_create_cq_rsp *)cmd; + cq->id = (u16) (rsp->rsp.cq_id & OCRDMA_CREATE_CQ_RSP_CQ_ID_MASK); + kfree(cmd); + return 0; +mbx_err: + ocrdma_unbind_eq(dev, cq->eqn); + dma_free_coherent(&pdev->dev, cq->len, cq->va, cq->pa); +mem_err: + kfree(cmd); + return status; +} + +int ocrdma_mbx_destroy_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq) +{ + int status = -ENOMEM; + struct ocrdma_destroy_cq *cmd; + + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_DELETE_CQ, sizeof(*cmd)); + if (!cmd) + return status; + ocrdma_init_mch(&cmd->req, OCRDMA_CMD_DELETE_CQ, + OCRDMA_SUBSYS_COMMON, sizeof(*cmd)); + + cmd->bypass_flush_qid |= + (cq->id << OCRDMA_DESTROY_CQ_QID_SHIFT) & + OCRDMA_DESTROY_CQ_QID_MASK; + + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + ocrdma_unbind_eq(dev, cq->eqn); + dma_free_coherent(&dev->nic_info.pdev->dev, cq->len, cq->va, cq->pa); + kfree(cmd); + return status; +} + +int ocrdma_mbx_alloc_lkey(struct ocrdma_dev *dev, struct ocrdma_hw_mr *hwmr, + u32 pdid, int addr_check) +{ + int status = -ENOMEM; + struct ocrdma_alloc_lkey *cmd; + struct ocrdma_alloc_lkey_rsp *rsp; + + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_ALLOC_LKEY, sizeof(*cmd)); + if (!cmd) + return status; + cmd->pdid = pdid; + cmd->pbl_sz_flags |= addr_check; + cmd->pbl_sz_flags |= (hwmr->fr_mr << OCRDMA_ALLOC_LKEY_FMR_SHIFT); + cmd->pbl_sz_flags |= + (hwmr->remote_wr << OCRDMA_ALLOC_LKEY_REMOTE_WR_SHIFT); + cmd->pbl_sz_flags |= + (hwmr->remote_rd << OCRDMA_ALLOC_LKEY_REMOTE_RD_SHIFT); + cmd->pbl_sz_flags |= + (hwmr->local_wr << OCRDMA_ALLOC_LKEY_LOCAL_WR_SHIFT); + cmd->pbl_sz_flags |= + (hwmr->remote_atomic << OCRDMA_ALLOC_LKEY_REMOTE_ATOMIC_SHIFT); + cmd->pbl_sz_flags |= + (hwmr->num_pbls << OCRDMA_ALLOC_LKEY_PBL_SIZE_SHIFT); + + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + if (status) + goto mbx_err; + rsp = (struct ocrdma_alloc_lkey_rsp *)cmd; + hwmr->lkey = rsp->lrkey; +mbx_err: + kfree(cmd); + return status; +} + +int ocrdma_mbx_dealloc_lkey(struct ocrdma_dev *dev, int fr_mr, u32 lkey) +{ + int status = -ENOMEM; + struct ocrdma_dealloc_lkey *cmd; + + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_DEALLOC_LKEY, sizeof(*cmd)); + if (!cmd) + return -ENOMEM; + cmd->lkey = lkey; + cmd->rsvd_frmr = fr_mr ? 1 : 0; + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + if (status) + goto mbx_err; +mbx_err: + kfree(cmd); + return status; +} + +static int ocrdma_mbx_reg_mr(struct ocrdma_dev *dev, struct ocrdma_hw_mr *hwmr, + u32 pdid, u32 pbl_cnt, u32 pbe_size, u32 last) +{ + int status = -ENOMEM; + int i; + struct ocrdma_reg_nsmr *cmd; + struct ocrdma_reg_nsmr_rsp *rsp; + + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_REGISTER_NSMR, sizeof(*cmd)); + if (!cmd) + return -ENOMEM; + cmd->num_pbl_pdid = + pdid | (hwmr->num_pbls << OCRDMA_REG_NSMR_NUM_PBL_SHIFT); + cmd->fr_mr = hwmr->fr_mr; + + cmd->flags_hpage_pbe_sz |= (hwmr->remote_wr << + OCRDMA_REG_NSMR_REMOTE_WR_SHIFT); + cmd->flags_hpage_pbe_sz |= (hwmr->remote_rd << + OCRDMA_REG_NSMR_REMOTE_RD_SHIFT); + cmd->flags_hpage_pbe_sz |= (hwmr->local_wr << + OCRDMA_REG_NSMR_LOCAL_WR_SHIFT); + cmd->flags_hpage_pbe_sz |= (hwmr->remote_atomic << + OCRDMA_REG_NSMR_REMOTE_ATOMIC_SHIFT); + cmd->flags_hpage_pbe_sz |= (hwmr->mw_bind << + OCRDMA_REG_NSMR_BIND_MEMWIN_SHIFT); + cmd->flags_hpage_pbe_sz |= (last << OCRDMA_REG_NSMR_LAST_SHIFT); + + cmd->flags_hpage_pbe_sz |= (hwmr->pbe_size / OCRDMA_MIN_HPAGE_SIZE); + cmd->flags_hpage_pbe_sz |= (hwmr->pbl_size / OCRDMA_MIN_HPAGE_SIZE) << + OCRDMA_REG_NSMR_HPAGE_SIZE_SHIFT; + cmd->totlen_low = hwmr->len; + cmd->totlen_high = upper_32_bits(hwmr->len); + cmd->fbo_low = (u32) (hwmr->fbo & 0xffffffff); + cmd->fbo_high = (u32) upper_32_bits(hwmr->fbo); + cmd->va_loaddr = (u32) hwmr->va; + cmd->va_hiaddr = (u32) upper_32_bits(hwmr->va); + + for (i = 0; i < pbl_cnt; i++) { + cmd->pbl[i].lo = (u32) (hwmr->pbl_table[i].pa & 0xffffffff); + cmd->pbl[i].hi = upper_32_bits(hwmr->pbl_table[i].pa); + } + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + if (status) + goto mbx_err; + rsp = (struct ocrdma_reg_nsmr_rsp *)cmd; + hwmr->lkey = rsp->lrkey; +mbx_err: + kfree(cmd); + return status; +} + +static int ocrdma_mbx_reg_mr_cont(struct ocrdma_dev *dev, + struct ocrdma_hw_mr *hwmr, u32 pbl_cnt, + u32 pbl_offset, u32 last) +{ + int status = -ENOMEM; + int i; + struct ocrdma_reg_nsmr_cont *cmd; + + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_REGISTER_NSMR_CONT, sizeof(*cmd)); + if (!cmd) + return -ENOMEM; + cmd->lrkey = hwmr->lkey; + cmd->num_pbl_offset = (pbl_cnt << OCRDMA_REG_NSMR_CONT_NUM_PBL_SHIFT) | + (pbl_offset & OCRDMA_REG_NSMR_CONT_PBL_SHIFT_MASK); + cmd->last = last << OCRDMA_REG_NSMR_CONT_LAST_SHIFT; + + for (i = 0; i < pbl_cnt; i++) { + cmd->pbl[i].lo = + (u32) (hwmr->pbl_table[i + pbl_offset].pa & 0xffffffff); + cmd->pbl[i].hi = + upper_32_bits(hwmr->pbl_table[i + pbl_offset].pa); + } + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + if (status) + goto mbx_err; +mbx_err: + kfree(cmd); + return status; +} + +int ocrdma_reg_mr(struct ocrdma_dev *dev, + struct ocrdma_hw_mr *hwmr, u32 pdid, int acc) +{ + int status; + u32 last = 0; + u32 cur_pbl_cnt, pbl_offset; + u32 pending_pbl_cnt = hwmr->num_pbls; + + pbl_offset = 0; + cur_pbl_cnt = min(pending_pbl_cnt, MAX_OCRDMA_NSMR_PBL); + if (cur_pbl_cnt == pending_pbl_cnt) + last = 1; + + status = ocrdma_mbx_reg_mr(dev, hwmr, pdid, + cur_pbl_cnt, hwmr->pbe_size, last); + if (status) { + pr_err("%s() status=%d\n", __func__, status); + return status; + } + /* if there is no more pbls to register then exit. */ + if (last) + return 0; + + while (!last) { + pbl_offset += cur_pbl_cnt; + pending_pbl_cnt -= cur_pbl_cnt; + cur_pbl_cnt = min(pending_pbl_cnt, MAX_OCRDMA_NSMR_PBL); + /* if we reach the end of the pbls, then need to set the last + * bit, indicating no more pbls to register for this memory key. + */ + if (cur_pbl_cnt == pending_pbl_cnt) + last = 1; + + status = ocrdma_mbx_reg_mr_cont(dev, hwmr, cur_pbl_cnt, + pbl_offset, last); + if (status) + break; + } + if (status) + pr_err("%s() err. status=%d\n", __func__, status); + + return status; +} + +bool ocrdma_is_qp_in_sq_flushlist(struct ocrdma_cq *cq, struct ocrdma_qp *qp) +{ + struct ocrdma_qp *tmp; + bool found = false; + list_for_each_entry(tmp, &cq->sq_head, sq_entry) { + if (qp == tmp) { + found = true; + break; + } + } + return found; +} + +bool ocrdma_is_qp_in_rq_flushlist(struct ocrdma_cq *cq, struct ocrdma_qp *qp) +{ + struct ocrdma_qp *tmp; + bool found = false; + list_for_each_entry(tmp, &cq->rq_head, rq_entry) { + if (qp == tmp) { + found = true; + break; + } + } + return found; +} + +void ocrdma_flush_qp(struct ocrdma_qp *qp) +{ + bool found; + unsigned long flags; + struct ocrdma_dev *dev = get_ocrdma_dev(qp->ibqp.device); + + spin_lock_irqsave(&dev->flush_q_lock, flags); + found = ocrdma_is_qp_in_sq_flushlist(qp->sq_cq, qp); + if (!found) + list_add_tail(&qp->sq_entry, &qp->sq_cq->sq_head); + if (!qp->srq) { + found = ocrdma_is_qp_in_rq_flushlist(qp->rq_cq, qp); + if (!found) + list_add_tail(&qp->rq_entry, &qp->rq_cq->rq_head); + } + spin_unlock_irqrestore(&dev->flush_q_lock, flags); +} + +static void ocrdma_init_hwq_ptr(struct ocrdma_qp *qp) +{ + qp->sq.head = 0; + qp->sq.tail = 0; + qp->rq.head = 0; + qp->rq.tail = 0; +} + +int ocrdma_qp_state_change(struct ocrdma_qp *qp, enum ib_qp_state new_ib_state, + enum ib_qp_state *old_ib_state) +{ + unsigned long flags; + int status = 0; + enum ocrdma_qp_state new_state; + new_state = get_ocrdma_qp_state(new_ib_state); + + /* sync with wqe and rqe posting */ + spin_lock_irqsave(&qp->q_lock, flags); + + if (old_ib_state) + *old_ib_state = get_ibqp_state(qp->state); + if (new_state == qp->state) { + spin_unlock_irqrestore(&qp->q_lock, flags); + return 1; + } + + + if (new_state == OCRDMA_QPS_INIT) { + ocrdma_init_hwq_ptr(qp); + ocrdma_del_flush_qp(qp); + } else if (new_state == OCRDMA_QPS_ERR) { + ocrdma_flush_qp(qp); + } + + qp->state = new_state; + + spin_unlock_irqrestore(&qp->q_lock, flags); + return status; +} + +static u32 ocrdma_set_create_qp_mbx_access_flags(struct ocrdma_qp *qp) +{ + u32 flags = 0; + if (qp->cap_flags & OCRDMA_QP_INB_RD) + flags |= OCRDMA_CREATE_QP_REQ_INB_RDEN_MASK; + if (qp->cap_flags & OCRDMA_QP_INB_WR) + flags |= OCRDMA_CREATE_QP_REQ_INB_WREN_MASK; + if (qp->cap_flags & OCRDMA_QP_MW_BIND) + flags |= OCRDMA_CREATE_QP_REQ_BIND_MEMWIN_MASK; + if (qp->cap_flags & OCRDMA_QP_LKEY0) + flags |= OCRDMA_CREATE_QP_REQ_ZERO_LKEYEN_MASK; + if (qp->cap_flags & OCRDMA_QP_FAST_REG) + flags |= OCRDMA_CREATE_QP_REQ_FMR_EN_MASK; + return flags; +} + +static int ocrdma_set_create_qp_sq_cmd(struct ocrdma_create_qp_req *cmd, + struct ib_qp_init_attr *attrs, + struct ocrdma_qp *qp) +{ + int status; + u32 len, hw_pages, hw_page_size; + dma_addr_t pa; + struct ocrdma_pd *pd = qp->pd; + struct ocrdma_dev *dev = get_ocrdma_dev(pd->ibpd.device); + struct pci_dev *pdev = dev->nic_info.pdev; + u32 max_wqe_allocated; + u32 max_sges = attrs->cap.max_send_sge; + + /* QP1 may exceed 127 */ + max_wqe_allocated = min_t(u32, attrs->cap.max_send_wr + 1, + dev->attr.max_wqe); + + status = ocrdma_build_q_conf(&max_wqe_allocated, + dev->attr.wqe_size, &hw_pages, &hw_page_size); + if (status) { + pr_err("%s() req. max_send_wr=0x%x\n", __func__, + max_wqe_allocated); + return -EINVAL; + } + qp->sq.max_cnt = max_wqe_allocated; + len = (hw_pages * hw_page_size); + + qp->sq.va = dma_alloc_coherent(&pdev->dev, len, &pa, GFP_KERNEL); + if (!qp->sq.va) + return -EINVAL; + memset(qp->sq.va, 0, len); + qp->sq.len = len; + qp->sq.pa = pa; + qp->sq.entry_size = dev->attr.wqe_size; + ocrdma_build_q_pages(&cmd->wq_addr[0], hw_pages, pa, hw_page_size); + + cmd->type_pgsz_pdn |= (ilog2(hw_page_size / OCRDMA_MIN_Q_PAGE_SIZE) + << OCRDMA_CREATE_QP_REQ_SQ_PAGE_SIZE_SHIFT); + cmd->num_wq_rq_pages |= (hw_pages << + OCRDMA_CREATE_QP_REQ_NUM_WQ_PAGES_SHIFT) & + OCRDMA_CREATE_QP_REQ_NUM_WQ_PAGES_MASK; + cmd->max_sge_send_write |= (max_sges << + OCRDMA_CREATE_QP_REQ_MAX_SGE_SEND_SHIFT) & + OCRDMA_CREATE_QP_REQ_MAX_SGE_SEND_MASK; + cmd->max_sge_send_write |= (max_sges << + OCRDMA_CREATE_QP_REQ_MAX_SGE_WRITE_SHIFT) & + OCRDMA_CREATE_QP_REQ_MAX_SGE_WRITE_MASK; + cmd->max_wqe_rqe |= (ilog2(qp->sq.max_cnt) << + OCRDMA_CREATE_QP_REQ_MAX_WQE_SHIFT) & + OCRDMA_CREATE_QP_REQ_MAX_WQE_MASK; + cmd->wqe_rqe_size |= (dev->attr.wqe_size << + OCRDMA_CREATE_QP_REQ_WQE_SIZE_SHIFT) & + OCRDMA_CREATE_QP_REQ_WQE_SIZE_MASK; + return 0; +} + +static int ocrdma_set_create_qp_rq_cmd(struct ocrdma_create_qp_req *cmd, + struct ib_qp_init_attr *attrs, + struct ocrdma_qp *qp) +{ + int status; + u32 len, hw_pages, hw_page_size; + dma_addr_t pa = 0; + struct ocrdma_pd *pd = qp->pd; + struct ocrdma_dev *dev = get_ocrdma_dev(pd->ibpd.device); + struct pci_dev *pdev = dev->nic_info.pdev; + u32 max_rqe_allocated = attrs->cap.max_recv_wr + 1; + + status = ocrdma_build_q_conf(&max_rqe_allocated, dev->attr.rqe_size, + &hw_pages, &hw_page_size); + if (status) { + pr_err("%s() req. max_recv_wr=0x%x\n", __func__, + attrs->cap.max_recv_wr + 1); + return status; + } + qp->rq.max_cnt = max_rqe_allocated; + len = (hw_pages * hw_page_size); + + qp->rq.va = dma_alloc_coherent(&pdev->dev, len, &pa, GFP_KERNEL); + if (!qp->rq.va) + return -ENOMEM; + memset(qp->rq.va, 0, len); + qp->rq.pa = pa; + qp->rq.len = len; + qp->rq.entry_size = dev->attr.rqe_size; + + ocrdma_build_q_pages(&cmd->rq_addr[0], hw_pages, pa, hw_page_size); + cmd->type_pgsz_pdn |= (ilog2(hw_page_size / OCRDMA_MIN_Q_PAGE_SIZE) << + OCRDMA_CREATE_QP_REQ_RQ_PAGE_SIZE_SHIFT); + cmd->num_wq_rq_pages |= + (hw_pages << OCRDMA_CREATE_QP_REQ_NUM_RQ_PAGES_SHIFT) & + OCRDMA_CREATE_QP_REQ_NUM_RQ_PAGES_MASK; + cmd->max_sge_recv_flags |= (attrs->cap.max_recv_sge << + OCRDMA_CREATE_QP_REQ_MAX_SGE_RECV_SHIFT) & + OCRDMA_CREATE_QP_REQ_MAX_SGE_RECV_MASK; + cmd->max_wqe_rqe |= (ilog2(qp->rq.max_cnt) << + OCRDMA_CREATE_QP_REQ_MAX_RQE_SHIFT) & + OCRDMA_CREATE_QP_REQ_MAX_RQE_MASK; + cmd->wqe_rqe_size |= (dev->attr.rqe_size << + OCRDMA_CREATE_QP_REQ_RQE_SIZE_SHIFT) & + OCRDMA_CREATE_QP_REQ_RQE_SIZE_MASK; + return 0; +} + +static void ocrdma_set_create_qp_dpp_cmd(struct ocrdma_create_qp_req *cmd, + struct ocrdma_pd *pd, + struct ocrdma_qp *qp, + u8 enable_dpp_cq, u16 dpp_cq_id) +{ + pd->num_dpp_qp--; + qp->dpp_enabled = true; + cmd->max_sge_recv_flags |= OCRDMA_CREATE_QP_REQ_ENABLE_DPP_MASK; + if (!enable_dpp_cq) + return; + cmd->max_sge_recv_flags |= OCRDMA_CREATE_QP_REQ_ENABLE_DPP_MASK; + cmd->dpp_credits_cqid = dpp_cq_id; + cmd->dpp_credits_cqid |= OCRDMA_CREATE_QP_REQ_DPP_CREDIT_LIMIT << + OCRDMA_CREATE_QP_REQ_DPP_CREDIT_SHIFT; +} + +static int ocrdma_set_create_qp_ird_cmd(struct ocrdma_create_qp_req *cmd, + struct ocrdma_qp *qp) +{ + struct ocrdma_pd *pd = qp->pd; + struct ocrdma_dev *dev = get_ocrdma_dev(pd->ibpd.device); + struct pci_dev *pdev = dev->nic_info.pdev; + dma_addr_t pa = 0; + int ird_page_size = dev->attr.ird_page_size; + int ird_q_len = dev->attr.num_ird_pages * ird_page_size; + struct ocrdma_hdr_wqe *rqe; + int i = 0; + + if (dev->attr.ird == 0) + return 0; + + qp->ird_q_va = dma_alloc_coherent(&pdev->dev, ird_q_len, + &pa, GFP_KERNEL); + if (!qp->ird_q_va) + return -ENOMEM; + memset(qp->ird_q_va, 0, ird_q_len); + ocrdma_build_q_pages(&cmd->ird_addr[0], dev->attr.num_ird_pages, + pa, ird_page_size); + for (; i < ird_q_len / dev->attr.rqe_size; i++) { + rqe = (struct ocrdma_hdr_wqe *)(qp->ird_q_va + + (i * dev->attr.rqe_size)); + rqe->cw = 0; + rqe->cw |= 2; + rqe->cw |= (OCRDMA_TYPE_LKEY << OCRDMA_WQE_TYPE_SHIFT); + rqe->cw |= (8 << OCRDMA_WQE_SIZE_SHIFT); + rqe->cw |= (8 << OCRDMA_WQE_NXT_WQE_SIZE_SHIFT); + } + return 0; +} + +static void ocrdma_get_create_qp_rsp(struct ocrdma_create_qp_rsp *rsp, + struct ocrdma_qp *qp, + struct ib_qp_init_attr *attrs, + u16 *dpp_offset, u16 *dpp_credit_lmt) +{ + u32 max_wqe_allocated, max_rqe_allocated; + qp->id = rsp->qp_id & OCRDMA_CREATE_QP_RSP_QP_ID_MASK; + qp->rq.dbid = rsp->sq_rq_id & OCRDMA_CREATE_QP_RSP_RQ_ID_MASK; + qp->sq.dbid = rsp->sq_rq_id >> OCRDMA_CREATE_QP_RSP_SQ_ID_SHIFT; + qp->max_ird = rsp->max_ord_ird & OCRDMA_CREATE_QP_RSP_MAX_IRD_MASK; + qp->max_ord = (rsp->max_ord_ird >> OCRDMA_CREATE_QP_RSP_MAX_ORD_SHIFT); + qp->dpp_enabled = false; + if (rsp->dpp_response & OCRDMA_CREATE_QP_RSP_DPP_ENABLED_MASK) { + qp->dpp_enabled = true; + *dpp_credit_lmt = (rsp->dpp_response & + OCRDMA_CREATE_QP_RSP_DPP_CREDITS_MASK) >> + OCRDMA_CREATE_QP_RSP_DPP_CREDITS_SHIFT; + *dpp_offset = (rsp->dpp_response & + OCRDMA_CREATE_QP_RSP_DPP_PAGE_OFFSET_MASK) >> + OCRDMA_CREATE_QP_RSP_DPP_PAGE_OFFSET_SHIFT; + } + max_wqe_allocated = + rsp->max_wqe_rqe >> OCRDMA_CREATE_QP_RSP_MAX_WQE_SHIFT; + max_wqe_allocated = 1 << max_wqe_allocated; + max_rqe_allocated = 1 << ((u16)rsp->max_wqe_rqe); + + qp->sq.max_cnt = max_wqe_allocated; + qp->sq.max_wqe_idx = max_wqe_allocated - 1; + + if (!attrs->srq) { + qp->rq.max_cnt = max_rqe_allocated; + qp->rq.max_wqe_idx = max_rqe_allocated - 1; + } +} + +int ocrdma_mbx_create_qp(struct ocrdma_qp *qp, struct ib_qp_init_attr *attrs, + u8 enable_dpp_cq, u16 dpp_cq_id, u16 *dpp_offset, + u16 *dpp_credit_lmt) +{ + int status = -ENOMEM; + u32 flags = 0; + struct ocrdma_pd *pd = qp->pd; + struct ocrdma_dev *dev = get_ocrdma_dev(pd->ibpd.device); + struct pci_dev *pdev = dev->nic_info.pdev; + struct ocrdma_cq *cq; + struct ocrdma_create_qp_req *cmd; + struct ocrdma_create_qp_rsp *rsp; + int qptype; + + switch (attrs->qp_type) { + case IB_QPT_GSI: + qptype = OCRDMA_QPT_GSI; + break; + case IB_QPT_RC: + qptype = OCRDMA_QPT_RC; + break; + case IB_QPT_UD: + qptype = OCRDMA_QPT_UD; + break; + default: + return -EINVAL; + } + + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_CREATE_QP, sizeof(*cmd)); + if (!cmd) + return status; + cmd->type_pgsz_pdn |= (qptype << OCRDMA_CREATE_QP_REQ_QPT_SHIFT) & + OCRDMA_CREATE_QP_REQ_QPT_MASK; + status = ocrdma_set_create_qp_sq_cmd(cmd, attrs, qp); + if (status) + goto sq_err; + + if (attrs->srq) { + struct ocrdma_srq *srq = get_ocrdma_srq(attrs->srq); + cmd->max_sge_recv_flags |= OCRDMA_CREATE_QP_REQ_USE_SRQ_MASK; + cmd->rq_addr[0].lo = srq->id; + qp->srq = srq; + } else { + status = ocrdma_set_create_qp_rq_cmd(cmd, attrs, qp); + if (status) + goto rq_err; + } + + status = ocrdma_set_create_qp_ird_cmd(cmd, qp); + if (status) + goto mbx_err; + + cmd->type_pgsz_pdn |= (pd->id << OCRDMA_CREATE_QP_REQ_PD_ID_SHIFT) & + OCRDMA_CREATE_QP_REQ_PD_ID_MASK; + + flags = ocrdma_set_create_qp_mbx_access_flags(qp); + + cmd->max_sge_recv_flags |= flags; + cmd->max_ord_ird |= (dev->attr.max_ord_per_qp << + OCRDMA_CREATE_QP_REQ_MAX_ORD_SHIFT) & + OCRDMA_CREATE_QP_REQ_MAX_ORD_MASK; + cmd->max_ord_ird |= (dev->attr.max_ird_per_qp << + OCRDMA_CREATE_QP_REQ_MAX_IRD_SHIFT) & + OCRDMA_CREATE_QP_REQ_MAX_IRD_MASK; + cq = get_ocrdma_cq(attrs->send_cq); + cmd->wq_rq_cqid |= (cq->id << OCRDMA_CREATE_QP_REQ_WQ_CQID_SHIFT) & + OCRDMA_CREATE_QP_REQ_WQ_CQID_MASK; + qp->sq_cq = cq; + cq = get_ocrdma_cq(attrs->recv_cq); + cmd->wq_rq_cqid |= (cq->id << OCRDMA_CREATE_QP_REQ_RQ_CQID_SHIFT) & + OCRDMA_CREATE_QP_REQ_RQ_CQID_MASK; + qp->rq_cq = cq; + + if (pd->dpp_enabled && attrs->cap.max_inline_data && pd->num_dpp_qp && + (attrs->cap.max_inline_data <= dev->attr.max_inline_data)) { + ocrdma_set_create_qp_dpp_cmd(cmd, pd, qp, enable_dpp_cq, + dpp_cq_id); + } + + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + if (status) + goto mbx_err; + rsp = (struct ocrdma_create_qp_rsp *)cmd; + ocrdma_get_create_qp_rsp(rsp, qp, attrs, dpp_offset, dpp_credit_lmt); + qp->state = OCRDMA_QPS_RST; + kfree(cmd); + return 0; +mbx_err: + if (qp->rq.va) + dma_free_coherent(&pdev->dev, qp->rq.len, qp->rq.va, qp->rq.pa); +rq_err: + pr_err("%s(%d) rq_err\n", __func__, dev->id); + dma_free_coherent(&pdev->dev, qp->sq.len, qp->sq.va, qp->sq.pa); +sq_err: + pr_err("%s(%d) sq_err\n", __func__, dev->id); + kfree(cmd); + return status; +} + +int ocrdma_mbx_query_qp(struct ocrdma_dev *dev, struct ocrdma_qp *qp, + struct ocrdma_qp_params *param) +{ + int status = -ENOMEM; + struct ocrdma_query_qp *cmd; + struct ocrdma_query_qp_rsp *rsp; + + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_QUERY_QP, sizeof(*rsp)); + if (!cmd) + return status; + cmd->qp_id = qp->id; + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + if (status) + goto mbx_err; + rsp = (struct ocrdma_query_qp_rsp *)cmd; + memcpy(param, &rsp->params, sizeof(struct ocrdma_qp_params)); +mbx_err: + kfree(cmd); + return status; +} + +static int ocrdma_set_av_params(struct ocrdma_qp *qp, + struct ocrdma_modify_qp *cmd, + struct ib_qp_attr *attrs, + int attr_mask) +{ + int status; + struct ib_ah_attr *ah_attr = &attrs->ah_attr; + union ib_gid sgid, zgid; + u32 vlan_id = 0xFFFF; + u8 mac_addr[6]; + struct ocrdma_dev *dev = get_ocrdma_dev(qp->ibqp.device); + + if ((ah_attr->ah_flags & IB_AH_GRH) == 0) + return -EINVAL; + if (atomic_cmpxchg(&dev->update_sl, 1, 0)) + ocrdma_init_service_level(dev); + cmd->params.tclass_sq_psn |= + (ah_attr->grh.traffic_class << OCRDMA_QP_PARAMS_TCLASS_SHIFT); + cmd->params.rnt_rc_sl_fl |= + (ah_attr->grh.flow_label & OCRDMA_QP_PARAMS_FLOW_LABEL_MASK); + cmd->params.rnt_rc_sl_fl |= (ah_attr->sl << OCRDMA_QP_PARAMS_SL_SHIFT); + cmd->params.hop_lmt_rq_psn |= + (ah_attr->grh.hop_limit << OCRDMA_QP_PARAMS_HOP_LMT_SHIFT); + cmd->flags |= OCRDMA_QP_PARA_FLOW_LBL_VALID; + memcpy(&cmd->params.dgid[0], &ah_attr->grh.dgid.raw[0], + sizeof(cmd->params.dgid)); + status = ocrdma_query_gid(&dev->ibdev, 1, + ah_attr->grh.sgid_index, &sgid); + if (status) + return status; + + memset(&zgid, 0, sizeof(zgid)); + if (!memcmp(&sgid, &zgid, sizeof(zgid))) + return -EINVAL; + + qp->sgid_idx = ah_attr->grh.sgid_index; + memcpy(&cmd->params.sgid[0], &sgid.raw[0], sizeof(cmd->params.sgid)); + status = ocrdma_resolve_dmac(dev, ah_attr, &mac_addr[0]); + if (status) + return status; + cmd->params.dmac_b0_to_b3 = mac_addr[0] | (mac_addr[1] << 8) | + (mac_addr[2] << 16) | (mac_addr[3] << 24); + /* convert them to LE format. */ + ocrdma_cpu_to_le32(&cmd->params.dgid[0], sizeof(cmd->params.dgid)); + ocrdma_cpu_to_le32(&cmd->params.sgid[0], sizeof(cmd->params.sgid)); + cmd->params.vlan_dmac_b4_to_b5 = mac_addr[4] | (mac_addr[5] << 8); + if (attr_mask & IB_QP_VID) { + vlan_id = attrs->vlan_id; + } else if (dev->pfc_state) { + vlan_id = 0; + pr_err("ocrdma%d:Using VLAN with PFC is recommended\n", + dev->id); + pr_err("ocrdma%d:Using VLAN 0 for this connection\n", + dev->id); + } + + if (vlan_id < 0x1000) { + cmd->params.vlan_dmac_b4_to_b5 |= + vlan_id << OCRDMA_QP_PARAMS_VLAN_SHIFT; + cmd->flags |= OCRDMA_QP_PARA_VLAN_EN_VALID; + cmd->params.rnt_rc_sl_fl |= + (dev->sl & 0x07) << OCRDMA_QP_PARAMS_SL_SHIFT; + } + + return 0; +} + +static int ocrdma_set_qp_params(struct ocrdma_qp *qp, + struct ocrdma_modify_qp *cmd, + struct ib_qp_attr *attrs, int attr_mask) +{ + int status = 0; + struct ocrdma_dev *dev = get_ocrdma_dev(qp->ibqp.device); + + if (attr_mask & IB_QP_PKEY_INDEX) { + cmd->params.path_mtu_pkey_indx |= (attrs->pkey_index & + OCRDMA_QP_PARAMS_PKEY_INDEX_MASK); + cmd->flags |= OCRDMA_QP_PARA_PKEY_VALID; + } + if (attr_mask & IB_QP_QKEY) { + qp->qkey = attrs->qkey; + cmd->params.qkey = attrs->qkey; + cmd->flags |= OCRDMA_QP_PARA_QKEY_VALID; + } + if (attr_mask & IB_QP_AV) { + status = ocrdma_set_av_params(qp, cmd, attrs, attr_mask); + if (status) + return status; + } else if (qp->qp_type == IB_QPT_GSI || qp->qp_type == IB_QPT_UD) { + /* set the default mac address for UD, GSI QPs */ + cmd->params.dmac_b0_to_b3 = dev->nic_info.mac_addr[0] | + (dev->nic_info.mac_addr[1] << 8) | + (dev->nic_info.mac_addr[2] << 16) | + (dev->nic_info.mac_addr[3] << 24); + cmd->params.vlan_dmac_b4_to_b5 = dev->nic_info.mac_addr[4] | + (dev->nic_info.mac_addr[5] << 8); + } + if ((attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY) && + attrs->en_sqd_async_notify) { + cmd->params.max_sge_recv_flags |= + OCRDMA_QP_PARAMS_FLAGS_SQD_ASYNC; + cmd->flags |= OCRDMA_QP_PARA_DST_QPN_VALID; + } + if (attr_mask & IB_QP_DEST_QPN) { + cmd->params.ack_to_rnr_rtc_dest_qpn |= (attrs->dest_qp_num & + OCRDMA_QP_PARAMS_DEST_QPN_MASK); + cmd->flags |= OCRDMA_QP_PARA_DST_QPN_VALID; + } + if (attr_mask & IB_QP_PATH_MTU) { + if (attrs->path_mtu < IB_MTU_512 || + attrs->path_mtu > IB_MTU_4096) { + pr_err("ocrdma%d: IB MTU %d is not supported\n", + dev->id, ib_mtu_enum_to_int(attrs->path_mtu)); + status = -EINVAL; + goto pmtu_err; + } + cmd->params.path_mtu_pkey_indx |= + (ib_mtu_enum_to_int(attrs->path_mtu) << + OCRDMA_QP_PARAMS_PATH_MTU_SHIFT) & + OCRDMA_QP_PARAMS_PATH_MTU_MASK; + cmd->flags |= OCRDMA_QP_PARA_PMTU_VALID; + } + if (attr_mask & IB_QP_TIMEOUT) { + cmd->params.ack_to_rnr_rtc_dest_qpn |= attrs->timeout << + OCRDMA_QP_PARAMS_ACK_TIMEOUT_SHIFT; + cmd->flags |= OCRDMA_QP_PARA_ACK_TO_VALID; + } + if (attr_mask & IB_QP_RETRY_CNT) { + cmd->params.rnt_rc_sl_fl |= (attrs->retry_cnt << + OCRDMA_QP_PARAMS_RETRY_CNT_SHIFT) & + OCRDMA_QP_PARAMS_RETRY_CNT_MASK; + cmd->flags |= OCRDMA_QP_PARA_RETRY_CNT_VALID; + } + if (attr_mask & IB_QP_MIN_RNR_TIMER) { + cmd->params.rnt_rc_sl_fl |= (attrs->min_rnr_timer << + OCRDMA_QP_PARAMS_RNR_NAK_TIMER_SHIFT) & + OCRDMA_QP_PARAMS_RNR_NAK_TIMER_MASK; + cmd->flags |= OCRDMA_QP_PARA_RNT_VALID; + } + if (attr_mask & IB_QP_RNR_RETRY) { + cmd->params.ack_to_rnr_rtc_dest_qpn |= (attrs->rnr_retry << + OCRDMA_QP_PARAMS_RNR_RETRY_CNT_SHIFT) + & OCRDMA_QP_PARAMS_RNR_RETRY_CNT_MASK; + cmd->flags |= OCRDMA_QP_PARA_RRC_VALID; + } + if (attr_mask & IB_QP_SQ_PSN) { + cmd->params.tclass_sq_psn |= (attrs->sq_psn & 0x00ffffff); + cmd->flags |= OCRDMA_QP_PARA_SQPSN_VALID; + } + if (attr_mask & IB_QP_RQ_PSN) { + cmd->params.hop_lmt_rq_psn |= (attrs->rq_psn & 0x00ffffff); + cmd->flags |= OCRDMA_QP_PARA_RQPSN_VALID; + } + if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) { + if (attrs->max_rd_atomic > dev->attr.max_ord_per_qp) { + status = -EINVAL; + goto pmtu_err; + } + qp->max_ord = attrs->max_rd_atomic; + cmd->flags |= OCRDMA_QP_PARA_MAX_ORD_VALID; + } + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) { + if (attrs->max_dest_rd_atomic > dev->attr.max_ird_per_qp) { + status = -EINVAL; + goto pmtu_err; + } + qp->max_ird = attrs->max_dest_rd_atomic; + cmd->flags |= OCRDMA_QP_PARA_MAX_IRD_VALID; + } + cmd->params.max_ord_ird = (qp->max_ord << + OCRDMA_QP_PARAMS_MAX_ORD_SHIFT) | + (qp->max_ird & OCRDMA_QP_PARAMS_MAX_IRD_MASK); +pmtu_err: + return status; +} + +int ocrdma_mbx_modify_qp(struct ocrdma_dev *dev, struct ocrdma_qp *qp, + struct ib_qp_attr *attrs, int attr_mask) +{ + int status = -ENOMEM; + struct ocrdma_modify_qp *cmd; + + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_MODIFY_QP, sizeof(*cmd)); + if (!cmd) + return status; + + cmd->params.id = qp->id; + cmd->flags = 0; + if (attr_mask & IB_QP_STATE) { + cmd->params.max_sge_recv_flags |= + (get_ocrdma_qp_state(attrs->qp_state) << + OCRDMA_QP_PARAMS_STATE_SHIFT) & + OCRDMA_QP_PARAMS_STATE_MASK; + cmd->flags |= OCRDMA_QP_PARA_QPS_VALID; + } else { + cmd->params.max_sge_recv_flags |= + (qp->state << OCRDMA_QP_PARAMS_STATE_SHIFT) & + OCRDMA_QP_PARAMS_STATE_MASK; + } + + status = ocrdma_set_qp_params(qp, cmd, attrs, attr_mask); + if (status) + goto mbx_err; + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + if (status) + goto mbx_err; + +mbx_err: + kfree(cmd); + return status; +} + +int ocrdma_mbx_destroy_qp(struct ocrdma_dev *dev, struct ocrdma_qp *qp) +{ + int status = -ENOMEM; + struct ocrdma_destroy_qp *cmd; + struct pci_dev *pdev = dev->nic_info.pdev; + + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_DELETE_QP, sizeof(*cmd)); + if (!cmd) + return status; + cmd->qp_id = qp->id; + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + if (status) + goto mbx_err; + +mbx_err: + kfree(cmd); + if (qp->sq.va) + dma_free_coherent(&pdev->dev, qp->sq.len, qp->sq.va, qp->sq.pa); + if (!qp->srq && qp->rq.va) + dma_free_coherent(&pdev->dev, qp->rq.len, qp->rq.va, qp->rq.pa); + if (qp->dpp_enabled) + qp->pd->num_dpp_qp++; + return status; +} + +int ocrdma_mbx_create_srq(struct ocrdma_dev *dev, struct ocrdma_srq *srq, + struct ib_srq_init_attr *srq_attr, + struct ocrdma_pd *pd) +{ + int status = -ENOMEM; + int hw_pages, hw_page_size; + int len; + struct ocrdma_create_srq_rsp *rsp; + struct ocrdma_create_srq *cmd; + dma_addr_t pa; + struct pci_dev *pdev = dev->nic_info.pdev; + u32 max_rqe_allocated; + + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_CREATE_SRQ, sizeof(*cmd)); + if (!cmd) + return status; + + cmd->pgsz_pdid = pd->id & OCRDMA_CREATE_SRQ_PD_ID_MASK; + max_rqe_allocated = srq_attr->attr.max_wr + 1; + status = ocrdma_build_q_conf(&max_rqe_allocated, + dev->attr.rqe_size, + &hw_pages, &hw_page_size); + if (status) { + pr_err("%s() req. max_wr=0x%x\n", __func__, + srq_attr->attr.max_wr); + status = -EINVAL; + goto ret; + } + len = hw_pages * hw_page_size; + srq->rq.va = dma_alloc_coherent(&pdev->dev, len, &pa, GFP_KERNEL); + if (!srq->rq.va) { + status = -ENOMEM; + goto ret; + } + ocrdma_build_q_pages(&cmd->rq_addr[0], hw_pages, pa, hw_page_size); + + srq->rq.entry_size = dev->attr.rqe_size; + srq->rq.pa = pa; + srq->rq.len = len; + srq->rq.max_cnt = max_rqe_allocated; + + cmd->max_sge_rqe = ilog2(max_rqe_allocated); + cmd->max_sge_rqe |= srq_attr->attr.max_sge << + OCRDMA_CREATE_SRQ_MAX_SGE_RECV_SHIFT; + + cmd->pgsz_pdid |= (ilog2(hw_page_size / OCRDMA_MIN_Q_PAGE_SIZE) + << OCRDMA_CREATE_SRQ_PG_SZ_SHIFT); + cmd->pages_rqe_sz |= (dev->attr.rqe_size + << OCRDMA_CREATE_SRQ_RQE_SIZE_SHIFT) + & OCRDMA_CREATE_SRQ_RQE_SIZE_MASK; + cmd->pages_rqe_sz |= hw_pages << OCRDMA_CREATE_SRQ_NUM_RQ_PAGES_SHIFT; + + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + if (status) + goto mbx_err; + rsp = (struct ocrdma_create_srq_rsp *)cmd; + srq->id = rsp->id; + srq->rq.dbid = rsp->id; + max_rqe_allocated = ((rsp->max_sge_rqe_allocated & + OCRDMA_CREATE_SRQ_RSP_MAX_RQE_ALLOCATED_MASK) >> + OCRDMA_CREATE_SRQ_RSP_MAX_RQE_ALLOCATED_SHIFT); + max_rqe_allocated = (1 << max_rqe_allocated); + srq->rq.max_cnt = max_rqe_allocated; + srq->rq.max_wqe_idx = max_rqe_allocated - 1; + srq->rq.max_sges = (rsp->max_sge_rqe_allocated & + OCRDMA_CREATE_SRQ_RSP_MAX_SGE_RECV_ALLOCATED_MASK) >> + OCRDMA_CREATE_SRQ_RSP_MAX_SGE_RECV_ALLOCATED_SHIFT; + goto ret; +mbx_err: + dma_free_coherent(&pdev->dev, srq->rq.len, srq->rq.va, pa); +ret: + kfree(cmd); + return status; +} + +int ocrdma_mbx_modify_srq(struct ocrdma_srq *srq, struct ib_srq_attr *srq_attr) +{ + int status = -ENOMEM; + struct ocrdma_modify_srq *cmd; + struct ocrdma_pd *pd = srq->pd; + struct ocrdma_dev *dev = get_ocrdma_dev(pd->ibpd.device); + + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_MODIFY_SRQ, sizeof(*cmd)); + if (!cmd) + return status; + cmd->id = srq->id; + cmd->limit_max_rqe |= srq_attr->srq_limit << + OCRDMA_MODIFY_SRQ_LIMIT_SHIFT; + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + kfree(cmd); + return status; +} + +int ocrdma_mbx_query_srq(struct ocrdma_srq *srq, struct ib_srq_attr *srq_attr) +{ + int status = -ENOMEM; + struct ocrdma_query_srq *cmd; + struct ocrdma_dev *dev = get_ocrdma_dev(srq->ibsrq.device); + + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_QUERY_SRQ, sizeof(*cmd)); + if (!cmd) + return status; + cmd->id = srq->rq.dbid; + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + if (status == 0) { + struct ocrdma_query_srq_rsp *rsp = + (struct ocrdma_query_srq_rsp *)cmd; + srq_attr->max_sge = + rsp->srq_lmt_max_sge & + OCRDMA_QUERY_SRQ_RSP_MAX_SGE_RECV_MASK; + srq_attr->max_wr = + rsp->max_rqe_pdid >> OCRDMA_QUERY_SRQ_RSP_MAX_RQE_SHIFT; + srq_attr->srq_limit = rsp->srq_lmt_max_sge >> + OCRDMA_QUERY_SRQ_RSP_SRQ_LIMIT_SHIFT; + } + kfree(cmd); + return status; +} + +int ocrdma_mbx_destroy_srq(struct ocrdma_dev *dev, struct ocrdma_srq *srq) +{ + int status = -ENOMEM; + struct ocrdma_destroy_srq *cmd; + struct pci_dev *pdev = dev->nic_info.pdev; + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_DELETE_SRQ, sizeof(*cmd)); + if (!cmd) + return status; + cmd->id = srq->id; + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + if (srq->rq.va) + dma_free_coherent(&pdev->dev, srq->rq.len, + srq->rq.va, srq->rq.pa); + kfree(cmd); + return status; +} + +static int ocrdma_mbx_get_dcbx_config(struct ocrdma_dev *dev, u32 ptype, + struct ocrdma_dcbx_cfg *dcbxcfg) +{ + int status = 0; + dma_addr_t pa; + struct ocrdma_mqe cmd; + + struct ocrdma_get_dcbx_cfg_req *req = NULL; + struct ocrdma_get_dcbx_cfg_rsp *rsp = NULL; + struct pci_dev *pdev = dev->nic_info.pdev; + struct ocrdma_mqe_sge *mqe_sge = cmd.u.nonemb_req.sge; + + memset(&cmd, 0, sizeof(struct ocrdma_mqe)); + cmd.hdr.pyld_len = max_t (u32, sizeof(struct ocrdma_get_dcbx_cfg_rsp), + sizeof(struct ocrdma_get_dcbx_cfg_req)); + req = dma_alloc_coherent(&pdev->dev, cmd.hdr.pyld_len, &pa, GFP_KERNEL); + if (!req) { + status = -ENOMEM; + goto mem_err; + } + + cmd.hdr.spcl_sge_cnt_emb |= (1 << OCRDMA_MQE_HDR_SGE_CNT_SHIFT) & + OCRDMA_MQE_HDR_SGE_CNT_MASK; + mqe_sge->pa_lo = (u32) (pa & 0xFFFFFFFFUL); + mqe_sge->pa_hi = (u32) upper_32_bits(pa); + mqe_sge->len = cmd.hdr.pyld_len; + + memset(req, 0, sizeof(struct ocrdma_get_dcbx_cfg_req)); + ocrdma_init_mch(&req->hdr, OCRDMA_CMD_GET_DCBX_CONFIG, + OCRDMA_SUBSYS_DCBX, cmd.hdr.pyld_len); + req->param_type = ptype; + + status = ocrdma_mbx_cmd(dev, &cmd); + if (status) + goto mbx_err; + + rsp = (struct ocrdma_get_dcbx_cfg_rsp *)req; + ocrdma_le32_to_cpu(rsp, sizeof(struct ocrdma_get_dcbx_cfg_rsp)); + memcpy(dcbxcfg, &rsp->cfg, sizeof(struct ocrdma_dcbx_cfg)); + +mbx_err: + dma_free_coherent(&pdev->dev, cmd.hdr.pyld_len, req, pa); +mem_err: + return status; +} + +#define OCRDMA_MAX_SERVICE_LEVEL_INDEX 0x08 +#define OCRDMA_DEFAULT_SERVICE_LEVEL 0x05 + +static int ocrdma_parse_dcbxcfg_rsp(struct ocrdma_dev *dev, int ptype, + struct ocrdma_dcbx_cfg *dcbxcfg, + u8 *srvc_lvl) +{ + int status = -EINVAL, indx, slindx; + int ventry_cnt; + struct ocrdma_app_parameter *app_param; + u8 valid, proto_sel; + u8 app_prio, pfc_prio; + u16 proto; + + if (!(dcbxcfg->tcv_aev_opv_st & OCRDMA_DCBX_STATE_MASK)) { + pr_info("%s ocrdma%d DCBX is disabled\n", + dev_name(&dev->nic_info.pdev->dev), dev->id); + goto out; + } + + if (!ocrdma_is_enabled_and_synced(dcbxcfg->pfc_state)) { + pr_info("%s ocrdma%d priority flow control(%s) is %s%s\n", + dev_name(&dev->nic_info.pdev->dev), dev->id, + (ptype > 0 ? "operational" : "admin"), + (dcbxcfg->pfc_state & OCRDMA_STATE_FLAG_ENABLED) ? + "enabled" : "disabled", + (dcbxcfg->pfc_state & OCRDMA_STATE_FLAG_SYNC) ? + "" : ", not sync'ed"); + goto out; + } else { + pr_info("%s ocrdma%d priority flow control is enabled and sync'ed\n", + dev_name(&dev->nic_info.pdev->dev), dev->id); + } + + ventry_cnt = (dcbxcfg->tcv_aev_opv_st >> + OCRDMA_DCBX_APP_ENTRY_SHIFT) + & OCRDMA_DCBX_STATE_MASK; + + for (indx = 0; indx < ventry_cnt; indx++) { + app_param = &dcbxcfg->app_param[indx]; + valid = (app_param->valid_proto_app >> + OCRDMA_APP_PARAM_VALID_SHIFT) + & OCRDMA_APP_PARAM_VALID_MASK; + proto_sel = (app_param->valid_proto_app + >> OCRDMA_APP_PARAM_PROTO_SEL_SHIFT) + & OCRDMA_APP_PARAM_PROTO_SEL_MASK; + proto = app_param->valid_proto_app & + OCRDMA_APP_PARAM_APP_PROTO_MASK; + + if ( + valid && proto == OCRDMA_APP_PROTO_ROCE && + proto_sel == OCRDMA_PROTO_SELECT_L2) { + for (slindx = 0; slindx < + OCRDMA_MAX_SERVICE_LEVEL_INDEX; slindx++) { + app_prio = ocrdma_get_app_prio( + (u8 *)app_param->app_prio, + slindx); + pfc_prio = ocrdma_get_pfc_prio( + (u8 *)dcbxcfg->pfc_prio, + slindx); + + if (app_prio && pfc_prio) { + *srvc_lvl = slindx; + status = 0; + goto out; + } + } + if (slindx == OCRDMA_MAX_SERVICE_LEVEL_INDEX) { + pr_info("%s ocrdma%d application priority not set for 0x%x protocol\n", + dev_name(&dev->nic_info.pdev->dev), + dev->id, proto); + } + } + } + +out: + return status; +} + +void ocrdma_init_service_level(struct ocrdma_dev *dev) +{ + int status = 0, indx; + struct ocrdma_dcbx_cfg dcbxcfg; + u8 srvc_lvl = OCRDMA_DEFAULT_SERVICE_LEVEL; + int ptype = OCRDMA_PARAMETER_TYPE_OPER; + + for (indx = 0; indx < 2; indx++) { + status = ocrdma_mbx_get_dcbx_config(dev, ptype, &dcbxcfg); + if (status) { + pr_err("%s(): status=%d\n", __func__, status); + ptype = OCRDMA_PARAMETER_TYPE_ADMIN; + continue; + } + + status = ocrdma_parse_dcbxcfg_rsp(dev, ptype, + &dcbxcfg, &srvc_lvl); + if (status) { + ptype = OCRDMA_PARAMETER_TYPE_ADMIN; + continue; + } + + break; + } + + if (status) + pr_info("%s ocrdma%d service level default\n", + dev_name(&dev->nic_info.pdev->dev), dev->id); + else + pr_info("%s ocrdma%d service level %d\n", + dev_name(&dev->nic_info.pdev->dev), dev->id, + srvc_lvl); + + dev->pfc_state = ocrdma_is_enabled_and_synced(dcbxcfg.pfc_state); + dev->sl = srvc_lvl; +} + +int ocrdma_alloc_av(struct ocrdma_dev *dev, struct ocrdma_ah *ah) +{ + int i; + int status = -EINVAL; + struct ocrdma_av *av; + unsigned long flags; + + av = dev->av_tbl.va; + spin_lock_irqsave(&dev->av_tbl.lock, flags); + for (i = 0; i < dev->av_tbl.num_ah; i++) { + if (av->valid == 0) { + av->valid = OCRDMA_AV_VALID; + ah->av = av; + ah->id = i; + status = 0; + break; + } + av++; + } + if (i == dev->av_tbl.num_ah) + status = -EAGAIN; + spin_unlock_irqrestore(&dev->av_tbl.lock, flags); + return status; +} + +int ocrdma_free_av(struct ocrdma_dev *dev, struct ocrdma_ah *ah) +{ + unsigned long flags; + spin_lock_irqsave(&dev->av_tbl.lock, flags); + ah->av->valid = 0; + spin_unlock_irqrestore(&dev->av_tbl.lock, flags); + return 0; +} + +static int ocrdma_create_eqs(struct ocrdma_dev *dev) +{ + int num_eq, i, status = 0; + int irq; + unsigned long flags = 0; + + num_eq = dev->nic_info.msix.num_vectors - + dev->nic_info.msix.start_vector; + if (dev->nic_info.intr_mode == BE_INTERRUPT_MODE_INTX) { + num_eq = 1; + flags = IRQF_SHARED; + } else { + num_eq = min_t(u32, num_eq, num_online_cpus()); + } + + if (!num_eq) + return -EINVAL; + + dev->eq_tbl = kzalloc(sizeof(struct ocrdma_eq) * num_eq, GFP_KERNEL); + if (!dev->eq_tbl) + return -ENOMEM; + + for (i = 0; i < num_eq; i++) { + status = ocrdma_create_eq(dev, &dev->eq_tbl[i], + OCRDMA_EQ_LEN); + if (status) { + status = -EINVAL; + break; + } + sprintf(dev->eq_tbl[i].irq_name, "ocrdma%d-%d", + dev->id, i); + irq = ocrdma_get_irq(dev, &dev->eq_tbl[i]); + status = request_irq(irq, ocrdma_irq_handler, flags, + dev->eq_tbl[i].irq_name, + &dev->eq_tbl[i]); + if (status) + goto done; + dev->eq_cnt += 1; + } + /* one eq is sufficient for data path to work */ + return 0; +done: + ocrdma_destroy_eqs(dev); + return status; +} + +static int ocrdma_mbx_modify_eqd(struct ocrdma_dev *dev, struct ocrdma_eq *eq, + int num) +{ + int i, status = -ENOMEM; + struct ocrdma_modify_eqd_req *cmd; + + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_MODIFY_EQ_DELAY, sizeof(*cmd)); + if (!cmd) + return status; + + ocrdma_init_mch(&cmd->cmd.req, OCRDMA_CMD_MODIFY_EQ_DELAY, + OCRDMA_SUBSYS_COMMON, sizeof(*cmd)); + + cmd->cmd.num_eq = num; + for (i = 0; i < num; i++) { + cmd->cmd.set_eqd[i].eq_id = eq[i].q.id; + cmd->cmd.set_eqd[i].phase = 0; + cmd->cmd.set_eqd[i].delay_multiplier = + (eq[i].aic_obj.prev_eqd * 65)/100; + } + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + if (status) + goto mbx_err; +mbx_err: + kfree(cmd); + return status; +} + +static int ocrdma_modify_eqd(struct ocrdma_dev *dev, struct ocrdma_eq *eq, + int num) +{ + int num_eqs, i = 0; + if (num > 8) { + while (num) { + num_eqs = min(num, 8); + ocrdma_mbx_modify_eqd(dev, &eq[i], num_eqs); + i += num_eqs; + num -= num_eqs; + } + } else { + ocrdma_mbx_modify_eqd(dev, eq, num); + } + return 0; +} + +void ocrdma_eqd_set_task(struct work_struct *work) +{ + struct ocrdma_dev *dev = + container_of(work, struct ocrdma_dev, eqd_work.work); + struct ocrdma_eq *eq = 0; + int i, num = 0, status = -EINVAL; + u64 eq_intr; + + for (i = 0; i < dev->eq_cnt; i++) { + eq = &dev->eq_tbl[i]; + if (eq->aic_obj.eq_intr_cnt > eq->aic_obj.prev_eq_intr_cnt) { + eq_intr = eq->aic_obj.eq_intr_cnt - + eq->aic_obj.prev_eq_intr_cnt; + if ((eq_intr > EQ_INTR_PER_SEC_THRSH_HI) && + (eq->aic_obj.prev_eqd == EQ_AIC_MIN_EQD)) { + eq->aic_obj.prev_eqd = EQ_AIC_MAX_EQD; + num++; + } else if ((eq_intr < EQ_INTR_PER_SEC_THRSH_LOW) && + (eq->aic_obj.prev_eqd == EQ_AIC_MAX_EQD)) { + eq->aic_obj.prev_eqd = EQ_AIC_MIN_EQD; + num++; + } + } + eq->aic_obj.prev_eq_intr_cnt = eq->aic_obj.eq_intr_cnt; + } + + if (num) + status = ocrdma_modify_eqd(dev, &dev->eq_tbl[0], num); + schedule_delayed_work(&dev->eqd_work, msecs_to_jiffies(1000)); +} + +int ocrdma_init_hw(struct ocrdma_dev *dev) +{ + int status; + + /* create the eqs */ + status = ocrdma_create_eqs(dev); + if (status) + goto qpeq_err; + status = ocrdma_create_mq(dev); + if (status) + goto mq_err; + status = ocrdma_mbx_query_fw_config(dev); + if (status) + goto conf_err; + status = ocrdma_mbx_query_dev(dev); + if (status) + goto conf_err; + status = ocrdma_mbx_query_fw_ver(dev); + if (status) + goto conf_err; + status = ocrdma_mbx_create_ah_tbl(dev); + if (status) + goto conf_err; + status = ocrdma_mbx_get_phy_info(dev); + if (status) + goto info_attrb_err; + status = ocrdma_mbx_get_ctrl_attribs(dev); + if (status) + goto info_attrb_err; + + return 0; + +info_attrb_err: + ocrdma_mbx_delete_ah_tbl(dev); +conf_err: + ocrdma_destroy_mq(dev); +mq_err: + ocrdma_destroy_eqs(dev); +qpeq_err: + pr_err("%s() status=%d\n", __func__, status); + return status; +} + +void ocrdma_cleanup_hw(struct ocrdma_dev *dev) +{ + ocrdma_free_pd_pool(dev); + ocrdma_mbx_delete_ah_tbl(dev); + + /* cleanup the control path */ + ocrdma_destroy_mq(dev); + + /* cleanup the eqs */ + ocrdma_destroy_eqs(dev); +} diff --git a/kernel/drivers/infiniband/hw/ocrdma/ocrdma_hw.h b/kernel/drivers/infiniband/hw/ocrdma/ocrdma_hw.h new file mode 100644 index 000000000..e905972fc --- /dev/null +++ b/kernel/drivers/infiniband/hw/ocrdma/ocrdma_hw.h @@ -0,0 +1,142 @@ +/******************************************************************* + * This file is part of the Emulex RoCE Device Driver for * + * RoCE (RDMA over Converged Ethernet) CNA Adapters. * + * Copyright (C) 2008-2012 Emulex. All rights reserved. * + * EMULEX and SLI are trademarks of Emulex. * + * www.emulex.com * + * * + * This program is free software; you can redistribute it and/or * + * modify it under the terms of version 2 of the GNU General * + * Public License as published by the Free Software Foundation. * + * This program is distributed in the hope that it will be useful. * + * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND * + * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, * + * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE * + * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD * + * TO BE LEGALLY INVALID. See the GNU General Public License for * + * more details, a copy of which can be found in the file COPYING * + * included with this package. * + * + * Contact Information: + * linux-drivers@emulex.com + * + * Emulex + * 3333 Susan Street + * Costa Mesa, CA 92626 + *******************************************************************/ + +#ifndef __OCRDMA_HW_H__ +#define __OCRDMA_HW_H__ + +#include "ocrdma_sli.h" + +static inline void ocrdma_cpu_to_le32(void *dst, u32 len) +{ +#ifdef __BIG_ENDIAN + int i = 0; + u32 *src_ptr = dst; + u32 *dst_ptr = dst; + for (; i < (len / 4); i++) + *(dst_ptr + i) = cpu_to_le32p(src_ptr + i); +#endif +} + +static inline void ocrdma_le32_to_cpu(void *dst, u32 len) +{ +#ifdef __BIG_ENDIAN + int i = 0; + u32 *src_ptr = dst; + u32 *dst_ptr = dst; + for (; i < (len / sizeof(u32)); i++) + *(dst_ptr + i) = le32_to_cpu(*(src_ptr + i)); +#endif +} + +static inline void ocrdma_copy_cpu_to_le32(void *dst, void *src, u32 len) +{ +#ifdef __BIG_ENDIAN + int i = 0; + u32 *src_ptr = src; + u32 *dst_ptr = dst; + for (; i < (len / sizeof(u32)); i++) + *(dst_ptr + i) = cpu_to_le32p(src_ptr + i); +#else + memcpy(dst, src, len); +#endif +} + +static inline void ocrdma_copy_le32_to_cpu(void *dst, void *src, u32 len) +{ +#ifdef __BIG_ENDIAN + int i = 0; + u32 *src_ptr = src; + u32 *dst_ptr = dst; + for (; i < len / sizeof(u32); i++) + *(dst_ptr + i) = le32_to_cpu(*(src_ptr + i)); +#else + memcpy(dst, src, len); +#endif +} + +static inline u64 ocrdma_get_db_addr(struct ocrdma_dev *dev, u32 pdid) +{ + return dev->nic_info.unmapped_db + (pdid * dev->nic_info.db_page_size); +} + +int ocrdma_init_hw(struct ocrdma_dev *); +void ocrdma_cleanup_hw(struct ocrdma_dev *); + +enum ib_qp_state get_ibqp_state(enum ocrdma_qp_state qps); +void ocrdma_ring_cq_db(struct ocrdma_dev *, u16 cq_id, bool armed, + bool solicited, u16 cqe_popped); + +/* verbs specific mailbox commands */ +int ocrdma_mbx_get_link_speed(struct ocrdma_dev *dev, u8 *lnk_speed); +int ocrdma_query_config(struct ocrdma_dev *, + struct ocrdma_mbx_query_config *config); + +int ocrdma_mbx_alloc_pd(struct ocrdma_dev *, struct ocrdma_pd *); +int ocrdma_mbx_dealloc_pd(struct ocrdma_dev *, struct ocrdma_pd *); + +int ocrdma_mbx_alloc_lkey(struct ocrdma_dev *, struct ocrdma_hw_mr *hwmr, + u32 pd_id, int addr_check); +int ocrdma_mbx_dealloc_lkey(struct ocrdma_dev *, int fmr, u32 lkey); + +int ocrdma_reg_mr(struct ocrdma_dev *, struct ocrdma_hw_mr *hwmr, + u32 pd_id, int acc); +int ocrdma_mbx_create_cq(struct ocrdma_dev *, struct ocrdma_cq *, + int entries, int dpp_cq, u16 pd_id); +int ocrdma_mbx_destroy_cq(struct ocrdma_dev *, struct ocrdma_cq *); + +int ocrdma_mbx_create_qp(struct ocrdma_qp *, struct ib_qp_init_attr *attrs, + u8 enable_dpp_cq, u16 dpp_cq_id, u16 *dpp_offset, + u16 *dpp_credit_lmt); +int ocrdma_mbx_modify_qp(struct ocrdma_dev *, struct ocrdma_qp *, + struct ib_qp_attr *attrs, int attr_mask); +int ocrdma_mbx_query_qp(struct ocrdma_dev *, struct ocrdma_qp *, + struct ocrdma_qp_params *param); +int ocrdma_mbx_destroy_qp(struct ocrdma_dev *, struct ocrdma_qp *); +int ocrdma_mbx_create_srq(struct ocrdma_dev *, struct ocrdma_srq *, + struct ib_srq_init_attr *, + struct ocrdma_pd *); +int ocrdma_mbx_modify_srq(struct ocrdma_srq *, struct ib_srq_attr *); +int ocrdma_mbx_query_srq(struct ocrdma_srq *, struct ib_srq_attr *); +int ocrdma_mbx_destroy_srq(struct ocrdma_dev *, struct ocrdma_srq *); + +int ocrdma_alloc_av(struct ocrdma_dev *, struct ocrdma_ah *); +int ocrdma_free_av(struct ocrdma_dev *, struct ocrdma_ah *); + +int ocrdma_qp_state_change(struct ocrdma_qp *, enum ib_qp_state new_state, + enum ib_qp_state *old_ib_state); +bool ocrdma_is_qp_in_sq_flushlist(struct ocrdma_cq *, struct ocrdma_qp *); +bool ocrdma_is_qp_in_rq_flushlist(struct ocrdma_cq *, struct ocrdma_qp *); +void ocrdma_flush_qp(struct ocrdma_qp *); +int ocrdma_get_irq(struct ocrdma_dev *dev, struct ocrdma_eq *eq); + +int ocrdma_mbx_rdma_stats(struct ocrdma_dev *, bool reset); +char *port_speed_string(struct ocrdma_dev *dev); +void ocrdma_init_service_level(struct ocrdma_dev *); +void ocrdma_alloc_pd_pool(struct ocrdma_dev *dev); +void ocrdma_free_pd_range(struct ocrdma_dev *dev); + +#endif /* __OCRDMA_HW_H__ */ diff --git a/kernel/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/kernel/drivers/infiniband/hw/ocrdma/ocrdma_main.c new file mode 100644 index 000000000..7a2b59aca --- /dev/null +++ b/kernel/drivers/infiniband/hw/ocrdma/ocrdma_main.c @@ -0,0 +1,682 @@ +/******************************************************************* + * This file is part of the Emulex RoCE Device Driver for * + * RoCE (RDMA over Converged Ethernet) adapters. * + * Copyright (C) 2008-2012 Emulex. All rights reserved. * + * EMULEX and SLI are trademarks of Emulex. * + * www.emulex.com * + * * + * This program is free software; you can redistribute it and/or * + * modify it under the terms of version 2 of the GNU General * + * Public License as published by the Free Software Foundation. * + * This program is distributed in the hope that it will be useful. * + * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND * + * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, * + * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE * + * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD * + * TO BE LEGALLY INVALID. See the GNU General Public License for * + * more details, a copy of which can be found in the file COPYING * + * included with this package. * + * + * Contact Information: + * linux-drivers@emulex.com + * + * Emulex + * 3333 Susan Street + * Costa Mesa, CA 92626 + *******************************************************************/ + +#include +#include +#include +#include +#include + +#include +#include + +#include "ocrdma.h" +#include "ocrdma_verbs.h" +#include "ocrdma_ah.h" +#include "be_roce.h" +#include "ocrdma_hw.h" +#include "ocrdma_stats.h" +#include "ocrdma_abi.h" + +MODULE_VERSION(OCRDMA_ROCE_DRV_VERSION); +MODULE_DESCRIPTION(OCRDMA_ROCE_DRV_DESC " " OCRDMA_ROCE_DRV_VERSION); +MODULE_AUTHOR("Emulex Corporation"); +MODULE_LICENSE("GPL"); + +static LIST_HEAD(ocrdma_dev_list); +static DEFINE_SPINLOCK(ocrdma_devlist_lock); +static DEFINE_IDR(ocrdma_dev_id); + +static union ib_gid ocrdma_zero_sgid; + +void ocrdma_get_guid(struct ocrdma_dev *dev, u8 *guid) +{ + u8 mac_addr[6]; + + memcpy(&mac_addr[0], &dev->nic_info.mac_addr[0], ETH_ALEN); + guid[0] = mac_addr[0] ^ 2; + guid[1] = mac_addr[1]; + guid[2] = mac_addr[2]; + guid[3] = 0xff; + guid[4] = 0xfe; + guid[5] = mac_addr[3]; + guid[6] = mac_addr[4]; + guid[7] = mac_addr[5]; +} + +static bool ocrdma_add_sgid(struct ocrdma_dev *dev, union ib_gid *new_sgid) +{ + int i; + unsigned long flags; + + memset(&ocrdma_zero_sgid, 0, sizeof(union ib_gid)); + + + spin_lock_irqsave(&dev->sgid_lock, flags); + for (i = 0; i < OCRDMA_MAX_SGID; i++) { + if (!memcmp(&dev->sgid_tbl[i], &ocrdma_zero_sgid, + sizeof(union ib_gid))) { + /* found free entry */ + memcpy(&dev->sgid_tbl[i], new_sgid, + sizeof(union ib_gid)); + spin_unlock_irqrestore(&dev->sgid_lock, flags); + return true; + } else if (!memcmp(&dev->sgid_tbl[i], new_sgid, + sizeof(union ib_gid))) { + /* entry already present, no addition is required. */ + spin_unlock_irqrestore(&dev->sgid_lock, flags); + return false; + } + } + spin_unlock_irqrestore(&dev->sgid_lock, flags); + return false; +} + +static bool ocrdma_del_sgid(struct ocrdma_dev *dev, union ib_gid *sgid) +{ + int found = false; + int i; + unsigned long flags; + + + spin_lock_irqsave(&dev->sgid_lock, flags); + /* first is default sgid, which cannot be deleted. */ + for (i = 1; i < OCRDMA_MAX_SGID; i++) { + if (!memcmp(&dev->sgid_tbl[i], sgid, sizeof(union ib_gid))) { + /* found matching entry */ + memset(&dev->sgid_tbl[i], 0, sizeof(union ib_gid)); + found = true; + break; + } + } + spin_unlock_irqrestore(&dev->sgid_lock, flags); + return found; +} + +static int ocrdma_addr_event(unsigned long event, struct net_device *netdev, + union ib_gid *gid) +{ + struct ib_event gid_event; + struct ocrdma_dev *dev; + bool found = false; + bool updated = false; + bool is_vlan = false; + + is_vlan = netdev->priv_flags & IFF_802_1Q_VLAN; + if (is_vlan) + netdev = rdma_vlan_dev_real_dev(netdev); + + rcu_read_lock(); + list_for_each_entry_rcu(dev, &ocrdma_dev_list, entry) { + if (dev->nic_info.netdev == netdev) { + found = true; + break; + } + } + rcu_read_unlock(); + + if (!found) + return NOTIFY_DONE; + + mutex_lock(&dev->dev_lock); + switch (event) { + case NETDEV_UP: + updated = ocrdma_add_sgid(dev, gid); + break; + case NETDEV_DOWN: + updated = ocrdma_del_sgid(dev, gid); + break; + default: + break; + } + if (updated) { + /* GID table updated, notify the consumers about it */ + gid_event.device = &dev->ibdev; + gid_event.element.port_num = 1; + gid_event.event = IB_EVENT_GID_CHANGE; + ib_dispatch_event(&gid_event); + } + mutex_unlock(&dev->dev_lock); + return NOTIFY_OK; +} + +static int ocrdma_inetaddr_event(struct notifier_block *notifier, + unsigned long event, void *ptr) +{ + struct in_ifaddr *ifa = ptr; + union ib_gid gid; + struct net_device *netdev = ifa->ifa_dev->dev; + + ipv6_addr_set_v4mapped(ifa->ifa_address, (struct in6_addr *)&gid); + return ocrdma_addr_event(event, netdev, &gid); +} + +static struct notifier_block ocrdma_inetaddr_notifier = { + .notifier_call = ocrdma_inetaddr_event +}; + +#if IS_ENABLED(CONFIG_IPV6) + +static int ocrdma_inet6addr_event(struct notifier_block *notifier, + unsigned long event, void *ptr) +{ + struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr; + union ib_gid *gid = (union ib_gid *)&ifa->addr; + struct net_device *netdev = ifa->idev->dev; + return ocrdma_addr_event(event, netdev, gid); +} + +static struct notifier_block ocrdma_inet6addr_notifier = { + .notifier_call = ocrdma_inet6addr_event +}; + +#endif /* IPV6 and VLAN */ + +static enum rdma_link_layer ocrdma_link_layer(struct ib_device *device, + u8 port_num) +{ + return IB_LINK_LAYER_ETHERNET; +} + +static int ocrdma_register_device(struct ocrdma_dev *dev) +{ + strlcpy(dev->ibdev.name, "ocrdma%d", IB_DEVICE_NAME_MAX); + ocrdma_get_guid(dev, (u8 *)&dev->ibdev.node_guid); + memcpy(dev->ibdev.node_desc, OCRDMA_NODE_DESC, + sizeof(OCRDMA_NODE_DESC)); + dev->ibdev.owner = THIS_MODULE; + dev->ibdev.uverbs_abi_ver = OCRDMA_ABI_VERSION; + dev->ibdev.uverbs_cmd_mask = + OCRDMA_UVERBS(GET_CONTEXT) | + OCRDMA_UVERBS(QUERY_DEVICE) | + OCRDMA_UVERBS(QUERY_PORT) | + OCRDMA_UVERBS(ALLOC_PD) | + OCRDMA_UVERBS(DEALLOC_PD) | + OCRDMA_UVERBS(REG_MR) | + OCRDMA_UVERBS(DEREG_MR) | + OCRDMA_UVERBS(CREATE_COMP_CHANNEL) | + OCRDMA_UVERBS(CREATE_CQ) | + OCRDMA_UVERBS(RESIZE_CQ) | + OCRDMA_UVERBS(DESTROY_CQ) | + OCRDMA_UVERBS(REQ_NOTIFY_CQ) | + OCRDMA_UVERBS(CREATE_QP) | + OCRDMA_UVERBS(MODIFY_QP) | + OCRDMA_UVERBS(QUERY_QP) | + OCRDMA_UVERBS(DESTROY_QP) | + OCRDMA_UVERBS(POLL_CQ) | + OCRDMA_UVERBS(POST_SEND) | + OCRDMA_UVERBS(POST_RECV); + + dev->ibdev.uverbs_cmd_mask |= + OCRDMA_UVERBS(CREATE_AH) | + OCRDMA_UVERBS(MODIFY_AH) | + OCRDMA_UVERBS(QUERY_AH) | + OCRDMA_UVERBS(DESTROY_AH); + + dev->ibdev.node_type = RDMA_NODE_IB_CA; + dev->ibdev.phys_port_cnt = 1; + dev->ibdev.num_comp_vectors = dev->eq_cnt; + + /* mandatory verbs. */ + dev->ibdev.query_device = ocrdma_query_device; + dev->ibdev.query_port = ocrdma_query_port; + dev->ibdev.modify_port = ocrdma_modify_port; + dev->ibdev.query_gid = ocrdma_query_gid; + dev->ibdev.get_link_layer = ocrdma_link_layer; + dev->ibdev.alloc_pd = ocrdma_alloc_pd; + dev->ibdev.dealloc_pd = ocrdma_dealloc_pd; + + dev->ibdev.create_cq = ocrdma_create_cq; + dev->ibdev.destroy_cq = ocrdma_destroy_cq; + dev->ibdev.resize_cq = ocrdma_resize_cq; + + dev->ibdev.create_qp = ocrdma_create_qp; + dev->ibdev.modify_qp = ocrdma_modify_qp; + dev->ibdev.query_qp = ocrdma_query_qp; + dev->ibdev.destroy_qp = ocrdma_destroy_qp; + + dev->ibdev.query_pkey = ocrdma_query_pkey; + dev->ibdev.create_ah = ocrdma_create_ah; + dev->ibdev.destroy_ah = ocrdma_destroy_ah; + dev->ibdev.query_ah = ocrdma_query_ah; + dev->ibdev.modify_ah = ocrdma_modify_ah; + + dev->ibdev.poll_cq = ocrdma_poll_cq; + dev->ibdev.post_send = ocrdma_post_send; + dev->ibdev.post_recv = ocrdma_post_recv; + dev->ibdev.req_notify_cq = ocrdma_arm_cq; + + dev->ibdev.get_dma_mr = ocrdma_get_dma_mr; + dev->ibdev.reg_phys_mr = ocrdma_reg_kernel_mr; + dev->ibdev.dereg_mr = ocrdma_dereg_mr; + dev->ibdev.reg_user_mr = ocrdma_reg_user_mr; + + dev->ibdev.alloc_fast_reg_mr = ocrdma_alloc_frmr; + dev->ibdev.alloc_fast_reg_page_list = ocrdma_alloc_frmr_page_list; + dev->ibdev.free_fast_reg_page_list = ocrdma_free_frmr_page_list; + + /* mandatory to support user space verbs consumer. */ + dev->ibdev.alloc_ucontext = ocrdma_alloc_ucontext; + dev->ibdev.dealloc_ucontext = ocrdma_dealloc_ucontext; + dev->ibdev.mmap = ocrdma_mmap; + dev->ibdev.dma_device = &dev->nic_info.pdev->dev; + + dev->ibdev.process_mad = ocrdma_process_mad; + + if (ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R) { + dev->ibdev.uverbs_cmd_mask |= + OCRDMA_UVERBS(CREATE_SRQ) | + OCRDMA_UVERBS(MODIFY_SRQ) | + OCRDMA_UVERBS(QUERY_SRQ) | + OCRDMA_UVERBS(DESTROY_SRQ) | + OCRDMA_UVERBS(POST_SRQ_RECV); + + dev->ibdev.create_srq = ocrdma_create_srq; + dev->ibdev.modify_srq = ocrdma_modify_srq; + dev->ibdev.query_srq = ocrdma_query_srq; + dev->ibdev.destroy_srq = ocrdma_destroy_srq; + dev->ibdev.post_srq_recv = ocrdma_post_srq_recv; + } + return ib_register_device(&dev->ibdev, NULL); +} + +static int ocrdma_alloc_resources(struct ocrdma_dev *dev) +{ + mutex_init(&dev->dev_lock); + dev->sgid_tbl = kzalloc(sizeof(union ib_gid) * + OCRDMA_MAX_SGID, GFP_KERNEL); + if (!dev->sgid_tbl) + goto alloc_err; + spin_lock_init(&dev->sgid_lock); + + dev->cq_tbl = kzalloc(sizeof(struct ocrdma_cq *) * + OCRDMA_MAX_CQ, GFP_KERNEL); + if (!dev->cq_tbl) + goto alloc_err; + + if (dev->attr.max_qp) { + dev->qp_tbl = kzalloc(sizeof(struct ocrdma_qp *) * + OCRDMA_MAX_QP, GFP_KERNEL); + if (!dev->qp_tbl) + goto alloc_err; + } + + dev->stag_arr = kzalloc(sizeof(u64) * OCRDMA_MAX_STAG, GFP_KERNEL); + if (dev->stag_arr == NULL) + goto alloc_err; + + ocrdma_alloc_pd_pool(dev); + + spin_lock_init(&dev->av_tbl.lock); + spin_lock_init(&dev->flush_q_lock); + return 0; +alloc_err: + pr_err("%s(%d) error.\n", __func__, dev->id); + return -ENOMEM; +} + +static void ocrdma_free_resources(struct ocrdma_dev *dev) +{ + kfree(dev->stag_arr); + kfree(dev->qp_tbl); + kfree(dev->cq_tbl); + kfree(dev->sgid_tbl); +} + +/* OCRDMA sysfs interface */ +static ssize_t show_rev(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct ocrdma_dev *dev = dev_get_drvdata(device); + + return scnprintf(buf, PAGE_SIZE, "0x%x\n", dev->nic_info.pdev->vendor); +} + +static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct ocrdma_dev *dev = dev_get_drvdata(device); + + return scnprintf(buf, PAGE_SIZE, "%s\n", &dev->attr.fw_ver[0]); +} + +static ssize_t show_hca_type(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct ocrdma_dev *dev = dev_get_drvdata(device); + + return scnprintf(buf, PAGE_SIZE, "%s\n", &dev->model_number[0]); +} + +static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); +static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); +static DEVICE_ATTR(hca_type, S_IRUGO, show_hca_type, NULL); + +static struct device_attribute *ocrdma_attributes[] = { + &dev_attr_hw_rev, + &dev_attr_fw_ver, + &dev_attr_hca_type +}; + +static void ocrdma_remove_sysfiles(struct ocrdma_dev *dev) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(ocrdma_attributes); i++) + device_remove_file(&dev->ibdev.dev, ocrdma_attributes[i]); +} + +static void ocrdma_add_default_sgid(struct ocrdma_dev *dev) +{ + /* GID Index 0 - Invariant manufacturer-assigned EUI-64 */ + union ib_gid *sgid = &dev->sgid_tbl[0]; + + sgid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL); + ocrdma_get_guid(dev, &sgid->raw[8]); +} + +static void ocrdma_init_ipv4_gids(struct ocrdma_dev *dev, + struct net_device *net) +{ + struct in_device *in_dev; + union ib_gid gid; + in_dev = in_dev_get(net); + if (in_dev) { + for_ifa(in_dev) { + ipv6_addr_set_v4mapped(ifa->ifa_address, + (struct in6_addr *)&gid); + ocrdma_add_sgid(dev, &gid); + } + endfor_ifa(in_dev); + in_dev_put(in_dev); + } +} + +static void ocrdma_init_ipv6_gids(struct ocrdma_dev *dev, + struct net_device *net) +{ +#if IS_ENABLED(CONFIG_IPV6) + struct inet6_dev *in6_dev; + union ib_gid *pgid; + struct inet6_ifaddr *ifp; + in6_dev = in6_dev_get(net); + if (in6_dev) { + read_lock_bh(&in6_dev->lock); + list_for_each_entry(ifp, &in6_dev->addr_list, if_list) { + pgid = (union ib_gid *)&ifp->addr; + ocrdma_add_sgid(dev, pgid); + } + read_unlock_bh(&in6_dev->lock); + in6_dev_put(in6_dev); + } +#endif +} + +static void ocrdma_init_gid_table(struct ocrdma_dev *dev) +{ + struct net_device *net_dev; + + for_each_netdev(&init_net, net_dev) { + struct net_device *real_dev = rdma_vlan_dev_real_dev(net_dev) ? + rdma_vlan_dev_real_dev(net_dev) : net_dev; + + if (real_dev == dev->nic_info.netdev) { + ocrdma_add_default_sgid(dev); + ocrdma_init_ipv4_gids(dev, net_dev); + ocrdma_init_ipv6_gids(dev, net_dev); + } + } +} + +static struct ocrdma_dev *ocrdma_add(struct be_dev_info *dev_info) +{ + int status = 0, i; + struct ocrdma_dev *dev; + + dev = (struct ocrdma_dev *)ib_alloc_device(sizeof(struct ocrdma_dev)); + if (!dev) { + pr_err("Unable to allocate ib device\n"); + return NULL; + } + dev->mbx_cmd = kzalloc(sizeof(struct ocrdma_mqe_emb_cmd), GFP_KERNEL); + if (!dev->mbx_cmd) + goto idr_err; + + memcpy(&dev->nic_info, dev_info, sizeof(*dev_info)); + dev->id = idr_alloc(&ocrdma_dev_id, NULL, 0, 0, GFP_KERNEL); + if (dev->id < 0) + goto idr_err; + + status = ocrdma_init_hw(dev); + if (status) + goto init_err; + + status = ocrdma_alloc_resources(dev); + if (status) + goto alloc_err; + + ocrdma_init_service_level(dev); + ocrdma_init_gid_table(dev); + status = ocrdma_register_device(dev); + if (status) + goto alloc_err; + + for (i = 0; i < ARRAY_SIZE(ocrdma_attributes); i++) + if (device_create_file(&dev->ibdev.dev, ocrdma_attributes[i])) + goto sysfs_err; + spin_lock(&ocrdma_devlist_lock); + list_add_tail_rcu(&dev->entry, &ocrdma_dev_list); + spin_unlock(&ocrdma_devlist_lock); + /* Init stats */ + ocrdma_add_port_stats(dev); + /* Interrupt Moderation */ + INIT_DELAYED_WORK(&dev->eqd_work, ocrdma_eqd_set_task); + schedule_delayed_work(&dev->eqd_work, msecs_to_jiffies(1000)); + + pr_info("%s %s: %s \"%s\" port %d\n", + dev_name(&dev->nic_info.pdev->dev), hca_name(dev), + port_speed_string(dev), dev->model_number, + dev->hba_port_num); + pr_info("%s ocrdma%d driver loaded successfully\n", + dev_name(&dev->nic_info.pdev->dev), dev->id); + return dev; + +sysfs_err: + ocrdma_remove_sysfiles(dev); +alloc_err: + ocrdma_free_resources(dev); + ocrdma_cleanup_hw(dev); +init_err: + idr_remove(&ocrdma_dev_id, dev->id); +idr_err: + kfree(dev->mbx_cmd); + ib_dealloc_device(&dev->ibdev); + pr_err("%s() leaving. ret=%d\n", __func__, status); + return NULL; +} + +static void ocrdma_remove_free(struct rcu_head *rcu) +{ + struct ocrdma_dev *dev = container_of(rcu, struct ocrdma_dev, rcu); + + idr_remove(&ocrdma_dev_id, dev->id); + kfree(dev->mbx_cmd); + ib_dealloc_device(&dev->ibdev); +} + +static void ocrdma_remove(struct ocrdma_dev *dev) +{ + /* first unregister with stack to stop all the active traffic + * of the registered clients. + */ + cancel_delayed_work_sync(&dev->eqd_work); + ocrdma_remove_sysfiles(dev); + ib_unregister_device(&dev->ibdev); + + ocrdma_rem_port_stats(dev); + + spin_lock(&ocrdma_devlist_lock); + list_del_rcu(&dev->entry); + spin_unlock(&ocrdma_devlist_lock); + + ocrdma_free_resources(dev); + ocrdma_cleanup_hw(dev); + + call_rcu(&dev->rcu, ocrdma_remove_free); +} + +static int ocrdma_open(struct ocrdma_dev *dev) +{ + struct ib_event port_event; + + port_event.event = IB_EVENT_PORT_ACTIVE; + port_event.element.port_num = 1; + port_event.device = &dev->ibdev; + ib_dispatch_event(&port_event); + return 0; +} + +static int ocrdma_close(struct ocrdma_dev *dev) +{ + int i; + struct ocrdma_qp *qp, **cur_qp; + struct ib_event err_event; + struct ib_qp_attr attrs; + int attr_mask = IB_QP_STATE; + + attrs.qp_state = IB_QPS_ERR; + mutex_lock(&dev->dev_lock); + if (dev->qp_tbl) { + cur_qp = dev->qp_tbl; + for (i = 0; i < OCRDMA_MAX_QP; i++) { + qp = cur_qp[i]; + if (qp && qp->ibqp.qp_type != IB_QPT_GSI) { + /* change the QP state to ERROR */ + _ocrdma_modify_qp(&qp->ibqp, &attrs, attr_mask); + + err_event.event = IB_EVENT_QP_FATAL; + err_event.element.qp = &qp->ibqp; + err_event.device = &dev->ibdev; + ib_dispatch_event(&err_event); + } + } + } + mutex_unlock(&dev->dev_lock); + + err_event.event = IB_EVENT_PORT_ERR; + err_event.element.port_num = 1; + err_event.device = &dev->ibdev; + ib_dispatch_event(&err_event); + return 0; +} + +static void ocrdma_shutdown(struct ocrdma_dev *dev) +{ + ocrdma_close(dev); + ocrdma_remove(dev); +} + +/* event handling via NIC driver ensures that all the NIC specific + * initialization done before RoCE driver notifies + * event to stack. + */ +static void ocrdma_event_handler(struct ocrdma_dev *dev, u32 event) +{ + switch (event) { + case BE_DEV_UP: + ocrdma_open(dev); + break; + case BE_DEV_DOWN: + ocrdma_close(dev); + break; + case BE_DEV_SHUTDOWN: + ocrdma_shutdown(dev); + break; + } +} + +static struct ocrdma_driver ocrdma_drv = { + .name = "ocrdma_driver", + .add = ocrdma_add, + .remove = ocrdma_remove, + .state_change_handler = ocrdma_event_handler, + .be_abi_version = OCRDMA_BE_ROCE_ABI_VERSION, +}; + +static void ocrdma_unregister_inet6addr_notifier(void) +{ +#if IS_ENABLED(CONFIG_IPV6) + unregister_inet6addr_notifier(&ocrdma_inet6addr_notifier); +#endif +} + +static void ocrdma_unregister_inetaddr_notifier(void) +{ + unregister_inetaddr_notifier(&ocrdma_inetaddr_notifier); +} + +static int __init ocrdma_init_module(void) +{ + int status; + + ocrdma_init_debugfs(); + + status = register_inetaddr_notifier(&ocrdma_inetaddr_notifier); + if (status) + return status; + +#if IS_ENABLED(CONFIG_IPV6) + status = register_inet6addr_notifier(&ocrdma_inet6addr_notifier); + if (status) + goto err_notifier6; +#endif + + status = be_roce_register_driver(&ocrdma_drv); + if (status) + goto err_be_reg; + + return 0; + +err_be_reg: +#if IS_ENABLED(CONFIG_IPV6) + ocrdma_unregister_inet6addr_notifier(); +err_notifier6: +#endif + ocrdma_unregister_inetaddr_notifier(); + return status; +} + +static void __exit ocrdma_exit_module(void) +{ + be_roce_unregister_driver(&ocrdma_drv); + ocrdma_unregister_inet6addr_notifier(); + ocrdma_unregister_inetaddr_notifier(); + ocrdma_rem_debugfs(); +} + +module_init(ocrdma_init_module); +module_exit(ocrdma_exit_module); diff --git a/kernel/drivers/infiniband/hw/ocrdma/ocrdma_sli.h b/kernel/drivers/infiniband/hw/ocrdma/ocrdma_sli.h new file mode 100644 index 000000000..02ad0aee9 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ocrdma/ocrdma_sli.h @@ -0,0 +1,2173 @@ +/******************************************************************* + * This file is part of the Emulex RoCE Device Driver for * + * RoCE (RDMA over Converged Ethernet) adapters. * + * Copyright (C) 2008-2012 Emulex. All rights reserved. * + * EMULEX and SLI are trademarks of Emulex. * + * www.emulex.com * + * * + * This program is free software; you can redistribute it and/or * + * modify it under the terms of version 2 of the GNU General * + * Public License as published by the Free Software Foundation. * + * This program is distributed in the hope that it will be useful. * + * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND * + * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, * + * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE * + * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD * + * TO BE LEGALLY INVALID. See the GNU General Public License for * + * more details, a copy of which can be found in the file COPYING * + * included with this package. * + * + * Contact Information: + * linux-drivers@emulex.com + * + * Emulex + * 3333 Susan Street + * Costa Mesa, CA 92626 + *******************************************************************/ + +#ifndef __OCRDMA_SLI_H__ +#define __OCRDMA_SLI_H__ + +enum { + OCRDMA_ASIC_GEN_SKH_R = 0x04, + OCRDMA_ASIC_GEN_LANCER = 0x0B +}; + +enum { + OCRDMA_ASIC_REV_A0 = 0x00, + OCRDMA_ASIC_REV_B0 = 0x10, + OCRDMA_ASIC_REV_C0 = 0x20 +}; + +#define OCRDMA_SUBSYS_ROCE 10 +enum { + OCRDMA_CMD_QUERY_CONFIG = 1, + OCRDMA_CMD_ALLOC_PD = 2, + OCRDMA_CMD_DEALLOC_PD = 3, + + OCRDMA_CMD_CREATE_AH_TBL = 4, + OCRDMA_CMD_DELETE_AH_TBL = 5, + + OCRDMA_CMD_CREATE_QP = 6, + OCRDMA_CMD_QUERY_QP = 7, + OCRDMA_CMD_MODIFY_QP = 8 , + OCRDMA_CMD_DELETE_QP = 9, + + OCRDMA_CMD_RSVD1 = 10, + OCRDMA_CMD_ALLOC_LKEY = 11, + OCRDMA_CMD_DEALLOC_LKEY = 12, + OCRDMA_CMD_REGISTER_NSMR = 13, + OCRDMA_CMD_REREGISTER_NSMR = 14, + OCRDMA_CMD_REGISTER_NSMR_CONT = 15, + OCRDMA_CMD_QUERY_NSMR = 16, + OCRDMA_CMD_ALLOC_MW = 17, + OCRDMA_CMD_QUERY_MW = 18, + + OCRDMA_CMD_CREATE_SRQ = 19, + OCRDMA_CMD_QUERY_SRQ = 20, + OCRDMA_CMD_MODIFY_SRQ = 21, + OCRDMA_CMD_DELETE_SRQ = 22, + + OCRDMA_CMD_ATTACH_MCAST = 23, + OCRDMA_CMD_DETACH_MCAST = 24, + + OCRDMA_CMD_CREATE_RBQ = 25, + OCRDMA_CMD_DESTROY_RBQ = 26, + + OCRDMA_CMD_GET_RDMA_STATS = 27, + OCRDMA_CMD_ALLOC_PD_RANGE = 28, + OCRDMA_CMD_DEALLOC_PD_RANGE = 29, + + OCRDMA_CMD_MAX +}; + +#define OCRDMA_SUBSYS_COMMON 1 +enum { + OCRDMA_CMD_QUERY_NTWK_LINK_CONFIG_V1 = 5, + OCRDMA_CMD_CREATE_CQ = 12, + OCRDMA_CMD_CREATE_EQ = 13, + OCRDMA_CMD_CREATE_MQ = 21, + OCRDMA_CMD_GET_CTRL_ATTRIBUTES = 32, + OCRDMA_CMD_GET_FW_VER = 35, + OCRDMA_CMD_MODIFY_EQ_DELAY = 41, + OCRDMA_CMD_DELETE_MQ = 53, + OCRDMA_CMD_DELETE_CQ = 54, + OCRDMA_CMD_DELETE_EQ = 55, + OCRDMA_CMD_GET_FW_CONFIG = 58, + OCRDMA_CMD_CREATE_MQ_EXT = 90, + OCRDMA_CMD_PHY_DETAILS = 102 +}; + +enum { + QTYPE_EQ = 1, + QTYPE_CQ = 2, + QTYPE_MCCQ = 3 +}; + +#define OCRDMA_MAX_SGID 16 + +#define OCRDMA_MAX_QP 2048 +#define OCRDMA_MAX_CQ 2048 +#define OCRDMA_MAX_STAG 16384 + +enum { + OCRDMA_DB_RQ_OFFSET = 0xE0, + OCRDMA_DB_GEN2_RQ_OFFSET = 0x100, + OCRDMA_DB_SQ_OFFSET = 0x60, + OCRDMA_DB_GEN2_SQ_OFFSET = 0x1C0, + OCRDMA_DB_SRQ_OFFSET = OCRDMA_DB_RQ_OFFSET, + OCRDMA_DB_GEN2_SRQ_OFFSET = OCRDMA_DB_GEN2_RQ_OFFSET, + OCRDMA_DB_CQ_OFFSET = 0x120, + OCRDMA_DB_EQ_OFFSET = OCRDMA_DB_CQ_OFFSET, + OCRDMA_DB_MQ_OFFSET = 0x140, + + OCRDMA_DB_SQ_SHIFT = 16, + OCRDMA_DB_RQ_SHIFT = 24 +}; + +#define OCRDMA_DB_CQ_RING_ID_MASK 0x3FF /* bits 0 - 9 */ +#define OCRDMA_DB_CQ_RING_ID_EXT_MASK 0x0C00 /* bits 10-11 of qid at 12-11 */ +/* qid #2 msbits at 12-11 */ +#define OCRDMA_DB_CQ_RING_ID_EXT_MASK_SHIFT 0x1 +#define OCRDMA_DB_CQ_NUM_POPPED_SHIFT 16 /* bits 16 - 28 */ +/* Rearm bit */ +#define OCRDMA_DB_CQ_REARM_SHIFT 29 /* bit 29 */ +/* solicited bit */ +#define OCRDMA_DB_CQ_SOLICIT_SHIFT 31 /* bit 31 */ + +#define OCRDMA_EQ_ID_MASK 0x1FF /* bits 0 - 8 */ +#define OCRDMA_EQ_ID_EXT_MASK 0x3e00 /* bits 9-13 */ +#define OCRDMA_EQ_ID_EXT_MASK_SHIFT 2 /* qid bits 9-13 at 11-15 */ + +/* Clear the interrupt for this eq */ +#define OCRDMA_EQ_CLR_SHIFT 9 /* bit 9 */ +/* Must be 1 */ +#define OCRDMA_EQ_TYPE_SHIFT 10 /* bit 10 */ +/* Number of event entries processed */ +#define OCRDMA_NUM_EQE_SHIFT 16 /* bits 16 - 28 */ +/* Rearm bit */ +#define OCRDMA_REARM_SHIFT 29 /* bit 29 */ + +#define OCRDMA_MQ_ID_MASK 0x7FF /* bits 0 - 10 */ +/* Number of entries posted */ +#define OCRDMA_MQ_NUM_MQE_SHIFT 16 /* bits 16 - 29 */ + +#define OCRDMA_MIN_HPAGE_SIZE 4096 + +#define OCRDMA_MIN_Q_PAGE_SIZE 4096 +#define OCRDMA_MAX_Q_PAGES 8 + +#define OCRDMA_SLI_ASIC_ID_OFFSET 0x9C +#define OCRDMA_SLI_ASIC_REV_MASK 0x000000FF +#define OCRDMA_SLI_ASIC_GEN_NUM_MASK 0x0000FF00 +#define OCRDMA_SLI_ASIC_GEN_NUM_SHIFT 0x08 +/* +# 0: 4K Bytes +# 1: 8K Bytes +# 2: 16K Bytes +# 3: 32K Bytes +# 4: 64K Bytes +# 5: 128K Bytes +# 6: 256K Bytes +# 7: 512K Bytes +*/ +#define OCRDMA_MAX_Q_PAGE_SIZE_CNT 8 +#define OCRDMA_Q_PAGE_BASE_SIZE (OCRDMA_MIN_Q_PAGE_SIZE * OCRDMA_MAX_Q_PAGES) + +#define MAX_OCRDMA_QP_PAGES 8 +#define OCRDMA_MAX_WQE_MEM_SIZE (MAX_OCRDMA_QP_PAGES * OCRDMA_MIN_HQ_PAGE_SIZE) + +#define OCRDMA_CREATE_CQ_MAX_PAGES 4 +#define OCRDMA_DPP_CQE_SIZE 4 + +#define OCRDMA_GEN2_MAX_CQE 1024 +#define OCRDMA_GEN2_CQ_PAGE_SIZE 4096 +#define OCRDMA_GEN2_WQE_SIZE 256 +#define OCRDMA_MAX_CQE 4095 +#define OCRDMA_CQ_PAGE_SIZE 16384 +#define OCRDMA_WQE_SIZE 128 +#define OCRDMA_WQE_STRIDE 8 +#define OCRDMA_WQE_ALIGN_BYTES 16 + +#define MAX_OCRDMA_SRQ_PAGES MAX_OCRDMA_QP_PAGES + +enum { + OCRDMA_MCH_OPCODE_SHIFT = 0, + OCRDMA_MCH_OPCODE_MASK = 0xFF, + OCRDMA_MCH_SUBSYS_SHIFT = 8, + OCRDMA_MCH_SUBSYS_MASK = 0xFF00 +}; + +/* mailbox cmd header */ +struct ocrdma_mbx_hdr { + u32 subsys_op; + u32 timeout; /* in seconds */ + u32 cmd_len; + u32 rsvd_version; +}; + +enum { + OCRDMA_MBX_RSP_OPCODE_SHIFT = 0, + OCRDMA_MBX_RSP_OPCODE_MASK = 0xFF, + OCRDMA_MBX_RSP_SUBSYS_SHIFT = 8, + OCRDMA_MBX_RSP_SUBSYS_MASK = 0xFF << OCRDMA_MBX_RSP_SUBSYS_SHIFT, + + OCRDMA_MBX_RSP_STATUS_SHIFT = 0, + OCRDMA_MBX_RSP_STATUS_MASK = 0xFF, + OCRDMA_MBX_RSP_ASTATUS_SHIFT = 8, + OCRDMA_MBX_RSP_ASTATUS_MASK = 0xFF << OCRDMA_MBX_RSP_ASTATUS_SHIFT +}; + +/* mailbox cmd response */ +struct ocrdma_mbx_rsp { + u32 subsys_op; + u32 status; + u32 rsp_len; + u32 add_rsp_len; +}; + +enum { + OCRDMA_MQE_EMBEDDED = 1, + OCRDMA_MQE_NONEMBEDDED = 0 +}; + +struct ocrdma_mqe_sge { + u32 pa_lo; + u32 pa_hi; + u32 len; +}; + +enum { + OCRDMA_MQE_HDR_EMB_SHIFT = 0, + OCRDMA_MQE_HDR_EMB_MASK = BIT(0), + OCRDMA_MQE_HDR_SGE_CNT_SHIFT = 3, + OCRDMA_MQE_HDR_SGE_CNT_MASK = 0x1F << OCRDMA_MQE_HDR_SGE_CNT_SHIFT, + OCRDMA_MQE_HDR_SPECIAL_SHIFT = 24, + OCRDMA_MQE_HDR_SPECIAL_MASK = 0xFF << OCRDMA_MQE_HDR_SPECIAL_SHIFT +}; + +struct ocrdma_mqe_hdr { + u32 spcl_sge_cnt_emb; + u32 pyld_len; + u32 tag_lo; + u32 tag_hi; + u32 rsvd3; +}; + +struct ocrdma_mqe_emb_cmd { + struct ocrdma_mbx_hdr mch; + u8 pyld[220]; +}; + +struct ocrdma_mqe { + struct ocrdma_mqe_hdr hdr; + union { + struct ocrdma_mqe_emb_cmd emb_req; + struct { + struct ocrdma_mqe_sge sge[19]; + } nonemb_req; + u8 cmd[236]; + struct ocrdma_mbx_rsp rsp; + } u; +}; + +#define OCRDMA_EQ_LEN 4096 +#define OCRDMA_MQ_CQ_LEN 256 +#define OCRDMA_MQ_LEN 128 + +#define PAGE_SHIFT_4K 12 +#define PAGE_SIZE_4K (1 << PAGE_SHIFT_4K) + +/* Returns number of pages spanned by the data starting at the given addr */ +#define PAGES_4K_SPANNED(_address, size) \ + ((u32)((((size_t)(_address) & (PAGE_SIZE_4K - 1)) + \ + (size) + (PAGE_SIZE_4K - 1)) >> PAGE_SHIFT_4K)) + +struct ocrdma_delete_q_req { + struct ocrdma_mbx_hdr req; + u32 id; +}; + +struct ocrdma_pa { + u32 lo; + u32 hi; +}; + +#define MAX_OCRDMA_EQ_PAGES 8 +struct ocrdma_create_eq_req { + struct ocrdma_mbx_hdr req; + u32 num_pages; + u32 valid; + u32 cnt; + u32 delay; + u32 rsvd; + struct ocrdma_pa pa[MAX_OCRDMA_EQ_PAGES]; +}; + +enum { + OCRDMA_CREATE_EQ_VALID = BIT(29), + OCRDMA_CREATE_EQ_CNT_SHIFT = 26, + OCRDMA_CREATE_CQ_DELAY_SHIFT = 13, +}; + +struct ocrdma_create_eq_rsp { + struct ocrdma_mbx_rsp rsp; + u32 vector_eqid; +}; + +#define OCRDMA_EQ_MINOR_OTHER 0x1 + +struct ocrmda_set_eqd { + u32 eq_id; + u32 phase; + u32 delay_multiplier; +}; + +struct ocrdma_modify_eqd_cmd { + struct ocrdma_mbx_hdr req; + u32 num_eq; + struct ocrmda_set_eqd set_eqd[8]; +} __packed; + +struct ocrdma_modify_eqd_req { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_modify_eqd_cmd cmd; +}; + + +struct ocrdma_modify_eq_delay_rsp { + struct ocrdma_mbx_rsp hdr; + u32 rsvd0; +} __packed; + +enum { + OCRDMA_MCQE_STATUS_SHIFT = 0, + OCRDMA_MCQE_STATUS_MASK = 0xFFFF, + OCRDMA_MCQE_ESTATUS_SHIFT = 16, + OCRDMA_MCQE_ESTATUS_MASK = 0xFFFF << OCRDMA_MCQE_ESTATUS_SHIFT, + OCRDMA_MCQE_CONS_SHIFT = 27, + OCRDMA_MCQE_CONS_MASK = BIT(27), + OCRDMA_MCQE_CMPL_SHIFT = 28, + OCRDMA_MCQE_CMPL_MASK = BIT(28), + OCRDMA_MCQE_AE_SHIFT = 30, + OCRDMA_MCQE_AE_MASK = BIT(30), + OCRDMA_MCQE_VALID_SHIFT = 31, + OCRDMA_MCQE_VALID_MASK = BIT(31) +}; + +struct ocrdma_mcqe { + u32 status; + u32 tag_lo; + u32 tag_hi; + u32 valid_ae_cmpl_cons; +}; + +enum { + OCRDMA_AE_MCQE_QPVALID = BIT(31), + OCRDMA_AE_MCQE_QPID_MASK = 0xFFFF, + + OCRDMA_AE_MCQE_CQVALID = BIT(31), + OCRDMA_AE_MCQE_CQID_MASK = 0xFFFF, + OCRDMA_AE_MCQE_VALID = BIT(31), + OCRDMA_AE_MCQE_AE = BIT(30), + OCRDMA_AE_MCQE_EVENT_TYPE_SHIFT = 16, + OCRDMA_AE_MCQE_EVENT_TYPE_MASK = + 0xFF << OCRDMA_AE_MCQE_EVENT_TYPE_SHIFT, + OCRDMA_AE_MCQE_EVENT_CODE_SHIFT = 8, + OCRDMA_AE_MCQE_EVENT_CODE_MASK = + 0xFF << OCRDMA_AE_MCQE_EVENT_CODE_SHIFT +}; +struct ocrdma_ae_mcqe { + u32 qpvalid_qpid; + u32 cqvalid_cqid; + u32 evt_tag; + u32 valid_ae_event; +}; + +enum { + OCRDMA_AE_PVID_MCQE_ENABLED_SHIFT = 0, + OCRDMA_AE_PVID_MCQE_ENABLED_MASK = 0xFF, + OCRDMA_AE_PVID_MCQE_TAG_SHIFT = 16, + OCRDMA_AE_PVID_MCQE_TAG_MASK = 0xFFFF << OCRDMA_AE_PVID_MCQE_TAG_SHIFT +}; + +struct ocrdma_ae_pvid_mcqe { + u32 tag_enabled; + u32 event_tag; + u32 rsvd1; + u32 rsvd2; +}; + +enum { + OCRDMA_AE_MPA_MCQE_REQ_ID_SHIFT = 16, + OCRDMA_AE_MPA_MCQE_REQ_ID_MASK = 0xFFFF << + OCRDMA_AE_MPA_MCQE_REQ_ID_SHIFT, + + OCRDMA_AE_MPA_MCQE_EVENT_CODE_SHIFT = 8, + OCRDMA_AE_MPA_MCQE_EVENT_CODE_MASK = 0xFF << + OCRDMA_AE_MPA_MCQE_EVENT_CODE_SHIFT, + OCRDMA_AE_MPA_MCQE_EVENT_TYPE_SHIFT = 16, + OCRDMA_AE_MPA_MCQE_EVENT_TYPE_MASK = 0xFF << + OCRDMA_AE_MPA_MCQE_EVENT_TYPE_SHIFT, + OCRDMA_AE_MPA_MCQE_EVENT_AE_SHIFT = 30, + OCRDMA_AE_MPA_MCQE_EVENT_AE_MASK = BIT(30), + OCRDMA_AE_MPA_MCQE_EVENT_VALID_SHIFT = 31, + OCRDMA_AE_MPA_MCQE_EVENT_VALID_MASK = BIT(31) +}; + +struct ocrdma_ae_mpa_mcqe { + u32 req_id; + u32 w1; + u32 w2; + u32 valid_ae_event; +}; + +enum { + OCRDMA_AE_QP_MCQE_NEW_QP_STATE_SHIFT = 0, + OCRDMA_AE_QP_MCQE_NEW_QP_STATE_MASK = 0xFFFF, + OCRDMA_AE_QP_MCQE_QP_ID_SHIFT = 16, + OCRDMA_AE_QP_MCQE_QP_ID_MASK = 0xFFFF << + OCRDMA_AE_QP_MCQE_QP_ID_SHIFT, + + OCRDMA_AE_QP_MCQE_EVENT_CODE_SHIFT = 8, + OCRDMA_AE_QP_MCQE_EVENT_CODE_MASK = 0xFF << + OCRDMA_AE_QP_MCQE_EVENT_CODE_SHIFT, + OCRDMA_AE_QP_MCQE_EVENT_TYPE_SHIFT = 16, + OCRDMA_AE_QP_MCQE_EVENT_TYPE_MASK = 0xFF << + OCRDMA_AE_QP_MCQE_EVENT_TYPE_SHIFT, + OCRDMA_AE_QP_MCQE_EVENT_AE_SHIFT = 30, + OCRDMA_AE_QP_MCQE_EVENT_AE_MASK = BIT(30), + OCRDMA_AE_QP_MCQE_EVENT_VALID_SHIFT = 31, + OCRDMA_AE_QP_MCQE_EVENT_VALID_MASK = BIT(31) +}; + +struct ocrdma_ae_qp_mcqe { + u32 qp_id_state; + u32 w1; + u32 w2; + u32 valid_ae_event; +}; + +#define OCRDMA_ASYNC_RDMA_EVE_CODE 0x14 +#define OCRDMA_ASYNC_GRP5_EVE_CODE 0x5 + +enum ocrdma_async_grp5_events { + OCRDMA_ASYNC_EVENT_QOS_VALUE = 0x01, + OCRDMA_ASYNC_EVENT_COS_VALUE = 0x02, + OCRDMA_ASYNC_EVENT_PVID_STATE = 0x03 +}; + +enum OCRDMA_ASYNC_EVENT_TYPE { + OCRDMA_CQ_ERROR = 0x00, + OCRDMA_CQ_OVERRUN_ERROR = 0x01, + OCRDMA_CQ_QPCAT_ERROR = 0x02, + OCRDMA_QP_ACCESS_ERROR = 0x03, + OCRDMA_QP_COMM_EST_EVENT = 0x04, + OCRDMA_SQ_DRAINED_EVENT = 0x05, + OCRDMA_DEVICE_FATAL_EVENT = 0x08, + OCRDMA_SRQCAT_ERROR = 0x0E, + OCRDMA_SRQ_LIMIT_EVENT = 0x0F, + OCRDMA_QP_LAST_WQE_EVENT = 0x10, + + OCRDMA_MAX_ASYNC_ERRORS +}; + +/* mailbox command request and responses */ +enum { + OCRDMA_MBX_QUERY_CFG_CQ_OVERFLOW_SHIFT = 2, + OCRDMA_MBX_QUERY_CFG_CQ_OVERFLOW_MASK = BIT(2), + OCRDMA_MBX_QUERY_CFG_SRQ_SUPPORTED_SHIFT = 3, + OCRDMA_MBX_QUERY_CFG_SRQ_SUPPORTED_MASK = BIT(3), + OCRDMA_MBX_QUERY_CFG_MAX_QP_SHIFT = 8, + OCRDMA_MBX_QUERY_CFG_MAX_QP_MASK = 0xFFFFFF << + OCRDMA_MBX_QUERY_CFG_MAX_QP_SHIFT, + + OCRDMA_MBX_QUERY_CFG_MAX_PD_SHIFT = 16, + OCRDMA_MBX_QUERY_CFG_MAX_PD_MASK = 0xFFFF << + OCRDMA_MBX_QUERY_CFG_MAX_PD_SHIFT, + OCRDMA_MBX_QUERY_CFG_CA_ACK_DELAY_SHIFT = 8, + OCRDMA_MBX_QUERY_CFG_CA_ACK_DELAY_MASK = 0xFF << + OCRDMA_MBX_QUERY_CFG_CA_ACK_DELAY_SHIFT, + + OCRDMA_MBX_QUERY_CFG_MAX_SEND_SGE_SHIFT = 0, + OCRDMA_MBX_QUERY_CFG_MAX_SEND_SGE_MASK = 0xFFFF, + OCRDMA_MBX_QUERY_CFG_MAX_WRITE_SGE_SHIFT = 16, + OCRDMA_MBX_QUERY_CFG_MAX_WRITE_SGE_MASK = 0xFFFF << + OCRDMA_MBX_QUERY_CFG_MAX_WRITE_SGE_SHIFT, + + OCRDMA_MBX_QUERY_CFG_MAX_ORD_PER_QP_SHIFT = 0, + OCRDMA_MBX_QUERY_CFG_MAX_ORD_PER_QP_MASK = 0xFFFF, + OCRDMA_MBX_QUERY_CFG_MAX_IRD_PER_QP_SHIFT = 16, + OCRDMA_MBX_QUERY_CFG_MAX_IRD_PER_QP_MASK = 0xFFFF << + OCRDMA_MBX_QUERY_CFG_MAX_IRD_PER_QP_SHIFT, + + OCRDMA_MBX_QUERY_CFG_MAX_WQE_SIZE_OFFSET = 24, + OCRDMA_MBX_QUERY_CFG_MAX_WQE_SIZE_MASK = 0xFF << + OCRDMA_MBX_QUERY_CFG_MAX_WQE_SIZE_OFFSET, + OCRDMA_MBX_QUERY_CFG_MAX_RQE_SIZE_OFFSET = 16, + OCRDMA_MBX_QUERY_CFG_MAX_RQE_SIZE_MASK = 0xFF << + OCRDMA_MBX_QUERY_CFG_MAX_RQE_SIZE_OFFSET, + OCRDMA_MBX_QUERY_CFG_MAX_DPP_CQES_OFFSET = 0, + OCRDMA_MBX_QUERY_CFG_MAX_DPP_CQES_MASK = 0xFFFF << + OCRDMA_MBX_QUERY_CFG_MAX_DPP_CQES_OFFSET, + + OCRDMA_MBX_QUERY_CFG_MAX_SRQ_OFFSET = 16, + OCRDMA_MBX_QUERY_CFG_MAX_SRQ_MASK = 0xFFFF << + OCRDMA_MBX_QUERY_CFG_MAX_SRQ_OFFSET, + OCRDMA_MBX_QUERY_CFG_MAX_RPIR_QPS_OFFSET = 0, + OCRDMA_MBX_QUERY_CFG_MAX_RPIR_QPS_MASK = 0xFFFF << + OCRDMA_MBX_QUERY_CFG_MAX_RPIR_QPS_OFFSET, + + OCRDMA_MBX_QUERY_CFG_MAX_DPP_PDS_OFFSET = 16, + OCRDMA_MBX_QUERY_CFG_MAX_DPP_PDS_MASK = 0xFFFF << + OCRDMA_MBX_QUERY_CFG_MAX_DPP_PDS_OFFSET, + OCRDMA_MBX_QUERY_CFG_MAX_DPP_CREDITS_OFFSET = 0, + OCRDMA_MBX_QUERY_CFG_MAX_DPP_CREDITS_MASK = 0xFFFF << + OCRDMA_MBX_QUERY_CFG_MAX_DPP_CREDITS_OFFSET, + + OCRDMA_MBX_QUERY_CFG_MAX_DPP_QPS_OFFSET = 0, + OCRDMA_MBX_QUERY_CFG_MAX_DPP_QPS_MASK = 0xFFFF << + OCRDMA_MBX_QUERY_CFG_MAX_DPP_QPS_OFFSET, + + OCRDMA_MBX_QUERY_CFG_MAX_WQES_PER_WQ_OFFSET = 16, + OCRDMA_MBX_QUERY_CFG_MAX_WQES_PER_WQ_MASK = 0xFFFF << + OCRDMA_MBX_QUERY_CFG_MAX_WQES_PER_WQ_OFFSET, + OCRDMA_MBX_QUERY_CFG_MAX_RQES_PER_RQ_OFFSET = 0, + OCRDMA_MBX_QUERY_CFG_MAX_RQES_PER_RQ_MASK = 0xFFFF << + OCRDMA_MBX_QUERY_CFG_MAX_RQES_PER_RQ_OFFSET, + + OCRDMA_MBX_QUERY_CFG_MAX_CQ_OFFSET = 16, + OCRDMA_MBX_QUERY_CFG_MAX_CQ_MASK = 0xFFFF << + OCRDMA_MBX_QUERY_CFG_MAX_CQ_OFFSET, + OCRDMA_MBX_QUERY_CFG_MAX_CQES_PER_CQ_OFFSET = 0, + OCRDMA_MBX_QUERY_CFG_MAX_CQES_PER_CQ_MASK = 0xFFFF << + OCRDMA_MBX_QUERY_CFG_MAX_CQES_PER_CQ_OFFSET, + + OCRDMA_MBX_QUERY_CFG_MAX_SRQ_RQE_OFFSET = 16, + OCRDMA_MBX_QUERY_CFG_MAX_SRQ_RQE_MASK = 0xFFFF << + OCRDMA_MBX_QUERY_CFG_MAX_SRQ_RQE_OFFSET, + OCRDMA_MBX_QUERY_CFG_MAX_SRQ_SGE_OFFSET = 0, + OCRDMA_MBX_QUERY_CFG_MAX_SRQ_SGE_MASK = 0xFFFF << + OCRDMA_MBX_QUERY_CFG_MAX_SRQ_SGE_OFFSET, +}; + +struct ocrdma_mbx_query_config { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp rsp; + u32 qp_srq_cq_ird_ord; + u32 max_pd_ca_ack_delay; + u32 max_write_send_sge; + u32 max_ird_ord_per_qp; + u32 max_shared_ird_ord; + u32 max_mr; + u32 max_mr_size_hi; + u32 max_mr_size_lo; + u32 max_num_mr_pbl; + u32 max_mw; + u32 max_fmr; + u32 max_pages_per_frmr; + u32 max_mcast_group; + u32 max_mcast_qp_attach; + u32 max_total_mcast_qp_attach; + u32 wqe_rqe_stride_max_dpp_cqs; + u32 max_srq_rpir_qps; + u32 max_dpp_pds_credits; + u32 max_dpp_credits_pds_per_pd; + u32 max_wqes_rqes_per_q; + u32 max_cq_cqes_per_cq; + u32 max_srq_rqe_sge; +}; + +struct ocrdma_fw_ver_rsp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp rsp; + + u8 running_ver[32]; +}; + +struct ocrdma_fw_conf_rsp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp rsp; + + u32 config_num; + u32 asic_revision; + u32 phy_port; + u32 fn_mode; + struct { + u32 mode; + u32 nic_wqid_base; + u32 nic_wq_tot; + u32 prot_wqid_base; + u32 prot_wq_tot; + u32 prot_rqid_base; + u32 prot_rqid_tot; + u32 rsvd[6]; + } ulp[2]; + u32 fn_capabilities; + u32 rsvd1; + u32 rsvd2; + u32 base_eqid; + u32 max_eq; + +}; + +enum { + OCRDMA_FN_MODE_RDMA = 0x4 +}; + +enum { + OCRDMA_IF_TYPE_MASK = 0xFFFF0000, + OCRDMA_IF_TYPE_SHIFT = 0x10, + OCRDMA_PHY_TYPE_MASK = 0x0000FFFF, + OCRDMA_FUTURE_DETAILS_MASK = 0xFFFF0000, + OCRDMA_FUTURE_DETAILS_SHIFT = 0x10, + OCRDMA_EX_PHY_DETAILS_MASK = 0x0000FFFF, + OCRDMA_FSPEED_SUPP_MASK = 0xFFFF0000, + OCRDMA_FSPEED_SUPP_SHIFT = 0x10, + OCRDMA_ASPEED_SUPP_MASK = 0x0000FFFF +}; + +struct ocrdma_get_phy_info_rsp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp rsp; + + u32 ityp_ptyp; + u32 misc_params; + u32 ftrdtl_exphydtl; + u32 fspeed_aspeed; + u32 future_use[2]; +}; + +enum { + OCRDMA_PHY_SPEED_ZERO = 0x0, + OCRDMA_PHY_SPEED_10MBPS = 0x1, + OCRDMA_PHY_SPEED_100MBPS = 0x2, + OCRDMA_PHY_SPEED_1GBPS = 0x4, + OCRDMA_PHY_SPEED_10GBPS = 0x8, + OCRDMA_PHY_SPEED_40GBPS = 0x20 +}; + +enum { + OCRDMA_PORT_NUM_MASK = 0x3F, + OCRDMA_PT_MASK = 0xC0, + OCRDMA_PT_SHIFT = 0x6, + OCRDMA_LINK_DUP_MASK = 0x0000FF00, + OCRDMA_LINK_DUP_SHIFT = 0x8, + OCRDMA_PHY_PS_MASK = 0x00FF0000, + OCRDMA_PHY_PS_SHIFT = 0x10, + OCRDMA_PHY_PFLT_MASK = 0xFF000000, + OCRDMA_PHY_PFLT_SHIFT = 0x18, + OCRDMA_QOS_LNKSP_MASK = 0xFFFF0000, + OCRDMA_QOS_LNKSP_SHIFT = 0x10, + OCRDMA_LLST_MASK = 0xFF, + OCRDMA_PLFC_MASK = 0x00000400, + OCRDMA_PLFC_SHIFT = 0x8, + OCRDMA_PLRFC_MASK = 0x00000200, + OCRDMA_PLRFC_SHIFT = 0x8, + OCRDMA_PLTFC_MASK = 0x00000100, + OCRDMA_PLTFC_SHIFT = 0x8 +}; + +struct ocrdma_get_link_speed_rsp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp rsp; + + u32 pflt_pps_ld_pnum; + u32 qos_lsp; + u32 res_lls; +}; + +enum { + OCRDMA_PHYS_LINK_SPEED_ZERO = 0x0, + OCRDMA_PHYS_LINK_SPEED_10MBPS = 0x1, + OCRDMA_PHYS_LINK_SPEED_100MBPS = 0x2, + OCRDMA_PHYS_LINK_SPEED_1GBPS = 0x3, + OCRDMA_PHYS_LINK_SPEED_10GBPS = 0x4, + OCRDMA_PHYS_LINK_SPEED_20GBPS = 0x5, + OCRDMA_PHYS_LINK_SPEED_25GBPS = 0x6, + OCRDMA_PHYS_LINK_SPEED_40GBPS = 0x7, + OCRDMA_PHYS_LINK_SPEED_100GBPS = 0x8 +}; + +enum { + OCRDMA_CREATE_CQ_VER2 = 2, + OCRDMA_CREATE_CQ_VER3 = 3, + + OCRDMA_CREATE_CQ_PAGE_CNT_MASK = 0xFFFF, + OCRDMA_CREATE_CQ_PAGE_SIZE_SHIFT = 16, + OCRDMA_CREATE_CQ_PAGE_SIZE_MASK = 0xFF, + + OCRDMA_CREATE_CQ_COALESCWM_SHIFT = 12, + OCRDMA_CREATE_CQ_COALESCWM_MASK = BIT(13) | BIT(12), + OCRDMA_CREATE_CQ_FLAGS_NODELAY = BIT(14), + OCRDMA_CREATE_CQ_FLAGS_AUTO_VALID = BIT(15), + + OCRDMA_CREATE_CQ_EQ_ID_MASK = 0xFFFF, + OCRDMA_CREATE_CQ_CQE_COUNT_MASK = 0xFFFF +}; + +enum { + OCRDMA_CREATE_CQ_VER0 = 0, + OCRDMA_CREATE_CQ_DPP = 1, + OCRDMA_CREATE_CQ_TYPE_SHIFT = 24, + OCRDMA_CREATE_CQ_EQID_SHIFT = 22, + + OCRDMA_CREATE_CQ_CNT_SHIFT = 27, + OCRDMA_CREATE_CQ_FLAGS_VALID = BIT(29), + OCRDMA_CREATE_CQ_FLAGS_EVENTABLE = BIT(31), + OCRDMA_CREATE_CQ_DEF_FLAGS = OCRDMA_CREATE_CQ_FLAGS_VALID | + OCRDMA_CREATE_CQ_FLAGS_EVENTABLE | + OCRDMA_CREATE_CQ_FLAGS_NODELAY +}; + +struct ocrdma_create_cq_cmd { + struct ocrdma_mbx_hdr req; + u32 pgsz_pgcnt; + u32 ev_cnt_flags; + u32 eqn; + u32 pdid_cqecnt; + u32 rsvd6; + struct ocrdma_pa pa[OCRDMA_CREATE_CQ_MAX_PAGES]; +}; + +struct ocrdma_create_cq { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_create_cq_cmd cmd; +}; + +enum { + OCRDMA_CREATE_CQ_CMD_PDID_SHIFT = 0x10 +}; + +enum { + OCRDMA_CREATE_CQ_RSP_CQ_ID_MASK = 0xFFFF +}; + +struct ocrdma_create_cq_cmd_rsp { + struct ocrdma_mbx_rsp rsp; + u32 cq_id; +}; + +struct ocrdma_create_cq_rsp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_create_cq_cmd_rsp rsp; +}; + +enum { + OCRDMA_CREATE_MQ_V0_CQ_ID_SHIFT = 22, + OCRDMA_CREATE_MQ_CQ_ID_SHIFT = 16, + OCRDMA_CREATE_MQ_RING_SIZE_SHIFT = 16, + OCRDMA_CREATE_MQ_VALID = BIT(31), + OCRDMA_CREATE_MQ_ASYNC_CQ_VALID = BIT(0) +}; + +struct ocrdma_create_mq_req { + struct ocrdma_mbx_hdr req; + u32 cqid_pages; + u32 async_event_bitmap; + u32 async_cqid_ringsize; + u32 valid; + u32 async_cqid_valid; + u32 rsvd; + struct ocrdma_pa pa[8]; +}; + +struct ocrdma_create_mq_rsp { + struct ocrdma_mbx_rsp rsp; + u32 id; +}; + +enum { + OCRDMA_DESTROY_CQ_QID_SHIFT = 0, + OCRDMA_DESTROY_CQ_QID_MASK = 0xFFFF, + OCRDMA_DESTROY_CQ_QID_BYPASS_FLUSH_SHIFT = 16, + OCRDMA_DESTROY_CQ_QID_BYPASS_FLUSH_MASK = 0xFFFF << + OCRDMA_DESTROY_CQ_QID_BYPASS_FLUSH_SHIFT +}; + +struct ocrdma_destroy_cq { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_hdr req; + + u32 bypass_flush_qid; +}; + +struct ocrdma_destroy_cq_rsp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp rsp; +}; + +enum { + OCRDMA_QPT_GSI = 1, + OCRDMA_QPT_RC = 2, + OCRDMA_QPT_UD = 4, +}; + +enum { + OCRDMA_CREATE_QP_REQ_PD_ID_SHIFT = 0, + OCRDMA_CREATE_QP_REQ_PD_ID_MASK = 0xFFFF, + OCRDMA_CREATE_QP_REQ_SQ_PAGE_SIZE_SHIFT = 16, + OCRDMA_CREATE_QP_REQ_RQ_PAGE_SIZE_SHIFT = 19, + OCRDMA_CREATE_QP_REQ_QPT_SHIFT = 29, + OCRDMA_CREATE_QP_REQ_QPT_MASK = BIT(31) | BIT(30) | BIT(29), + + OCRDMA_CREATE_QP_REQ_MAX_RQE_SHIFT = 0, + OCRDMA_CREATE_QP_REQ_MAX_RQE_MASK = 0xFFFF, + OCRDMA_CREATE_QP_REQ_MAX_WQE_SHIFT = 16, + OCRDMA_CREATE_QP_REQ_MAX_WQE_MASK = 0xFFFF << + OCRDMA_CREATE_QP_REQ_MAX_WQE_SHIFT, + + OCRDMA_CREATE_QP_REQ_MAX_SGE_WRITE_SHIFT = 0, + OCRDMA_CREATE_QP_REQ_MAX_SGE_WRITE_MASK = 0xFFFF, + OCRDMA_CREATE_QP_REQ_MAX_SGE_SEND_SHIFT = 16, + OCRDMA_CREATE_QP_REQ_MAX_SGE_SEND_MASK = 0xFFFF << + OCRDMA_CREATE_QP_REQ_MAX_SGE_SEND_SHIFT, + + OCRDMA_CREATE_QP_REQ_FMR_EN_SHIFT = 0, + OCRDMA_CREATE_QP_REQ_FMR_EN_MASK = BIT(0), + OCRDMA_CREATE_QP_REQ_ZERO_LKEYEN_SHIFT = 1, + OCRDMA_CREATE_QP_REQ_ZERO_LKEYEN_MASK = BIT(1), + OCRDMA_CREATE_QP_REQ_BIND_MEMWIN_SHIFT = 2, + OCRDMA_CREATE_QP_REQ_BIND_MEMWIN_MASK = BIT(2), + OCRDMA_CREATE_QP_REQ_INB_WREN_SHIFT = 3, + OCRDMA_CREATE_QP_REQ_INB_WREN_MASK = BIT(3), + OCRDMA_CREATE_QP_REQ_INB_RDEN_SHIFT = 4, + OCRDMA_CREATE_QP_REQ_INB_RDEN_MASK = BIT(4), + OCRDMA_CREATE_QP_REQ_USE_SRQ_SHIFT = 5, + OCRDMA_CREATE_QP_REQ_USE_SRQ_MASK = BIT(5), + OCRDMA_CREATE_QP_REQ_ENABLE_RPIR_SHIFT = 6, + OCRDMA_CREATE_QP_REQ_ENABLE_RPIR_MASK = BIT(6), + OCRDMA_CREATE_QP_REQ_ENABLE_DPP_SHIFT = 7, + OCRDMA_CREATE_QP_REQ_ENABLE_DPP_MASK = BIT(7), + OCRDMA_CREATE_QP_REQ_ENABLE_DPP_CQ_SHIFT = 8, + OCRDMA_CREATE_QP_REQ_ENABLE_DPP_CQ_MASK = BIT(8), + OCRDMA_CREATE_QP_REQ_MAX_SGE_RECV_SHIFT = 16, + OCRDMA_CREATE_QP_REQ_MAX_SGE_RECV_MASK = 0xFFFF << + OCRDMA_CREATE_QP_REQ_MAX_SGE_RECV_SHIFT, + + OCRDMA_CREATE_QP_REQ_MAX_IRD_SHIFT = 0, + OCRDMA_CREATE_QP_REQ_MAX_IRD_MASK = 0xFFFF, + OCRDMA_CREATE_QP_REQ_MAX_ORD_SHIFT = 16, + OCRDMA_CREATE_QP_REQ_MAX_ORD_MASK = 0xFFFF << + OCRDMA_CREATE_QP_REQ_MAX_ORD_SHIFT, + + OCRDMA_CREATE_QP_REQ_NUM_RQ_PAGES_SHIFT = 0, + OCRDMA_CREATE_QP_REQ_NUM_RQ_PAGES_MASK = 0xFFFF, + OCRDMA_CREATE_QP_REQ_NUM_WQ_PAGES_SHIFT = 16, + OCRDMA_CREATE_QP_REQ_NUM_WQ_PAGES_MASK = 0xFFFF << + OCRDMA_CREATE_QP_REQ_NUM_WQ_PAGES_SHIFT, + + OCRDMA_CREATE_QP_REQ_RQE_SIZE_SHIFT = 0, + OCRDMA_CREATE_QP_REQ_RQE_SIZE_MASK = 0xFFFF, + OCRDMA_CREATE_QP_REQ_WQE_SIZE_SHIFT = 16, + OCRDMA_CREATE_QP_REQ_WQE_SIZE_MASK = 0xFFFF << + OCRDMA_CREATE_QP_REQ_WQE_SIZE_SHIFT, + + OCRDMA_CREATE_QP_REQ_RQ_CQID_SHIFT = 0, + OCRDMA_CREATE_QP_REQ_RQ_CQID_MASK = 0xFFFF, + OCRDMA_CREATE_QP_REQ_WQ_CQID_SHIFT = 16, + OCRDMA_CREATE_QP_REQ_WQ_CQID_MASK = 0xFFFF << + OCRDMA_CREATE_QP_REQ_WQ_CQID_SHIFT, + + OCRDMA_CREATE_QP_REQ_DPP_CQPID_SHIFT = 0, + OCRDMA_CREATE_QP_REQ_DPP_CQPID_MASK = 0xFFFF, + OCRDMA_CREATE_QP_REQ_DPP_CREDIT_SHIFT = 16, + OCRDMA_CREATE_QP_REQ_DPP_CREDIT_MASK = 0xFFFF << + OCRDMA_CREATE_QP_REQ_DPP_CREDIT_SHIFT +}; + +enum { + OCRDMA_CREATE_QP_REQ_DPP_CREDIT_LIMIT = 16, + OCRDMA_CREATE_QP_RSP_DPP_PAGE_SHIFT = 1 +}; + +#define MAX_OCRDMA_IRD_PAGES 4 + +enum ocrdma_qp_flags { + OCRDMA_QP_MW_BIND = 1, + OCRDMA_QP_LKEY0 = (1 << 1), + OCRDMA_QP_FAST_REG = (1 << 2), + OCRDMA_QP_INB_RD = (1 << 6), + OCRDMA_QP_INB_WR = (1 << 7), +}; + +enum ocrdma_qp_state { + OCRDMA_QPS_RST = 0, + OCRDMA_QPS_INIT = 1, + OCRDMA_QPS_RTR = 2, + OCRDMA_QPS_RTS = 3, + OCRDMA_QPS_SQE = 4, + OCRDMA_QPS_SQ_DRAINING = 5, + OCRDMA_QPS_ERR = 6, + OCRDMA_QPS_SQD = 7 +}; + +struct ocrdma_create_qp_req { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_hdr req; + + u32 type_pgsz_pdn; + u32 max_wqe_rqe; + u32 max_sge_send_write; + u32 max_sge_recv_flags; + u32 max_ord_ird; + u32 num_wq_rq_pages; + u32 wqe_rqe_size; + u32 wq_rq_cqid; + struct ocrdma_pa wq_addr[MAX_OCRDMA_QP_PAGES]; + struct ocrdma_pa rq_addr[MAX_OCRDMA_QP_PAGES]; + u32 dpp_credits_cqid; + u32 rpir_lkey; + struct ocrdma_pa ird_addr[MAX_OCRDMA_IRD_PAGES]; +}; + +enum { + OCRDMA_CREATE_QP_RSP_QP_ID_SHIFT = 0, + OCRDMA_CREATE_QP_RSP_QP_ID_MASK = 0xFFFF, + + OCRDMA_CREATE_QP_RSP_MAX_RQE_SHIFT = 0, + OCRDMA_CREATE_QP_RSP_MAX_RQE_MASK = 0xFFFF, + OCRDMA_CREATE_QP_RSP_MAX_WQE_SHIFT = 16, + OCRDMA_CREATE_QP_RSP_MAX_WQE_MASK = 0xFFFF << + OCRDMA_CREATE_QP_RSP_MAX_WQE_SHIFT, + + OCRDMA_CREATE_QP_RSP_MAX_SGE_WRITE_SHIFT = 0, + OCRDMA_CREATE_QP_RSP_MAX_SGE_WRITE_MASK = 0xFFFF, + OCRDMA_CREATE_QP_RSP_MAX_SGE_SEND_SHIFT = 16, + OCRDMA_CREATE_QP_RSP_MAX_SGE_SEND_MASK = 0xFFFF << + OCRDMA_CREATE_QP_RSP_MAX_SGE_SEND_SHIFT, + + OCRDMA_CREATE_QP_RSP_MAX_SGE_RECV_SHIFT = 16, + OCRDMA_CREATE_QP_RSP_MAX_SGE_RECV_MASK = 0xFFFF << + OCRDMA_CREATE_QP_RSP_MAX_SGE_RECV_SHIFT, + + OCRDMA_CREATE_QP_RSP_MAX_IRD_SHIFT = 0, + OCRDMA_CREATE_QP_RSP_MAX_IRD_MASK = 0xFFFF, + OCRDMA_CREATE_QP_RSP_MAX_ORD_SHIFT = 16, + OCRDMA_CREATE_QP_RSP_MAX_ORD_MASK = 0xFFFF << + OCRDMA_CREATE_QP_RSP_MAX_ORD_SHIFT, + + OCRDMA_CREATE_QP_RSP_RQ_ID_SHIFT = 0, + OCRDMA_CREATE_QP_RSP_RQ_ID_MASK = 0xFFFF, + OCRDMA_CREATE_QP_RSP_SQ_ID_SHIFT = 16, + OCRDMA_CREATE_QP_RSP_SQ_ID_MASK = 0xFFFF << + OCRDMA_CREATE_QP_RSP_SQ_ID_SHIFT, + + OCRDMA_CREATE_QP_RSP_DPP_ENABLED_MASK = BIT(0), + OCRDMA_CREATE_QP_RSP_DPP_PAGE_OFFSET_SHIFT = 1, + OCRDMA_CREATE_QP_RSP_DPP_PAGE_OFFSET_MASK = 0x7FFF << + OCRDMA_CREATE_QP_RSP_DPP_PAGE_OFFSET_SHIFT, + OCRDMA_CREATE_QP_RSP_DPP_CREDITS_SHIFT = 16, + OCRDMA_CREATE_QP_RSP_DPP_CREDITS_MASK = 0xFFFF << + OCRDMA_CREATE_QP_RSP_DPP_CREDITS_SHIFT, +}; + +struct ocrdma_create_qp_rsp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp rsp; + + u32 qp_id; + u32 max_wqe_rqe; + u32 max_sge_send_write; + u32 max_sge_recv; + u32 max_ord_ird; + u32 sq_rq_id; + u32 dpp_response; +}; + +struct ocrdma_destroy_qp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_hdr req; + u32 qp_id; +}; + +struct ocrdma_destroy_qp_rsp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp rsp; +}; + +enum { + OCRDMA_MODIFY_QP_ID_SHIFT = 0, + OCRDMA_MODIFY_QP_ID_MASK = 0xFFFF, + + OCRDMA_QP_PARA_QPS_VALID = BIT(0), + OCRDMA_QP_PARA_SQD_ASYNC_VALID = BIT(1), + OCRDMA_QP_PARA_PKEY_VALID = BIT(2), + OCRDMA_QP_PARA_QKEY_VALID = BIT(3), + OCRDMA_QP_PARA_PMTU_VALID = BIT(4), + OCRDMA_QP_PARA_ACK_TO_VALID = BIT(5), + OCRDMA_QP_PARA_RETRY_CNT_VALID = BIT(6), + OCRDMA_QP_PARA_RRC_VALID = BIT(7), + OCRDMA_QP_PARA_RQPSN_VALID = BIT(8), + OCRDMA_QP_PARA_MAX_IRD_VALID = BIT(9), + OCRDMA_QP_PARA_MAX_ORD_VALID = BIT(10), + OCRDMA_QP_PARA_RNT_VALID = BIT(11), + OCRDMA_QP_PARA_SQPSN_VALID = BIT(12), + OCRDMA_QP_PARA_DST_QPN_VALID = BIT(13), + OCRDMA_QP_PARA_MAX_WQE_VALID = BIT(14), + OCRDMA_QP_PARA_MAX_RQE_VALID = BIT(15), + OCRDMA_QP_PARA_SGE_SEND_VALID = BIT(16), + OCRDMA_QP_PARA_SGE_RECV_VALID = BIT(17), + OCRDMA_QP_PARA_SGE_WR_VALID = BIT(18), + OCRDMA_QP_PARA_INB_RDEN_VALID = BIT(19), + OCRDMA_QP_PARA_INB_WREN_VALID = BIT(20), + OCRDMA_QP_PARA_FLOW_LBL_VALID = BIT(21), + OCRDMA_QP_PARA_BIND_EN_VALID = BIT(22), + OCRDMA_QP_PARA_ZLKEY_EN_VALID = BIT(23), + OCRDMA_QP_PARA_FMR_EN_VALID = BIT(24), + OCRDMA_QP_PARA_INBAT_EN_VALID = BIT(25), + OCRDMA_QP_PARA_VLAN_EN_VALID = BIT(26), + + OCRDMA_MODIFY_QP_FLAGS_RD = BIT(0), + OCRDMA_MODIFY_QP_FLAGS_WR = BIT(1), + OCRDMA_MODIFY_QP_FLAGS_SEND = BIT(2), + OCRDMA_MODIFY_QP_FLAGS_ATOMIC = BIT(3) +}; + +enum { + OCRDMA_QP_PARAMS_SRQ_ID_SHIFT = 0, + OCRDMA_QP_PARAMS_SRQ_ID_MASK = 0xFFFF, + + OCRDMA_QP_PARAMS_MAX_RQE_SHIFT = 0, + OCRDMA_QP_PARAMS_MAX_RQE_MASK = 0xFFFF, + OCRDMA_QP_PARAMS_MAX_WQE_SHIFT = 16, + OCRDMA_QP_PARAMS_MAX_WQE_MASK = 0xFFFF << + OCRDMA_QP_PARAMS_MAX_WQE_SHIFT, + + OCRDMA_QP_PARAMS_MAX_SGE_WRITE_SHIFT = 0, + OCRDMA_QP_PARAMS_MAX_SGE_WRITE_MASK = 0xFFFF, + OCRDMA_QP_PARAMS_MAX_SGE_SEND_SHIFT = 16, + OCRDMA_QP_PARAMS_MAX_SGE_SEND_MASK = 0xFFFF << + OCRDMA_QP_PARAMS_MAX_SGE_SEND_SHIFT, + + OCRDMA_QP_PARAMS_FLAGS_FMR_EN = BIT(0), + OCRDMA_QP_PARAMS_FLAGS_LKEY_0_EN = BIT(1), + OCRDMA_QP_PARAMS_FLAGS_BIND_MW_EN = BIT(2), + OCRDMA_QP_PARAMS_FLAGS_INBWR_EN = BIT(3), + OCRDMA_QP_PARAMS_FLAGS_INBRD_EN = BIT(4), + OCRDMA_QP_PARAMS_STATE_SHIFT = 5, + OCRDMA_QP_PARAMS_STATE_MASK = BIT(5) | BIT(6) | BIT(7), + OCRDMA_QP_PARAMS_FLAGS_SQD_ASYNC = BIT(8), + OCRDMA_QP_PARAMS_FLAGS_INB_ATEN = BIT(9), + OCRDMA_QP_PARAMS_MAX_SGE_RECV_SHIFT = 16, + OCRDMA_QP_PARAMS_MAX_SGE_RECV_MASK = 0xFFFF << + OCRDMA_QP_PARAMS_MAX_SGE_RECV_SHIFT, + + OCRDMA_QP_PARAMS_MAX_IRD_SHIFT = 0, + OCRDMA_QP_PARAMS_MAX_IRD_MASK = 0xFFFF, + OCRDMA_QP_PARAMS_MAX_ORD_SHIFT = 16, + OCRDMA_QP_PARAMS_MAX_ORD_MASK = 0xFFFF << + OCRDMA_QP_PARAMS_MAX_ORD_SHIFT, + + OCRDMA_QP_PARAMS_RQ_CQID_SHIFT = 0, + OCRDMA_QP_PARAMS_RQ_CQID_MASK = 0xFFFF, + OCRDMA_QP_PARAMS_WQ_CQID_SHIFT = 16, + OCRDMA_QP_PARAMS_WQ_CQID_MASK = 0xFFFF << + OCRDMA_QP_PARAMS_WQ_CQID_SHIFT, + + OCRDMA_QP_PARAMS_RQ_PSN_SHIFT = 0, + OCRDMA_QP_PARAMS_RQ_PSN_MASK = 0xFFFFFF, + OCRDMA_QP_PARAMS_HOP_LMT_SHIFT = 24, + OCRDMA_QP_PARAMS_HOP_LMT_MASK = 0xFF << + OCRDMA_QP_PARAMS_HOP_LMT_SHIFT, + + OCRDMA_QP_PARAMS_SQ_PSN_SHIFT = 0, + OCRDMA_QP_PARAMS_SQ_PSN_MASK = 0xFFFFFF, + OCRDMA_QP_PARAMS_TCLASS_SHIFT = 24, + OCRDMA_QP_PARAMS_TCLASS_MASK = 0xFF << + OCRDMA_QP_PARAMS_TCLASS_SHIFT, + + OCRDMA_QP_PARAMS_DEST_QPN_SHIFT = 0, + OCRDMA_QP_PARAMS_DEST_QPN_MASK = 0xFFFFFF, + OCRDMA_QP_PARAMS_RNR_RETRY_CNT_SHIFT = 24, + OCRDMA_QP_PARAMS_RNR_RETRY_CNT_MASK = 0x7 << + OCRDMA_QP_PARAMS_RNR_RETRY_CNT_SHIFT, + OCRDMA_QP_PARAMS_ACK_TIMEOUT_SHIFT = 27, + OCRDMA_QP_PARAMS_ACK_TIMEOUT_MASK = 0x1F << + OCRDMA_QP_PARAMS_ACK_TIMEOUT_SHIFT, + + OCRDMA_QP_PARAMS_PKEY_IDNEX_SHIFT = 0, + OCRDMA_QP_PARAMS_PKEY_INDEX_MASK = 0xFFFF, + OCRDMA_QP_PARAMS_PATH_MTU_SHIFT = 18, + OCRDMA_QP_PARAMS_PATH_MTU_MASK = 0x3FFF << + OCRDMA_QP_PARAMS_PATH_MTU_SHIFT, + + OCRDMA_QP_PARAMS_FLOW_LABEL_SHIFT = 0, + OCRDMA_QP_PARAMS_FLOW_LABEL_MASK = 0xFFFFF, + OCRDMA_QP_PARAMS_SL_SHIFT = 20, + OCRDMA_QP_PARAMS_SL_MASK = 0xF << + OCRDMA_QP_PARAMS_SL_SHIFT, + OCRDMA_QP_PARAMS_RETRY_CNT_SHIFT = 24, + OCRDMA_QP_PARAMS_RETRY_CNT_MASK = 0x7 << + OCRDMA_QP_PARAMS_RETRY_CNT_SHIFT, + OCRDMA_QP_PARAMS_RNR_NAK_TIMER_SHIFT = 27, + OCRDMA_QP_PARAMS_RNR_NAK_TIMER_MASK = 0x1F << + OCRDMA_QP_PARAMS_RNR_NAK_TIMER_SHIFT, + + OCRDMA_QP_PARAMS_DMAC_B4_TO_B5_SHIFT = 0, + OCRDMA_QP_PARAMS_DMAC_B4_TO_B5_MASK = 0xFFFF, + OCRDMA_QP_PARAMS_VLAN_SHIFT = 16, + OCRDMA_QP_PARAMS_VLAN_MASK = 0xFFFF << + OCRDMA_QP_PARAMS_VLAN_SHIFT +}; + +struct ocrdma_qp_params { + u32 id; + u32 max_wqe_rqe; + u32 max_sge_send_write; + u32 max_sge_recv_flags; + u32 max_ord_ird; + u32 wq_rq_cqid; + u32 hop_lmt_rq_psn; + u32 tclass_sq_psn; + u32 ack_to_rnr_rtc_dest_qpn; + u32 path_mtu_pkey_indx; + u32 rnt_rc_sl_fl; + u8 sgid[16]; + u8 dgid[16]; + u32 dmac_b0_to_b3; + u32 vlan_dmac_b4_to_b5; + u32 qkey; +}; + + +struct ocrdma_modify_qp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_hdr req; + + struct ocrdma_qp_params params; + u32 flags; + u32 rdma_flags; + u32 num_outstanding_atomic_rd; +}; + +enum { + OCRDMA_MODIFY_QP_RSP_MAX_RQE_SHIFT = 0, + OCRDMA_MODIFY_QP_RSP_MAX_RQE_MASK = 0xFFFF, + OCRDMA_MODIFY_QP_RSP_MAX_WQE_SHIFT = 16, + OCRDMA_MODIFY_QP_RSP_MAX_WQE_MASK = 0xFFFF << + OCRDMA_MODIFY_QP_RSP_MAX_WQE_SHIFT, + + OCRDMA_MODIFY_QP_RSP_MAX_IRD_SHIFT = 0, + OCRDMA_MODIFY_QP_RSP_MAX_IRD_MASK = 0xFFFF, + OCRDMA_MODIFY_QP_RSP_MAX_ORD_SHIFT = 16, + OCRDMA_MODIFY_QP_RSP_MAX_ORD_MASK = 0xFFFF << + OCRDMA_MODIFY_QP_RSP_MAX_ORD_SHIFT +}; + +struct ocrdma_modify_qp_rsp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp rsp; + + u32 max_wqe_rqe; + u32 max_ord_ird; +}; + +struct ocrdma_query_qp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_hdr req; + +#define OCRDMA_QUERY_UP_QP_ID_SHIFT 0 +#define OCRDMA_QUERY_UP_QP_ID_MASK 0xFFFFFF + u32 qp_id; +}; + +struct ocrdma_query_qp_rsp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp rsp; + struct ocrdma_qp_params params; + u32 dpp_credits_cqid; + u32 rbq_id; +}; + +enum { + OCRDMA_CREATE_SRQ_PD_ID_SHIFT = 0, + OCRDMA_CREATE_SRQ_PD_ID_MASK = 0xFFFF, + OCRDMA_CREATE_SRQ_PG_SZ_SHIFT = 16, + OCRDMA_CREATE_SRQ_PG_SZ_MASK = 0x3 << + OCRDMA_CREATE_SRQ_PG_SZ_SHIFT, + + OCRDMA_CREATE_SRQ_MAX_RQE_SHIFT = 0, + OCRDMA_CREATE_SRQ_MAX_SGE_RECV_SHIFT = 16, + OCRDMA_CREATE_SRQ_MAX_SGE_RECV_MASK = 0xFFFF << + OCRDMA_CREATE_SRQ_MAX_SGE_RECV_SHIFT, + + OCRDMA_CREATE_SRQ_RQE_SIZE_SHIFT = 0, + OCRDMA_CREATE_SRQ_RQE_SIZE_MASK = 0xFFFF, + OCRDMA_CREATE_SRQ_NUM_RQ_PAGES_SHIFT = 16, + OCRDMA_CREATE_SRQ_NUM_RQ_PAGES_MASK = 0xFFFF << + OCRDMA_CREATE_SRQ_NUM_RQ_PAGES_SHIFT +}; + +struct ocrdma_create_srq { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_hdr req; + + u32 pgsz_pdid; + u32 max_sge_rqe; + u32 pages_rqe_sz; + struct ocrdma_pa rq_addr[MAX_OCRDMA_SRQ_PAGES]; +}; + +enum { + OCRDMA_CREATE_SRQ_RSP_SRQ_ID_SHIFT = 0, + OCRDMA_CREATE_SRQ_RSP_SRQ_ID_MASK = 0xFFFFFF, + + OCRDMA_CREATE_SRQ_RSP_MAX_RQE_ALLOCATED_SHIFT = 0, + OCRDMA_CREATE_SRQ_RSP_MAX_RQE_ALLOCATED_MASK = 0xFFFF, + OCRDMA_CREATE_SRQ_RSP_MAX_SGE_RECV_ALLOCATED_SHIFT = 16, + OCRDMA_CREATE_SRQ_RSP_MAX_SGE_RECV_ALLOCATED_MASK = 0xFFFF << + OCRDMA_CREATE_SRQ_RSP_MAX_SGE_RECV_ALLOCATED_SHIFT +}; + +struct ocrdma_create_srq_rsp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp rsp; + + u32 id; + u32 max_sge_rqe_allocated; +}; + +enum { + OCRDMA_MODIFY_SRQ_ID_SHIFT = 0, + OCRDMA_MODIFY_SRQ_ID_MASK = 0xFFFFFF, + + OCRDMA_MODIFY_SRQ_MAX_RQE_SHIFT = 0, + OCRDMA_MODIFY_SRQ_MAX_RQE_MASK = 0xFFFF, + OCRDMA_MODIFY_SRQ_LIMIT_SHIFT = 16, + OCRDMA_MODIFY_SRQ__LIMIT_MASK = 0xFFFF << + OCRDMA_MODIFY_SRQ_LIMIT_SHIFT +}; + +struct ocrdma_modify_srq { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp rep; + + u32 id; + u32 limit_max_rqe; +}; + +enum { + OCRDMA_QUERY_SRQ_ID_SHIFT = 0, + OCRDMA_QUERY_SRQ_ID_MASK = 0xFFFFFF +}; + +struct ocrdma_query_srq { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp req; + + u32 id; +}; + +enum { + OCRDMA_QUERY_SRQ_RSP_PD_ID_SHIFT = 0, + OCRDMA_QUERY_SRQ_RSP_PD_ID_MASK = 0xFFFF, + OCRDMA_QUERY_SRQ_RSP_MAX_RQE_SHIFT = 16, + OCRDMA_QUERY_SRQ_RSP_MAX_RQE_MASK = 0xFFFF << + OCRDMA_QUERY_SRQ_RSP_MAX_RQE_SHIFT, + + OCRDMA_QUERY_SRQ_RSP_MAX_SGE_RECV_SHIFT = 0, + OCRDMA_QUERY_SRQ_RSP_MAX_SGE_RECV_MASK = 0xFFFF, + OCRDMA_QUERY_SRQ_RSP_SRQ_LIMIT_SHIFT = 16, + OCRDMA_QUERY_SRQ_RSP_SRQ_LIMIT_MASK = 0xFFFF << + OCRDMA_QUERY_SRQ_RSP_SRQ_LIMIT_SHIFT +}; + +struct ocrdma_query_srq_rsp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp req; + + u32 max_rqe_pdid; + u32 srq_lmt_max_sge; +}; + +enum { + OCRDMA_DESTROY_SRQ_ID_SHIFT = 0, + OCRDMA_DESTROY_SRQ_ID_MASK = 0xFFFFFF +}; + +struct ocrdma_destroy_srq { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp req; + + u32 id; +}; + +enum { + OCRDMA_ALLOC_PD_ENABLE_DPP = BIT(16), + OCRDMA_DPP_PAGE_SIZE = 4096 +}; + +struct ocrdma_alloc_pd { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_hdr req; + u32 enable_dpp_rsvd; +}; + +enum { + OCRDMA_ALLOC_PD_RSP_DPP = BIT(16), + OCRDMA_ALLOC_PD_RSP_DPP_PAGE_SHIFT = 20, + OCRDMA_ALLOC_PD_RSP_PDID_MASK = 0xFFFF, +}; + +struct ocrdma_alloc_pd_rsp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp rsp; + u32 dpp_page_pdid; +}; + +struct ocrdma_dealloc_pd { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_hdr req; + u32 id; +}; + +struct ocrdma_dealloc_pd_rsp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp rsp; +}; + +struct ocrdma_alloc_pd_range { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_hdr req; + u32 enable_dpp_rsvd; + u32 pd_count; +}; + +struct ocrdma_alloc_pd_range_rsp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp rsp; + u32 dpp_page_pdid; + u32 pd_count; +}; + +enum { + OCRDMA_ALLOC_PD_RNG_RSP_START_PDID_MASK = 0xFFFF, +}; + +struct ocrdma_dealloc_pd_range { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_hdr req; + u32 start_pd_id; + u32 pd_count; +}; + +struct ocrdma_dealloc_pd_range_rsp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_hdr req; + u32 rsvd; +}; + +enum { + OCRDMA_ADDR_CHECK_ENABLE = 1, + OCRDMA_ADDR_CHECK_DISABLE = 0 +}; + +enum { + OCRDMA_ALLOC_LKEY_PD_ID_SHIFT = 0, + OCRDMA_ALLOC_LKEY_PD_ID_MASK = 0xFFFF, + + OCRDMA_ALLOC_LKEY_ADDR_CHECK_SHIFT = 0, + OCRDMA_ALLOC_LKEY_ADDR_CHECK_MASK = BIT(0), + OCRDMA_ALLOC_LKEY_FMR_SHIFT = 1, + OCRDMA_ALLOC_LKEY_FMR_MASK = BIT(1), + OCRDMA_ALLOC_LKEY_REMOTE_INV_SHIFT = 2, + OCRDMA_ALLOC_LKEY_REMOTE_INV_MASK = BIT(2), + OCRDMA_ALLOC_LKEY_REMOTE_WR_SHIFT = 3, + OCRDMA_ALLOC_LKEY_REMOTE_WR_MASK = BIT(3), + OCRDMA_ALLOC_LKEY_REMOTE_RD_SHIFT = 4, + OCRDMA_ALLOC_LKEY_REMOTE_RD_MASK = BIT(4), + OCRDMA_ALLOC_LKEY_LOCAL_WR_SHIFT = 5, + OCRDMA_ALLOC_LKEY_LOCAL_WR_MASK = BIT(5), + OCRDMA_ALLOC_LKEY_REMOTE_ATOMIC_MASK = BIT(6), + OCRDMA_ALLOC_LKEY_REMOTE_ATOMIC_SHIFT = 6, + OCRDMA_ALLOC_LKEY_PBL_SIZE_SHIFT = 16, + OCRDMA_ALLOC_LKEY_PBL_SIZE_MASK = 0xFFFF << + OCRDMA_ALLOC_LKEY_PBL_SIZE_SHIFT +}; + +struct ocrdma_alloc_lkey { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_hdr req; + + u32 pdid; + u32 pbl_sz_flags; +}; + +struct ocrdma_alloc_lkey_rsp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp rsp; + + u32 lrkey; + u32 num_pbl_rsvd; +}; + +struct ocrdma_dealloc_lkey { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_hdr req; + + u32 lkey; + u32 rsvd_frmr; +}; + +struct ocrdma_dealloc_lkey_rsp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp rsp; +}; + +#define MAX_OCRDMA_NSMR_PBL (u32)22 +#define MAX_OCRDMA_PBL_SIZE 65536 +#define MAX_OCRDMA_PBL_PER_LKEY 32767 + +enum { + OCRDMA_REG_NSMR_LRKEY_INDEX_SHIFT = 0, + OCRDMA_REG_NSMR_LRKEY_INDEX_MASK = 0xFFFFFF, + OCRDMA_REG_NSMR_LRKEY_SHIFT = 24, + OCRDMA_REG_NSMR_LRKEY_MASK = 0xFF << + OCRDMA_REG_NSMR_LRKEY_SHIFT, + + OCRDMA_REG_NSMR_PD_ID_SHIFT = 0, + OCRDMA_REG_NSMR_PD_ID_MASK = 0xFFFF, + OCRDMA_REG_NSMR_NUM_PBL_SHIFT = 16, + OCRDMA_REG_NSMR_NUM_PBL_MASK = 0xFFFF << + OCRDMA_REG_NSMR_NUM_PBL_SHIFT, + + OCRDMA_REG_NSMR_PBE_SIZE_SHIFT = 0, + OCRDMA_REG_NSMR_PBE_SIZE_MASK = 0xFFFF, + OCRDMA_REG_NSMR_HPAGE_SIZE_SHIFT = 16, + OCRDMA_REG_NSMR_HPAGE_SIZE_MASK = 0xFF << + OCRDMA_REG_NSMR_HPAGE_SIZE_SHIFT, + OCRDMA_REG_NSMR_BIND_MEMWIN_SHIFT = 24, + OCRDMA_REG_NSMR_BIND_MEMWIN_MASK = BIT(24), + OCRDMA_REG_NSMR_ZB_SHIFT = 25, + OCRDMA_REG_NSMR_ZB_SHIFT_MASK = BIT(25), + OCRDMA_REG_NSMR_REMOTE_INV_SHIFT = 26, + OCRDMA_REG_NSMR_REMOTE_INV_MASK = BIT(26), + OCRDMA_REG_NSMR_REMOTE_WR_SHIFT = 27, + OCRDMA_REG_NSMR_REMOTE_WR_MASK = BIT(27), + OCRDMA_REG_NSMR_REMOTE_RD_SHIFT = 28, + OCRDMA_REG_NSMR_REMOTE_RD_MASK = BIT(28), + OCRDMA_REG_NSMR_LOCAL_WR_SHIFT = 29, + OCRDMA_REG_NSMR_LOCAL_WR_MASK = BIT(29), + OCRDMA_REG_NSMR_REMOTE_ATOMIC_SHIFT = 30, + OCRDMA_REG_NSMR_REMOTE_ATOMIC_MASK = BIT(30), + OCRDMA_REG_NSMR_LAST_SHIFT = 31, + OCRDMA_REG_NSMR_LAST_MASK = BIT(31) +}; + +struct ocrdma_reg_nsmr { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_hdr cmd; + + u32 fr_mr; + u32 num_pbl_pdid; + u32 flags_hpage_pbe_sz; + u32 totlen_low; + u32 totlen_high; + u32 fbo_low; + u32 fbo_high; + u32 va_loaddr; + u32 va_hiaddr; + struct ocrdma_pa pbl[MAX_OCRDMA_NSMR_PBL]; +}; + +enum { + OCRDMA_REG_NSMR_CONT_PBL_SHIFT = 0, + OCRDMA_REG_NSMR_CONT_PBL_SHIFT_MASK = 0xFFFF, + OCRDMA_REG_NSMR_CONT_NUM_PBL_SHIFT = 16, + OCRDMA_REG_NSMR_CONT_NUM_PBL_MASK = 0xFFFF << + OCRDMA_REG_NSMR_CONT_NUM_PBL_SHIFT, + + OCRDMA_REG_NSMR_CONT_LAST_SHIFT = 31, + OCRDMA_REG_NSMR_CONT_LAST_MASK = BIT(31) +}; + +struct ocrdma_reg_nsmr_cont { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_hdr cmd; + + u32 lrkey; + u32 num_pbl_offset; + u32 last; + + struct ocrdma_pa pbl[MAX_OCRDMA_NSMR_PBL]; +}; + +struct ocrdma_pbe { + u32 pa_hi; + u32 pa_lo; +}; + +enum { + OCRDMA_REG_NSMR_RSP_NUM_PBL_SHIFT = 16, + OCRDMA_REG_NSMR_RSP_NUM_PBL_MASK = 0xFFFF0000 +}; +struct ocrdma_reg_nsmr_rsp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp rsp; + + u32 lrkey; + u32 num_pbl; +}; + +enum { + OCRDMA_REG_NSMR_CONT_RSP_LRKEY_INDEX_SHIFT = 0, + OCRDMA_REG_NSMR_CONT_RSP_LRKEY_INDEX_MASK = 0xFFFFFF, + OCRDMA_REG_NSMR_CONT_RSP_LRKEY_SHIFT = 24, + OCRDMA_REG_NSMR_CONT_RSP_LRKEY_MASK = 0xFF << + OCRDMA_REG_NSMR_CONT_RSP_LRKEY_SHIFT, + + OCRDMA_REG_NSMR_CONT_RSP_NUM_PBL_SHIFT = 16, + OCRDMA_REG_NSMR_CONT_RSP_NUM_PBL_MASK = 0xFFFF << + OCRDMA_REG_NSMR_CONT_RSP_NUM_PBL_SHIFT +}; + +struct ocrdma_reg_nsmr_cont_rsp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp rsp; + + u32 lrkey_key_index; + u32 num_pbl; +}; + +enum { + OCRDMA_ALLOC_MW_PD_ID_SHIFT = 0, + OCRDMA_ALLOC_MW_PD_ID_MASK = 0xFFFF +}; + +struct ocrdma_alloc_mw { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_hdr req; + + u32 pdid; +}; + +enum { + OCRDMA_ALLOC_MW_RSP_LRKEY_INDEX_SHIFT = 0, + OCRDMA_ALLOC_MW_RSP_LRKEY_INDEX_MASK = 0xFFFFFF +}; + +struct ocrdma_alloc_mw_rsp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp rsp; + + u32 lrkey_index; +}; + +struct ocrdma_attach_mcast { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_hdr req; + u32 qp_id; + u8 mgid[16]; + u32 mac_b0_to_b3; + u32 vlan_mac_b4_to_b5; +}; + +struct ocrdma_attach_mcast_rsp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp rsp; +}; + +struct ocrdma_detach_mcast { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_hdr req; + u32 qp_id; + u8 mgid[16]; + u32 mac_b0_to_b3; + u32 vlan_mac_b4_to_b5; +}; + +struct ocrdma_detach_mcast_rsp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp rsp; +}; + +enum { + OCRDMA_CREATE_AH_NUM_PAGES_SHIFT = 19, + OCRDMA_CREATE_AH_NUM_PAGES_MASK = 0xF << + OCRDMA_CREATE_AH_NUM_PAGES_SHIFT, + + OCRDMA_CREATE_AH_PAGE_SIZE_SHIFT = 16, + OCRDMA_CREATE_AH_PAGE_SIZE_MASK = 0x7 << + OCRDMA_CREATE_AH_PAGE_SIZE_SHIFT, + + OCRDMA_CREATE_AH_ENTRY_SIZE_SHIFT = 23, + OCRDMA_CREATE_AH_ENTRY_SIZE_MASK = 0x1FF << + OCRDMA_CREATE_AH_ENTRY_SIZE_SHIFT, +}; + +#define OCRDMA_AH_TBL_PAGES 8 + +struct ocrdma_create_ah_tbl { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_hdr req; + + u32 ah_conf; + struct ocrdma_pa tbl_addr[8]; +}; + +struct ocrdma_create_ah_tbl_rsp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp rsp; + u32 ahid; +}; + +struct ocrdma_delete_ah_tbl { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_hdr req; + u32 ahid; +}; + +struct ocrdma_delete_ah_tbl_rsp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp rsp; +}; + +enum { + OCRDMA_EQE_VALID_SHIFT = 0, + OCRDMA_EQE_VALID_MASK = BIT(0), + OCRDMA_EQE_MAJOR_CODE_MASK = 0x0E, + OCRDMA_EQE_MAJOR_CODE_SHIFT = 0x01, + OCRDMA_EQE_FOR_CQE_MASK = 0xFFFE, + OCRDMA_EQE_RESOURCE_ID_SHIFT = 16, + OCRDMA_EQE_RESOURCE_ID_MASK = 0xFFFF << + OCRDMA_EQE_RESOURCE_ID_SHIFT, +}; + +enum major_code { + OCRDMA_MAJOR_CODE_COMPLETION = 0x00, + OCRDMA_MAJOR_CODE_SENTINAL = 0x01 +}; + +struct ocrdma_eqe { + u32 id_valid; +}; + +enum OCRDMA_CQE_STATUS { + OCRDMA_CQE_SUCCESS = 0, + OCRDMA_CQE_LOC_LEN_ERR, + OCRDMA_CQE_LOC_QP_OP_ERR, + OCRDMA_CQE_LOC_EEC_OP_ERR, + OCRDMA_CQE_LOC_PROT_ERR, + OCRDMA_CQE_WR_FLUSH_ERR, + OCRDMA_CQE_MW_BIND_ERR, + OCRDMA_CQE_BAD_RESP_ERR, + OCRDMA_CQE_LOC_ACCESS_ERR, + OCRDMA_CQE_REM_INV_REQ_ERR, + OCRDMA_CQE_REM_ACCESS_ERR, + OCRDMA_CQE_REM_OP_ERR, + OCRDMA_CQE_RETRY_EXC_ERR, + OCRDMA_CQE_RNR_RETRY_EXC_ERR, + OCRDMA_CQE_LOC_RDD_VIOL_ERR, + OCRDMA_CQE_REM_INV_RD_REQ_ERR, + OCRDMA_CQE_REM_ABORT_ERR, + OCRDMA_CQE_INV_EECN_ERR, + OCRDMA_CQE_INV_EEC_STATE_ERR, + OCRDMA_CQE_FATAL_ERR, + OCRDMA_CQE_RESP_TIMEOUT_ERR, + OCRDMA_CQE_GENERAL_ERR, + + OCRDMA_MAX_CQE_ERR +}; + +enum { + /* w0 */ + OCRDMA_CQE_WQEIDX_SHIFT = 0, + OCRDMA_CQE_WQEIDX_MASK = 0xFFFF, + + /* w1 */ + OCRDMA_CQE_UD_XFER_LEN_SHIFT = 16, + OCRDMA_CQE_PKEY_SHIFT = 0, + OCRDMA_CQE_PKEY_MASK = 0xFFFF, + + /* w2 */ + OCRDMA_CQE_QPN_SHIFT = 0, + OCRDMA_CQE_QPN_MASK = 0x0000FFFF, + + OCRDMA_CQE_BUFTAG_SHIFT = 16, + OCRDMA_CQE_BUFTAG_MASK = 0xFFFF << OCRDMA_CQE_BUFTAG_SHIFT, + + /* w3 */ + OCRDMA_CQE_UD_STATUS_SHIFT = 24, + OCRDMA_CQE_UD_STATUS_MASK = 0x7 << OCRDMA_CQE_UD_STATUS_SHIFT, + OCRDMA_CQE_STATUS_SHIFT = 16, + OCRDMA_CQE_STATUS_MASK = 0xFF << OCRDMA_CQE_STATUS_SHIFT, + OCRDMA_CQE_VALID = BIT(31), + OCRDMA_CQE_INVALIDATE = BIT(30), + OCRDMA_CQE_QTYPE = BIT(29), + OCRDMA_CQE_IMM = BIT(28), + OCRDMA_CQE_WRITE_IMM = BIT(27), + OCRDMA_CQE_QTYPE_SQ = 0, + OCRDMA_CQE_QTYPE_RQ = 1, + OCRDMA_CQE_SRCQP_MASK = 0xFFFFFF +}; + +struct ocrdma_cqe { + union { + /* w0 to w2 */ + struct { + u32 wqeidx; + u32 bytes_xfered; + u32 qpn; + } wq; + struct { + u32 lkey_immdt; + u32 rxlen; + u32 buftag_qpn; + } rq; + struct { + u32 lkey_immdt; + u32 rxlen_pkey; + u32 buftag_qpn; + } ud; + struct { + u32 word_0; + u32 word_1; + u32 qpn; + } cmn; + }; + u32 flags_status_srcqpn; /* w3 */ +}; + +struct ocrdma_sge { + u32 addr_hi; + u32 addr_lo; + u32 lrkey; + u32 len; +}; + +enum { + OCRDMA_FLAG_SIG = 0x1, + OCRDMA_FLAG_INV = 0x2, + OCRDMA_FLAG_FENCE_L = 0x4, + OCRDMA_FLAG_FENCE_R = 0x8, + OCRDMA_FLAG_SOLICIT = 0x10, + OCRDMA_FLAG_IMM = 0x20, + OCRDMA_FLAG_AH_VLAN_PR = 0x40, + + /* Stag flags */ + OCRDMA_LKEY_FLAG_LOCAL_WR = 0x1, + OCRDMA_LKEY_FLAG_REMOTE_RD = 0x2, + OCRDMA_LKEY_FLAG_REMOTE_WR = 0x4, + OCRDMA_LKEY_FLAG_VATO = 0x8, +}; + +enum OCRDMA_WQE_OPCODE { + OCRDMA_WRITE = 0x06, + OCRDMA_READ = 0x0C, + OCRDMA_RESV0 = 0x02, + OCRDMA_SEND = 0x00, + OCRDMA_CMP_SWP = 0x14, + OCRDMA_BIND_MW = 0x10, + OCRDMA_FR_MR = 0x11, + OCRDMA_RESV1 = 0x0A, + OCRDMA_LKEY_INV = 0x15, + OCRDMA_FETCH_ADD = 0x13, + OCRDMA_POST_RQ = 0x12 +}; + +enum { + OCRDMA_TYPE_INLINE = 0x0, + OCRDMA_TYPE_LKEY = 0x1, +}; + +enum { + OCRDMA_WQE_OPCODE_SHIFT = 0, + OCRDMA_WQE_OPCODE_MASK = 0x0000001F, + OCRDMA_WQE_FLAGS_SHIFT = 5, + OCRDMA_WQE_TYPE_SHIFT = 16, + OCRDMA_WQE_TYPE_MASK = 0x00030000, + OCRDMA_WQE_SIZE_SHIFT = 18, + OCRDMA_WQE_SIZE_MASK = 0xFF, + OCRDMA_WQE_NXT_WQE_SIZE_SHIFT = 25, + + OCRDMA_WQE_LKEY_FLAGS_SHIFT = 0, + OCRDMA_WQE_LKEY_FLAGS_MASK = 0xF +}; + +/* header WQE for all the SQ and RQ operations */ +struct ocrdma_hdr_wqe { + u32 cw; + union { + u32 rsvd_tag; + u32 rsvd_lkey_flags; + }; + union { + u32 immdt; + u32 lkey; + }; + u32 total_len; +}; + +struct ocrdma_ewqe_ud_hdr { + u32 rsvd_dest_qpn; + u32 qkey; + u32 rsvd_ahid; + u32 rsvd; +}; + +/* extended wqe followed by hdr_wqe for Fast Memory register */ +struct ocrdma_ewqe_fr { + u32 va_hi; + u32 va_lo; + u32 fbo_hi; + u32 fbo_lo; + u32 size_sge; + u32 num_sges; + u32 rsvd; + u32 rsvd2; +}; + +struct ocrdma_eth_basic { + u8 dmac[6]; + u8 smac[6]; + __be16 eth_type; +} __packed; + +struct ocrdma_eth_vlan { + u8 dmac[6]; + u8 smac[6]; + __be16 eth_type; + __be16 vlan_tag; +#define OCRDMA_ROCE_ETH_TYPE 0x8915 + __be16 roce_eth_type; +} __packed; + +struct ocrdma_grh { + __be32 tclass_flow; + __be32 pdid_hoplimit; + u8 sgid[16]; + u8 dgid[16]; + u16 rsvd; +} __packed; + +#define OCRDMA_AV_VALID BIT(7) +#define OCRDMA_AV_VLAN_VALID BIT(1) + +struct ocrdma_av { + struct ocrdma_eth_vlan eth_hdr; + struct ocrdma_grh grh; + u32 valid; +} __packed; + +struct ocrdma_rsrc_stats { + u32 dpp_pds; + u32 non_dpp_pds; + u32 rc_dpp_qps; + u32 uc_dpp_qps; + u32 ud_dpp_qps; + u32 rc_non_dpp_qps; + u32 rsvd; + u32 uc_non_dpp_qps; + u32 ud_non_dpp_qps; + u32 rsvd1; + u32 srqs; + u32 rbqs; + u32 r64K_nsmr; + u32 r64K_to_2M_nsmr; + u32 r2M_to_44M_nsmr; + u32 r44M_to_1G_nsmr; + u32 r1G_to_4G_nsmr; + u32 nsmr_count_4G_to_32G; + u32 r32G_to_64G_nsmr; + u32 r64G_to_128G_nsmr; + u32 r128G_to_higher_nsmr; + u32 embedded_nsmr; + u32 frmr; + u32 prefetch_qps; + u32 ondemand_qps; + u32 phy_mr; + u32 mw; + u32 rsvd2[7]; +}; + +struct ocrdma_db_err_stats { + u32 sq_doorbell_errors; + u32 cq_doorbell_errors; + u32 rq_srq_doorbell_errors; + u32 cq_overflow_errors; + u32 rsvd[4]; +}; + +struct ocrdma_wqe_stats { + u32 large_send_rc_wqes_lo; + u32 large_send_rc_wqes_hi; + u32 large_write_rc_wqes_lo; + u32 large_write_rc_wqes_hi; + u32 rsvd[4]; + u32 read_wqes_lo; + u32 read_wqes_hi; + u32 frmr_wqes_lo; + u32 frmr_wqes_hi; + u32 mw_bind_wqes_lo; + u32 mw_bind_wqes_hi; + u32 invalidate_wqes_lo; + u32 invalidate_wqes_hi; + u32 rsvd1[2]; + u32 dpp_wqe_drops; + u32 rsvd2[5]; +}; + +struct ocrdma_tx_stats { + u32 send_pkts_lo; + u32 send_pkts_hi; + u32 write_pkts_lo; + u32 write_pkts_hi; + u32 read_pkts_lo; + u32 read_pkts_hi; + u32 read_rsp_pkts_lo; + u32 read_rsp_pkts_hi; + u32 ack_pkts_lo; + u32 ack_pkts_hi; + u32 send_bytes_lo; + u32 send_bytes_hi; + u32 write_bytes_lo; + u32 write_bytes_hi; + u32 read_req_bytes_lo; + u32 read_req_bytes_hi; + u32 read_rsp_bytes_lo; + u32 read_rsp_bytes_hi; + u32 ack_timeouts; + u32 rsvd[5]; +}; + + +struct ocrdma_tx_qp_err_stats { + u32 local_length_errors; + u32 local_protection_errors; + u32 local_qp_operation_errors; + u32 retry_count_exceeded_errors; + u32 rnr_retry_count_exceeded_errors; + u32 rsvd[3]; +}; + +struct ocrdma_rx_stats { + u32 roce_frame_bytes_lo; + u32 roce_frame_bytes_hi; + u32 roce_frame_icrc_drops; + u32 roce_frame_payload_len_drops; + u32 ud_drops; + u32 qp1_drops; + u32 psn_error_request_packets; + u32 psn_error_resp_packets; + u32 rnr_nak_timeouts; + u32 rnr_nak_receives; + u32 roce_frame_rxmt_drops; + u32 nak_count_psn_sequence_errors; + u32 rc_drop_count_lookup_errors; + u32 rq_rnr_naks; + u32 srq_rnr_naks; + u32 roce_frames_lo; + u32 roce_frames_hi; + u32 rsvd; +}; + +struct ocrdma_rx_qp_err_stats { + u32 nak_invalid_requst_errors; + u32 nak_remote_operation_errors; + u32 nak_count_remote_access_errors; + u32 local_length_errors; + u32 local_protection_errors; + u32 local_qp_operation_errors; + u32 rsvd[2]; +}; + +struct ocrdma_tx_dbg_stats { + u32 data[100]; +}; + +struct ocrdma_rx_dbg_stats { + u32 data[200]; +}; + +struct ocrdma_rdma_stats_req { + struct ocrdma_mbx_hdr hdr; + u8 reset_stats; + u8 rsvd[3]; +} __packed; + +struct ocrdma_rdma_stats_resp { + struct ocrdma_mbx_hdr hdr; + struct ocrdma_rsrc_stats act_rsrc_stats; + struct ocrdma_rsrc_stats th_rsrc_stats; + struct ocrdma_db_err_stats db_err_stats; + struct ocrdma_wqe_stats wqe_stats; + struct ocrdma_tx_stats tx_stats; + struct ocrdma_tx_qp_err_stats tx_qp_err_stats; + struct ocrdma_rx_stats rx_stats; + struct ocrdma_rx_qp_err_stats rx_qp_err_stats; + struct ocrdma_tx_dbg_stats tx_dbg_stats; + struct ocrdma_rx_dbg_stats rx_dbg_stats; +} __packed; + +enum { + OCRDMA_HBA_ATTRB_EPROM_VER_LO_MASK = 0xFF, + OCRDMA_HBA_ATTRB_EPROM_VER_HI_MASK = 0xFF00, + OCRDMA_HBA_ATTRB_EPROM_VER_HI_SHIFT = 0x08, + OCRDMA_HBA_ATTRB_CDBLEN_MASK = 0xFFFF, + OCRDMA_HBA_ATTRB_ASIC_REV_MASK = 0xFF0000, + OCRDMA_HBA_ATTRB_ASIC_REV_SHIFT = 0x10, + OCRDMA_HBA_ATTRB_GUID0_MASK = 0xFF000000, + OCRDMA_HBA_ATTRB_GUID0_SHIFT = 0x18, + OCRDMA_HBA_ATTRB_GUID13_MASK = 0xFF, + OCRDMA_HBA_ATTRB_GUID14_MASK = 0xFF00, + OCRDMA_HBA_ATTRB_GUID14_SHIFT = 0x08, + OCRDMA_HBA_ATTRB_GUID15_MASK = 0xFF0000, + OCRDMA_HBA_ATTRB_GUID15_SHIFT = 0x10, + OCRDMA_HBA_ATTRB_PCNT_MASK = 0xFF000000, + OCRDMA_HBA_ATTRB_PCNT_SHIFT = 0x18, + OCRDMA_HBA_ATTRB_LDTOUT_MASK = 0xFFFF, + OCRDMA_HBA_ATTRB_ISCSI_VER_MASK = 0xFF0000, + OCRDMA_HBA_ATTRB_ISCSI_VER_SHIFT = 0x10, + OCRDMA_HBA_ATTRB_MFUNC_DEV_MASK = 0xFF000000, + OCRDMA_HBA_ATTRB_MFUNC_DEV_SHIFT = 0x18, + OCRDMA_HBA_ATTRB_CV_MASK = 0xFF, + OCRDMA_HBA_ATTRB_HBA_ST_MASK = 0xFF00, + OCRDMA_HBA_ATTRB_HBA_ST_SHIFT = 0x08, + OCRDMA_HBA_ATTRB_MAX_DOMS_MASK = 0xFF0000, + OCRDMA_HBA_ATTRB_MAX_DOMS_SHIFT = 0x10, + OCRDMA_HBA_ATTRB_PTNUM_MASK = 0x3F000000, + OCRDMA_HBA_ATTRB_PTNUM_SHIFT = 0x18, + OCRDMA_HBA_ATTRB_PT_MASK = 0xC0000000, + OCRDMA_HBA_ATTRB_PT_SHIFT = 0x1E, + OCRDMA_HBA_ATTRB_ISCSI_FET_MASK = 0xFF, + OCRDMA_HBA_ATTRB_ASIC_GEN_MASK = 0xFF00, + OCRDMA_HBA_ATTRB_ASIC_GEN_SHIFT = 0x08, + OCRDMA_HBA_ATTRB_PCI_VID_MASK = 0xFFFF, + OCRDMA_HBA_ATTRB_PCI_DID_MASK = 0xFFFF0000, + OCRDMA_HBA_ATTRB_PCI_DID_SHIFT = 0x10, + OCRDMA_HBA_ATTRB_PCI_SVID_MASK = 0xFFFF, + OCRDMA_HBA_ATTRB_PCI_SSID_MASK = 0xFFFF0000, + OCRDMA_HBA_ATTRB_PCI_SSID_SHIFT = 0x10, + OCRDMA_HBA_ATTRB_PCI_BUSNUM_MASK = 0xFF, + OCRDMA_HBA_ATTRB_PCI_DEVNUM_MASK = 0xFF00, + OCRDMA_HBA_ATTRB_PCI_DEVNUM_SHIFT = 0x08, + OCRDMA_HBA_ATTRB_PCI_FUNCNUM_MASK = 0xFF0000, + OCRDMA_HBA_ATTRB_PCI_FUNCNUM_SHIFT = 0x10, + OCRDMA_HBA_ATTRB_IF_TYPE_MASK = 0xFF000000, + OCRDMA_HBA_ATTRB_IF_TYPE_SHIFT = 0x18, + OCRDMA_HBA_ATTRB_NETFIL_MASK =0xFF +}; + +struct mgmt_hba_attribs { + u8 flashrom_version_string[32]; + u8 manufacturer_name[32]; + u32 supported_modes; + u32 rsvd_eprom_verhi_verlo; + u32 mbx_ds_ver; + u32 epfw_ds_ver; + u8 ncsi_ver_string[12]; + u32 default_extended_timeout; + u8 controller_model_number[32]; + u8 controller_description[64]; + u8 controller_serial_number[32]; + u8 ip_version_string[32]; + u8 firmware_version_string[32]; + u8 bios_version_string[32]; + u8 redboot_version_string[32]; + u8 driver_version_string[32]; + u8 fw_on_flash_version_string[32]; + u32 functionalities_supported; + u32 guid0_asicrev_cdblen; + u8 generational_guid[12]; + u32 portcnt_guid15; + u32 mfuncdev_iscsi_ldtout; + u32 ptpnum_maxdoms_hbast_cv; + u32 firmware_post_status; + u32 hba_mtu[8]; + u32 res_asicgen_iscsi_feaures; + u32 rsvd1[3]; +}; + +struct mgmt_controller_attrib { + struct mgmt_hba_attribs hba_attribs; + u32 pci_did_vid; + u32 pci_ssid_svid; + u32 ityp_fnum_devnum_bnum; + u32 uid_hi; + u32 uid_lo; + u32 res_nnetfil; + u32 rsvd0[4]; +}; + +struct ocrdma_get_ctrl_attribs_rsp { + struct ocrdma_mbx_hdr hdr; + struct mgmt_controller_attrib ctrl_attribs; +}; + +#define OCRDMA_SUBSYS_DCBX 0x10 + +enum OCRDMA_DCBX_OPCODE { + OCRDMA_CMD_GET_DCBX_CONFIG = 0x01 +}; + +enum OCRDMA_DCBX_PARAM_TYPE { + OCRDMA_PARAMETER_TYPE_ADMIN = 0x00, + OCRDMA_PARAMETER_TYPE_OPER = 0x01, + OCRDMA_PARAMETER_TYPE_PEER = 0x02 +}; + +enum OCRDMA_DCBX_APP_PROTO { + OCRDMA_APP_PROTO_ROCE = 0x8915 +}; + +enum OCRDMA_DCBX_PROTO { + OCRDMA_PROTO_SELECT_L2 = 0x00, + OCRDMA_PROTO_SELECT_L4 = 0x01 +}; + +enum OCRDMA_DCBX_APP_PARAM { + OCRDMA_APP_PARAM_APP_PROTO_MASK = 0xFFFF, + OCRDMA_APP_PARAM_PROTO_SEL_MASK = 0xFF, + OCRDMA_APP_PARAM_PROTO_SEL_SHIFT = 0x10, + OCRDMA_APP_PARAM_VALID_MASK = 0xFF, + OCRDMA_APP_PARAM_VALID_SHIFT = 0x18 +}; + +enum OCRDMA_DCBX_STATE_FLAGS { + OCRDMA_STATE_FLAG_ENABLED = 0x01, + OCRDMA_STATE_FLAG_ADDVERTISED = 0x02, + OCRDMA_STATE_FLAG_WILLING = 0x04, + OCRDMA_STATE_FLAG_SYNC = 0x08, + OCRDMA_STATE_FLAG_UNSUPPORTED = 0x40000000, + OCRDMA_STATE_FLAG_NEG_FAILD = 0x80000000 +}; + +enum OCRDMA_TCV_AEV_OPV_ST { + OCRDMA_DCBX_TC_SUPPORT_MASK = 0xFF, + OCRDMA_DCBX_TC_SUPPORT_SHIFT = 0x18, + OCRDMA_DCBX_APP_ENTRY_SHIFT = 0x10, + OCRDMA_DCBX_OP_PARAM_SHIFT = 0x08, + OCRDMA_DCBX_STATE_MASK = 0xFF +}; + +struct ocrdma_app_parameter { + u32 valid_proto_app; + u32 oui; + u32 app_prio[2]; +}; + +struct ocrdma_dcbx_cfg { + u32 tcv_aev_opv_st; + u32 tc_state; + u32 pfc_state; + u32 qcn_state; + u32 appl_state; + u32 ll_state; + u32 tc_bw[2]; + u32 tc_prio[8]; + u32 pfc_prio[2]; + struct ocrdma_app_parameter app_param[15]; +}; + +struct ocrdma_get_dcbx_cfg_req { + struct ocrdma_mbx_hdr hdr; + u32 param_type; +} __packed; + +struct ocrdma_get_dcbx_cfg_rsp { + struct ocrdma_mbx_rsp hdr; + struct ocrdma_dcbx_cfg cfg; +} __packed; + +#endif /* __OCRDMA_SLI_H__ */ diff --git a/kernel/drivers/infiniband/hw/ocrdma/ocrdma_stats.c b/kernel/drivers/infiniband/hw/ocrdma/ocrdma_stats.c new file mode 100644 index 000000000..48d7ef51a --- /dev/null +++ b/kernel/drivers/infiniband/hw/ocrdma/ocrdma_stats.c @@ -0,0 +1,857 @@ +/******************************************************************* + * This file is part of the Emulex RoCE Device Driver for * + * RoCE (RDMA over Converged Ethernet) adapters. * + * Copyright (C) 2008-2014 Emulex. All rights reserved. * + * EMULEX and SLI are trademarks of Emulex. * + * www.emulex.com * + * * + * This program is free software; you can redistribute it and/or * + * modify it under the terms of version 2 of the GNU General * + * Public License as published by the Free Software Foundation. * + * This program is distributed in the hope that it will be useful. * + * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND * + * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, * + * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE * + * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD * + * TO BE LEGALLY INVALID. See the GNU General Public License for * + * more details, a copy of which can be found in the file COPYING * + * included with this package. * + * + * Contact Information: + * linux-drivers@emulex.com + * + * Emulex + * 3333 Susan Street + * Costa Mesa, CA 92626 + *******************************************************************/ + +#include +#include +#include "ocrdma_stats.h" + +static struct dentry *ocrdma_dbgfs_dir; + +static int ocrdma_add_stat(char *start, char *pcur, + char *name, u64 count) +{ + char buff[128] = {0}; + int cpy_len = 0; + + snprintf(buff, 128, "%s: %llu\n", name, count); + cpy_len = strlen(buff); + + if (pcur + cpy_len > start + OCRDMA_MAX_DBGFS_MEM) { + pr_err("%s: No space in stats buff\n", __func__); + return 0; + } + + memcpy(pcur, buff, cpy_len); + return cpy_len; +} + +static bool ocrdma_alloc_stats_mem(struct ocrdma_dev *dev) +{ + struct stats_mem *mem = &dev->stats_mem; + + /* Alloc mbox command mem*/ + mem->size = max_t(u32, sizeof(struct ocrdma_rdma_stats_req), + sizeof(struct ocrdma_rdma_stats_resp)); + + mem->va = dma_alloc_coherent(&dev->nic_info.pdev->dev, mem->size, + &mem->pa, GFP_KERNEL); + if (!mem->va) { + pr_err("%s: stats mbox allocation failed\n", __func__); + return false; + } + + memset(mem->va, 0, mem->size); + + /* Alloc debugfs mem */ + mem->debugfs_mem = kzalloc(OCRDMA_MAX_DBGFS_MEM, GFP_KERNEL); + if (!mem->debugfs_mem) { + pr_err("%s: stats debugfs mem allocation failed\n", __func__); + return false; + } + + return true; +} + +static void ocrdma_release_stats_mem(struct ocrdma_dev *dev) +{ + struct stats_mem *mem = &dev->stats_mem; + + if (mem->va) + dma_free_coherent(&dev->nic_info.pdev->dev, mem->size, + mem->va, mem->pa); + kfree(mem->debugfs_mem); +} + +static char *ocrdma_resource_stats(struct ocrdma_dev *dev) +{ + char *stats = dev->stats_mem.debugfs_mem, *pcur; + struct ocrdma_rdma_stats_resp *rdma_stats = + (struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; + struct ocrdma_rsrc_stats *rsrc_stats = &rdma_stats->act_rsrc_stats; + + memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM)); + + pcur = stats; + pcur += ocrdma_add_stat(stats, pcur, "active_dpp_pds", + (u64)rsrc_stats->dpp_pds); + pcur += ocrdma_add_stat(stats, pcur, "active_non_dpp_pds", + (u64)rsrc_stats->non_dpp_pds); + pcur += ocrdma_add_stat(stats, pcur, "active_rc_dpp_qps", + (u64)rsrc_stats->rc_dpp_qps); + pcur += ocrdma_add_stat(stats, pcur, "active_uc_dpp_qps", + (u64)rsrc_stats->uc_dpp_qps); + pcur += ocrdma_add_stat(stats, pcur, "active_ud_dpp_qps", + (u64)rsrc_stats->ud_dpp_qps); + pcur += ocrdma_add_stat(stats, pcur, "active_rc_non_dpp_qps", + (u64)rsrc_stats->rc_non_dpp_qps); + pcur += ocrdma_add_stat(stats, pcur, "active_uc_non_dpp_qps", + (u64)rsrc_stats->uc_non_dpp_qps); + pcur += ocrdma_add_stat(stats, pcur, "active_ud_non_dpp_qps", + (u64)rsrc_stats->ud_non_dpp_qps); + pcur += ocrdma_add_stat(stats, pcur, "active_srqs", + (u64)rsrc_stats->srqs); + pcur += ocrdma_add_stat(stats, pcur, "active_rbqs", + (u64)rsrc_stats->rbqs); + pcur += ocrdma_add_stat(stats, pcur, "active_64K_nsmr", + (u64)rsrc_stats->r64K_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "active_64K_to_2M_nsmr", + (u64)rsrc_stats->r64K_to_2M_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "active_2M_to_44M_nsmr", + (u64)rsrc_stats->r2M_to_44M_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "active_44M_to_1G_nsmr", + (u64)rsrc_stats->r44M_to_1G_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "active_1G_to_4G_nsmr", + (u64)rsrc_stats->r1G_to_4G_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "active_nsmr_count_4G_to_32G", + (u64)rsrc_stats->nsmr_count_4G_to_32G); + pcur += ocrdma_add_stat(stats, pcur, "active_32G_to_64G_nsmr", + (u64)rsrc_stats->r32G_to_64G_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "active_64G_to_128G_nsmr", + (u64)rsrc_stats->r64G_to_128G_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "active_128G_to_higher_nsmr", + (u64)rsrc_stats->r128G_to_higher_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "active_embedded_nsmr", + (u64)rsrc_stats->embedded_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "active_frmr", + (u64)rsrc_stats->frmr); + pcur += ocrdma_add_stat(stats, pcur, "active_prefetch_qps", + (u64)rsrc_stats->prefetch_qps); + pcur += ocrdma_add_stat(stats, pcur, "active_ondemand_qps", + (u64)rsrc_stats->ondemand_qps); + pcur += ocrdma_add_stat(stats, pcur, "active_phy_mr", + (u64)rsrc_stats->phy_mr); + pcur += ocrdma_add_stat(stats, pcur, "active_mw", + (u64)rsrc_stats->mw); + + /* Print the threshold stats */ + rsrc_stats = &rdma_stats->th_rsrc_stats; + + pcur += ocrdma_add_stat(stats, pcur, "threshold_dpp_pds", + (u64)rsrc_stats->dpp_pds); + pcur += ocrdma_add_stat(stats, pcur, "threshold_non_dpp_pds", + (u64)rsrc_stats->non_dpp_pds); + pcur += ocrdma_add_stat(stats, pcur, "threshold_rc_dpp_qps", + (u64)rsrc_stats->rc_dpp_qps); + pcur += ocrdma_add_stat(stats, pcur, "threshold_uc_dpp_qps", + (u64)rsrc_stats->uc_dpp_qps); + pcur += ocrdma_add_stat(stats, pcur, "threshold_ud_dpp_qps", + (u64)rsrc_stats->ud_dpp_qps); + pcur += ocrdma_add_stat(stats, pcur, "threshold_rc_non_dpp_qps", + (u64)rsrc_stats->rc_non_dpp_qps); + pcur += ocrdma_add_stat(stats, pcur, "threshold_uc_non_dpp_qps", + (u64)rsrc_stats->uc_non_dpp_qps); + pcur += ocrdma_add_stat(stats, pcur, "threshold_ud_non_dpp_qps", + (u64)rsrc_stats->ud_non_dpp_qps); + pcur += ocrdma_add_stat(stats, pcur, "threshold_srqs", + (u64)rsrc_stats->srqs); + pcur += ocrdma_add_stat(stats, pcur, "threshold_rbqs", + (u64)rsrc_stats->rbqs); + pcur += ocrdma_add_stat(stats, pcur, "threshold_64K_nsmr", + (u64)rsrc_stats->r64K_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "threshold_64K_to_2M_nsmr", + (u64)rsrc_stats->r64K_to_2M_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "threshold_2M_to_44M_nsmr", + (u64)rsrc_stats->r2M_to_44M_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "threshold_44M_to_1G_nsmr", + (u64)rsrc_stats->r44M_to_1G_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "threshold_1G_to_4G_nsmr", + (u64)rsrc_stats->r1G_to_4G_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "threshold_nsmr_count_4G_to_32G", + (u64)rsrc_stats->nsmr_count_4G_to_32G); + pcur += ocrdma_add_stat(stats, pcur, "threshold_32G_to_64G_nsmr", + (u64)rsrc_stats->r32G_to_64G_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "threshold_64G_to_128G_nsmr", + (u64)rsrc_stats->r64G_to_128G_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "threshold_128G_to_higher_nsmr", + (u64)rsrc_stats->r128G_to_higher_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "threshold_embedded_nsmr", + (u64)rsrc_stats->embedded_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "threshold_frmr", + (u64)rsrc_stats->frmr); + pcur += ocrdma_add_stat(stats, pcur, "threshold_prefetch_qps", + (u64)rsrc_stats->prefetch_qps); + pcur += ocrdma_add_stat(stats, pcur, "threshold_ondemand_qps", + (u64)rsrc_stats->ondemand_qps); + pcur += ocrdma_add_stat(stats, pcur, "threshold_phy_mr", + (u64)rsrc_stats->phy_mr); + pcur += ocrdma_add_stat(stats, pcur, "threshold_mw", + (u64)rsrc_stats->mw); + return stats; +} + +static char *ocrdma_rx_stats(struct ocrdma_dev *dev) +{ + char *stats = dev->stats_mem.debugfs_mem, *pcur; + struct ocrdma_rdma_stats_resp *rdma_stats = + (struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; + struct ocrdma_rx_stats *rx_stats = &rdma_stats->rx_stats; + + memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM)); + + pcur = stats; + pcur += ocrdma_add_stat + (stats, pcur, "roce_frame_bytes", + convert_to_64bit(rx_stats->roce_frame_bytes_lo, + rx_stats->roce_frame_bytes_hi)); + pcur += ocrdma_add_stat(stats, pcur, "roce_frame_icrc_drops", + (u64)rx_stats->roce_frame_icrc_drops); + pcur += ocrdma_add_stat(stats, pcur, "roce_frame_payload_len_drops", + (u64)rx_stats->roce_frame_payload_len_drops); + pcur += ocrdma_add_stat(stats, pcur, "ud_drops", + (u64)rx_stats->ud_drops); + pcur += ocrdma_add_stat(stats, pcur, "qp1_drops", + (u64)rx_stats->qp1_drops); + pcur += ocrdma_add_stat(stats, pcur, "psn_error_request_packets", + (u64)rx_stats->psn_error_request_packets); + pcur += ocrdma_add_stat(stats, pcur, "psn_error_resp_packets", + (u64)rx_stats->psn_error_resp_packets); + pcur += ocrdma_add_stat(stats, pcur, "rnr_nak_timeouts", + (u64)rx_stats->rnr_nak_timeouts); + pcur += ocrdma_add_stat(stats, pcur, "rnr_nak_receives", + (u64)rx_stats->rnr_nak_receives); + pcur += ocrdma_add_stat(stats, pcur, "roce_frame_rxmt_drops", + (u64)rx_stats->roce_frame_rxmt_drops); + pcur += ocrdma_add_stat(stats, pcur, "nak_count_psn_sequence_errors", + (u64)rx_stats->nak_count_psn_sequence_errors); + pcur += ocrdma_add_stat(stats, pcur, "rc_drop_count_lookup_errors", + (u64)rx_stats->rc_drop_count_lookup_errors); + pcur += ocrdma_add_stat(stats, pcur, "rq_rnr_naks", + (u64)rx_stats->rq_rnr_naks); + pcur += ocrdma_add_stat(stats, pcur, "srq_rnr_naks", + (u64)rx_stats->srq_rnr_naks); + pcur += ocrdma_add_stat(stats, pcur, "roce_frames", + convert_to_64bit(rx_stats->roce_frames_lo, + rx_stats->roce_frames_hi)); + + return stats; +} + +static u64 ocrdma_sysfs_rcv_pkts(struct ocrdma_dev *dev) +{ + struct ocrdma_rdma_stats_resp *rdma_stats = + (struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; + struct ocrdma_rx_stats *rx_stats = &rdma_stats->rx_stats; + + return convert_to_64bit(rx_stats->roce_frames_lo, + rx_stats->roce_frames_hi) + (u64)rx_stats->roce_frame_icrc_drops + + (u64)rx_stats->roce_frame_payload_len_drops; +} + +static u64 ocrdma_sysfs_rcv_data(struct ocrdma_dev *dev) +{ + struct ocrdma_rdma_stats_resp *rdma_stats = + (struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; + struct ocrdma_rx_stats *rx_stats = &rdma_stats->rx_stats; + + return (convert_to_64bit(rx_stats->roce_frame_bytes_lo, + rx_stats->roce_frame_bytes_hi))/4; +} + +static char *ocrdma_tx_stats(struct ocrdma_dev *dev) +{ + char *stats = dev->stats_mem.debugfs_mem, *pcur; + struct ocrdma_rdma_stats_resp *rdma_stats = + (struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; + struct ocrdma_tx_stats *tx_stats = &rdma_stats->tx_stats; + + memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM)); + + pcur = stats; + pcur += ocrdma_add_stat(stats, pcur, "send_pkts", + convert_to_64bit(tx_stats->send_pkts_lo, + tx_stats->send_pkts_hi)); + pcur += ocrdma_add_stat(stats, pcur, "write_pkts", + convert_to_64bit(tx_stats->write_pkts_lo, + tx_stats->write_pkts_hi)); + pcur += ocrdma_add_stat(stats, pcur, "read_pkts", + convert_to_64bit(tx_stats->read_pkts_lo, + tx_stats->read_pkts_hi)); + pcur += ocrdma_add_stat(stats, pcur, "read_rsp_pkts", + convert_to_64bit(tx_stats->read_rsp_pkts_lo, + tx_stats->read_rsp_pkts_hi)); + pcur += ocrdma_add_stat(stats, pcur, "ack_pkts", + convert_to_64bit(tx_stats->ack_pkts_lo, + tx_stats->ack_pkts_hi)); + pcur += ocrdma_add_stat(stats, pcur, "send_bytes", + convert_to_64bit(tx_stats->send_bytes_lo, + tx_stats->send_bytes_hi)); + pcur += ocrdma_add_stat(stats, pcur, "write_bytes", + convert_to_64bit(tx_stats->write_bytes_lo, + tx_stats->write_bytes_hi)); + pcur += ocrdma_add_stat(stats, pcur, "read_req_bytes", + convert_to_64bit(tx_stats->read_req_bytes_lo, + tx_stats->read_req_bytes_hi)); + pcur += ocrdma_add_stat(stats, pcur, "read_rsp_bytes", + convert_to_64bit(tx_stats->read_rsp_bytes_lo, + tx_stats->read_rsp_bytes_hi)); + pcur += ocrdma_add_stat(stats, pcur, "ack_timeouts", + (u64)tx_stats->ack_timeouts); + + return stats; +} + +static u64 ocrdma_sysfs_xmit_pkts(struct ocrdma_dev *dev) +{ + struct ocrdma_rdma_stats_resp *rdma_stats = + (struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; + struct ocrdma_tx_stats *tx_stats = &rdma_stats->tx_stats; + + return (convert_to_64bit(tx_stats->send_pkts_lo, + tx_stats->send_pkts_hi) + + convert_to_64bit(tx_stats->write_pkts_lo, tx_stats->write_pkts_hi) + + convert_to_64bit(tx_stats->read_pkts_lo, tx_stats->read_pkts_hi) + + convert_to_64bit(tx_stats->read_rsp_pkts_lo, + tx_stats->read_rsp_pkts_hi) + + convert_to_64bit(tx_stats->ack_pkts_lo, tx_stats->ack_pkts_hi)); +} + +static u64 ocrdma_sysfs_xmit_data(struct ocrdma_dev *dev) +{ + struct ocrdma_rdma_stats_resp *rdma_stats = + (struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; + struct ocrdma_tx_stats *tx_stats = &rdma_stats->tx_stats; + + return (convert_to_64bit(tx_stats->send_bytes_lo, + tx_stats->send_bytes_hi) + + convert_to_64bit(tx_stats->write_bytes_lo, + tx_stats->write_bytes_hi) + + convert_to_64bit(tx_stats->read_req_bytes_lo, + tx_stats->read_req_bytes_hi) + + convert_to_64bit(tx_stats->read_rsp_bytes_lo, + tx_stats->read_rsp_bytes_hi))/4; +} + +static char *ocrdma_wqe_stats(struct ocrdma_dev *dev) +{ + char *stats = dev->stats_mem.debugfs_mem, *pcur; + struct ocrdma_rdma_stats_resp *rdma_stats = + (struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; + struct ocrdma_wqe_stats *wqe_stats = &rdma_stats->wqe_stats; + + memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM)); + + pcur = stats; + pcur += ocrdma_add_stat(stats, pcur, "large_send_rc_wqes", + convert_to_64bit(wqe_stats->large_send_rc_wqes_lo, + wqe_stats->large_send_rc_wqes_hi)); + pcur += ocrdma_add_stat(stats, pcur, "large_write_rc_wqes", + convert_to_64bit(wqe_stats->large_write_rc_wqes_lo, + wqe_stats->large_write_rc_wqes_hi)); + pcur += ocrdma_add_stat(stats, pcur, "read_wqes", + convert_to_64bit(wqe_stats->read_wqes_lo, + wqe_stats->read_wqes_hi)); + pcur += ocrdma_add_stat(stats, pcur, "frmr_wqes", + convert_to_64bit(wqe_stats->frmr_wqes_lo, + wqe_stats->frmr_wqes_hi)); + pcur += ocrdma_add_stat(stats, pcur, "mw_bind_wqes", + convert_to_64bit(wqe_stats->mw_bind_wqes_lo, + wqe_stats->mw_bind_wqes_hi)); + pcur += ocrdma_add_stat(stats, pcur, "invalidate_wqes", + convert_to_64bit(wqe_stats->invalidate_wqes_lo, + wqe_stats->invalidate_wqes_hi)); + pcur += ocrdma_add_stat(stats, pcur, "dpp_wqe_drops", + (u64)wqe_stats->dpp_wqe_drops); + return stats; +} + +static char *ocrdma_db_errstats(struct ocrdma_dev *dev) +{ + char *stats = dev->stats_mem.debugfs_mem, *pcur; + struct ocrdma_rdma_stats_resp *rdma_stats = + (struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; + struct ocrdma_db_err_stats *db_err_stats = &rdma_stats->db_err_stats; + + memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM)); + + pcur = stats; + pcur += ocrdma_add_stat(stats, pcur, "sq_doorbell_errors", + (u64)db_err_stats->sq_doorbell_errors); + pcur += ocrdma_add_stat(stats, pcur, "cq_doorbell_errors", + (u64)db_err_stats->cq_doorbell_errors); + pcur += ocrdma_add_stat(stats, pcur, "rq_srq_doorbell_errors", + (u64)db_err_stats->rq_srq_doorbell_errors); + pcur += ocrdma_add_stat(stats, pcur, "cq_overflow_errors", + (u64)db_err_stats->cq_overflow_errors); + return stats; +} + +static char *ocrdma_rxqp_errstats(struct ocrdma_dev *dev) +{ + char *stats = dev->stats_mem.debugfs_mem, *pcur; + struct ocrdma_rdma_stats_resp *rdma_stats = + (struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; + struct ocrdma_rx_qp_err_stats *rx_qp_err_stats = + &rdma_stats->rx_qp_err_stats; + + memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM)); + + pcur = stats; + pcur += ocrdma_add_stat(stats, pcur, "nak_invalid_requst_errors", + (u64)rx_qp_err_stats->nak_invalid_requst_errors); + pcur += ocrdma_add_stat(stats, pcur, "nak_remote_operation_errors", + (u64)rx_qp_err_stats->nak_remote_operation_errors); + pcur += ocrdma_add_stat(stats, pcur, "nak_count_remote_access_errors", + (u64)rx_qp_err_stats->nak_count_remote_access_errors); + pcur += ocrdma_add_stat(stats, pcur, "local_length_errors", + (u64)rx_qp_err_stats->local_length_errors); + pcur += ocrdma_add_stat(stats, pcur, "local_protection_errors", + (u64)rx_qp_err_stats->local_protection_errors); + pcur += ocrdma_add_stat(stats, pcur, "local_qp_operation_errors", + (u64)rx_qp_err_stats->local_qp_operation_errors); + return stats; +} + +static char *ocrdma_txqp_errstats(struct ocrdma_dev *dev) +{ + char *stats = dev->stats_mem.debugfs_mem, *pcur; + struct ocrdma_rdma_stats_resp *rdma_stats = + (struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; + struct ocrdma_tx_qp_err_stats *tx_qp_err_stats = + &rdma_stats->tx_qp_err_stats; + + memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM)); + + pcur = stats; + pcur += ocrdma_add_stat(stats, pcur, "local_length_errors", + (u64)tx_qp_err_stats->local_length_errors); + pcur += ocrdma_add_stat(stats, pcur, "local_protection_errors", + (u64)tx_qp_err_stats->local_protection_errors); + pcur += ocrdma_add_stat(stats, pcur, "local_qp_operation_errors", + (u64)tx_qp_err_stats->local_qp_operation_errors); + pcur += ocrdma_add_stat(stats, pcur, "retry_count_exceeded_errors", + (u64)tx_qp_err_stats->retry_count_exceeded_errors); + pcur += ocrdma_add_stat(stats, pcur, "rnr_retry_count_exceeded_errors", + (u64)tx_qp_err_stats->rnr_retry_count_exceeded_errors); + return stats; +} + +static char *ocrdma_tx_dbg_stats(struct ocrdma_dev *dev) +{ + int i; + char *pstats = dev->stats_mem.debugfs_mem; + struct ocrdma_rdma_stats_resp *rdma_stats = + (struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; + struct ocrdma_tx_dbg_stats *tx_dbg_stats = + &rdma_stats->tx_dbg_stats; + + memset(pstats, 0, (OCRDMA_MAX_DBGFS_MEM)); + + for (i = 0; i < 100; i++) + pstats += snprintf(pstats, 80, "DW[%d] = 0x%x\n", i, + tx_dbg_stats->data[i]); + + return dev->stats_mem.debugfs_mem; +} + +static char *ocrdma_rx_dbg_stats(struct ocrdma_dev *dev) +{ + int i; + char *pstats = dev->stats_mem.debugfs_mem; + struct ocrdma_rdma_stats_resp *rdma_stats = + (struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; + struct ocrdma_rx_dbg_stats *rx_dbg_stats = + &rdma_stats->rx_dbg_stats; + + memset(pstats, 0, (OCRDMA_MAX_DBGFS_MEM)); + + for (i = 0; i < 200; i++) + pstats += snprintf(pstats, 80, "DW[%d] = 0x%x\n", i, + rx_dbg_stats->data[i]); + + return dev->stats_mem.debugfs_mem; +} + +static char *ocrdma_driver_dbg_stats(struct ocrdma_dev *dev) +{ + char *stats = dev->stats_mem.debugfs_mem, *pcur; + + + memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM)); + + pcur = stats; + pcur += ocrdma_add_stat(stats, pcur, "async_cq_err", + (u64)(dev->async_err_stats + [OCRDMA_CQ_ERROR].counter)); + pcur += ocrdma_add_stat(stats, pcur, "async_cq_overrun_err", + (u64)dev->async_err_stats + [OCRDMA_CQ_OVERRUN_ERROR].counter); + pcur += ocrdma_add_stat(stats, pcur, "async_cq_qpcat_err", + (u64)dev->async_err_stats + [OCRDMA_CQ_QPCAT_ERROR].counter); + pcur += ocrdma_add_stat(stats, pcur, "async_qp_access_err", + (u64)dev->async_err_stats + [OCRDMA_QP_ACCESS_ERROR].counter); + pcur += ocrdma_add_stat(stats, pcur, "async_qp_commm_est_evt", + (u64)dev->async_err_stats + [OCRDMA_QP_COMM_EST_EVENT].counter); + pcur += ocrdma_add_stat(stats, pcur, "async_sq_drained_evt", + (u64)dev->async_err_stats + [OCRDMA_SQ_DRAINED_EVENT].counter); + pcur += ocrdma_add_stat(stats, pcur, "async_dev_fatal_evt", + (u64)dev->async_err_stats + [OCRDMA_DEVICE_FATAL_EVENT].counter); + pcur += ocrdma_add_stat(stats, pcur, "async_srqcat_err", + (u64)dev->async_err_stats + [OCRDMA_SRQCAT_ERROR].counter); + pcur += ocrdma_add_stat(stats, pcur, "async_srq_limit_evt", + (u64)dev->async_err_stats + [OCRDMA_SRQ_LIMIT_EVENT].counter); + pcur += ocrdma_add_stat(stats, pcur, "async_qp_last_wqe_evt", + (u64)dev->async_err_stats + [OCRDMA_QP_LAST_WQE_EVENT].counter); + + pcur += ocrdma_add_stat(stats, pcur, "cqe_loc_len_err", + (u64)dev->cqe_err_stats + [OCRDMA_CQE_LOC_LEN_ERR].counter); + pcur += ocrdma_add_stat(stats, pcur, "cqe_loc_qp_op_err", + (u64)dev->cqe_err_stats + [OCRDMA_CQE_LOC_QP_OP_ERR].counter); + pcur += ocrdma_add_stat(stats, pcur, "cqe_loc_eec_op_err", + (u64)dev->cqe_err_stats + [OCRDMA_CQE_LOC_EEC_OP_ERR].counter); + pcur += ocrdma_add_stat(stats, pcur, "cqe_loc_prot_err", + (u64)dev->cqe_err_stats + [OCRDMA_CQE_LOC_PROT_ERR].counter); + pcur += ocrdma_add_stat(stats, pcur, "cqe_wr_flush_err", + (u64)dev->cqe_err_stats + [OCRDMA_CQE_WR_FLUSH_ERR].counter); + pcur += ocrdma_add_stat(stats, pcur, "cqe_mw_bind_err", + (u64)dev->cqe_err_stats + [OCRDMA_CQE_MW_BIND_ERR].counter); + pcur += ocrdma_add_stat(stats, pcur, "cqe_bad_resp_err", + (u64)dev->cqe_err_stats + [OCRDMA_CQE_BAD_RESP_ERR].counter); + pcur += ocrdma_add_stat(stats, pcur, "cqe_loc_access_err", + (u64)dev->cqe_err_stats + [OCRDMA_CQE_LOC_ACCESS_ERR].counter); + pcur += ocrdma_add_stat(stats, pcur, "cqe_rem_inv_req_err", + (u64)dev->cqe_err_stats + [OCRDMA_CQE_REM_INV_REQ_ERR].counter); + pcur += ocrdma_add_stat(stats, pcur, "cqe_rem_access_err", + (u64)dev->cqe_err_stats + [OCRDMA_CQE_REM_ACCESS_ERR].counter); + pcur += ocrdma_add_stat(stats, pcur, "cqe_rem_op_err", + (u64)dev->cqe_err_stats + [OCRDMA_CQE_REM_OP_ERR].counter); + pcur += ocrdma_add_stat(stats, pcur, "cqe_retry_exc_err", + (u64)dev->cqe_err_stats + [OCRDMA_CQE_RETRY_EXC_ERR].counter); + pcur += ocrdma_add_stat(stats, pcur, "cqe_rnr_retry_exc_err", + (u64)dev->cqe_err_stats + [OCRDMA_CQE_RNR_RETRY_EXC_ERR].counter); + pcur += ocrdma_add_stat(stats, pcur, "cqe_loc_rdd_viol_err", + (u64)dev->cqe_err_stats + [OCRDMA_CQE_LOC_RDD_VIOL_ERR].counter); + pcur += ocrdma_add_stat(stats, pcur, "cqe_rem_inv_rd_req_err", + (u64)dev->cqe_err_stats + [OCRDMA_CQE_REM_INV_RD_REQ_ERR].counter); + pcur += ocrdma_add_stat(stats, pcur, "cqe_rem_abort_err", + (u64)dev->cqe_err_stats + [OCRDMA_CQE_REM_ABORT_ERR].counter); + pcur += ocrdma_add_stat(stats, pcur, "cqe_inv_eecn_err", + (u64)dev->cqe_err_stats + [OCRDMA_CQE_INV_EECN_ERR].counter); + pcur += ocrdma_add_stat(stats, pcur, "cqe_inv_eec_state_err", + (u64)dev->cqe_err_stats + [OCRDMA_CQE_INV_EEC_STATE_ERR].counter); + pcur += ocrdma_add_stat(stats, pcur, "cqe_fatal_err", + (u64)dev->cqe_err_stats + [OCRDMA_CQE_FATAL_ERR].counter); + pcur += ocrdma_add_stat(stats, pcur, "cqe_resp_timeout_err", + (u64)dev->cqe_err_stats + [OCRDMA_CQE_RESP_TIMEOUT_ERR].counter); + pcur += ocrdma_add_stat(stats, pcur, "cqe_general_err", + (u64)dev->cqe_err_stats + [OCRDMA_CQE_GENERAL_ERR].counter); + return stats; +} + +static void ocrdma_update_stats(struct ocrdma_dev *dev) +{ + ulong now = jiffies, secs; + int status = 0; + struct ocrdma_rdma_stats_resp *rdma_stats = + (struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; + struct ocrdma_rsrc_stats *rsrc_stats = &rdma_stats->act_rsrc_stats; + + secs = jiffies_to_msecs(now - dev->last_stats_time) / 1000U; + if (secs) { + /* update */ + status = ocrdma_mbx_rdma_stats(dev, false); + if (status) + pr_err("%s: stats mbox failed with status = %d\n", + __func__, status); + /* Update PD counters from PD resource manager */ + if (dev->pd_mgr->pd_prealloc_valid) { + rsrc_stats->dpp_pds = dev->pd_mgr->pd_dpp_count; + rsrc_stats->non_dpp_pds = dev->pd_mgr->pd_norm_count; + /* Threshold stata*/ + rsrc_stats = &rdma_stats->th_rsrc_stats; + rsrc_stats->dpp_pds = dev->pd_mgr->pd_dpp_thrsh; + rsrc_stats->non_dpp_pds = dev->pd_mgr->pd_norm_thrsh; + } + dev->last_stats_time = jiffies; + } +} + +static ssize_t ocrdma_dbgfs_ops_write(struct file *filp, + const char __user *buffer, + size_t count, loff_t *ppos) +{ + char tmp_str[32]; + long reset; + int status = 0; + struct ocrdma_stats *pstats = filp->private_data; + struct ocrdma_dev *dev = pstats->dev; + + if (count > 32) + goto err; + + if (copy_from_user(tmp_str, buffer, count)) + goto err; + + tmp_str[count-1] = '\0'; + if (kstrtol(tmp_str, 10, &reset)) + goto err; + + switch (pstats->type) { + case OCRDMA_RESET_STATS: + if (reset) { + status = ocrdma_mbx_rdma_stats(dev, true); + if (status) { + pr_err("Failed to reset stats = %d", status); + goto err; + } + } + break; + default: + goto err; + } + + return count; +err: + return -EFAULT; +} + +int ocrdma_pma_counters(struct ocrdma_dev *dev, + struct ib_mad *out_mad) +{ + struct ib_pma_portcounters *pma_cnt; + + memset(out_mad->data, 0, sizeof out_mad->data); + pma_cnt = (void *)(out_mad->data + 40); + ocrdma_update_stats(dev); + + pma_cnt->port_xmit_data = cpu_to_be32(ocrdma_sysfs_xmit_data(dev)); + pma_cnt->port_rcv_data = cpu_to_be32(ocrdma_sysfs_rcv_data(dev)); + pma_cnt->port_xmit_packets = cpu_to_be32(ocrdma_sysfs_xmit_pkts(dev)); + pma_cnt->port_rcv_packets = cpu_to_be32(ocrdma_sysfs_rcv_pkts(dev)); + return 0; +} + +static ssize_t ocrdma_dbgfs_ops_read(struct file *filp, char __user *buffer, + size_t usr_buf_len, loff_t *ppos) +{ + struct ocrdma_stats *pstats = filp->private_data; + struct ocrdma_dev *dev = pstats->dev; + ssize_t status = 0; + char *data = NULL; + + /* No partial reads */ + if (*ppos != 0) + return 0; + + mutex_lock(&dev->stats_lock); + + ocrdma_update_stats(dev); + + switch (pstats->type) { + case OCRDMA_RSRC_STATS: + data = ocrdma_resource_stats(dev); + break; + case OCRDMA_RXSTATS: + data = ocrdma_rx_stats(dev); + break; + case OCRDMA_WQESTATS: + data = ocrdma_wqe_stats(dev); + break; + case OCRDMA_TXSTATS: + data = ocrdma_tx_stats(dev); + break; + case OCRDMA_DB_ERRSTATS: + data = ocrdma_db_errstats(dev); + break; + case OCRDMA_RXQP_ERRSTATS: + data = ocrdma_rxqp_errstats(dev); + break; + case OCRDMA_TXQP_ERRSTATS: + data = ocrdma_txqp_errstats(dev); + break; + case OCRDMA_TX_DBG_STATS: + data = ocrdma_tx_dbg_stats(dev); + break; + case OCRDMA_RX_DBG_STATS: + data = ocrdma_rx_dbg_stats(dev); + break; + case OCRDMA_DRV_STATS: + data = ocrdma_driver_dbg_stats(dev); + break; + + default: + status = -EFAULT; + goto exit; + } + + if (usr_buf_len < strlen(data)) { + status = -ENOSPC; + goto exit; + } + + status = simple_read_from_buffer(buffer, usr_buf_len, ppos, data, + strlen(data)); +exit: + mutex_unlock(&dev->stats_lock); + return status; +} + +static const struct file_operations ocrdma_dbg_ops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = ocrdma_dbgfs_ops_read, + .write = ocrdma_dbgfs_ops_write, +}; + +void ocrdma_add_port_stats(struct ocrdma_dev *dev) +{ + if (!ocrdma_dbgfs_dir) + return; + + /* Create post stats base dir */ + dev->dir = debugfs_create_dir(dev->ibdev.name, ocrdma_dbgfs_dir); + if (!dev->dir) + goto err; + + dev->rsrc_stats.type = OCRDMA_RSRC_STATS; + dev->rsrc_stats.dev = dev; + if (!debugfs_create_file("resource_stats", S_IRUSR, dev->dir, + &dev->rsrc_stats, &ocrdma_dbg_ops)) + goto err; + + dev->rx_stats.type = OCRDMA_RXSTATS; + dev->rx_stats.dev = dev; + if (!debugfs_create_file("rx_stats", S_IRUSR, dev->dir, + &dev->rx_stats, &ocrdma_dbg_ops)) + goto err; + + dev->wqe_stats.type = OCRDMA_WQESTATS; + dev->wqe_stats.dev = dev; + if (!debugfs_create_file("wqe_stats", S_IRUSR, dev->dir, + &dev->wqe_stats, &ocrdma_dbg_ops)) + goto err; + + dev->tx_stats.type = OCRDMA_TXSTATS; + dev->tx_stats.dev = dev; + if (!debugfs_create_file("tx_stats", S_IRUSR, dev->dir, + &dev->tx_stats, &ocrdma_dbg_ops)) + goto err; + + dev->db_err_stats.type = OCRDMA_DB_ERRSTATS; + dev->db_err_stats.dev = dev; + if (!debugfs_create_file("db_err_stats", S_IRUSR, dev->dir, + &dev->db_err_stats, &ocrdma_dbg_ops)) + goto err; + + + dev->tx_qp_err_stats.type = OCRDMA_TXQP_ERRSTATS; + dev->tx_qp_err_stats.dev = dev; + if (!debugfs_create_file("tx_qp_err_stats", S_IRUSR, dev->dir, + &dev->tx_qp_err_stats, &ocrdma_dbg_ops)) + goto err; + + dev->rx_qp_err_stats.type = OCRDMA_RXQP_ERRSTATS; + dev->rx_qp_err_stats.dev = dev; + if (!debugfs_create_file("rx_qp_err_stats", S_IRUSR, dev->dir, + &dev->rx_qp_err_stats, &ocrdma_dbg_ops)) + goto err; + + + dev->tx_dbg_stats.type = OCRDMA_TX_DBG_STATS; + dev->tx_dbg_stats.dev = dev; + if (!debugfs_create_file("tx_dbg_stats", S_IRUSR, dev->dir, + &dev->tx_dbg_stats, &ocrdma_dbg_ops)) + goto err; + + dev->rx_dbg_stats.type = OCRDMA_RX_DBG_STATS; + dev->rx_dbg_stats.dev = dev; + if (!debugfs_create_file("rx_dbg_stats", S_IRUSR, dev->dir, + &dev->rx_dbg_stats, &ocrdma_dbg_ops)) + goto err; + + dev->driver_stats.type = OCRDMA_DRV_STATS; + dev->driver_stats.dev = dev; + if (!debugfs_create_file("driver_dbg_stats", S_IRUSR, dev->dir, + &dev->driver_stats, &ocrdma_dbg_ops)) + goto err; + + dev->reset_stats.type = OCRDMA_RESET_STATS; + dev->reset_stats.dev = dev; + if (!debugfs_create_file("reset_stats", S_IRUSR, dev->dir, + &dev->reset_stats, &ocrdma_dbg_ops)) + goto err; + + /* Now create dma_mem for stats mbx command */ + if (!ocrdma_alloc_stats_mem(dev)) + goto err; + + mutex_init(&dev->stats_lock); + + return; +err: + ocrdma_release_stats_mem(dev); + debugfs_remove_recursive(dev->dir); + dev->dir = NULL; +} + +void ocrdma_rem_port_stats(struct ocrdma_dev *dev) +{ + if (!dev->dir) + return; + mutex_destroy(&dev->stats_lock); + ocrdma_release_stats_mem(dev); + debugfs_remove(dev->dir); +} + +void ocrdma_init_debugfs(void) +{ + /* Create base dir in debugfs root dir */ + ocrdma_dbgfs_dir = debugfs_create_dir("ocrdma", NULL); +} + +void ocrdma_rem_debugfs(void) +{ + debugfs_remove_recursive(ocrdma_dbgfs_dir); +} diff --git a/kernel/drivers/infiniband/hw/ocrdma/ocrdma_stats.h b/kernel/drivers/infiniband/hw/ocrdma/ocrdma_stats.h new file mode 100644 index 000000000..091edd68a --- /dev/null +++ b/kernel/drivers/infiniband/hw/ocrdma/ocrdma_stats.h @@ -0,0 +1,58 @@ +/******************************************************************* + * This file is part of the Emulex RoCE Device Driver for * + * RoCE (RDMA over Converged Ethernet) adapters. * + * Copyright (C) 2008-2014 Emulex. All rights reserved. * + * EMULEX and SLI are trademarks of Emulex. * + * www.emulex.com * + * * + * This program is free software; you can redistribute it and/or * + * modify it under the terms of version 2 of the GNU General * + * Public License as published by the Free Software Foundation. * + * This program is distributed in the hope that it will be useful. * + * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND * + * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, * + * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE * + * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD * + * TO BE LEGALLY INVALID. See the GNU General Public License for * + * more details, a copy of which can be found in the file COPYING * + * included with this package. * + * + * Contact Information: + * linux-drivers@emulex.com + * + * Emulex + * 3333 Susan Street + * Costa Mesa, CA 92626 + *******************************************************************/ + +#ifndef __OCRDMA_STATS_H__ +#define __OCRDMA_STATS_H__ + +#include +#include "ocrdma.h" +#include "ocrdma_hw.h" + +#define OCRDMA_MAX_DBGFS_MEM 4096 + +enum OCRDMA_STATS_TYPE { + OCRDMA_RSRC_STATS, + OCRDMA_RXSTATS, + OCRDMA_WQESTATS, + OCRDMA_TXSTATS, + OCRDMA_DB_ERRSTATS, + OCRDMA_RXQP_ERRSTATS, + OCRDMA_TXQP_ERRSTATS, + OCRDMA_TX_DBG_STATS, + OCRDMA_RX_DBG_STATS, + OCRDMA_DRV_STATS, + OCRDMA_RESET_STATS +}; + +void ocrdma_rem_debugfs(void); +void ocrdma_init_debugfs(void); +void ocrdma_rem_port_stats(struct ocrdma_dev *dev); +void ocrdma_add_port_stats(struct ocrdma_dev *dev); +int ocrdma_pma_counters(struct ocrdma_dev *dev, + struct ib_mad *out_mad); + +#endif /* __OCRDMA_STATS_H__ */ diff --git a/kernel/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/kernel/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c new file mode 100644 index 000000000..9dcb66077 --- /dev/null +++ b/kernel/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -0,0 +1,3195 @@ +/******************************************************************* + * This file is part of the Emulex RoCE Device Driver for * + * RoCE (RDMA over Converged Ethernet) adapters. * + * Copyright (C) 2008-2012 Emulex. All rights reserved. * + * EMULEX and SLI are trademarks of Emulex. * + * www.emulex.com * + * * + * This program is free software; you can redistribute it and/or * + * modify it under the terms of version 2 of the GNU General * + * Public License as published by the Free Software Foundation. * + * This program is distributed in the hope that it will be useful. * + * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND * + * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, * + * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE * + * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD * + * TO BE LEGALLY INVALID. See the GNU General Public License for * + * more details, a copy of which can be found in the file COPYING * + * included with this package. * + * + * Contact Information: + * linux-drivers@emulex.com + * + * Emulex + * 3333 Susan Street + * Costa Mesa, CA 92626 + *******************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include "ocrdma.h" +#include "ocrdma_hw.h" +#include "ocrdma_verbs.h" +#include "ocrdma_abi.h" + +int ocrdma_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey) +{ + if (index > 1) + return -EINVAL; + + *pkey = 0xffff; + return 0; +} + +int ocrdma_query_gid(struct ib_device *ibdev, u8 port, + int index, union ib_gid *sgid) +{ + struct ocrdma_dev *dev; + + dev = get_ocrdma_dev(ibdev); + memset(sgid, 0, sizeof(*sgid)); + if (index >= OCRDMA_MAX_SGID) + return -EINVAL; + + memcpy(sgid, &dev->sgid_tbl[index], sizeof(*sgid)); + + return 0; +} + +int ocrdma_query_device(struct ib_device *ibdev, struct ib_device_attr *attr) +{ + struct ocrdma_dev *dev = get_ocrdma_dev(ibdev); + + memset(attr, 0, sizeof *attr); + memcpy(&attr->fw_ver, &dev->attr.fw_ver[0], + min(sizeof(dev->attr.fw_ver), sizeof(attr->fw_ver))); + ocrdma_get_guid(dev, (u8 *)&attr->sys_image_guid); + attr->max_mr_size = dev->attr.max_mr_size; + attr->page_size_cap = 0xffff000; + attr->vendor_id = dev->nic_info.pdev->vendor; + attr->vendor_part_id = dev->nic_info.pdev->device; + attr->hw_ver = dev->asic_id; + attr->max_qp = dev->attr.max_qp; + attr->max_ah = OCRDMA_MAX_AH; + attr->max_qp_wr = dev->attr.max_wqe; + + attr->device_cap_flags = IB_DEVICE_CURR_QP_STATE_MOD | + IB_DEVICE_RC_RNR_NAK_GEN | + IB_DEVICE_SHUTDOWN_PORT | + IB_DEVICE_SYS_IMAGE_GUID | + IB_DEVICE_LOCAL_DMA_LKEY | + IB_DEVICE_MEM_MGT_EXTENSIONS; + attr->max_sge = min(dev->attr.max_send_sge, dev->attr.max_srq_sge); + attr->max_sge_rd = 0; + attr->max_cq = dev->attr.max_cq; + attr->max_cqe = dev->attr.max_cqe; + attr->max_mr = dev->attr.max_mr; + attr->max_mw = dev->attr.max_mw; + attr->max_pd = dev->attr.max_pd; + attr->atomic_cap = 0; + attr->max_fmr = 0; + attr->max_map_per_fmr = 0; + attr->max_qp_rd_atom = + min(dev->attr.max_ord_per_qp, dev->attr.max_ird_per_qp); + attr->max_qp_init_rd_atom = dev->attr.max_ord_per_qp; + attr->max_srq = dev->attr.max_srq; + attr->max_srq_sge = dev->attr.max_srq_sge; + attr->max_srq_wr = dev->attr.max_rqe; + attr->local_ca_ack_delay = dev->attr.local_ca_ack_delay; + attr->max_fast_reg_page_list_len = dev->attr.max_pages_per_frmr; + attr->max_pkeys = 1; + return 0; +} + +static inline void get_link_speed_and_width(struct ocrdma_dev *dev, + u8 *ib_speed, u8 *ib_width) +{ + int status; + u8 speed; + + status = ocrdma_mbx_get_link_speed(dev, &speed); + if (status) + speed = OCRDMA_PHYS_LINK_SPEED_ZERO; + + switch (speed) { + case OCRDMA_PHYS_LINK_SPEED_1GBPS: + *ib_speed = IB_SPEED_SDR; + *ib_width = IB_WIDTH_1X; + break; + + case OCRDMA_PHYS_LINK_SPEED_10GBPS: + *ib_speed = IB_SPEED_QDR; + *ib_width = IB_WIDTH_1X; + break; + + case OCRDMA_PHYS_LINK_SPEED_20GBPS: + *ib_speed = IB_SPEED_DDR; + *ib_width = IB_WIDTH_4X; + break; + + case OCRDMA_PHYS_LINK_SPEED_40GBPS: + *ib_speed = IB_SPEED_QDR; + *ib_width = IB_WIDTH_4X; + break; + + default: + /* Unsupported */ + *ib_speed = IB_SPEED_SDR; + *ib_width = IB_WIDTH_1X; + } +} + +int ocrdma_query_port(struct ib_device *ibdev, + u8 port, struct ib_port_attr *props) +{ + enum ib_port_state port_state; + struct ocrdma_dev *dev; + struct net_device *netdev; + + dev = get_ocrdma_dev(ibdev); + if (port > 1) { + pr_err("%s(%d) invalid_port=0x%x\n", __func__, + dev->id, port); + return -EINVAL; + } + netdev = dev->nic_info.netdev; + if (netif_running(netdev) && netif_oper_up(netdev)) { + port_state = IB_PORT_ACTIVE; + props->phys_state = 5; + } else { + port_state = IB_PORT_DOWN; + props->phys_state = 3; + } + props->max_mtu = IB_MTU_4096; + props->active_mtu = iboe_get_mtu(netdev->mtu); + props->lid = 0; + props->lmc = 0; + props->sm_lid = 0; + props->sm_sl = 0; + props->state = port_state; + props->port_cap_flags = + IB_PORT_CM_SUP | + IB_PORT_REINIT_SUP | + IB_PORT_DEVICE_MGMT_SUP | IB_PORT_VENDOR_CLASS_SUP | IB_PORT_IP_BASED_GIDS; + props->gid_tbl_len = OCRDMA_MAX_SGID; + props->pkey_tbl_len = 1; + props->bad_pkey_cntr = 0; + props->qkey_viol_cntr = 0; + get_link_speed_and_width(dev, &props->active_speed, + &props->active_width); + props->max_msg_sz = 0x80000000; + props->max_vl_num = 4; + return 0; +} + +int ocrdma_modify_port(struct ib_device *ibdev, u8 port, int mask, + struct ib_port_modify *props) +{ + struct ocrdma_dev *dev; + + dev = get_ocrdma_dev(ibdev); + if (port > 1) { + pr_err("%s(%d) invalid_port=0x%x\n", __func__, dev->id, port); + return -EINVAL; + } + return 0; +} + +static int ocrdma_add_mmap(struct ocrdma_ucontext *uctx, u64 phy_addr, + unsigned long len) +{ + struct ocrdma_mm *mm; + + mm = kzalloc(sizeof(*mm), GFP_KERNEL); + if (mm == NULL) + return -ENOMEM; + mm->key.phy_addr = phy_addr; + mm->key.len = len; + INIT_LIST_HEAD(&mm->entry); + + mutex_lock(&uctx->mm_list_lock); + list_add_tail(&mm->entry, &uctx->mm_head); + mutex_unlock(&uctx->mm_list_lock); + return 0; +} + +static void ocrdma_del_mmap(struct ocrdma_ucontext *uctx, u64 phy_addr, + unsigned long len) +{ + struct ocrdma_mm *mm, *tmp; + + mutex_lock(&uctx->mm_list_lock); + list_for_each_entry_safe(mm, tmp, &uctx->mm_head, entry) { + if (len != mm->key.len && phy_addr != mm->key.phy_addr) + continue; + + list_del(&mm->entry); + kfree(mm); + break; + } + mutex_unlock(&uctx->mm_list_lock); +} + +static bool ocrdma_search_mmap(struct ocrdma_ucontext *uctx, u64 phy_addr, + unsigned long len) +{ + bool found = false; + struct ocrdma_mm *mm; + + mutex_lock(&uctx->mm_list_lock); + list_for_each_entry(mm, &uctx->mm_head, entry) { + if (len != mm->key.len && phy_addr != mm->key.phy_addr) + continue; + + found = true; + break; + } + mutex_unlock(&uctx->mm_list_lock); + return found; +} + + +static u16 _ocrdma_pd_mgr_get_bitmap(struct ocrdma_dev *dev, bool dpp_pool) +{ + u16 pd_bitmap_idx = 0; + const unsigned long *pd_bitmap; + + if (dpp_pool) { + pd_bitmap = dev->pd_mgr->pd_dpp_bitmap; + pd_bitmap_idx = find_first_zero_bit(pd_bitmap, + dev->pd_mgr->max_dpp_pd); + __set_bit(pd_bitmap_idx, dev->pd_mgr->pd_dpp_bitmap); + dev->pd_mgr->pd_dpp_count++; + if (dev->pd_mgr->pd_dpp_count > dev->pd_mgr->pd_dpp_thrsh) + dev->pd_mgr->pd_dpp_thrsh = dev->pd_mgr->pd_dpp_count; + } else { + pd_bitmap = dev->pd_mgr->pd_norm_bitmap; + pd_bitmap_idx = find_first_zero_bit(pd_bitmap, + dev->pd_mgr->max_normal_pd); + __set_bit(pd_bitmap_idx, dev->pd_mgr->pd_norm_bitmap); + dev->pd_mgr->pd_norm_count++; + if (dev->pd_mgr->pd_norm_count > dev->pd_mgr->pd_norm_thrsh) + dev->pd_mgr->pd_norm_thrsh = dev->pd_mgr->pd_norm_count; + } + return pd_bitmap_idx; +} + +static int _ocrdma_pd_mgr_put_bitmap(struct ocrdma_dev *dev, u16 pd_id, + bool dpp_pool) +{ + u16 pd_count; + u16 pd_bit_index; + + pd_count = dpp_pool ? dev->pd_mgr->pd_dpp_count : + dev->pd_mgr->pd_norm_count; + if (pd_count == 0) + return -EINVAL; + + if (dpp_pool) { + pd_bit_index = pd_id - dev->pd_mgr->pd_dpp_start; + if (pd_bit_index >= dev->pd_mgr->max_dpp_pd) { + return -EINVAL; + } else { + __clear_bit(pd_bit_index, dev->pd_mgr->pd_dpp_bitmap); + dev->pd_mgr->pd_dpp_count--; + } + } else { + pd_bit_index = pd_id - dev->pd_mgr->pd_norm_start; + if (pd_bit_index >= dev->pd_mgr->max_normal_pd) { + return -EINVAL; + } else { + __clear_bit(pd_bit_index, dev->pd_mgr->pd_norm_bitmap); + dev->pd_mgr->pd_norm_count--; + } + } + + return 0; +} + +static u8 ocrdma_put_pd_num(struct ocrdma_dev *dev, u16 pd_id, + bool dpp_pool) +{ + int status; + + mutex_lock(&dev->dev_lock); + status = _ocrdma_pd_mgr_put_bitmap(dev, pd_id, dpp_pool); + mutex_unlock(&dev->dev_lock); + return status; +} + +static int ocrdma_get_pd_num(struct ocrdma_dev *dev, struct ocrdma_pd *pd) +{ + u16 pd_idx = 0; + int status = 0; + + mutex_lock(&dev->dev_lock); + if (pd->dpp_enabled) { + /* try allocating DPP PD, if not available then normal PD */ + if (dev->pd_mgr->pd_dpp_count < dev->pd_mgr->max_dpp_pd) { + pd_idx = _ocrdma_pd_mgr_get_bitmap(dev, true); + pd->id = dev->pd_mgr->pd_dpp_start + pd_idx; + pd->dpp_page = dev->pd_mgr->dpp_page_index + pd_idx; + } else if (dev->pd_mgr->pd_norm_count < + dev->pd_mgr->max_normal_pd) { + pd_idx = _ocrdma_pd_mgr_get_bitmap(dev, false); + pd->id = dev->pd_mgr->pd_norm_start + pd_idx; + pd->dpp_enabled = false; + } else { + status = -EINVAL; + } + } else { + if (dev->pd_mgr->pd_norm_count < dev->pd_mgr->max_normal_pd) { + pd_idx = _ocrdma_pd_mgr_get_bitmap(dev, false); + pd->id = dev->pd_mgr->pd_norm_start + pd_idx; + } else { + status = -EINVAL; + } + } + mutex_unlock(&dev->dev_lock); + return status; +} + +static struct ocrdma_pd *_ocrdma_alloc_pd(struct ocrdma_dev *dev, + struct ocrdma_ucontext *uctx, + struct ib_udata *udata) +{ + struct ocrdma_pd *pd = NULL; + int status = 0; + + pd = kzalloc(sizeof(*pd), GFP_KERNEL); + if (!pd) + return ERR_PTR(-ENOMEM); + + if (udata && uctx && dev->attr.max_dpp_pds) { + pd->dpp_enabled = + ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R; + pd->num_dpp_qp = + pd->dpp_enabled ? (dev->nic_info.db_page_size / + dev->attr.wqe_size) : 0; + } + + if (dev->pd_mgr->pd_prealloc_valid) { + status = ocrdma_get_pd_num(dev, pd); + return (status == 0) ? pd : ERR_PTR(status); + } + +retry: + status = ocrdma_mbx_alloc_pd(dev, pd); + if (status) { + if (pd->dpp_enabled) { + pd->dpp_enabled = false; + pd->num_dpp_qp = 0; + goto retry; + } else { + kfree(pd); + return ERR_PTR(status); + } + } + + return pd; +} + +static inline int is_ucontext_pd(struct ocrdma_ucontext *uctx, + struct ocrdma_pd *pd) +{ + return (uctx->cntxt_pd == pd ? true : false); +} + +static int _ocrdma_dealloc_pd(struct ocrdma_dev *dev, + struct ocrdma_pd *pd) +{ + int status = 0; + + if (dev->pd_mgr->pd_prealloc_valid) + status = ocrdma_put_pd_num(dev, pd->id, pd->dpp_enabled); + else + status = ocrdma_mbx_dealloc_pd(dev, pd); + + kfree(pd); + return status; +} + +static int ocrdma_alloc_ucontext_pd(struct ocrdma_dev *dev, + struct ocrdma_ucontext *uctx, + struct ib_udata *udata) +{ + int status = 0; + + uctx->cntxt_pd = _ocrdma_alloc_pd(dev, uctx, udata); + if (IS_ERR(uctx->cntxt_pd)) { + status = PTR_ERR(uctx->cntxt_pd); + uctx->cntxt_pd = NULL; + goto err; + } + + uctx->cntxt_pd->uctx = uctx; + uctx->cntxt_pd->ibpd.device = &dev->ibdev; +err: + return status; +} + +static int ocrdma_dealloc_ucontext_pd(struct ocrdma_ucontext *uctx) +{ + struct ocrdma_pd *pd = uctx->cntxt_pd; + struct ocrdma_dev *dev = get_ocrdma_dev(pd->ibpd.device); + + if (uctx->pd_in_use) { + pr_err("%s(%d) Freeing in use pdid=0x%x.\n", + __func__, dev->id, pd->id); + } + uctx->cntxt_pd = NULL; + (void)_ocrdma_dealloc_pd(dev, pd); + return 0; +} + +static struct ocrdma_pd *ocrdma_get_ucontext_pd(struct ocrdma_ucontext *uctx) +{ + struct ocrdma_pd *pd = NULL; + + mutex_lock(&uctx->mm_list_lock); + if (!uctx->pd_in_use) { + uctx->pd_in_use = true; + pd = uctx->cntxt_pd; + } + mutex_unlock(&uctx->mm_list_lock); + + return pd; +} + +static void ocrdma_release_ucontext_pd(struct ocrdma_ucontext *uctx) +{ + mutex_lock(&uctx->mm_list_lock); + uctx->pd_in_use = false; + mutex_unlock(&uctx->mm_list_lock); +} + +struct ib_ucontext *ocrdma_alloc_ucontext(struct ib_device *ibdev, + struct ib_udata *udata) +{ + int status; + struct ocrdma_ucontext *ctx; + struct ocrdma_alloc_ucontext_resp resp; + struct ocrdma_dev *dev = get_ocrdma_dev(ibdev); + struct pci_dev *pdev = dev->nic_info.pdev; + u32 map_len = roundup(sizeof(u32) * 2048, PAGE_SIZE); + + if (!udata) + return ERR_PTR(-EFAULT); + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return ERR_PTR(-ENOMEM); + INIT_LIST_HEAD(&ctx->mm_head); + mutex_init(&ctx->mm_list_lock); + + ctx->ah_tbl.va = dma_alloc_coherent(&pdev->dev, map_len, + &ctx->ah_tbl.pa, GFP_KERNEL); + if (!ctx->ah_tbl.va) { + kfree(ctx); + return ERR_PTR(-ENOMEM); + } + memset(ctx->ah_tbl.va, 0, map_len); + ctx->ah_tbl.len = map_len; + + memset(&resp, 0, sizeof(resp)); + resp.ah_tbl_len = ctx->ah_tbl.len; + resp.ah_tbl_page = virt_to_phys(ctx->ah_tbl.va); + + status = ocrdma_add_mmap(ctx, resp.ah_tbl_page, resp.ah_tbl_len); + if (status) + goto map_err; + + status = ocrdma_alloc_ucontext_pd(dev, ctx, udata); + if (status) + goto pd_err; + + resp.dev_id = dev->id; + resp.max_inline_data = dev->attr.max_inline_data; + resp.wqe_size = dev->attr.wqe_size; + resp.rqe_size = dev->attr.rqe_size; + resp.dpp_wqe_size = dev->attr.wqe_size; + + memcpy(resp.fw_ver, dev->attr.fw_ver, sizeof(resp.fw_ver)); + status = ib_copy_to_udata(udata, &resp, sizeof(resp)); + if (status) + goto cpy_err; + return &ctx->ibucontext; + +cpy_err: +pd_err: + ocrdma_del_mmap(ctx, ctx->ah_tbl.pa, ctx->ah_tbl.len); +map_err: + dma_free_coherent(&pdev->dev, ctx->ah_tbl.len, ctx->ah_tbl.va, + ctx->ah_tbl.pa); + kfree(ctx); + return ERR_PTR(status); +} + +int ocrdma_dealloc_ucontext(struct ib_ucontext *ibctx) +{ + int status = 0; + struct ocrdma_mm *mm, *tmp; + struct ocrdma_ucontext *uctx = get_ocrdma_ucontext(ibctx); + struct ocrdma_dev *dev = get_ocrdma_dev(ibctx->device); + struct pci_dev *pdev = dev->nic_info.pdev; + + status = ocrdma_dealloc_ucontext_pd(uctx); + + ocrdma_del_mmap(uctx, uctx->ah_tbl.pa, uctx->ah_tbl.len); + dma_free_coherent(&pdev->dev, uctx->ah_tbl.len, uctx->ah_tbl.va, + uctx->ah_tbl.pa); + + list_for_each_entry_safe(mm, tmp, &uctx->mm_head, entry) { + list_del(&mm->entry); + kfree(mm); + } + kfree(uctx); + return status; +} + +int ocrdma_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) +{ + struct ocrdma_ucontext *ucontext = get_ocrdma_ucontext(context); + struct ocrdma_dev *dev = get_ocrdma_dev(context->device); + unsigned long vm_page = vma->vm_pgoff << PAGE_SHIFT; + u64 unmapped_db = (u64) dev->nic_info.unmapped_db; + unsigned long len = (vma->vm_end - vma->vm_start); + int status = 0; + bool found; + + if (vma->vm_start & (PAGE_SIZE - 1)) + return -EINVAL; + found = ocrdma_search_mmap(ucontext, vma->vm_pgoff << PAGE_SHIFT, len); + if (!found) + return -EINVAL; + + if ((vm_page >= unmapped_db) && (vm_page <= (unmapped_db + + dev->nic_info.db_total_size)) && + (len <= dev->nic_info.db_page_size)) { + if (vma->vm_flags & VM_READ) + return -EPERM; + + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + status = io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, + len, vma->vm_page_prot); + } else if (dev->nic_info.dpp_unmapped_len && + (vm_page >= (u64) dev->nic_info.dpp_unmapped_addr) && + (vm_page <= (u64) (dev->nic_info.dpp_unmapped_addr + + dev->nic_info.dpp_unmapped_len)) && + (len <= dev->nic_info.dpp_unmapped_len)) { + if (vma->vm_flags & VM_READ) + return -EPERM; + + vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); + status = io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, + len, vma->vm_page_prot); + } else { + status = remap_pfn_range(vma, vma->vm_start, + vma->vm_pgoff, len, vma->vm_page_prot); + } + return status; +} + +static int ocrdma_copy_pd_uresp(struct ocrdma_dev *dev, struct ocrdma_pd *pd, + struct ib_ucontext *ib_ctx, + struct ib_udata *udata) +{ + int status; + u64 db_page_addr; + u64 dpp_page_addr = 0; + u32 db_page_size; + struct ocrdma_alloc_pd_uresp rsp; + struct ocrdma_ucontext *uctx = get_ocrdma_ucontext(ib_ctx); + + memset(&rsp, 0, sizeof(rsp)); + rsp.id = pd->id; + rsp.dpp_enabled = pd->dpp_enabled; + db_page_addr = ocrdma_get_db_addr(dev, pd->id); + db_page_size = dev->nic_info.db_page_size; + + status = ocrdma_add_mmap(uctx, db_page_addr, db_page_size); + if (status) + return status; + + if (pd->dpp_enabled) { + dpp_page_addr = dev->nic_info.dpp_unmapped_addr + + (pd->id * PAGE_SIZE); + status = ocrdma_add_mmap(uctx, dpp_page_addr, + PAGE_SIZE); + if (status) + goto dpp_map_err; + rsp.dpp_page_addr_hi = upper_32_bits(dpp_page_addr); + rsp.dpp_page_addr_lo = dpp_page_addr; + } + + status = ib_copy_to_udata(udata, &rsp, sizeof(rsp)); + if (status) + goto ucopy_err; + + pd->uctx = uctx; + return 0; + +ucopy_err: + if (pd->dpp_enabled) + ocrdma_del_mmap(pd->uctx, dpp_page_addr, PAGE_SIZE); +dpp_map_err: + ocrdma_del_mmap(pd->uctx, db_page_addr, db_page_size); + return status; +} + +struct ib_pd *ocrdma_alloc_pd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct ocrdma_dev *dev = get_ocrdma_dev(ibdev); + struct ocrdma_pd *pd; + struct ocrdma_ucontext *uctx = NULL; + int status; + u8 is_uctx_pd = false; + + if (udata && context) { + uctx = get_ocrdma_ucontext(context); + pd = ocrdma_get_ucontext_pd(uctx); + if (pd) { + is_uctx_pd = true; + goto pd_mapping; + } + } + + pd = _ocrdma_alloc_pd(dev, uctx, udata); + if (IS_ERR(pd)) { + status = PTR_ERR(pd); + goto exit; + } + +pd_mapping: + if (udata && context) { + status = ocrdma_copy_pd_uresp(dev, pd, context, udata); + if (status) + goto err; + } + return &pd->ibpd; + +err: + if (is_uctx_pd) { + ocrdma_release_ucontext_pd(uctx); + } else { + status = _ocrdma_dealloc_pd(dev, pd); + kfree(pd); + } +exit: + return ERR_PTR(status); +} + +int ocrdma_dealloc_pd(struct ib_pd *ibpd) +{ + struct ocrdma_pd *pd = get_ocrdma_pd(ibpd); + struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device); + struct ocrdma_ucontext *uctx = NULL; + int status = 0; + u64 usr_db; + + uctx = pd->uctx; + if (uctx) { + u64 dpp_db = dev->nic_info.dpp_unmapped_addr + + (pd->id * PAGE_SIZE); + if (pd->dpp_enabled) + ocrdma_del_mmap(pd->uctx, dpp_db, PAGE_SIZE); + usr_db = ocrdma_get_db_addr(dev, pd->id); + ocrdma_del_mmap(pd->uctx, usr_db, dev->nic_info.db_page_size); + + if (is_ucontext_pd(uctx, pd)) { + ocrdma_release_ucontext_pd(uctx); + return status; + } + } + status = _ocrdma_dealloc_pd(dev, pd); + return status; +} + +static int ocrdma_alloc_lkey(struct ocrdma_dev *dev, struct ocrdma_mr *mr, + u32 pdid, int acc, u32 num_pbls, u32 addr_check) +{ + int status; + + mr->hwmr.fr_mr = 0; + mr->hwmr.local_rd = 1; + mr->hwmr.remote_rd = (acc & IB_ACCESS_REMOTE_READ) ? 1 : 0; + mr->hwmr.remote_wr = (acc & IB_ACCESS_REMOTE_WRITE) ? 1 : 0; + mr->hwmr.local_wr = (acc & IB_ACCESS_LOCAL_WRITE) ? 1 : 0; + mr->hwmr.mw_bind = (acc & IB_ACCESS_MW_BIND) ? 1 : 0; + mr->hwmr.remote_atomic = (acc & IB_ACCESS_REMOTE_ATOMIC) ? 1 : 0; + mr->hwmr.num_pbls = num_pbls; + + status = ocrdma_mbx_alloc_lkey(dev, &mr->hwmr, pdid, addr_check); + if (status) + return status; + + mr->ibmr.lkey = mr->hwmr.lkey; + if (mr->hwmr.remote_wr || mr->hwmr.remote_rd) + mr->ibmr.rkey = mr->hwmr.lkey; + return 0; +} + +struct ib_mr *ocrdma_get_dma_mr(struct ib_pd *ibpd, int acc) +{ + int status; + struct ocrdma_mr *mr; + struct ocrdma_pd *pd = get_ocrdma_pd(ibpd); + struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device); + + if (acc & IB_ACCESS_REMOTE_WRITE && !(acc & IB_ACCESS_LOCAL_WRITE)) { + pr_err("%s err, invalid access rights\n", __func__); + return ERR_PTR(-EINVAL); + } + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + status = ocrdma_alloc_lkey(dev, mr, pd->id, acc, 0, + OCRDMA_ADDR_CHECK_DISABLE); + if (status) { + kfree(mr); + return ERR_PTR(status); + } + + return &mr->ibmr; +} + +static void ocrdma_free_mr_pbl_tbl(struct ocrdma_dev *dev, + struct ocrdma_hw_mr *mr) +{ + struct pci_dev *pdev = dev->nic_info.pdev; + int i = 0; + + if (mr->pbl_table) { + for (i = 0; i < mr->num_pbls; i++) { + if (!mr->pbl_table[i].va) + continue; + dma_free_coherent(&pdev->dev, mr->pbl_size, + mr->pbl_table[i].va, + mr->pbl_table[i].pa); + } + kfree(mr->pbl_table); + mr->pbl_table = NULL; + } +} + +static int ocrdma_get_pbl_info(struct ocrdma_dev *dev, struct ocrdma_mr *mr, + u32 num_pbes) +{ + u32 num_pbls = 0; + u32 idx = 0; + int status = 0; + u32 pbl_size; + + do { + pbl_size = OCRDMA_MIN_HPAGE_SIZE * (1 << idx); + if (pbl_size > MAX_OCRDMA_PBL_SIZE) { + status = -EFAULT; + break; + } + num_pbls = roundup(num_pbes, (pbl_size / sizeof(u64))); + num_pbls = num_pbls / (pbl_size / sizeof(u64)); + idx++; + } while (num_pbls >= dev->attr.max_num_mr_pbl); + + mr->hwmr.num_pbes = num_pbes; + mr->hwmr.num_pbls = num_pbls; + mr->hwmr.pbl_size = pbl_size; + return status; +} + +static int ocrdma_build_pbl_tbl(struct ocrdma_dev *dev, struct ocrdma_hw_mr *mr) +{ + int status = 0; + int i; + u32 dma_len = mr->pbl_size; + struct pci_dev *pdev = dev->nic_info.pdev; + void *va; + dma_addr_t pa; + + mr->pbl_table = kzalloc(sizeof(struct ocrdma_pbl) * + mr->num_pbls, GFP_KERNEL); + + if (!mr->pbl_table) + return -ENOMEM; + + for (i = 0; i < mr->num_pbls; i++) { + va = dma_alloc_coherent(&pdev->dev, dma_len, &pa, GFP_KERNEL); + if (!va) { + ocrdma_free_mr_pbl_tbl(dev, mr); + status = -ENOMEM; + break; + } + memset(va, 0, dma_len); + mr->pbl_table[i].va = va; + mr->pbl_table[i].pa = pa; + } + return status; +} + +static void build_user_pbes(struct ocrdma_dev *dev, struct ocrdma_mr *mr, + u32 num_pbes) +{ + struct ocrdma_pbe *pbe; + struct scatterlist *sg; + struct ocrdma_pbl *pbl_tbl = mr->hwmr.pbl_table; + struct ib_umem *umem = mr->umem; + int shift, pg_cnt, pages, pbe_cnt, entry, total_num_pbes = 0; + + if (!mr->hwmr.num_pbes) + return; + + pbe = (struct ocrdma_pbe *)pbl_tbl->va; + pbe_cnt = 0; + + shift = ilog2(umem->page_size); + + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { + pages = sg_dma_len(sg) >> shift; + for (pg_cnt = 0; pg_cnt < pages; pg_cnt++) { + /* store the page address in pbe */ + pbe->pa_lo = + cpu_to_le32(sg_dma_address + (sg) + + (umem->page_size * pg_cnt)); + pbe->pa_hi = + cpu_to_le32(upper_32_bits + ((sg_dma_address + (sg) + + umem->page_size * pg_cnt))); + pbe_cnt += 1; + total_num_pbes += 1; + pbe++; + + /* if done building pbes, issue the mbx cmd. */ + if (total_num_pbes == num_pbes) + return; + + /* if the given pbl is full storing the pbes, + * move to next pbl. + */ + if (pbe_cnt == + (mr->hwmr.pbl_size / sizeof(u64))) { + pbl_tbl++; + pbe = (struct ocrdma_pbe *)pbl_tbl->va; + pbe_cnt = 0; + } + + } + } +} + +struct ib_mr *ocrdma_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len, + u64 usr_addr, int acc, struct ib_udata *udata) +{ + int status = -ENOMEM; + struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device); + struct ocrdma_mr *mr; + struct ocrdma_pd *pd; + u32 num_pbes; + + pd = get_ocrdma_pd(ibpd); + + if (acc & IB_ACCESS_REMOTE_WRITE && !(acc & IB_ACCESS_LOCAL_WRITE)) + return ERR_PTR(-EINVAL); + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(status); + mr->umem = ib_umem_get(ibpd->uobject->context, start, len, acc, 0); + if (IS_ERR(mr->umem)) { + status = -EFAULT; + goto umem_err; + } + num_pbes = ib_umem_page_count(mr->umem); + status = ocrdma_get_pbl_info(dev, mr, num_pbes); + if (status) + goto umem_err; + + mr->hwmr.pbe_size = mr->umem->page_size; + mr->hwmr.fbo = ib_umem_offset(mr->umem); + mr->hwmr.va = usr_addr; + mr->hwmr.len = len; + mr->hwmr.remote_wr = (acc & IB_ACCESS_REMOTE_WRITE) ? 1 : 0; + mr->hwmr.remote_rd = (acc & IB_ACCESS_REMOTE_READ) ? 1 : 0; + mr->hwmr.local_wr = (acc & IB_ACCESS_LOCAL_WRITE) ? 1 : 0; + mr->hwmr.local_rd = 1; + mr->hwmr.remote_atomic = (acc & IB_ACCESS_REMOTE_ATOMIC) ? 1 : 0; + status = ocrdma_build_pbl_tbl(dev, &mr->hwmr); + if (status) + goto umem_err; + build_user_pbes(dev, mr, num_pbes); + status = ocrdma_reg_mr(dev, &mr->hwmr, pd->id, acc); + if (status) + goto mbx_err; + mr->ibmr.lkey = mr->hwmr.lkey; + if (mr->hwmr.remote_wr || mr->hwmr.remote_rd) + mr->ibmr.rkey = mr->hwmr.lkey; + + return &mr->ibmr; + +mbx_err: + ocrdma_free_mr_pbl_tbl(dev, &mr->hwmr); +umem_err: + kfree(mr); + return ERR_PTR(status); +} + +int ocrdma_dereg_mr(struct ib_mr *ib_mr) +{ + struct ocrdma_mr *mr = get_ocrdma_mr(ib_mr); + struct ocrdma_dev *dev = get_ocrdma_dev(ib_mr->device); + + (void) ocrdma_mbx_dealloc_lkey(dev, mr->hwmr.fr_mr, mr->hwmr.lkey); + + ocrdma_free_mr_pbl_tbl(dev, &mr->hwmr); + + /* it could be user registered memory. */ + if (mr->umem) + ib_umem_release(mr->umem); + kfree(mr); + + /* Don't stop cleanup, in case FW is unresponsive */ + if (dev->mqe_ctx.fw_error_state) { + pr_err("%s(%d) fw not responding.\n", + __func__, dev->id); + } + return 0; +} + +static int ocrdma_copy_cq_uresp(struct ocrdma_dev *dev, struct ocrdma_cq *cq, + struct ib_udata *udata, + struct ib_ucontext *ib_ctx) +{ + int status; + struct ocrdma_ucontext *uctx = get_ocrdma_ucontext(ib_ctx); + struct ocrdma_create_cq_uresp uresp; + + memset(&uresp, 0, sizeof(uresp)); + uresp.cq_id = cq->id; + uresp.page_size = PAGE_ALIGN(cq->len); + uresp.num_pages = 1; + uresp.max_hw_cqe = cq->max_hw_cqe; + uresp.page_addr[0] = virt_to_phys(cq->va); + uresp.db_page_addr = ocrdma_get_db_addr(dev, uctx->cntxt_pd->id); + uresp.db_page_size = dev->nic_info.db_page_size; + uresp.phase_change = cq->phase_change ? 1 : 0; + status = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); + if (status) { + pr_err("%s(%d) copy error cqid=0x%x.\n", + __func__, dev->id, cq->id); + goto err; + } + status = ocrdma_add_mmap(uctx, uresp.db_page_addr, uresp.db_page_size); + if (status) + goto err; + status = ocrdma_add_mmap(uctx, uresp.page_addr[0], uresp.page_size); + if (status) { + ocrdma_del_mmap(uctx, uresp.db_page_addr, uresp.db_page_size); + goto err; + } + cq->ucontext = uctx; +err: + return status; +} + +struct ib_cq *ocrdma_create_cq(struct ib_device *ibdev, int entries, int vector, + struct ib_ucontext *ib_ctx, + struct ib_udata *udata) +{ + struct ocrdma_cq *cq; + struct ocrdma_dev *dev = get_ocrdma_dev(ibdev); + struct ocrdma_ucontext *uctx = NULL; + u16 pd_id = 0; + int status; + struct ocrdma_create_cq_ureq ureq; + + if (udata) { + if (ib_copy_from_udata(&ureq, udata, sizeof(ureq))) + return ERR_PTR(-EFAULT); + } else + ureq.dpp_cq = 0; + cq = kzalloc(sizeof(*cq), GFP_KERNEL); + if (!cq) + return ERR_PTR(-ENOMEM); + + spin_lock_init(&cq->cq_lock); + spin_lock_init(&cq->comp_handler_lock); + INIT_LIST_HEAD(&cq->sq_head); + INIT_LIST_HEAD(&cq->rq_head); + cq->first_arm = true; + + if (ib_ctx) { + uctx = get_ocrdma_ucontext(ib_ctx); + pd_id = uctx->cntxt_pd->id; + } + + status = ocrdma_mbx_create_cq(dev, cq, entries, ureq.dpp_cq, pd_id); + if (status) { + kfree(cq); + return ERR_PTR(status); + } + if (ib_ctx) { + status = ocrdma_copy_cq_uresp(dev, cq, udata, ib_ctx); + if (status) + goto ctx_err; + } + cq->phase = OCRDMA_CQE_VALID; + dev->cq_tbl[cq->id] = cq; + return &cq->ibcq; + +ctx_err: + ocrdma_mbx_destroy_cq(dev, cq); + kfree(cq); + return ERR_PTR(status); +} + +int ocrdma_resize_cq(struct ib_cq *ibcq, int new_cnt, + struct ib_udata *udata) +{ + int status = 0; + struct ocrdma_cq *cq = get_ocrdma_cq(ibcq); + + if (new_cnt < 1 || new_cnt > cq->max_hw_cqe) { + status = -EINVAL; + return status; + } + ibcq->cqe = new_cnt; + return status; +} + +static void ocrdma_flush_cq(struct ocrdma_cq *cq) +{ + int cqe_cnt; + int valid_count = 0; + unsigned long flags; + + struct ocrdma_dev *dev = get_ocrdma_dev(cq->ibcq.device); + struct ocrdma_cqe *cqe = NULL; + + cqe = cq->va; + cqe_cnt = cq->cqe_cnt; + + /* Last irq might have scheduled a polling thread + * sync-up with it before hard flushing. + */ + spin_lock_irqsave(&cq->cq_lock, flags); + while (cqe_cnt) { + if (is_cqe_valid(cq, cqe)) + valid_count++; + cqe++; + cqe_cnt--; + } + ocrdma_ring_cq_db(dev, cq->id, false, false, valid_count); + spin_unlock_irqrestore(&cq->cq_lock, flags); +} + +int ocrdma_destroy_cq(struct ib_cq *ibcq) +{ + struct ocrdma_cq *cq = get_ocrdma_cq(ibcq); + struct ocrdma_eq *eq = NULL; + struct ocrdma_dev *dev = get_ocrdma_dev(ibcq->device); + int pdid = 0; + u32 irq, indx; + + dev->cq_tbl[cq->id] = NULL; + indx = ocrdma_get_eq_table_index(dev, cq->eqn); + if (indx == -EINVAL) + BUG(); + + eq = &dev->eq_tbl[indx]; + irq = ocrdma_get_irq(dev, eq); + synchronize_irq(irq); + ocrdma_flush_cq(cq); + + (void)ocrdma_mbx_destroy_cq(dev, cq); + if (cq->ucontext) { + pdid = cq->ucontext->cntxt_pd->id; + ocrdma_del_mmap(cq->ucontext, (u64) cq->pa, + PAGE_ALIGN(cq->len)); + ocrdma_del_mmap(cq->ucontext, + ocrdma_get_db_addr(dev, pdid), + dev->nic_info.db_page_size); + } + + kfree(cq); + return 0; +} + +static int ocrdma_add_qpn_map(struct ocrdma_dev *dev, struct ocrdma_qp *qp) +{ + int status = -EINVAL; + + if (qp->id < OCRDMA_MAX_QP && dev->qp_tbl[qp->id] == NULL) { + dev->qp_tbl[qp->id] = qp; + status = 0; + } + return status; +} + +static void ocrdma_del_qpn_map(struct ocrdma_dev *dev, struct ocrdma_qp *qp) +{ + dev->qp_tbl[qp->id] = NULL; +} + +static int ocrdma_check_qp_params(struct ib_pd *ibpd, struct ocrdma_dev *dev, + struct ib_qp_init_attr *attrs) +{ + if ((attrs->qp_type != IB_QPT_GSI) && + (attrs->qp_type != IB_QPT_RC) && + (attrs->qp_type != IB_QPT_UC) && + (attrs->qp_type != IB_QPT_UD)) { + pr_err("%s(%d) unsupported qp type=0x%x requested\n", + __func__, dev->id, attrs->qp_type); + return -EINVAL; + } + /* Skip the check for QP1 to support CM size of 128 */ + if ((attrs->qp_type != IB_QPT_GSI) && + (attrs->cap.max_send_wr > dev->attr.max_wqe)) { + pr_err("%s(%d) unsupported send_wr=0x%x requested\n", + __func__, dev->id, attrs->cap.max_send_wr); + pr_err("%s(%d) supported send_wr=0x%x\n", + __func__, dev->id, dev->attr.max_wqe); + return -EINVAL; + } + if (!attrs->srq && (attrs->cap.max_recv_wr > dev->attr.max_rqe)) { + pr_err("%s(%d) unsupported recv_wr=0x%x requested\n", + __func__, dev->id, attrs->cap.max_recv_wr); + pr_err("%s(%d) supported recv_wr=0x%x\n", + __func__, dev->id, dev->attr.max_rqe); + return -EINVAL; + } + if (attrs->cap.max_inline_data > dev->attr.max_inline_data) { + pr_err("%s(%d) unsupported inline data size=0x%x requested\n", + __func__, dev->id, attrs->cap.max_inline_data); + pr_err("%s(%d) supported inline data size=0x%x\n", + __func__, dev->id, dev->attr.max_inline_data); + return -EINVAL; + } + if (attrs->cap.max_send_sge > dev->attr.max_send_sge) { + pr_err("%s(%d) unsupported send_sge=0x%x requested\n", + __func__, dev->id, attrs->cap.max_send_sge); + pr_err("%s(%d) supported send_sge=0x%x\n", + __func__, dev->id, dev->attr.max_send_sge); + return -EINVAL; + } + if (attrs->cap.max_recv_sge > dev->attr.max_recv_sge) { + pr_err("%s(%d) unsupported recv_sge=0x%x requested\n", + __func__, dev->id, attrs->cap.max_recv_sge); + pr_err("%s(%d) supported recv_sge=0x%x\n", + __func__, dev->id, dev->attr.max_recv_sge); + return -EINVAL; + } + /* unprivileged user space cannot create special QP */ + if (ibpd->uobject && attrs->qp_type == IB_QPT_GSI) { + pr_err + ("%s(%d) Userspace can't create special QPs of type=0x%x\n", + __func__, dev->id, attrs->qp_type); + return -EINVAL; + } + /* allow creating only one GSI type of QP */ + if (attrs->qp_type == IB_QPT_GSI && dev->gsi_qp_created) { + pr_err("%s(%d) GSI special QPs already created.\n", + __func__, dev->id); + return -EINVAL; + } + /* verify consumer QPs are not trying to use GSI QP's CQ */ + if ((attrs->qp_type != IB_QPT_GSI) && (dev->gsi_qp_created)) { + if ((dev->gsi_sqcq == get_ocrdma_cq(attrs->send_cq)) || + (dev->gsi_rqcq == get_ocrdma_cq(attrs->recv_cq))) { + pr_err("%s(%d) Consumer QP cannot use GSI CQs.\n", + __func__, dev->id); + return -EINVAL; + } + } + return 0; +} + +static int ocrdma_copy_qp_uresp(struct ocrdma_qp *qp, + struct ib_udata *udata, int dpp_offset, + int dpp_credit_lmt, int srq) +{ + int status = 0; + u64 usr_db; + struct ocrdma_create_qp_uresp uresp; + struct ocrdma_pd *pd = qp->pd; + struct ocrdma_dev *dev = get_ocrdma_dev(pd->ibpd.device); + + memset(&uresp, 0, sizeof(uresp)); + usr_db = dev->nic_info.unmapped_db + + (pd->id * dev->nic_info.db_page_size); + uresp.qp_id = qp->id; + uresp.sq_dbid = qp->sq.dbid; + uresp.num_sq_pages = 1; + uresp.sq_page_size = PAGE_ALIGN(qp->sq.len); + uresp.sq_page_addr[0] = virt_to_phys(qp->sq.va); + uresp.num_wqe_allocated = qp->sq.max_cnt; + if (!srq) { + uresp.rq_dbid = qp->rq.dbid; + uresp.num_rq_pages = 1; + uresp.rq_page_size = PAGE_ALIGN(qp->rq.len); + uresp.rq_page_addr[0] = virt_to_phys(qp->rq.va); + uresp.num_rqe_allocated = qp->rq.max_cnt; + } + uresp.db_page_addr = usr_db; + uresp.db_page_size = dev->nic_info.db_page_size; + uresp.db_sq_offset = OCRDMA_DB_GEN2_SQ_OFFSET; + uresp.db_rq_offset = OCRDMA_DB_GEN2_RQ_OFFSET; + uresp.db_shift = OCRDMA_DB_RQ_SHIFT; + + if (qp->dpp_enabled) { + uresp.dpp_credit = dpp_credit_lmt; + uresp.dpp_offset = dpp_offset; + } + status = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); + if (status) { + pr_err("%s(%d) user copy error.\n", __func__, dev->id); + goto err; + } + status = ocrdma_add_mmap(pd->uctx, uresp.sq_page_addr[0], + uresp.sq_page_size); + if (status) + goto err; + + if (!srq) { + status = ocrdma_add_mmap(pd->uctx, uresp.rq_page_addr[0], + uresp.rq_page_size); + if (status) + goto rq_map_err; + } + return status; +rq_map_err: + ocrdma_del_mmap(pd->uctx, uresp.sq_page_addr[0], uresp.sq_page_size); +err: + return status; +} + +static void ocrdma_set_qp_db(struct ocrdma_dev *dev, struct ocrdma_qp *qp, + struct ocrdma_pd *pd) +{ + if (ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R) { + qp->sq_db = dev->nic_info.db + + (pd->id * dev->nic_info.db_page_size) + + OCRDMA_DB_GEN2_SQ_OFFSET; + qp->rq_db = dev->nic_info.db + + (pd->id * dev->nic_info.db_page_size) + + OCRDMA_DB_GEN2_RQ_OFFSET; + } else { + qp->sq_db = dev->nic_info.db + + (pd->id * dev->nic_info.db_page_size) + + OCRDMA_DB_SQ_OFFSET; + qp->rq_db = dev->nic_info.db + + (pd->id * dev->nic_info.db_page_size) + + OCRDMA_DB_RQ_OFFSET; + } +} + +static int ocrdma_alloc_wr_id_tbl(struct ocrdma_qp *qp) +{ + qp->wqe_wr_id_tbl = + kzalloc(sizeof(*(qp->wqe_wr_id_tbl)) * qp->sq.max_cnt, + GFP_KERNEL); + if (qp->wqe_wr_id_tbl == NULL) + return -ENOMEM; + qp->rqe_wr_id_tbl = + kzalloc(sizeof(u64) * qp->rq.max_cnt, GFP_KERNEL); + if (qp->rqe_wr_id_tbl == NULL) + return -ENOMEM; + + return 0; +} + +static void ocrdma_set_qp_init_params(struct ocrdma_qp *qp, + struct ocrdma_pd *pd, + struct ib_qp_init_attr *attrs) +{ + qp->pd = pd; + spin_lock_init(&qp->q_lock); + INIT_LIST_HEAD(&qp->sq_entry); + INIT_LIST_HEAD(&qp->rq_entry); + + qp->qp_type = attrs->qp_type; + qp->cap_flags = OCRDMA_QP_INB_RD | OCRDMA_QP_INB_WR; + qp->max_inline_data = attrs->cap.max_inline_data; + qp->sq.max_sges = attrs->cap.max_send_sge; + qp->rq.max_sges = attrs->cap.max_recv_sge; + qp->state = OCRDMA_QPS_RST; + qp->signaled = (attrs->sq_sig_type == IB_SIGNAL_ALL_WR) ? true : false; +} + +static void ocrdma_store_gsi_qp_cq(struct ocrdma_dev *dev, + struct ib_qp_init_attr *attrs) +{ + if (attrs->qp_type == IB_QPT_GSI) { + dev->gsi_qp_created = 1; + dev->gsi_sqcq = get_ocrdma_cq(attrs->send_cq); + dev->gsi_rqcq = get_ocrdma_cq(attrs->recv_cq); + } +} + +struct ib_qp *ocrdma_create_qp(struct ib_pd *ibpd, + struct ib_qp_init_attr *attrs, + struct ib_udata *udata) +{ + int status; + struct ocrdma_pd *pd = get_ocrdma_pd(ibpd); + struct ocrdma_qp *qp; + struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device); + struct ocrdma_create_qp_ureq ureq; + u16 dpp_credit_lmt, dpp_offset; + + status = ocrdma_check_qp_params(ibpd, dev, attrs); + if (status) + goto gen_err; + + memset(&ureq, 0, sizeof(ureq)); + if (udata) { + if (ib_copy_from_udata(&ureq, udata, sizeof(ureq))) + return ERR_PTR(-EFAULT); + } + qp = kzalloc(sizeof(*qp), GFP_KERNEL); + if (!qp) { + status = -ENOMEM; + goto gen_err; + } + ocrdma_set_qp_init_params(qp, pd, attrs); + if (udata == NULL) + qp->cap_flags |= (OCRDMA_QP_MW_BIND | OCRDMA_QP_LKEY0 | + OCRDMA_QP_FAST_REG); + + mutex_lock(&dev->dev_lock); + status = ocrdma_mbx_create_qp(qp, attrs, ureq.enable_dpp_cq, + ureq.dpp_cq_id, + &dpp_offset, &dpp_credit_lmt); + if (status) + goto mbx_err; + + /* user space QP's wr_id table are managed in library */ + if (udata == NULL) { + status = ocrdma_alloc_wr_id_tbl(qp); + if (status) + goto map_err; + } + + status = ocrdma_add_qpn_map(dev, qp); + if (status) + goto map_err; + ocrdma_set_qp_db(dev, qp, pd); + if (udata) { + status = ocrdma_copy_qp_uresp(qp, udata, dpp_offset, + dpp_credit_lmt, + (attrs->srq != NULL)); + if (status) + goto cpy_err; + } + ocrdma_store_gsi_qp_cq(dev, attrs); + qp->ibqp.qp_num = qp->id; + mutex_unlock(&dev->dev_lock); + return &qp->ibqp; + +cpy_err: + ocrdma_del_qpn_map(dev, qp); +map_err: + ocrdma_mbx_destroy_qp(dev, qp); +mbx_err: + mutex_unlock(&dev->dev_lock); + kfree(qp->wqe_wr_id_tbl); + kfree(qp->rqe_wr_id_tbl); + kfree(qp); + pr_err("%s(%d) error=%d\n", __func__, dev->id, status); +gen_err: + return ERR_PTR(status); +} + +int _ocrdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask) +{ + int status = 0; + struct ocrdma_qp *qp; + struct ocrdma_dev *dev; + enum ib_qp_state old_qps; + + qp = get_ocrdma_qp(ibqp); + dev = get_ocrdma_dev(ibqp->device); + if (attr_mask & IB_QP_STATE) + status = ocrdma_qp_state_change(qp, attr->qp_state, &old_qps); + /* if new and previous states are same hw doesn't need to + * know about it. + */ + if (status < 0) + return status; + status = ocrdma_mbx_modify_qp(dev, qp, attr, attr_mask); + + return status; +} + +int ocrdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + unsigned long flags; + int status = -EINVAL; + struct ocrdma_qp *qp; + struct ocrdma_dev *dev; + enum ib_qp_state old_qps, new_qps; + + qp = get_ocrdma_qp(ibqp); + dev = get_ocrdma_dev(ibqp->device); + + /* syncronize with multiple context trying to change, retrive qps */ + mutex_lock(&dev->dev_lock); + /* syncronize with wqe, rqe posting and cqe processing contexts */ + spin_lock_irqsave(&qp->q_lock, flags); + old_qps = get_ibqp_state(qp->state); + if (attr_mask & IB_QP_STATE) + new_qps = attr->qp_state; + else + new_qps = old_qps; + spin_unlock_irqrestore(&qp->q_lock, flags); + + if (!ib_modify_qp_is_ok(old_qps, new_qps, ibqp->qp_type, attr_mask, + IB_LINK_LAYER_ETHERNET)) { + pr_err("%s(%d) invalid attribute mask=0x%x specified for\n" + "qpn=0x%x of type=0x%x old_qps=0x%x, new_qps=0x%x\n", + __func__, dev->id, attr_mask, qp->id, ibqp->qp_type, + old_qps, new_qps); + goto param_err; + } + + status = _ocrdma_modify_qp(ibqp, attr, attr_mask); + if (status > 0) + status = 0; +param_err: + mutex_unlock(&dev->dev_lock); + return status; +} + +static enum ib_mtu ocrdma_mtu_int_to_enum(u16 mtu) +{ + switch (mtu) { + case 256: + return IB_MTU_256; + case 512: + return IB_MTU_512; + case 1024: + return IB_MTU_1024; + case 2048: + return IB_MTU_2048; + case 4096: + return IB_MTU_4096; + default: + return IB_MTU_1024; + } +} + +static int ocrdma_to_ib_qp_acc_flags(int qp_cap_flags) +{ + int ib_qp_acc_flags = 0; + + if (qp_cap_flags & OCRDMA_QP_INB_WR) + ib_qp_acc_flags |= IB_ACCESS_REMOTE_WRITE; + if (qp_cap_flags & OCRDMA_QP_INB_RD) + ib_qp_acc_flags |= IB_ACCESS_LOCAL_WRITE; + return ib_qp_acc_flags; +} + +int ocrdma_query_qp(struct ib_qp *ibqp, + struct ib_qp_attr *qp_attr, + int attr_mask, struct ib_qp_init_attr *qp_init_attr) +{ + int status; + u32 qp_state; + struct ocrdma_qp_params params; + struct ocrdma_qp *qp = get_ocrdma_qp(ibqp); + struct ocrdma_dev *dev = get_ocrdma_dev(ibqp->device); + + memset(¶ms, 0, sizeof(params)); + mutex_lock(&dev->dev_lock); + status = ocrdma_mbx_query_qp(dev, qp, ¶ms); + mutex_unlock(&dev->dev_lock); + if (status) + goto mbx_err; + if (qp->qp_type == IB_QPT_UD) + qp_attr->qkey = params.qkey; + qp_attr->path_mtu = + ocrdma_mtu_int_to_enum(params.path_mtu_pkey_indx & + OCRDMA_QP_PARAMS_PATH_MTU_MASK) >> + OCRDMA_QP_PARAMS_PATH_MTU_SHIFT; + qp_attr->path_mig_state = IB_MIG_MIGRATED; + qp_attr->rq_psn = params.hop_lmt_rq_psn & OCRDMA_QP_PARAMS_RQ_PSN_MASK; + qp_attr->sq_psn = params.tclass_sq_psn & OCRDMA_QP_PARAMS_SQ_PSN_MASK; + qp_attr->dest_qp_num = + params.ack_to_rnr_rtc_dest_qpn & OCRDMA_QP_PARAMS_DEST_QPN_MASK; + + qp_attr->qp_access_flags = ocrdma_to_ib_qp_acc_flags(qp->cap_flags); + qp_attr->cap.max_send_wr = qp->sq.max_cnt - 1; + qp_attr->cap.max_recv_wr = qp->rq.max_cnt - 1; + qp_attr->cap.max_send_sge = qp->sq.max_sges; + qp_attr->cap.max_recv_sge = qp->rq.max_sges; + qp_attr->cap.max_inline_data = qp->max_inline_data; + qp_init_attr->cap = qp_attr->cap; + memcpy(&qp_attr->ah_attr.grh.dgid, ¶ms.dgid[0], + sizeof(params.dgid)); + qp_attr->ah_attr.grh.flow_label = params.rnt_rc_sl_fl & + OCRDMA_QP_PARAMS_FLOW_LABEL_MASK; + qp_attr->ah_attr.grh.sgid_index = qp->sgid_idx; + qp_attr->ah_attr.grh.hop_limit = (params.hop_lmt_rq_psn & + OCRDMA_QP_PARAMS_HOP_LMT_MASK) >> + OCRDMA_QP_PARAMS_HOP_LMT_SHIFT; + qp_attr->ah_attr.grh.traffic_class = (params.tclass_sq_psn & + OCRDMA_QP_PARAMS_TCLASS_MASK) >> + OCRDMA_QP_PARAMS_TCLASS_SHIFT; + + qp_attr->ah_attr.ah_flags = IB_AH_GRH; + qp_attr->ah_attr.port_num = 1; + qp_attr->ah_attr.sl = (params.rnt_rc_sl_fl & + OCRDMA_QP_PARAMS_SL_MASK) >> + OCRDMA_QP_PARAMS_SL_SHIFT; + qp_attr->timeout = (params.ack_to_rnr_rtc_dest_qpn & + OCRDMA_QP_PARAMS_ACK_TIMEOUT_MASK) >> + OCRDMA_QP_PARAMS_ACK_TIMEOUT_SHIFT; + qp_attr->rnr_retry = (params.ack_to_rnr_rtc_dest_qpn & + OCRDMA_QP_PARAMS_RNR_RETRY_CNT_MASK) >> + OCRDMA_QP_PARAMS_RNR_RETRY_CNT_SHIFT; + qp_attr->retry_cnt = + (params.rnt_rc_sl_fl & OCRDMA_QP_PARAMS_RETRY_CNT_MASK) >> + OCRDMA_QP_PARAMS_RETRY_CNT_SHIFT; + qp_attr->min_rnr_timer = 0; + qp_attr->pkey_index = 0; + qp_attr->port_num = 1; + qp_attr->ah_attr.src_path_bits = 0; + qp_attr->ah_attr.static_rate = 0; + qp_attr->alt_pkey_index = 0; + qp_attr->alt_port_num = 0; + qp_attr->alt_timeout = 0; + memset(&qp_attr->alt_ah_attr, 0, sizeof(qp_attr->alt_ah_attr)); + qp_state = (params.max_sge_recv_flags & OCRDMA_QP_PARAMS_STATE_MASK) >> + OCRDMA_QP_PARAMS_STATE_SHIFT; + qp_attr->qp_state = get_ibqp_state(qp_state); + qp_attr->cur_qp_state = qp_attr->qp_state; + qp_attr->sq_draining = (qp_state == OCRDMA_QPS_SQ_DRAINING) ? 1 : 0; + qp_attr->max_dest_rd_atomic = + params.max_ord_ird >> OCRDMA_QP_PARAMS_MAX_ORD_SHIFT; + qp_attr->max_rd_atomic = + params.max_ord_ird & OCRDMA_QP_PARAMS_MAX_IRD_MASK; + qp_attr->en_sqd_async_notify = (params.max_sge_recv_flags & + OCRDMA_QP_PARAMS_FLAGS_SQD_ASYNC) ? 1 : 0; + /* Sync driver QP state with FW */ + ocrdma_qp_state_change(qp, qp_attr->qp_state, NULL); +mbx_err: + return status; +} + +static void ocrdma_srq_toggle_bit(struct ocrdma_srq *srq, unsigned int idx) +{ + unsigned int i = idx / 32; + u32 mask = (1U << (idx % 32)); + + srq->idx_bit_fields[i] ^= mask; +} + +static int ocrdma_hwq_free_cnt(struct ocrdma_qp_hwq_info *q) +{ + return ((q->max_wqe_idx - q->head) + q->tail) % q->max_cnt; +} + +static int is_hw_sq_empty(struct ocrdma_qp *qp) +{ + return (qp->sq.tail == qp->sq.head); +} + +static int is_hw_rq_empty(struct ocrdma_qp *qp) +{ + return (qp->rq.tail == qp->rq.head); +} + +static void *ocrdma_hwq_head(struct ocrdma_qp_hwq_info *q) +{ + return q->va + (q->head * q->entry_size); +} + +static void *ocrdma_hwq_head_from_idx(struct ocrdma_qp_hwq_info *q, + u32 idx) +{ + return q->va + (idx * q->entry_size); +} + +static void ocrdma_hwq_inc_head(struct ocrdma_qp_hwq_info *q) +{ + q->head = (q->head + 1) & q->max_wqe_idx; +} + +static void ocrdma_hwq_inc_tail(struct ocrdma_qp_hwq_info *q) +{ + q->tail = (q->tail + 1) & q->max_wqe_idx; +} + +/* discard the cqe for a given QP */ +static void ocrdma_discard_cqes(struct ocrdma_qp *qp, struct ocrdma_cq *cq) +{ + unsigned long cq_flags; + unsigned long flags; + int discard_cnt = 0; + u32 cur_getp, stop_getp; + struct ocrdma_cqe *cqe; + u32 qpn = 0, wqe_idx = 0; + + spin_lock_irqsave(&cq->cq_lock, cq_flags); + + /* traverse through the CQEs in the hw CQ, + * find the matching CQE for a given qp, + * mark the matching one discarded by clearing qpn. + * ring the doorbell in the poll_cq() as + * we don't complete out of order cqe. + */ + + cur_getp = cq->getp; + /* find upto when do we reap the cq. */ + stop_getp = cur_getp; + do { + if (is_hw_sq_empty(qp) && (!qp->srq && is_hw_rq_empty(qp))) + break; + + cqe = cq->va + cur_getp; + /* if (a) done reaping whole hw cq, or + * (b) qp_xq becomes empty. + * then exit + */ + qpn = cqe->cmn.qpn & OCRDMA_CQE_QPN_MASK; + /* if previously discarded cqe found, skip that too. */ + /* check for matching qp */ + if (qpn == 0 || qpn != qp->id) + goto skip_cqe; + + if (is_cqe_for_sq(cqe)) { + ocrdma_hwq_inc_tail(&qp->sq); + } else { + if (qp->srq) { + wqe_idx = (le32_to_cpu(cqe->rq.buftag_qpn) >> + OCRDMA_CQE_BUFTAG_SHIFT) & + qp->srq->rq.max_wqe_idx; + if (wqe_idx < 1) + BUG(); + spin_lock_irqsave(&qp->srq->q_lock, flags); + ocrdma_hwq_inc_tail(&qp->srq->rq); + ocrdma_srq_toggle_bit(qp->srq, wqe_idx - 1); + spin_unlock_irqrestore(&qp->srq->q_lock, flags); + + } else { + ocrdma_hwq_inc_tail(&qp->rq); + } + } + /* mark cqe discarded so that it is not picked up later + * in the poll_cq(). + */ + discard_cnt += 1; + cqe->cmn.qpn = 0; +skip_cqe: + cur_getp = (cur_getp + 1) % cq->max_hw_cqe; + } while (cur_getp != stop_getp); + spin_unlock_irqrestore(&cq->cq_lock, cq_flags); +} + +void ocrdma_del_flush_qp(struct ocrdma_qp *qp) +{ + int found = false; + unsigned long flags; + struct ocrdma_dev *dev = get_ocrdma_dev(qp->ibqp.device); + /* sync with any active CQ poll */ + + spin_lock_irqsave(&dev->flush_q_lock, flags); + found = ocrdma_is_qp_in_sq_flushlist(qp->sq_cq, qp); + if (found) + list_del(&qp->sq_entry); + if (!qp->srq) { + found = ocrdma_is_qp_in_rq_flushlist(qp->rq_cq, qp); + if (found) + list_del(&qp->rq_entry); + } + spin_unlock_irqrestore(&dev->flush_q_lock, flags); +} + +int ocrdma_destroy_qp(struct ib_qp *ibqp) +{ + struct ocrdma_pd *pd; + struct ocrdma_qp *qp; + struct ocrdma_dev *dev; + struct ib_qp_attr attrs; + int attr_mask; + unsigned long flags; + + qp = get_ocrdma_qp(ibqp); + dev = get_ocrdma_dev(ibqp->device); + + pd = qp->pd; + + /* change the QP state to ERROR */ + if (qp->state != OCRDMA_QPS_RST) { + attrs.qp_state = IB_QPS_ERR; + attr_mask = IB_QP_STATE; + _ocrdma_modify_qp(ibqp, &attrs, attr_mask); + } + /* ensure that CQEs for newly created QP (whose id may be same with + * one which just getting destroyed are same), dont get + * discarded until the old CQEs are discarded. + */ + mutex_lock(&dev->dev_lock); + (void) ocrdma_mbx_destroy_qp(dev, qp); + + /* + * acquire CQ lock while destroy is in progress, in order to + * protect against proessing in-flight CQEs for this QP. + */ + spin_lock_irqsave(&qp->sq_cq->cq_lock, flags); + if (qp->rq_cq && (qp->rq_cq != qp->sq_cq)) + spin_lock(&qp->rq_cq->cq_lock); + + ocrdma_del_qpn_map(dev, qp); + + if (qp->rq_cq && (qp->rq_cq != qp->sq_cq)) + spin_unlock(&qp->rq_cq->cq_lock); + spin_unlock_irqrestore(&qp->sq_cq->cq_lock, flags); + + if (!pd->uctx) { + ocrdma_discard_cqes(qp, qp->sq_cq); + ocrdma_discard_cqes(qp, qp->rq_cq); + } + mutex_unlock(&dev->dev_lock); + + if (pd->uctx) { + ocrdma_del_mmap(pd->uctx, (u64) qp->sq.pa, + PAGE_ALIGN(qp->sq.len)); + if (!qp->srq) + ocrdma_del_mmap(pd->uctx, (u64) qp->rq.pa, + PAGE_ALIGN(qp->rq.len)); + } + + ocrdma_del_flush_qp(qp); + + kfree(qp->wqe_wr_id_tbl); + kfree(qp->rqe_wr_id_tbl); + kfree(qp); + return 0; +} + +static int ocrdma_copy_srq_uresp(struct ocrdma_dev *dev, struct ocrdma_srq *srq, + struct ib_udata *udata) +{ + int status; + struct ocrdma_create_srq_uresp uresp; + + memset(&uresp, 0, sizeof(uresp)); + uresp.rq_dbid = srq->rq.dbid; + uresp.num_rq_pages = 1; + uresp.rq_page_addr[0] = virt_to_phys(srq->rq.va); + uresp.rq_page_size = srq->rq.len; + uresp.db_page_addr = dev->nic_info.unmapped_db + + (srq->pd->id * dev->nic_info.db_page_size); + uresp.db_page_size = dev->nic_info.db_page_size; + uresp.num_rqe_allocated = srq->rq.max_cnt; + if (ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R) { + uresp.db_rq_offset = OCRDMA_DB_GEN2_RQ_OFFSET; + uresp.db_shift = 24; + } else { + uresp.db_rq_offset = OCRDMA_DB_RQ_OFFSET; + uresp.db_shift = 16; + } + + status = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); + if (status) + return status; + status = ocrdma_add_mmap(srq->pd->uctx, uresp.rq_page_addr[0], + uresp.rq_page_size); + if (status) + return status; + return status; +} + +struct ib_srq *ocrdma_create_srq(struct ib_pd *ibpd, + struct ib_srq_init_attr *init_attr, + struct ib_udata *udata) +{ + int status = -ENOMEM; + struct ocrdma_pd *pd = get_ocrdma_pd(ibpd); + struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device); + struct ocrdma_srq *srq; + + if (init_attr->attr.max_sge > dev->attr.max_recv_sge) + return ERR_PTR(-EINVAL); + if (init_attr->attr.max_wr > dev->attr.max_rqe) + return ERR_PTR(-EINVAL); + + srq = kzalloc(sizeof(*srq), GFP_KERNEL); + if (!srq) + return ERR_PTR(status); + + spin_lock_init(&srq->q_lock); + srq->pd = pd; + srq->db = dev->nic_info.db + (pd->id * dev->nic_info.db_page_size); + status = ocrdma_mbx_create_srq(dev, srq, init_attr, pd); + if (status) + goto err; + + if (udata == NULL) { + srq->rqe_wr_id_tbl = kzalloc(sizeof(u64) * srq->rq.max_cnt, + GFP_KERNEL); + if (srq->rqe_wr_id_tbl == NULL) + goto arm_err; + + srq->bit_fields_len = (srq->rq.max_cnt / 32) + + (srq->rq.max_cnt % 32 ? 1 : 0); + srq->idx_bit_fields = + kmalloc(srq->bit_fields_len * sizeof(u32), GFP_KERNEL); + if (srq->idx_bit_fields == NULL) + goto arm_err; + memset(srq->idx_bit_fields, 0xff, + srq->bit_fields_len * sizeof(u32)); + } + + if (init_attr->attr.srq_limit) { + status = ocrdma_mbx_modify_srq(srq, &init_attr->attr); + if (status) + goto arm_err; + } + + if (udata) { + status = ocrdma_copy_srq_uresp(dev, srq, udata); + if (status) + goto arm_err; + } + + return &srq->ibsrq; + +arm_err: + ocrdma_mbx_destroy_srq(dev, srq); +err: + kfree(srq->rqe_wr_id_tbl); + kfree(srq->idx_bit_fields); + kfree(srq); + return ERR_PTR(status); +} + +int ocrdma_modify_srq(struct ib_srq *ibsrq, + struct ib_srq_attr *srq_attr, + enum ib_srq_attr_mask srq_attr_mask, + struct ib_udata *udata) +{ + int status = 0; + struct ocrdma_srq *srq; + + srq = get_ocrdma_srq(ibsrq); + if (srq_attr_mask & IB_SRQ_MAX_WR) + status = -EINVAL; + else + status = ocrdma_mbx_modify_srq(srq, srq_attr); + return status; +} + +int ocrdma_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr) +{ + int status; + struct ocrdma_srq *srq; + + srq = get_ocrdma_srq(ibsrq); + status = ocrdma_mbx_query_srq(srq, srq_attr); + return status; +} + +int ocrdma_destroy_srq(struct ib_srq *ibsrq) +{ + int status; + struct ocrdma_srq *srq; + struct ocrdma_dev *dev = get_ocrdma_dev(ibsrq->device); + + srq = get_ocrdma_srq(ibsrq); + + status = ocrdma_mbx_destroy_srq(dev, srq); + + if (srq->pd->uctx) + ocrdma_del_mmap(srq->pd->uctx, (u64) srq->rq.pa, + PAGE_ALIGN(srq->rq.len)); + + kfree(srq->idx_bit_fields); + kfree(srq->rqe_wr_id_tbl); + kfree(srq); + return status; +} + +/* unprivileged verbs and their support functions. */ +static void ocrdma_build_ud_hdr(struct ocrdma_qp *qp, + struct ocrdma_hdr_wqe *hdr, + struct ib_send_wr *wr) +{ + struct ocrdma_ewqe_ud_hdr *ud_hdr = + (struct ocrdma_ewqe_ud_hdr *)(hdr + 1); + struct ocrdma_ah *ah = get_ocrdma_ah(wr->wr.ud.ah); + + ud_hdr->rsvd_dest_qpn = wr->wr.ud.remote_qpn; + if (qp->qp_type == IB_QPT_GSI) + ud_hdr->qkey = qp->qkey; + else + ud_hdr->qkey = wr->wr.ud.remote_qkey; + ud_hdr->rsvd_ahid = ah->id; + if (ah->av->valid & OCRDMA_AV_VLAN_VALID) + hdr->cw |= (OCRDMA_FLAG_AH_VLAN_PR << OCRDMA_WQE_FLAGS_SHIFT); +} + +static void ocrdma_build_sges(struct ocrdma_hdr_wqe *hdr, + struct ocrdma_sge *sge, int num_sge, + struct ib_sge *sg_list) +{ + int i; + + for (i = 0; i < num_sge; i++) { + sge[i].lrkey = sg_list[i].lkey; + sge[i].addr_lo = sg_list[i].addr; + sge[i].addr_hi = upper_32_bits(sg_list[i].addr); + sge[i].len = sg_list[i].length; + hdr->total_len += sg_list[i].length; + } + if (num_sge == 0) + memset(sge, 0, sizeof(*sge)); +} + +static inline uint32_t ocrdma_sglist_len(struct ib_sge *sg_list, int num_sge) +{ + uint32_t total_len = 0, i; + + for (i = 0; i < num_sge; i++) + total_len += sg_list[i].length; + return total_len; +} + + +static int ocrdma_build_inline_sges(struct ocrdma_qp *qp, + struct ocrdma_hdr_wqe *hdr, + struct ocrdma_sge *sge, + struct ib_send_wr *wr, u32 wqe_size) +{ + int i; + char *dpp_addr; + + if (wr->send_flags & IB_SEND_INLINE && qp->qp_type != IB_QPT_UD) { + hdr->total_len = ocrdma_sglist_len(wr->sg_list, wr->num_sge); + if (unlikely(hdr->total_len > qp->max_inline_data)) { + pr_err("%s() supported_len=0x%x,\n" + " unsupported len req=0x%x\n", __func__, + qp->max_inline_data, hdr->total_len); + return -EINVAL; + } + dpp_addr = (char *)sge; + for (i = 0; i < wr->num_sge; i++) { + memcpy(dpp_addr, + (void *)(unsigned long)wr->sg_list[i].addr, + wr->sg_list[i].length); + dpp_addr += wr->sg_list[i].length; + } + + wqe_size += roundup(hdr->total_len, OCRDMA_WQE_ALIGN_BYTES); + if (0 == hdr->total_len) + wqe_size += sizeof(struct ocrdma_sge); + hdr->cw |= (OCRDMA_TYPE_INLINE << OCRDMA_WQE_TYPE_SHIFT); + } else { + ocrdma_build_sges(hdr, sge, wr->num_sge, wr->sg_list); + if (wr->num_sge) + wqe_size += (wr->num_sge * sizeof(struct ocrdma_sge)); + else + wqe_size += sizeof(struct ocrdma_sge); + hdr->cw |= (OCRDMA_TYPE_LKEY << OCRDMA_WQE_TYPE_SHIFT); + } + hdr->cw |= ((wqe_size / OCRDMA_WQE_STRIDE) << OCRDMA_WQE_SIZE_SHIFT); + return 0; +} + +static int ocrdma_build_send(struct ocrdma_qp *qp, struct ocrdma_hdr_wqe *hdr, + struct ib_send_wr *wr) +{ + int status; + struct ocrdma_sge *sge; + u32 wqe_size = sizeof(*hdr); + + if (qp->qp_type == IB_QPT_UD || qp->qp_type == IB_QPT_GSI) { + ocrdma_build_ud_hdr(qp, hdr, wr); + sge = (struct ocrdma_sge *)(hdr + 2); + wqe_size += sizeof(struct ocrdma_ewqe_ud_hdr); + } else { + sge = (struct ocrdma_sge *)(hdr + 1); + } + + status = ocrdma_build_inline_sges(qp, hdr, sge, wr, wqe_size); + return status; +} + +static int ocrdma_build_write(struct ocrdma_qp *qp, struct ocrdma_hdr_wqe *hdr, + struct ib_send_wr *wr) +{ + int status; + struct ocrdma_sge *ext_rw = (struct ocrdma_sge *)(hdr + 1); + struct ocrdma_sge *sge = ext_rw + 1; + u32 wqe_size = sizeof(*hdr) + sizeof(*ext_rw); + + status = ocrdma_build_inline_sges(qp, hdr, sge, wr, wqe_size); + if (status) + return status; + ext_rw->addr_lo = wr->wr.rdma.remote_addr; + ext_rw->addr_hi = upper_32_bits(wr->wr.rdma.remote_addr); + ext_rw->lrkey = wr->wr.rdma.rkey; + ext_rw->len = hdr->total_len; + return 0; +} + +static void ocrdma_build_read(struct ocrdma_qp *qp, struct ocrdma_hdr_wqe *hdr, + struct ib_send_wr *wr) +{ + struct ocrdma_sge *ext_rw = (struct ocrdma_sge *)(hdr + 1); + struct ocrdma_sge *sge = ext_rw + 1; + u32 wqe_size = ((wr->num_sge + 1) * sizeof(struct ocrdma_sge)) + + sizeof(struct ocrdma_hdr_wqe); + + ocrdma_build_sges(hdr, sge, wr->num_sge, wr->sg_list); + hdr->cw |= ((wqe_size / OCRDMA_WQE_STRIDE) << OCRDMA_WQE_SIZE_SHIFT); + hdr->cw |= (OCRDMA_READ << OCRDMA_WQE_OPCODE_SHIFT); + hdr->cw |= (OCRDMA_TYPE_LKEY << OCRDMA_WQE_TYPE_SHIFT); + + ext_rw->addr_lo = wr->wr.rdma.remote_addr; + ext_rw->addr_hi = upper_32_bits(wr->wr.rdma.remote_addr); + ext_rw->lrkey = wr->wr.rdma.rkey; + ext_rw->len = hdr->total_len; +} + +static void build_frmr_pbes(struct ib_send_wr *wr, struct ocrdma_pbl *pbl_tbl, + struct ocrdma_hw_mr *hwmr) +{ + int i; + u64 buf_addr = 0; + int num_pbes; + struct ocrdma_pbe *pbe; + + pbe = (struct ocrdma_pbe *)pbl_tbl->va; + num_pbes = 0; + + /* go through the OS phy regions & fill hw pbe entries into pbls. */ + for (i = 0; i < wr->wr.fast_reg.page_list_len; i++) { + /* number of pbes can be more for one OS buf, when + * buffers are of different sizes. + * split the ib_buf to one or more pbes. + */ + buf_addr = wr->wr.fast_reg.page_list->page_list[i]; + pbe->pa_lo = cpu_to_le32((u32) (buf_addr & PAGE_MASK)); + pbe->pa_hi = cpu_to_le32((u32) upper_32_bits(buf_addr)); + num_pbes += 1; + pbe++; + + /* if the pbl is full storing the pbes, + * move to next pbl. + */ + if (num_pbes == (hwmr->pbl_size/sizeof(u64))) { + pbl_tbl++; + pbe = (struct ocrdma_pbe *)pbl_tbl->va; + } + } + return; +} + +static int get_encoded_page_size(int pg_sz) +{ + /* Max size is 256M 4096 << 16 */ + int i = 0; + for (; i < 17; i++) + if (pg_sz == (4096 << i)) + break; + return i; +} + + +static int ocrdma_build_fr(struct ocrdma_qp *qp, struct ocrdma_hdr_wqe *hdr, + struct ib_send_wr *wr) +{ + u64 fbo; + struct ocrdma_ewqe_fr *fast_reg = (struct ocrdma_ewqe_fr *)(hdr + 1); + struct ocrdma_mr *mr; + struct ocrdma_dev *dev = get_ocrdma_dev(qp->ibqp.device); + u32 wqe_size = sizeof(*fast_reg) + sizeof(*hdr); + + wqe_size = roundup(wqe_size, OCRDMA_WQE_ALIGN_BYTES); + + if (wr->wr.fast_reg.page_list_len > dev->attr.max_pages_per_frmr) + return -EINVAL; + + hdr->cw |= (OCRDMA_FR_MR << OCRDMA_WQE_OPCODE_SHIFT); + hdr->cw |= ((wqe_size / OCRDMA_WQE_STRIDE) << OCRDMA_WQE_SIZE_SHIFT); + + if (wr->wr.fast_reg.page_list_len == 0) + BUG(); + if (wr->wr.fast_reg.access_flags & IB_ACCESS_LOCAL_WRITE) + hdr->rsvd_lkey_flags |= OCRDMA_LKEY_FLAG_LOCAL_WR; + if (wr->wr.fast_reg.access_flags & IB_ACCESS_REMOTE_WRITE) + hdr->rsvd_lkey_flags |= OCRDMA_LKEY_FLAG_REMOTE_WR; + if (wr->wr.fast_reg.access_flags & IB_ACCESS_REMOTE_READ) + hdr->rsvd_lkey_flags |= OCRDMA_LKEY_FLAG_REMOTE_RD; + hdr->lkey = wr->wr.fast_reg.rkey; + hdr->total_len = wr->wr.fast_reg.length; + + fbo = wr->wr.fast_reg.iova_start - + (wr->wr.fast_reg.page_list->page_list[0] & PAGE_MASK); + + fast_reg->va_hi = upper_32_bits(wr->wr.fast_reg.iova_start); + fast_reg->va_lo = (u32) (wr->wr.fast_reg.iova_start & 0xffffffff); + fast_reg->fbo_hi = upper_32_bits(fbo); + fast_reg->fbo_lo = (u32) fbo & 0xffffffff; + fast_reg->num_sges = wr->wr.fast_reg.page_list_len; + fast_reg->size_sge = + get_encoded_page_size(1 << wr->wr.fast_reg.page_shift); + mr = (struct ocrdma_mr *) (unsigned long) + dev->stag_arr[(hdr->lkey >> 8) & (OCRDMA_MAX_STAG - 1)]; + build_frmr_pbes(wr, mr->hwmr.pbl_table, &mr->hwmr); + return 0; +} + +static void ocrdma_ring_sq_db(struct ocrdma_qp *qp) +{ + u32 val = qp->sq.dbid | (1 << OCRDMA_DB_SQ_SHIFT); + + iowrite32(val, qp->sq_db); +} + +int ocrdma_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr) +{ + int status = 0; + struct ocrdma_qp *qp = get_ocrdma_qp(ibqp); + struct ocrdma_hdr_wqe *hdr; + unsigned long flags; + + spin_lock_irqsave(&qp->q_lock, flags); + if (qp->state != OCRDMA_QPS_RTS && qp->state != OCRDMA_QPS_SQD) { + spin_unlock_irqrestore(&qp->q_lock, flags); + *bad_wr = wr; + return -EINVAL; + } + + while (wr) { + if (qp->qp_type == IB_QPT_UD && + (wr->opcode != IB_WR_SEND && + wr->opcode != IB_WR_SEND_WITH_IMM)) { + *bad_wr = wr; + status = -EINVAL; + break; + } + if (ocrdma_hwq_free_cnt(&qp->sq) == 0 || + wr->num_sge > qp->sq.max_sges) { + *bad_wr = wr; + status = -ENOMEM; + break; + } + hdr = ocrdma_hwq_head(&qp->sq); + hdr->cw = 0; + if (wr->send_flags & IB_SEND_SIGNALED || qp->signaled) + hdr->cw |= (OCRDMA_FLAG_SIG << OCRDMA_WQE_FLAGS_SHIFT); + if (wr->send_flags & IB_SEND_FENCE) + hdr->cw |= + (OCRDMA_FLAG_FENCE_L << OCRDMA_WQE_FLAGS_SHIFT); + if (wr->send_flags & IB_SEND_SOLICITED) + hdr->cw |= + (OCRDMA_FLAG_SOLICIT << OCRDMA_WQE_FLAGS_SHIFT); + hdr->total_len = 0; + switch (wr->opcode) { + case IB_WR_SEND_WITH_IMM: + hdr->cw |= (OCRDMA_FLAG_IMM << OCRDMA_WQE_FLAGS_SHIFT); + hdr->immdt = ntohl(wr->ex.imm_data); + case IB_WR_SEND: + hdr->cw |= (OCRDMA_SEND << OCRDMA_WQE_OPCODE_SHIFT); + ocrdma_build_send(qp, hdr, wr); + break; + case IB_WR_SEND_WITH_INV: + hdr->cw |= (OCRDMA_FLAG_INV << OCRDMA_WQE_FLAGS_SHIFT); + hdr->cw |= (OCRDMA_SEND << OCRDMA_WQE_OPCODE_SHIFT); + hdr->lkey = wr->ex.invalidate_rkey; + status = ocrdma_build_send(qp, hdr, wr); + break; + case IB_WR_RDMA_WRITE_WITH_IMM: + hdr->cw |= (OCRDMA_FLAG_IMM << OCRDMA_WQE_FLAGS_SHIFT); + hdr->immdt = ntohl(wr->ex.imm_data); + case IB_WR_RDMA_WRITE: + hdr->cw |= (OCRDMA_WRITE << OCRDMA_WQE_OPCODE_SHIFT); + status = ocrdma_build_write(qp, hdr, wr); + break; + case IB_WR_RDMA_READ: + ocrdma_build_read(qp, hdr, wr); + break; + case IB_WR_LOCAL_INV: + hdr->cw |= + (OCRDMA_LKEY_INV << OCRDMA_WQE_OPCODE_SHIFT); + hdr->cw |= ((sizeof(struct ocrdma_hdr_wqe) + + sizeof(struct ocrdma_sge)) / + OCRDMA_WQE_STRIDE) << OCRDMA_WQE_SIZE_SHIFT; + hdr->lkey = wr->ex.invalidate_rkey; + break; + case IB_WR_FAST_REG_MR: + status = ocrdma_build_fr(qp, hdr, wr); + break; + default: + status = -EINVAL; + break; + } + if (status) { + *bad_wr = wr; + break; + } + if (wr->send_flags & IB_SEND_SIGNALED || qp->signaled) + qp->wqe_wr_id_tbl[qp->sq.head].signaled = 1; + else + qp->wqe_wr_id_tbl[qp->sq.head].signaled = 0; + qp->wqe_wr_id_tbl[qp->sq.head].wrid = wr->wr_id; + ocrdma_cpu_to_le32(hdr, ((hdr->cw >> OCRDMA_WQE_SIZE_SHIFT) & + OCRDMA_WQE_SIZE_MASK) * OCRDMA_WQE_STRIDE); + /* make sure wqe is written before adapter can access it */ + wmb(); + /* inform hw to start processing it */ + ocrdma_ring_sq_db(qp); + + /* update pointer, counter for next wr */ + ocrdma_hwq_inc_head(&qp->sq); + wr = wr->next; + } + spin_unlock_irqrestore(&qp->q_lock, flags); + return status; +} + +static void ocrdma_ring_rq_db(struct ocrdma_qp *qp) +{ + u32 val = qp->rq.dbid | (1 << OCRDMA_DB_RQ_SHIFT); + + iowrite32(val, qp->rq_db); +} + +static void ocrdma_build_rqe(struct ocrdma_hdr_wqe *rqe, struct ib_recv_wr *wr, + u16 tag) +{ + u32 wqe_size = 0; + struct ocrdma_sge *sge; + if (wr->num_sge) + wqe_size = (wr->num_sge * sizeof(*sge)) + sizeof(*rqe); + else + wqe_size = sizeof(*sge) + sizeof(*rqe); + + rqe->cw = ((wqe_size / OCRDMA_WQE_STRIDE) << + OCRDMA_WQE_SIZE_SHIFT); + rqe->cw |= (OCRDMA_FLAG_SIG << OCRDMA_WQE_FLAGS_SHIFT); + rqe->cw |= (OCRDMA_TYPE_LKEY << OCRDMA_WQE_TYPE_SHIFT); + rqe->total_len = 0; + rqe->rsvd_tag = tag; + sge = (struct ocrdma_sge *)(rqe + 1); + ocrdma_build_sges(rqe, sge, wr->num_sge, wr->sg_list); + ocrdma_cpu_to_le32(rqe, wqe_size); +} + +int ocrdma_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + int status = 0; + unsigned long flags; + struct ocrdma_qp *qp = get_ocrdma_qp(ibqp); + struct ocrdma_hdr_wqe *rqe; + + spin_lock_irqsave(&qp->q_lock, flags); + if (qp->state == OCRDMA_QPS_RST || qp->state == OCRDMA_QPS_ERR) { + spin_unlock_irqrestore(&qp->q_lock, flags); + *bad_wr = wr; + return -EINVAL; + } + while (wr) { + if (ocrdma_hwq_free_cnt(&qp->rq) == 0 || + wr->num_sge > qp->rq.max_sges) { + *bad_wr = wr; + status = -ENOMEM; + break; + } + rqe = ocrdma_hwq_head(&qp->rq); + ocrdma_build_rqe(rqe, wr, 0); + + qp->rqe_wr_id_tbl[qp->rq.head] = wr->wr_id; + /* make sure rqe is written before adapter can access it */ + wmb(); + + /* inform hw to start processing it */ + ocrdma_ring_rq_db(qp); + + /* update pointer, counter for next wr */ + ocrdma_hwq_inc_head(&qp->rq); + wr = wr->next; + } + spin_unlock_irqrestore(&qp->q_lock, flags); + return status; +} + +/* cqe for srq's rqe can potentially arrive out of order. + * index gives the entry in the shadow table where to store + * the wr_id. tag/index is returned in cqe to reference back + * for a given rqe. + */ +static int ocrdma_srq_get_idx(struct ocrdma_srq *srq) +{ + int row = 0; + int indx = 0; + + for (row = 0; row < srq->bit_fields_len; row++) { + if (srq->idx_bit_fields[row]) { + indx = ffs(srq->idx_bit_fields[row]); + indx = (row * 32) + (indx - 1); + if (indx >= srq->rq.max_cnt) + BUG(); + ocrdma_srq_toggle_bit(srq, indx); + break; + } + } + + if (row == srq->bit_fields_len) + BUG(); + return indx + 1; /* Use from index 1 */ +} + +static void ocrdma_ring_srq_db(struct ocrdma_srq *srq) +{ + u32 val = srq->rq.dbid | (1 << 16); + + iowrite32(val, srq->db + OCRDMA_DB_GEN2_SRQ_OFFSET); +} + +int ocrdma_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + int status = 0; + unsigned long flags; + struct ocrdma_srq *srq; + struct ocrdma_hdr_wqe *rqe; + u16 tag; + + srq = get_ocrdma_srq(ibsrq); + + spin_lock_irqsave(&srq->q_lock, flags); + while (wr) { + if (ocrdma_hwq_free_cnt(&srq->rq) == 0 || + wr->num_sge > srq->rq.max_sges) { + status = -ENOMEM; + *bad_wr = wr; + break; + } + tag = ocrdma_srq_get_idx(srq); + rqe = ocrdma_hwq_head(&srq->rq); + ocrdma_build_rqe(rqe, wr, tag); + + srq->rqe_wr_id_tbl[tag] = wr->wr_id; + /* make sure rqe is written before adapter can perform DMA */ + wmb(); + /* inform hw to start processing it */ + ocrdma_ring_srq_db(srq); + /* update pointer, counter for next wr */ + ocrdma_hwq_inc_head(&srq->rq); + wr = wr->next; + } + spin_unlock_irqrestore(&srq->q_lock, flags); + return status; +} + +static enum ib_wc_status ocrdma_to_ibwc_err(u16 status) +{ + enum ib_wc_status ibwc_status; + + switch (status) { + case OCRDMA_CQE_GENERAL_ERR: + ibwc_status = IB_WC_GENERAL_ERR; + break; + case OCRDMA_CQE_LOC_LEN_ERR: + ibwc_status = IB_WC_LOC_LEN_ERR; + break; + case OCRDMA_CQE_LOC_QP_OP_ERR: + ibwc_status = IB_WC_LOC_QP_OP_ERR; + break; + case OCRDMA_CQE_LOC_EEC_OP_ERR: + ibwc_status = IB_WC_LOC_EEC_OP_ERR; + break; + case OCRDMA_CQE_LOC_PROT_ERR: + ibwc_status = IB_WC_LOC_PROT_ERR; + break; + case OCRDMA_CQE_WR_FLUSH_ERR: + ibwc_status = IB_WC_WR_FLUSH_ERR; + break; + case OCRDMA_CQE_MW_BIND_ERR: + ibwc_status = IB_WC_MW_BIND_ERR; + break; + case OCRDMA_CQE_BAD_RESP_ERR: + ibwc_status = IB_WC_BAD_RESP_ERR; + break; + case OCRDMA_CQE_LOC_ACCESS_ERR: + ibwc_status = IB_WC_LOC_ACCESS_ERR; + break; + case OCRDMA_CQE_REM_INV_REQ_ERR: + ibwc_status = IB_WC_REM_INV_REQ_ERR; + break; + case OCRDMA_CQE_REM_ACCESS_ERR: + ibwc_status = IB_WC_REM_ACCESS_ERR; + break; + case OCRDMA_CQE_REM_OP_ERR: + ibwc_status = IB_WC_REM_OP_ERR; + break; + case OCRDMA_CQE_RETRY_EXC_ERR: + ibwc_status = IB_WC_RETRY_EXC_ERR; + break; + case OCRDMA_CQE_RNR_RETRY_EXC_ERR: + ibwc_status = IB_WC_RNR_RETRY_EXC_ERR; + break; + case OCRDMA_CQE_LOC_RDD_VIOL_ERR: + ibwc_status = IB_WC_LOC_RDD_VIOL_ERR; + break; + case OCRDMA_CQE_REM_INV_RD_REQ_ERR: + ibwc_status = IB_WC_REM_INV_RD_REQ_ERR; + break; + case OCRDMA_CQE_REM_ABORT_ERR: + ibwc_status = IB_WC_REM_ABORT_ERR; + break; + case OCRDMA_CQE_INV_EECN_ERR: + ibwc_status = IB_WC_INV_EECN_ERR; + break; + case OCRDMA_CQE_INV_EEC_STATE_ERR: + ibwc_status = IB_WC_INV_EEC_STATE_ERR; + break; + case OCRDMA_CQE_FATAL_ERR: + ibwc_status = IB_WC_FATAL_ERR; + break; + case OCRDMA_CQE_RESP_TIMEOUT_ERR: + ibwc_status = IB_WC_RESP_TIMEOUT_ERR; + break; + default: + ibwc_status = IB_WC_GENERAL_ERR; + break; + } + return ibwc_status; +} + +static void ocrdma_update_wc(struct ocrdma_qp *qp, struct ib_wc *ibwc, + u32 wqe_idx) +{ + struct ocrdma_hdr_wqe *hdr; + struct ocrdma_sge *rw; + int opcode; + + hdr = ocrdma_hwq_head_from_idx(&qp->sq, wqe_idx); + + ibwc->wr_id = qp->wqe_wr_id_tbl[wqe_idx].wrid; + /* Undo the hdr->cw swap */ + opcode = le32_to_cpu(hdr->cw) & OCRDMA_WQE_OPCODE_MASK; + switch (opcode) { + case OCRDMA_WRITE: + ibwc->opcode = IB_WC_RDMA_WRITE; + break; + case OCRDMA_READ: + rw = (struct ocrdma_sge *)(hdr + 1); + ibwc->opcode = IB_WC_RDMA_READ; + ibwc->byte_len = rw->len; + break; + case OCRDMA_SEND: + ibwc->opcode = IB_WC_SEND; + break; + case OCRDMA_FR_MR: + ibwc->opcode = IB_WC_FAST_REG_MR; + break; + case OCRDMA_LKEY_INV: + ibwc->opcode = IB_WC_LOCAL_INV; + break; + default: + ibwc->status = IB_WC_GENERAL_ERR; + pr_err("%s() invalid opcode received = 0x%x\n", + __func__, hdr->cw & OCRDMA_WQE_OPCODE_MASK); + break; + } +} + +static void ocrdma_set_cqe_status_flushed(struct ocrdma_qp *qp, + struct ocrdma_cqe *cqe) +{ + if (is_cqe_for_sq(cqe)) { + cqe->flags_status_srcqpn = cpu_to_le32(le32_to_cpu( + cqe->flags_status_srcqpn) & + ~OCRDMA_CQE_STATUS_MASK); + cqe->flags_status_srcqpn = cpu_to_le32(le32_to_cpu( + cqe->flags_status_srcqpn) | + (OCRDMA_CQE_WR_FLUSH_ERR << + OCRDMA_CQE_STATUS_SHIFT)); + } else { + if (qp->qp_type == IB_QPT_UD || qp->qp_type == IB_QPT_GSI) { + cqe->flags_status_srcqpn = cpu_to_le32(le32_to_cpu( + cqe->flags_status_srcqpn) & + ~OCRDMA_CQE_UD_STATUS_MASK); + cqe->flags_status_srcqpn = cpu_to_le32(le32_to_cpu( + cqe->flags_status_srcqpn) | + (OCRDMA_CQE_WR_FLUSH_ERR << + OCRDMA_CQE_UD_STATUS_SHIFT)); + } else { + cqe->flags_status_srcqpn = cpu_to_le32(le32_to_cpu( + cqe->flags_status_srcqpn) & + ~OCRDMA_CQE_STATUS_MASK); + cqe->flags_status_srcqpn = cpu_to_le32(le32_to_cpu( + cqe->flags_status_srcqpn) | + (OCRDMA_CQE_WR_FLUSH_ERR << + OCRDMA_CQE_STATUS_SHIFT)); + } + } +} + +static bool ocrdma_update_err_cqe(struct ib_wc *ibwc, struct ocrdma_cqe *cqe, + struct ocrdma_qp *qp, int status) +{ + bool expand = false; + + ibwc->byte_len = 0; + ibwc->qp = &qp->ibqp; + ibwc->status = ocrdma_to_ibwc_err(status); + + ocrdma_flush_qp(qp); + ocrdma_qp_state_change(qp, IB_QPS_ERR, NULL); + + /* if wqe/rqe pending for which cqe needs to be returned, + * trigger inflating it. + */ + if (!is_hw_rq_empty(qp) || !is_hw_sq_empty(qp)) { + expand = true; + ocrdma_set_cqe_status_flushed(qp, cqe); + } + return expand; +} + +static int ocrdma_update_err_rcqe(struct ib_wc *ibwc, struct ocrdma_cqe *cqe, + struct ocrdma_qp *qp, int status) +{ + ibwc->opcode = IB_WC_RECV; + ibwc->wr_id = qp->rqe_wr_id_tbl[qp->rq.tail]; + ocrdma_hwq_inc_tail(&qp->rq); + + return ocrdma_update_err_cqe(ibwc, cqe, qp, status); +} + +static int ocrdma_update_err_scqe(struct ib_wc *ibwc, struct ocrdma_cqe *cqe, + struct ocrdma_qp *qp, int status) +{ + ocrdma_update_wc(qp, ibwc, qp->sq.tail); + ocrdma_hwq_inc_tail(&qp->sq); + + return ocrdma_update_err_cqe(ibwc, cqe, qp, status); +} + + +static bool ocrdma_poll_err_scqe(struct ocrdma_qp *qp, + struct ocrdma_cqe *cqe, struct ib_wc *ibwc, + bool *polled, bool *stop) +{ + bool expand; + struct ocrdma_dev *dev = get_ocrdma_dev(qp->ibqp.device); + int status = (le32_to_cpu(cqe->flags_status_srcqpn) & + OCRDMA_CQE_STATUS_MASK) >> OCRDMA_CQE_STATUS_SHIFT; + if (status < OCRDMA_MAX_CQE_ERR) + atomic_inc(&dev->cqe_err_stats[status]); + + /* when hw sq is empty, but rq is not empty, so we continue + * to keep the cqe in order to get the cq event again. + */ + if (is_hw_sq_empty(qp) && !is_hw_rq_empty(qp)) { + /* when cq for rq and sq is same, it is safe to return + * flush cqe for RQEs. + */ + if (!qp->srq && (qp->sq_cq == qp->rq_cq)) { + *polled = true; + status = OCRDMA_CQE_WR_FLUSH_ERR; + expand = ocrdma_update_err_rcqe(ibwc, cqe, qp, status); + } else { + /* stop processing further cqe as this cqe is used for + * triggering cq event on buddy cq of RQ. + * When QP is destroyed, this cqe will be removed + * from the cq's hardware q. + */ + *polled = false; + *stop = true; + expand = false; + } + } else if (is_hw_sq_empty(qp)) { + /* Do nothing */ + expand = false; + *polled = false; + *stop = false; + } else { + *polled = true; + expand = ocrdma_update_err_scqe(ibwc, cqe, qp, status); + } + return expand; +} + +static bool ocrdma_poll_success_scqe(struct ocrdma_qp *qp, + struct ocrdma_cqe *cqe, + struct ib_wc *ibwc, bool *polled) +{ + bool expand = false; + int tail = qp->sq.tail; + u32 wqe_idx; + + if (!qp->wqe_wr_id_tbl[tail].signaled) { + *polled = false; /* WC cannot be consumed yet */ + } else { + ibwc->status = IB_WC_SUCCESS; + ibwc->wc_flags = 0; + ibwc->qp = &qp->ibqp; + ocrdma_update_wc(qp, ibwc, tail); + *polled = true; + } + wqe_idx = (le32_to_cpu(cqe->wq.wqeidx) & + OCRDMA_CQE_WQEIDX_MASK) & qp->sq.max_wqe_idx; + if (tail != wqe_idx) + expand = true; /* Coalesced CQE can't be consumed yet */ + + ocrdma_hwq_inc_tail(&qp->sq); + return expand; +} + +static bool ocrdma_poll_scqe(struct ocrdma_qp *qp, struct ocrdma_cqe *cqe, + struct ib_wc *ibwc, bool *polled, bool *stop) +{ + int status; + bool expand; + + status = (le32_to_cpu(cqe->flags_status_srcqpn) & + OCRDMA_CQE_STATUS_MASK) >> OCRDMA_CQE_STATUS_SHIFT; + + if (status == OCRDMA_CQE_SUCCESS) + expand = ocrdma_poll_success_scqe(qp, cqe, ibwc, polled); + else + expand = ocrdma_poll_err_scqe(qp, cqe, ibwc, polled, stop); + return expand; +} + +static int ocrdma_update_ud_rcqe(struct ib_wc *ibwc, struct ocrdma_cqe *cqe) +{ + int status; + + status = (le32_to_cpu(cqe->flags_status_srcqpn) & + OCRDMA_CQE_UD_STATUS_MASK) >> OCRDMA_CQE_UD_STATUS_SHIFT; + ibwc->src_qp = le32_to_cpu(cqe->flags_status_srcqpn) & + OCRDMA_CQE_SRCQP_MASK; + ibwc->pkey_index = le32_to_cpu(cqe->ud.rxlen_pkey) & + OCRDMA_CQE_PKEY_MASK; + ibwc->wc_flags = IB_WC_GRH; + ibwc->byte_len = (le32_to_cpu(cqe->ud.rxlen_pkey) >> + OCRDMA_CQE_UD_XFER_LEN_SHIFT); + return status; +} + +static void ocrdma_update_free_srq_cqe(struct ib_wc *ibwc, + struct ocrdma_cqe *cqe, + struct ocrdma_qp *qp) +{ + unsigned long flags; + struct ocrdma_srq *srq; + u32 wqe_idx; + + srq = get_ocrdma_srq(qp->ibqp.srq); + wqe_idx = (le32_to_cpu(cqe->rq.buftag_qpn) >> + OCRDMA_CQE_BUFTAG_SHIFT) & srq->rq.max_wqe_idx; + if (wqe_idx < 1) + BUG(); + + ibwc->wr_id = srq->rqe_wr_id_tbl[wqe_idx]; + spin_lock_irqsave(&srq->q_lock, flags); + ocrdma_srq_toggle_bit(srq, wqe_idx - 1); + spin_unlock_irqrestore(&srq->q_lock, flags); + ocrdma_hwq_inc_tail(&srq->rq); +} + +static bool ocrdma_poll_err_rcqe(struct ocrdma_qp *qp, struct ocrdma_cqe *cqe, + struct ib_wc *ibwc, bool *polled, bool *stop, + int status) +{ + bool expand; + struct ocrdma_dev *dev = get_ocrdma_dev(qp->ibqp.device); + + if (status < OCRDMA_MAX_CQE_ERR) + atomic_inc(&dev->cqe_err_stats[status]); + + /* when hw_rq is empty, but wq is not empty, so continue + * to keep the cqe to get the cq event again. + */ + if (is_hw_rq_empty(qp) && !is_hw_sq_empty(qp)) { + if (!qp->srq && (qp->sq_cq == qp->rq_cq)) { + *polled = true; + status = OCRDMA_CQE_WR_FLUSH_ERR; + expand = ocrdma_update_err_scqe(ibwc, cqe, qp, status); + } else { + *polled = false; + *stop = true; + expand = false; + } + } else if (is_hw_rq_empty(qp)) { + /* Do nothing */ + expand = false; + *polled = false; + *stop = false; + } else { + *polled = true; + expand = ocrdma_update_err_rcqe(ibwc, cqe, qp, status); + } + return expand; +} + +static void ocrdma_poll_success_rcqe(struct ocrdma_qp *qp, + struct ocrdma_cqe *cqe, struct ib_wc *ibwc) +{ + ibwc->opcode = IB_WC_RECV; + ibwc->qp = &qp->ibqp; + ibwc->status = IB_WC_SUCCESS; + + if (qp->qp_type == IB_QPT_UD || qp->qp_type == IB_QPT_GSI) + ocrdma_update_ud_rcqe(ibwc, cqe); + else + ibwc->byte_len = le32_to_cpu(cqe->rq.rxlen); + + if (is_cqe_imm(cqe)) { + ibwc->ex.imm_data = htonl(le32_to_cpu(cqe->rq.lkey_immdt)); + ibwc->wc_flags |= IB_WC_WITH_IMM; + } else if (is_cqe_wr_imm(cqe)) { + ibwc->opcode = IB_WC_RECV_RDMA_WITH_IMM; + ibwc->ex.imm_data = htonl(le32_to_cpu(cqe->rq.lkey_immdt)); + ibwc->wc_flags |= IB_WC_WITH_IMM; + } else if (is_cqe_invalidated(cqe)) { + ibwc->ex.invalidate_rkey = le32_to_cpu(cqe->rq.lkey_immdt); + ibwc->wc_flags |= IB_WC_WITH_INVALIDATE; + } + if (qp->ibqp.srq) { + ocrdma_update_free_srq_cqe(ibwc, cqe, qp); + } else { + ibwc->wr_id = qp->rqe_wr_id_tbl[qp->rq.tail]; + ocrdma_hwq_inc_tail(&qp->rq); + } +} + +static bool ocrdma_poll_rcqe(struct ocrdma_qp *qp, struct ocrdma_cqe *cqe, + struct ib_wc *ibwc, bool *polled, bool *stop) +{ + int status; + bool expand = false; + + ibwc->wc_flags = 0; + if (qp->qp_type == IB_QPT_UD || qp->qp_type == IB_QPT_GSI) { + status = (le32_to_cpu(cqe->flags_status_srcqpn) & + OCRDMA_CQE_UD_STATUS_MASK) >> + OCRDMA_CQE_UD_STATUS_SHIFT; + } else { + status = (le32_to_cpu(cqe->flags_status_srcqpn) & + OCRDMA_CQE_STATUS_MASK) >> OCRDMA_CQE_STATUS_SHIFT; + } + + if (status == OCRDMA_CQE_SUCCESS) { + *polled = true; + ocrdma_poll_success_rcqe(qp, cqe, ibwc); + } else { + expand = ocrdma_poll_err_rcqe(qp, cqe, ibwc, polled, stop, + status); + } + return expand; +} + +static void ocrdma_change_cq_phase(struct ocrdma_cq *cq, struct ocrdma_cqe *cqe, + u16 cur_getp) +{ + if (cq->phase_change) { + if (cur_getp == 0) + cq->phase = (~cq->phase & OCRDMA_CQE_VALID); + } else { + /* clear valid bit */ + cqe->flags_status_srcqpn = 0; + } +} + +static int ocrdma_poll_hwcq(struct ocrdma_cq *cq, int num_entries, + struct ib_wc *ibwc) +{ + u16 qpn = 0; + int i = 0; + bool expand = false; + int polled_hw_cqes = 0; + struct ocrdma_qp *qp = NULL; + struct ocrdma_dev *dev = get_ocrdma_dev(cq->ibcq.device); + struct ocrdma_cqe *cqe; + u16 cur_getp; bool polled = false; bool stop = false; + + cur_getp = cq->getp; + while (num_entries) { + cqe = cq->va + cur_getp; + /* check whether valid cqe or not */ + if (!is_cqe_valid(cq, cqe)) + break; + qpn = (le32_to_cpu(cqe->cmn.qpn) & OCRDMA_CQE_QPN_MASK); + /* ignore discarded cqe */ + if (qpn == 0) + goto skip_cqe; + qp = dev->qp_tbl[qpn]; + BUG_ON(qp == NULL); + + if (is_cqe_for_sq(cqe)) { + expand = ocrdma_poll_scqe(qp, cqe, ibwc, &polled, + &stop); + } else { + expand = ocrdma_poll_rcqe(qp, cqe, ibwc, &polled, + &stop); + } + if (expand) + goto expand_cqe; + if (stop) + goto stop_cqe; + /* clear qpn to avoid duplicate processing by discard_cqe() */ + cqe->cmn.qpn = 0; +skip_cqe: + polled_hw_cqes += 1; + cur_getp = (cur_getp + 1) % cq->max_hw_cqe; + ocrdma_change_cq_phase(cq, cqe, cur_getp); +expand_cqe: + if (polled) { + num_entries -= 1; + i += 1; + ibwc = ibwc + 1; + polled = false; + } + } +stop_cqe: + cq->getp = cur_getp; + if (cq->deferred_arm) { + ocrdma_ring_cq_db(dev, cq->id, true, cq->deferred_sol, + polled_hw_cqes); + cq->deferred_arm = false; + cq->deferred_sol = false; + } else { + /* We need to pop the CQE. No need to arm */ + ocrdma_ring_cq_db(dev, cq->id, false, cq->deferred_sol, + polled_hw_cqes); + cq->deferred_sol = false; + } + + return i; +} + +/* insert error cqe if the QP's SQ or RQ's CQ matches the CQ under poll. */ +static int ocrdma_add_err_cqe(struct ocrdma_cq *cq, int num_entries, + struct ocrdma_qp *qp, struct ib_wc *ibwc) +{ + int err_cqes = 0; + + while (num_entries) { + if (is_hw_sq_empty(qp) && is_hw_rq_empty(qp)) + break; + if (!is_hw_sq_empty(qp) && qp->sq_cq == cq) { + ocrdma_update_wc(qp, ibwc, qp->sq.tail); + ocrdma_hwq_inc_tail(&qp->sq); + } else if (!is_hw_rq_empty(qp) && qp->rq_cq == cq) { + ibwc->wr_id = qp->rqe_wr_id_tbl[qp->rq.tail]; + ocrdma_hwq_inc_tail(&qp->rq); + } else { + return err_cqes; + } + ibwc->byte_len = 0; + ibwc->status = IB_WC_WR_FLUSH_ERR; + ibwc = ibwc + 1; + err_cqes += 1; + num_entries -= 1; + } + return err_cqes; +} + +int ocrdma_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) +{ + int cqes_to_poll = num_entries; + struct ocrdma_cq *cq = get_ocrdma_cq(ibcq); + struct ocrdma_dev *dev = get_ocrdma_dev(ibcq->device); + int num_os_cqe = 0, err_cqes = 0; + struct ocrdma_qp *qp; + unsigned long flags; + + /* poll cqes from adapter CQ */ + spin_lock_irqsave(&cq->cq_lock, flags); + num_os_cqe = ocrdma_poll_hwcq(cq, cqes_to_poll, wc); + spin_unlock_irqrestore(&cq->cq_lock, flags); + cqes_to_poll -= num_os_cqe; + + if (cqes_to_poll) { + wc = wc + num_os_cqe; + /* adapter returns single error cqe when qp moves to + * error state. So insert error cqes with wc_status as + * FLUSHED for pending WQEs and RQEs of QP's SQ and RQ + * respectively which uses this CQ. + */ + spin_lock_irqsave(&dev->flush_q_lock, flags); + list_for_each_entry(qp, &cq->sq_head, sq_entry) { + if (cqes_to_poll == 0) + break; + err_cqes = ocrdma_add_err_cqe(cq, cqes_to_poll, qp, wc); + cqes_to_poll -= err_cqes; + num_os_cqe += err_cqes; + wc = wc + err_cqes; + } + spin_unlock_irqrestore(&dev->flush_q_lock, flags); + } + return num_os_cqe; +} + +int ocrdma_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags cq_flags) +{ + struct ocrdma_cq *cq = get_ocrdma_cq(ibcq); + struct ocrdma_dev *dev = get_ocrdma_dev(ibcq->device); + u16 cq_id; + unsigned long flags; + bool arm_needed = false, sol_needed = false; + + cq_id = cq->id; + + spin_lock_irqsave(&cq->cq_lock, flags); + if (cq_flags & IB_CQ_NEXT_COMP || cq_flags & IB_CQ_SOLICITED) + arm_needed = true; + if (cq_flags & IB_CQ_SOLICITED) + sol_needed = true; + + if (cq->first_arm) { + ocrdma_ring_cq_db(dev, cq_id, arm_needed, sol_needed, 0); + cq->first_arm = false; + } + + cq->deferred_arm = true; + cq->deferred_sol = sol_needed; + spin_unlock_irqrestore(&cq->cq_lock, flags); + + return 0; +} + +struct ib_mr *ocrdma_alloc_frmr(struct ib_pd *ibpd, int max_page_list_len) +{ + int status; + struct ocrdma_mr *mr; + struct ocrdma_pd *pd = get_ocrdma_pd(ibpd); + struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device); + + if (max_page_list_len > dev->attr.max_pages_per_frmr) + return ERR_PTR(-EINVAL); + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + status = ocrdma_get_pbl_info(dev, mr, max_page_list_len); + if (status) + goto pbl_err; + mr->hwmr.fr_mr = 1; + mr->hwmr.remote_rd = 0; + mr->hwmr.remote_wr = 0; + mr->hwmr.local_rd = 0; + mr->hwmr.local_wr = 0; + mr->hwmr.mw_bind = 0; + status = ocrdma_build_pbl_tbl(dev, &mr->hwmr); + if (status) + goto pbl_err; + status = ocrdma_reg_mr(dev, &mr->hwmr, pd->id, 0); + if (status) + goto mbx_err; + mr->ibmr.rkey = mr->hwmr.lkey; + mr->ibmr.lkey = mr->hwmr.lkey; + dev->stag_arr[(mr->hwmr.lkey >> 8) & (OCRDMA_MAX_STAG - 1)] = + (unsigned long) mr; + return &mr->ibmr; +mbx_err: + ocrdma_free_mr_pbl_tbl(dev, &mr->hwmr); +pbl_err: + kfree(mr); + return ERR_PTR(-ENOMEM); +} + +struct ib_fast_reg_page_list *ocrdma_alloc_frmr_page_list(struct ib_device + *ibdev, + int page_list_len) +{ + struct ib_fast_reg_page_list *frmr_list; + int size; + + size = sizeof(*frmr_list) + (page_list_len * sizeof(u64)); + frmr_list = kzalloc(size, GFP_KERNEL); + if (!frmr_list) + return ERR_PTR(-ENOMEM); + frmr_list->page_list = (u64 *)(frmr_list + 1); + return frmr_list; +} + +void ocrdma_free_frmr_page_list(struct ib_fast_reg_page_list *page_list) +{ + kfree(page_list); +} + +#define MAX_KERNEL_PBE_SIZE 65536 +static inline int count_kernel_pbes(struct ib_phys_buf *buf_list, + int buf_cnt, u32 *pbe_size) +{ + u64 total_size = 0; + u64 buf_size = 0; + int i; + *pbe_size = roundup(buf_list[0].size, PAGE_SIZE); + *pbe_size = roundup_pow_of_two(*pbe_size); + + /* find the smallest PBE size that we can have */ + for (i = 0; i < buf_cnt; i++) { + /* first addr may not be page aligned, so ignore checking */ + if ((i != 0) && ((buf_list[i].addr & ~PAGE_MASK) || + (buf_list[i].size & ~PAGE_MASK))) { + return 0; + } + + /* if configured PBE size is greater then the chosen one, + * reduce the PBE size. + */ + buf_size = roundup(buf_list[i].size, PAGE_SIZE); + /* pbe_size has to be even multiple of 4K 1,2,4,8...*/ + buf_size = roundup_pow_of_two(buf_size); + if (*pbe_size > buf_size) + *pbe_size = buf_size; + + total_size += buf_size; + } + *pbe_size = *pbe_size > MAX_KERNEL_PBE_SIZE ? + (MAX_KERNEL_PBE_SIZE) : (*pbe_size); + + /* num_pbes = total_size / (*pbe_size); this is implemented below. */ + + return total_size >> ilog2(*pbe_size); +} + +static void build_kernel_pbes(struct ib_phys_buf *buf_list, int ib_buf_cnt, + u32 pbe_size, struct ocrdma_pbl *pbl_tbl, + struct ocrdma_hw_mr *hwmr) +{ + int i; + int idx; + int pbes_per_buf = 0; + u64 buf_addr = 0; + int num_pbes; + struct ocrdma_pbe *pbe; + int total_num_pbes = 0; + + if (!hwmr->num_pbes) + return; + + pbe = (struct ocrdma_pbe *)pbl_tbl->va; + num_pbes = 0; + + /* go through the OS phy regions & fill hw pbe entries into pbls. */ + for (i = 0; i < ib_buf_cnt; i++) { + buf_addr = buf_list[i].addr; + pbes_per_buf = + roundup_pow_of_two(roundup(buf_list[i].size, PAGE_SIZE)) / + pbe_size; + hwmr->len += buf_list[i].size; + /* number of pbes can be more for one OS buf, when + * buffers are of different sizes. + * split the ib_buf to one or more pbes. + */ + for (idx = 0; idx < pbes_per_buf; idx++) { + /* we program always page aligned addresses, + * first unaligned address is taken care by fbo. + */ + if (i == 0) { + /* for non zero fbo, assign the + * start of the page. + */ + pbe->pa_lo = + cpu_to_le32((u32) (buf_addr & PAGE_MASK)); + pbe->pa_hi = + cpu_to_le32((u32) upper_32_bits(buf_addr)); + } else { + pbe->pa_lo = + cpu_to_le32((u32) (buf_addr & 0xffffffff)); + pbe->pa_hi = + cpu_to_le32((u32) upper_32_bits(buf_addr)); + } + buf_addr += pbe_size; + num_pbes += 1; + total_num_pbes += 1; + pbe++; + + if (total_num_pbes == hwmr->num_pbes) + goto mr_tbl_done; + /* if the pbl is full storing the pbes, + * move to next pbl. + */ + if (num_pbes == (hwmr->pbl_size/sizeof(u64))) { + pbl_tbl++; + pbe = (struct ocrdma_pbe *)pbl_tbl->va; + num_pbes = 0; + } + } + } +mr_tbl_done: + return; +} + +struct ib_mr *ocrdma_reg_kernel_mr(struct ib_pd *ibpd, + struct ib_phys_buf *buf_list, + int buf_cnt, int acc, u64 *iova_start) +{ + int status = -ENOMEM; + struct ocrdma_mr *mr; + struct ocrdma_pd *pd = get_ocrdma_pd(ibpd); + struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device); + u32 num_pbes; + u32 pbe_size = 0; + + if ((acc & IB_ACCESS_REMOTE_WRITE) && !(acc & IB_ACCESS_LOCAL_WRITE)) + return ERR_PTR(-EINVAL); + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(status); + + num_pbes = count_kernel_pbes(buf_list, buf_cnt, &pbe_size); + if (num_pbes == 0) { + status = -EINVAL; + goto pbl_err; + } + status = ocrdma_get_pbl_info(dev, mr, num_pbes); + if (status) + goto pbl_err; + + mr->hwmr.pbe_size = pbe_size; + mr->hwmr.fbo = *iova_start - (buf_list[0].addr & PAGE_MASK); + mr->hwmr.va = *iova_start; + mr->hwmr.local_rd = 1; + mr->hwmr.remote_wr = (acc & IB_ACCESS_REMOTE_WRITE) ? 1 : 0; + mr->hwmr.remote_rd = (acc & IB_ACCESS_REMOTE_READ) ? 1 : 0; + mr->hwmr.local_wr = (acc & IB_ACCESS_LOCAL_WRITE) ? 1 : 0; + mr->hwmr.remote_atomic = (acc & IB_ACCESS_REMOTE_ATOMIC) ? 1 : 0; + mr->hwmr.mw_bind = (acc & IB_ACCESS_MW_BIND) ? 1 : 0; + + status = ocrdma_build_pbl_tbl(dev, &mr->hwmr); + if (status) + goto pbl_err; + build_kernel_pbes(buf_list, buf_cnt, pbe_size, mr->hwmr.pbl_table, + &mr->hwmr); + status = ocrdma_reg_mr(dev, &mr->hwmr, pd->id, acc); + if (status) + goto mbx_err; + + mr->ibmr.lkey = mr->hwmr.lkey; + if (mr->hwmr.remote_wr || mr->hwmr.remote_rd) + mr->ibmr.rkey = mr->hwmr.lkey; + return &mr->ibmr; + +mbx_err: + ocrdma_free_mr_pbl_tbl(dev, &mr->hwmr); +pbl_err: + kfree(mr); + return ERR_PTR(status); +} diff --git a/kernel/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h b/kernel/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h new file mode 100644 index 000000000..b8f7853fd --- /dev/null +++ b/kernel/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h @@ -0,0 +1,99 @@ +/******************************************************************* + * This file is part of the Emulex RoCE Device Driver for * + * RoCE (RDMA over Converged Ethernet) adapters. * + * Copyright (C) 2008-2012 Emulex. All rights reserved. * + * EMULEX and SLI are trademarks of Emulex. * + * www.emulex.com * + * * + * This program is free software; you can redistribute it and/or * + * modify it under the terms of version 2 of the GNU General * + * Public License as published by the Free Software Foundation. * + * This program is distributed in the hope that it will be useful. * + * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND * + * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, * + * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE * + * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD * + * TO BE LEGALLY INVALID. See the GNU General Public License for * + * more details, a copy of which can be found in the file COPYING * + * included with this package. * + * + * Contact Information: + * linux-drivers@emulex.com + * + * Emulex + * 3333 Susan Street + * Costa Mesa, CA 92626 + *******************************************************************/ + +#ifndef __OCRDMA_VERBS_H__ +#define __OCRDMA_VERBS_H__ + +int ocrdma_post_send(struct ib_qp *, struct ib_send_wr *, + struct ib_send_wr **bad_wr); +int ocrdma_post_recv(struct ib_qp *, struct ib_recv_wr *, + struct ib_recv_wr **bad_wr); + +int ocrdma_poll_cq(struct ib_cq *, int num_entries, struct ib_wc *wc); +int ocrdma_arm_cq(struct ib_cq *, enum ib_cq_notify_flags flags); + +int ocrdma_query_device(struct ib_device *, struct ib_device_attr *props); +int ocrdma_query_port(struct ib_device *, u8 port, struct ib_port_attr *props); +int ocrdma_modify_port(struct ib_device *, u8 port, int mask, + struct ib_port_modify *props); + +void ocrdma_get_guid(struct ocrdma_dev *, u8 *guid); +int ocrdma_query_gid(struct ib_device *, u8 port, + int index, union ib_gid *gid); +int ocrdma_query_pkey(struct ib_device *, u8 port, u16 index, u16 *pkey); + +struct ib_ucontext *ocrdma_alloc_ucontext(struct ib_device *, + struct ib_udata *); +int ocrdma_dealloc_ucontext(struct ib_ucontext *); + +int ocrdma_mmap(struct ib_ucontext *, struct vm_area_struct *vma); + +struct ib_pd *ocrdma_alloc_pd(struct ib_device *, + struct ib_ucontext *, struct ib_udata *); +int ocrdma_dealloc_pd(struct ib_pd *pd); + +struct ib_cq *ocrdma_create_cq(struct ib_device *, int entries, int vector, + struct ib_ucontext *, struct ib_udata *); +int ocrdma_resize_cq(struct ib_cq *, int cqe, struct ib_udata *); +int ocrdma_destroy_cq(struct ib_cq *); + +struct ib_qp *ocrdma_create_qp(struct ib_pd *, + struct ib_qp_init_attr *attrs, + struct ib_udata *); +int _ocrdma_modify_qp(struct ib_qp *, struct ib_qp_attr *attr, + int attr_mask); +int ocrdma_modify_qp(struct ib_qp *, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata); +int ocrdma_query_qp(struct ib_qp *, + struct ib_qp_attr *qp_attr, + int qp_attr_mask, struct ib_qp_init_attr *); +int ocrdma_destroy_qp(struct ib_qp *); +void ocrdma_del_flush_qp(struct ocrdma_qp *qp); + +struct ib_srq *ocrdma_create_srq(struct ib_pd *, struct ib_srq_init_attr *, + struct ib_udata *); +int ocrdma_modify_srq(struct ib_srq *, struct ib_srq_attr *, + enum ib_srq_attr_mask, struct ib_udata *); +int ocrdma_query_srq(struct ib_srq *, struct ib_srq_attr *); +int ocrdma_destroy_srq(struct ib_srq *); +int ocrdma_post_srq_recv(struct ib_srq *, struct ib_recv_wr *, + struct ib_recv_wr **bad_recv_wr); + +int ocrdma_dereg_mr(struct ib_mr *); +struct ib_mr *ocrdma_get_dma_mr(struct ib_pd *, int acc); +struct ib_mr *ocrdma_reg_kernel_mr(struct ib_pd *, + struct ib_phys_buf *buffer_list, + int num_phys_buf, int acc, u64 *iova_start); +struct ib_mr *ocrdma_reg_user_mr(struct ib_pd *, u64 start, u64 length, + u64 virt, int acc, struct ib_udata *); +struct ib_mr *ocrdma_alloc_frmr(struct ib_pd *pd, int max_page_list_len); +struct ib_fast_reg_page_list *ocrdma_alloc_frmr_page_list(struct ib_device + *ibdev, + int page_list_len); +void ocrdma_free_frmr_page_list(struct ib_fast_reg_page_list *page_list); + +#endif /* __OCRDMA_VERBS_H__ */ diff --git a/kernel/drivers/infiniband/hw/qib/Kconfig b/kernel/drivers/infiniband/hw/qib/Kconfig new file mode 100644 index 000000000..495be0978 --- /dev/null +++ b/kernel/drivers/infiniband/hw/qib/Kconfig @@ -0,0 +1,15 @@ +config INFINIBAND_QIB + tristate "Intel PCIe HCA support" + depends on 64BIT + ---help--- + This is a low-level driver for Intel PCIe QLE InfiniBand host + channel adapters. This driver does not support the Intel + HyperTransport card (model QHT7140). + +config INFINIBAND_QIB_DCA + bool "QIB DCA support" + depends on INFINIBAND_QIB && DCA && SMP && !(INFINIBAND_QIB=y && DCA=m) + default y + ---help--- + Setting this enables DCA support on some Intel chip sets + with the iba7322 HCA. diff --git a/kernel/drivers/infiniband/hw/qib/Makefile b/kernel/drivers/infiniband/hw/qib/Makefile new file mode 100644 index 000000000..57f8103e5 --- /dev/null +++ b/kernel/drivers/infiniband/hw/qib/Makefile @@ -0,0 +1,16 @@ +obj-$(CONFIG_INFINIBAND_QIB) += ib_qib.o + +ib_qib-y := qib_cq.o qib_diag.o qib_dma.o qib_driver.o qib_eeprom.o \ + qib_file_ops.o qib_fs.o qib_init.o qib_intr.o qib_keys.o \ + qib_mad.o qib_mmap.o qib_mr.o qib_pcie.o qib_pio_copy.o \ + qib_qp.o qib_qsfp.o qib_rc.o qib_ruc.o qib_sdma.o qib_srq.o \ + qib_sysfs.o qib_twsi.o qib_tx.o qib_uc.o qib_ud.o \ + qib_user_pages.o qib_user_sdma.o qib_verbs_mcast.o qib_iba7220.o \ + qib_sd7220.o qib_iba7322.o qib_verbs.o + +# 6120 has no fallback if no MSI interrupts, others can do INTx +ib_qib-$(CONFIG_PCI_MSI) += qib_iba6120.o + +ib_qib-$(CONFIG_X86_64) += qib_wc_x86_64.o +ib_qib-$(CONFIG_PPC64) += qib_wc_ppc64.o +ib_qib-$(CONFIG_DEBUG_FS) += qib_debugfs.o diff --git a/kernel/drivers/infiniband/hw/qib/qib.h b/kernel/drivers/infiniband/hw/qib/qib.h new file mode 100644 index 000000000..7df16f74b --- /dev/null +++ b/kernel/drivers/infiniband/hw/qib/qib.h @@ -0,0 +1,1543 @@ +#ifndef _QIB_KERNEL_H +#define _QIB_KERNEL_H +/* + * Copyright (c) 2012, 2013 Intel Corporation. All rights reserved. + * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * This header file is the base header file for qlogic_ib kernel code + * qib_user.h serves a similar purpose for user code. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "qib_common.h" +#include "qib_verbs.h" + +/* only s/w major version of QLogic_IB we can handle */ +#define QIB_CHIP_VERS_MAJ 2U + +/* don't care about this except printing */ +#define QIB_CHIP_VERS_MIN 0U + +/* The Organization Unique Identifier (Mfg code), and its position in GUID */ +#define QIB_OUI 0x001175 +#define QIB_OUI_LSB 40 + +/* + * per driver stats, either not device nor port-specific, or + * summed over all of the devices and ports. + * They are described by name via ipathfs filesystem, so layout + * and number of elements can change without breaking compatibility. + * If members are added or deleted qib_statnames[] in qib_fs.c must + * change to match. + */ +struct qlogic_ib_stats { + __u64 sps_ints; /* number of interrupts handled */ + __u64 sps_errints; /* number of error interrupts */ + __u64 sps_txerrs; /* tx-related packet errors */ + __u64 sps_rcverrs; /* non-crc rcv packet errors */ + __u64 sps_hwerrs; /* hardware errors reported (parity, etc.) */ + __u64 sps_nopiobufs; /* no pio bufs avail from kernel */ + __u64 sps_ctxts; /* number of contexts currently open */ + __u64 sps_lenerrs; /* number of kernel packets where RHF != LRH len */ + __u64 sps_buffull; + __u64 sps_hdrfull; +}; + +extern struct qlogic_ib_stats qib_stats; +extern const struct pci_error_handlers qib_pci_err_handler; + +#define QIB_CHIP_SWVERSION QIB_CHIP_VERS_MAJ +/* + * First-cut critierion for "device is active" is + * two thousand dwords combined Tx, Rx traffic per + * 5-second interval. SMA packets are 64 dwords, + * and occur "a few per second", presumably each way. + */ +#define QIB_TRAFFIC_ACTIVE_THRESHOLD (2000) + +/* + * Struct used to indicate which errors are logged in each of the + * error-counters that are logged to EEPROM. A counter is incremented + * _once_ (saturating at 255) for each event with any bits set in + * the error or hwerror register masks below. + */ +#define QIB_EEP_LOG_CNT (4) +struct qib_eep_log_mask { + u64 errs_to_log; + u64 hwerrs_to_log; +}; + +/* + * Below contains all data related to a single context (formerly called port). + */ + +#ifdef CONFIG_DEBUG_FS +struct qib_opcode_stats_perctx; +#endif + +struct qib_ctxtdata { + void **rcvegrbuf; + dma_addr_t *rcvegrbuf_phys; + /* rcvhdrq base, needs mmap before useful */ + void *rcvhdrq; + /* kernel virtual address where hdrqtail is updated */ + void *rcvhdrtail_kvaddr; + /* + * temp buffer for expected send setup, allocated at open, instead + * of each setup call + */ + void *tid_pg_list; + /* + * Shared page for kernel to signal user processes that send buffers + * need disarming. The process should call QIB_CMD_DISARM_BUFS + * or QIB_CMD_ACK_EVENT with IPATH_EVENT_DISARM_BUFS set. + */ + unsigned long *user_event_mask; + /* when waiting for rcv or pioavail */ + wait_queue_head_t wait; + /* + * rcvegr bufs base, physical, must fit + * in 44 bits so 32 bit programs mmap64 44 bit works) + */ + dma_addr_t rcvegr_phys; + /* mmap of hdrq, must fit in 44 bits */ + dma_addr_t rcvhdrq_phys; + dma_addr_t rcvhdrqtailaddr_phys; + + /* + * number of opens (including slave sub-contexts) on this instance + * (ignoring forks, dup, etc. for now) + */ + int cnt; + /* + * how much space to leave at start of eager TID entries for + * protocol use, on each TID + */ + /* instead of calculating it */ + unsigned ctxt; + /* local node of context */ + int node_id; + /* non-zero if ctxt is being shared. */ + u16 subctxt_cnt; + /* non-zero if ctxt is being shared. */ + u16 subctxt_id; + /* number of eager TID entries. */ + u16 rcvegrcnt; + /* index of first eager TID entry. */ + u16 rcvegr_tid_base; + /* number of pio bufs for this ctxt (all procs, if shared) */ + u32 piocnt; + /* first pio buffer for this ctxt */ + u32 pio_base; + /* chip offset of PIO buffers for this ctxt */ + u32 piobufs; + /* how many alloc_pages() chunks in rcvegrbuf_pages */ + u32 rcvegrbuf_chunks; + /* how many egrbufs per chunk */ + u16 rcvegrbufs_perchunk; + /* ilog2 of above */ + u16 rcvegrbufs_perchunk_shift; + /* order for rcvegrbuf_pages */ + size_t rcvegrbuf_size; + /* rcvhdrq size (for freeing) */ + size_t rcvhdrq_size; + /* per-context flags for fileops/intr communication */ + unsigned long flag; + /* next expected TID to check when looking for free */ + u32 tidcursor; + /* WAIT_RCV that timed out, no interrupt */ + u32 rcvwait_to; + /* WAIT_PIO that timed out, no interrupt */ + u32 piowait_to; + /* WAIT_RCV already happened, no wait */ + u32 rcvnowait; + /* WAIT_PIO already happened, no wait */ + u32 pionowait; + /* total number of polled urgent packets */ + u32 urgent; + /* saved total number of polled urgent packets for poll edge trigger */ + u32 urgent_poll; + /* pid of process using this ctxt */ + pid_t pid; + pid_t subpid[QLOGIC_IB_MAX_SUBCTXT]; + /* same size as task_struct .comm[], command that opened context */ + char comm[16]; + /* pkeys set by this use of this ctxt */ + u16 pkeys[4]; + /* so file ops can get at unit */ + struct qib_devdata *dd; + /* so funcs that need physical port can get it easily */ + struct qib_pportdata *ppd; + /* A page of memory for rcvhdrhead, rcvegrhead, rcvegrtail * N */ + void *subctxt_uregbase; + /* An array of pages for the eager receive buffers * N */ + void *subctxt_rcvegrbuf; + /* An array of pages for the eager header queue entries * N */ + void *subctxt_rcvhdr_base; + /* The version of the library which opened this ctxt */ + u32 userversion; + /* Bitmask of active slaves */ + u32 active_slaves; + /* Type of packets or conditions we want to poll for */ + u16 poll_type; + /* receive packet sequence counter */ + u8 seq_cnt; + u8 redirect_seq_cnt; + /* ctxt rcvhdrq head offset */ + u32 head; + /* lookaside fields */ + struct qib_qp *lookaside_qp; + u32 lookaside_qpn; + /* QPs waiting for context processing */ + struct list_head qp_wait_list; +#ifdef CONFIG_DEBUG_FS + /* verbs stats per CTX */ + struct qib_opcode_stats_perctx *opstats; +#endif +}; + +struct qib_sge_state; + +struct qib_sdma_txreq { + int flags; + int sg_count; + dma_addr_t addr; + void (*callback)(struct qib_sdma_txreq *, int); + u16 start_idx; /* sdma private */ + u16 next_descq_idx; /* sdma private */ + struct list_head list; /* sdma private */ +}; + +struct qib_sdma_desc { + __le64 qw[2]; +}; + +struct qib_verbs_txreq { + struct qib_sdma_txreq txreq; + struct qib_qp *qp; + struct qib_swqe *wqe; + u32 dwords; + u16 hdr_dwords; + u16 hdr_inx; + struct qib_pio_header *align_buf; + struct qib_mregion *mr; + struct qib_sge_state *ss; +}; + +#define QIB_SDMA_TXREQ_F_USELARGEBUF 0x1 +#define QIB_SDMA_TXREQ_F_HEADTOHOST 0x2 +#define QIB_SDMA_TXREQ_F_INTREQ 0x4 +#define QIB_SDMA_TXREQ_F_FREEBUF 0x8 +#define QIB_SDMA_TXREQ_F_FREEDESC 0x10 + +#define QIB_SDMA_TXREQ_S_OK 0 +#define QIB_SDMA_TXREQ_S_SENDERROR 1 +#define QIB_SDMA_TXREQ_S_ABORTED 2 +#define QIB_SDMA_TXREQ_S_SHUTDOWN 3 + +/* + * Get/Set IB link-level config parameters for f_get/set_ib_cfg() + * Mostly for MADs that set or query link parameters, also ipath + * config interfaces + */ +#define QIB_IB_CFG_LIDLMC 0 /* LID (LS16b) and Mask (MS16b) */ +#define QIB_IB_CFG_LWID_ENB 2 /* allowed Link-width */ +#define QIB_IB_CFG_LWID 3 /* currently active Link-width */ +#define QIB_IB_CFG_SPD_ENB 4 /* allowed Link speeds */ +#define QIB_IB_CFG_SPD 5 /* current Link spd */ +#define QIB_IB_CFG_RXPOL_ENB 6 /* Auto-RX-polarity enable */ +#define QIB_IB_CFG_LREV_ENB 7 /* Auto-Lane-reversal enable */ +#define QIB_IB_CFG_LINKLATENCY 8 /* Link Latency (IB1.2 only) */ +#define QIB_IB_CFG_HRTBT 9 /* IB heartbeat off/enable/auto; DDR/QDR only */ +#define QIB_IB_CFG_OP_VLS 10 /* operational VLs */ +#define QIB_IB_CFG_VL_HIGH_CAP 11 /* num of VL high priority weights */ +#define QIB_IB_CFG_VL_LOW_CAP 12 /* num of VL low priority weights */ +#define QIB_IB_CFG_OVERRUN_THRESH 13 /* IB overrun threshold */ +#define QIB_IB_CFG_PHYERR_THRESH 14 /* IB PHY error threshold */ +#define QIB_IB_CFG_LINKDEFAULT 15 /* IB link default (sleep/poll) */ +#define QIB_IB_CFG_PKEYS 16 /* update partition keys */ +#define QIB_IB_CFG_MTU 17 /* update MTU in IBC */ +#define QIB_IB_CFG_LSTATE 18 /* update linkcmd and linkinitcmd in IBC */ +#define QIB_IB_CFG_VL_HIGH_LIMIT 19 +#define QIB_IB_CFG_PMA_TICKS 20 /* PMA sample tick resolution */ +#define QIB_IB_CFG_PORT 21 /* switch port we are connected to */ + +/* + * for CFG_LSTATE: LINKCMD in upper 16 bits, LINKINITCMD in lower 16 + * IB_LINKINITCMD_POLL and SLEEP are also used as set/get values for + * QIB_IB_CFG_LINKDEFAULT cmd + */ +#define IB_LINKCMD_DOWN (0 << 16) +#define IB_LINKCMD_ARMED (1 << 16) +#define IB_LINKCMD_ACTIVE (2 << 16) +#define IB_LINKINITCMD_NOP 0 +#define IB_LINKINITCMD_POLL 1 +#define IB_LINKINITCMD_SLEEP 2 +#define IB_LINKINITCMD_DISABLE 3 + +/* + * valid states passed to qib_set_linkstate() user call + */ +#define QIB_IB_LINKDOWN 0 +#define QIB_IB_LINKARM 1 +#define QIB_IB_LINKACTIVE 2 +#define QIB_IB_LINKDOWN_ONLY 3 +#define QIB_IB_LINKDOWN_SLEEP 4 +#define QIB_IB_LINKDOWN_DISABLE 5 + +/* + * These 7 values (SDR, DDR, and QDR may be ORed for auto-speed + * negotiation) are used for the 3rd argument to path_f_set_ib_cfg + * with cmd QIB_IB_CFG_SPD_ENB, by direct calls or via sysfs. They + * are also the the possible values for qib_link_speed_enabled and active + * The values were chosen to match values used within the IB spec. + */ +#define QIB_IB_SDR 1 +#define QIB_IB_DDR 2 +#define QIB_IB_QDR 4 + +#define QIB_DEFAULT_MTU 4096 + +/* max number of IB ports supported per HCA */ +#define QIB_MAX_IB_PORTS 2 + +/* + * Possible IB config parameters for f_get/set_ib_table() + */ +#define QIB_IB_TBL_VL_HIGH_ARB 1 /* Get/set VL high priority weights */ +#define QIB_IB_TBL_VL_LOW_ARB 2 /* Get/set VL low priority weights */ + +/* + * Possible "operations" for f_rcvctrl(ppd, op, ctxt) + * these are bits so they can be combined, e.g. + * QIB_RCVCTRL_INTRAVAIL_ENB | QIB_RCVCTRL_CTXT_ENB + */ +#define QIB_RCVCTRL_TAILUPD_ENB 0x01 +#define QIB_RCVCTRL_TAILUPD_DIS 0x02 +#define QIB_RCVCTRL_CTXT_ENB 0x04 +#define QIB_RCVCTRL_CTXT_DIS 0x08 +#define QIB_RCVCTRL_INTRAVAIL_ENB 0x10 +#define QIB_RCVCTRL_INTRAVAIL_DIS 0x20 +#define QIB_RCVCTRL_PKEY_ENB 0x40 /* Note, default is enabled */ +#define QIB_RCVCTRL_PKEY_DIS 0x80 +#define QIB_RCVCTRL_BP_ENB 0x0100 +#define QIB_RCVCTRL_BP_DIS 0x0200 +#define QIB_RCVCTRL_TIDFLOW_ENB 0x0400 +#define QIB_RCVCTRL_TIDFLOW_DIS 0x0800 + +/* + * Possible "operations" for f_sendctrl(ppd, op, var) + * these are bits so they can be combined, e.g. + * QIB_SENDCTRL_BUFAVAIL_ENB | QIB_SENDCTRL_ENB + * Some operations (e.g. DISARM, ABORT) are known to + * be "one-shot", so do not modify shadow. + */ +#define QIB_SENDCTRL_DISARM (0x1000) +#define QIB_SENDCTRL_DISARM_BUF(bufn) ((bufn) | QIB_SENDCTRL_DISARM) + /* available (0x2000) */ +#define QIB_SENDCTRL_AVAIL_DIS (0x4000) +#define QIB_SENDCTRL_AVAIL_ENB (0x8000) +#define QIB_SENDCTRL_AVAIL_BLIP (0x10000) +#define QIB_SENDCTRL_SEND_DIS (0x20000) +#define QIB_SENDCTRL_SEND_ENB (0x40000) +#define QIB_SENDCTRL_FLUSH (0x80000) +#define QIB_SENDCTRL_CLEAR (0x100000) +#define QIB_SENDCTRL_DISARM_ALL (0x200000) + +/* + * These are the generic indices for requesting per-port + * counter values via the f_portcntr function. They + * are always returned as 64 bit values, although most + * are 32 bit counters. + */ +/* send-related counters */ +#define QIBPORTCNTR_PKTSEND 0U +#define QIBPORTCNTR_WORDSEND 1U +#define QIBPORTCNTR_PSXMITDATA 2U +#define QIBPORTCNTR_PSXMITPKTS 3U +#define QIBPORTCNTR_PSXMITWAIT 4U +#define QIBPORTCNTR_SENDSTALL 5U +/* receive-related counters */ +#define QIBPORTCNTR_PKTRCV 6U +#define QIBPORTCNTR_PSRCVDATA 7U +#define QIBPORTCNTR_PSRCVPKTS 8U +#define QIBPORTCNTR_RCVEBP 9U +#define QIBPORTCNTR_RCVOVFL 10U +#define QIBPORTCNTR_WORDRCV 11U +/* IB link related error counters */ +#define QIBPORTCNTR_RXLOCALPHYERR 12U +#define QIBPORTCNTR_RXVLERR 13U +#define QIBPORTCNTR_ERRICRC 14U +#define QIBPORTCNTR_ERRVCRC 15U +#define QIBPORTCNTR_ERRLPCRC 16U +#define QIBPORTCNTR_BADFORMAT 17U +#define QIBPORTCNTR_ERR_RLEN 18U +#define QIBPORTCNTR_IBSYMBOLERR 19U +#define QIBPORTCNTR_INVALIDRLEN 20U +#define QIBPORTCNTR_UNSUPVL 21U +#define QIBPORTCNTR_EXCESSBUFOVFL 22U +#define QIBPORTCNTR_ERRLINK 23U +#define QIBPORTCNTR_IBLINKDOWN 24U +#define QIBPORTCNTR_IBLINKERRRECOV 25U +#define QIBPORTCNTR_LLI 26U +/* other error counters */ +#define QIBPORTCNTR_RXDROPPKT 27U +#define QIBPORTCNTR_VL15PKTDROP 28U +#define QIBPORTCNTR_ERRPKEY 29U +#define QIBPORTCNTR_KHDROVFL 30U +/* sampling counters (these are actually control registers) */ +#define QIBPORTCNTR_PSINTERVAL 31U +#define QIBPORTCNTR_PSSTART 32U +#define QIBPORTCNTR_PSSTAT 33U + +/* how often we check for packet activity for "power on hours (in seconds) */ +#define ACTIVITY_TIMER 5 + +#define MAX_NAME_SIZE 64 + +#ifdef CONFIG_INFINIBAND_QIB_DCA +struct qib_irq_notify; +#endif + +struct qib_msix_entry { + struct msix_entry msix; + void *arg; +#ifdef CONFIG_INFINIBAND_QIB_DCA + int dca; + int rcv; + struct qib_irq_notify *notifier; +#endif + char name[MAX_NAME_SIZE]; + cpumask_var_t mask; +}; + +/* Below is an opaque struct. Each chip (device) can maintain + * private data needed for its operation, but not germane to the + * rest of the driver. For convenience, we define another that + * is chip-specific, per-port + */ +struct qib_chip_specific; +struct qib_chipport_specific; + +enum qib_sdma_states { + qib_sdma_state_s00_hw_down, + qib_sdma_state_s10_hw_start_up_wait, + qib_sdma_state_s20_idle, + qib_sdma_state_s30_sw_clean_up_wait, + qib_sdma_state_s40_hw_clean_up_wait, + qib_sdma_state_s50_hw_halt_wait, + qib_sdma_state_s99_running, +}; + +enum qib_sdma_events { + qib_sdma_event_e00_go_hw_down, + qib_sdma_event_e10_go_hw_start, + qib_sdma_event_e20_hw_started, + qib_sdma_event_e30_go_running, + qib_sdma_event_e40_sw_cleaned, + qib_sdma_event_e50_hw_cleaned, + qib_sdma_event_e60_hw_halted, + qib_sdma_event_e70_go_idle, + qib_sdma_event_e7220_err_halted, + qib_sdma_event_e7322_err_halted, + qib_sdma_event_e90_timer_tick, +}; + +extern char *qib_sdma_state_names[]; +extern char *qib_sdma_event_names[]; + +struct sdma_set_state_action { + unsigned op_enable:1; + unsigned op_intenable:1; + unsigned op_halt:1; + unsigned op_drain:1; + unsigned go_s99_running_tofalse:1; + unsigned go_s99_running_totrue:1; +}; + +struct qib_sdma_state { + struct kref kref; + struct completion comp; + enum qib_sdma_states current_state; + struct sdma_set_state_action *set_state_action; + unsigned current_op; + unsigned go_s99_running; + unsigned first_sendbuf; + unsigned last_sendbuf; /* really last +1 */ + /* debugging/devel */ + enum qib_sdma_states previous_state; + unsigned previous_op; + enum qib_sdma_events last_event; +}; + +struct xmit_wait { + struct timer_list timer; + u64 counter; + u8 flags; + struct cache { + u64 psxmitdata; + u64 psrcvdata; + u64 psxmitpkts; + u64 psrcvpkts; + u64 psxmitwait; + } counter_cache; +}; + +/* + * The structure below encapsulates data relevant to a physical IB Port. + * Current chips support only one such port, but the separation + * clarifies things a bit. Note that to conform to IB conventions, + * port-numbers are one-based. The first or only port is port1. + */ +struct qib_pportdata { + struct qib_ibport ibport_data; + + struct qib_devdata *dd; + struct qib_chippport_specific *cpspec; /* chip-specific per-port */ + struct kobject pport_kobj; + struct kobject pport_cc_kobj; + struct kobject sl2vl_kobj; + struct kobject diagc_kobj; + + /* GUID for this interface, in network order */ + __be64 guid; + + /* QIB_POLL, etc. link-state specific flags, per port */ + u32 lflags; + /* qib_lflags driver is waiting for */ + u32 state_wanted; + spinlock_t lflags_lock; + + /* ref count for each pkey */ + atomic_t pkeyrefs[4]; + + /* + * this address is mapped readonly into user processes so they can + * get status cheaply, whenever they want. One qword of status per port + */ + u64 *statusp; + + /* SendDMA related entries */ + + /* read mostly */ + struct qib_sdma_desc *sdma_descq; + struct workqueue_struct *qib_wq; + struct qib_sdma_state sdma_state; + dma_addr_t sdma_descq_phys; + volatile __le64 *sdma_head_dma; /* DMA'ed by chip */ + dma_addr_t sdma_head_phys; + u16 sdma_descq_cnt; + + /* read/write using lock */ + spinlock_t sdma_lock ____cacheline_aligned_in_smp; + struct list_head sdma_activelist; + struct list_head sdma_userpending; + u64 sdma_descq_added; + u64 sdma_descq_removed; + u16 sdma_descq_tail; + u16 sdma_descq_head; + u8 sdma_generation; + u8 sdma_intrequest; + + struct tasklet_struct sdma_sw_clean_up_task + ____cacheline_aligned_in_smp; + + wait_queue_head_t state_wait; /* for state_wanted */ + + /* HoL blocking for SMP replies */ + unsigned hol_state; + struct timer_list hol_timer; + + /* + * Shadow copies of registers; size indicates read access size. + * Most of them are readonly, but some are write-only register, + * where we manipulate the bits in the shadow copy, and then write + * the shadow copy to qlogic_ib. + * + * We deliberately make most of these 32 bits, since they have + * restricted range. For any that we read, we won't to generate 32 + * bit accesses, since Opteron will generate 2 separate 32 bit HT + * transactions for a 64 bit read, and we want to avoid unnecessary + * bus transactions. + */ + + /* This is the 64 bit group */ + /* last ibcstatus. opaque outside chip-specific code */ + u64 lastibcstat; + + /* these are the "32 bit" regs */ + + /* + * the following two are 32-bit bitmasks, but {test,clear,set}_bit + * all expect bit fields to be "unsigned long" + */ + unsigned long p_rcvctrl; /* shadow per-port rcvctrl */ + unsigned long p_sendctrl; /* shadow per-port sendctrl */ + + u32 ibmtu; /* The MTU programmed for this unit */ + /* + * Current max size IB packet (in bytes) including IB headers, that + * we can send. Changes when ibmtu changes. + */ + u32 ibmaxlen; + /* + * ibmaxlen at init time, limited by chip and by receive buffer + * size. Not changed after init. + */ + u32 init_ibmaxlen; + /* LID programmed for this instance */ + u16 lid; + /* list of pkeys programmed; 0 if not set */ + u16 pkeys[4]; + /* LID mask control */ + u8 lmc; + u8 link_width_supported; + u8 link_speed_supported; + u8 link_width_enabled; + u8 link_speed_enabled; + u8 link_width_active; + u8 link_speed_active; + u8 vls_supported; + u8 vls_operational; + /* Rx Polarity inversion (compensate for ~tx on partner) */ + u8 rx_pol_inv; + + u8 hw_pidx; /* physical port index */ + u8 port; /* IB port number and index into dd->pports - 1 */ + + u8 delay_mult; + + /* used to override LED behavior */ + u8 led_override; /* Substituted for normal value, if non-zero */ + u16 led_override_timeoff; /* delta to next timer event */ + u8 led_override_vals[2]; /* Alternates per blink-frame */ + u8 led_override_phase; /* Just counts, LSB picks from vals[] */ + atomic_t led_override_timer_active; + /* Used to flash LEDs in override mode */ + struct timer_list led_override_timer; + struct xmit_wait cong_stats; + struct timer_list symerr_clear_timer; + + /* Synchronize access between driver writes and sysfs reads */ + spinlock_t cc_shadow_lock + ____cacheline_aligned_in_smp; + + /* Shadow copy of the congestion control table */ + struct cc_table_shadow *ccti_entries_shadow; + + /* Shadow copy of the congestion control entries */ + struct ib_cc_congestion_setting_attr_shadow *congestion_entries_shadow; + + /* List of congestion control table entries */ + struct ib_cc_table_entry_shadow *ccti_entries; + + /* 16 congestion entries with each entry corresponding to a SL */ + struct ib_cc_congestion_entry_shadow *congestion_entries; + + /* Maximum number of congestion control entries that the agent expects + * the manager to send. + */ + u16 cc_supported_table_entries; + + /* Total number of congestion control table entries */ + u16 total_cct_entry; + + /* Bit map identifying service level */ + u16 cc_sl_control_map; + + /* maximum congestion control table index */ + u16 ccti_limit; + + /* CA's max number of 64 entry units in the congestion control table */ + u8 cc_max_table_entries; +}; + +/* Observers. Not to be taken lightly, possibly not to ship. */ +/* + * If a diag read or write is to (bottom <= offset <= top), + * the "hoook" is called, allowing, e.g. shadows to be + * updated in sync with the driver. struct diag_observer + * is the "visible" part. + */ +struct diag_observer; + +typedef int (*diag_hook) (struct qib_devdata *dd, + const struct diag_observer *op, + u32 offs, u64 *data, u64 mask, int only_32); + +struct diag_observer { + diag_hook hook; + u32 bottom; + u32 top; +}; + +extern int qib_register_observer(struct qib_devdata *dd, + const struct diag_observer *op); + +/* Only declared here, not defined. Private to diags */ +struct diag_observer_list_elt; + +/* device data struct now contains only "general per-device" info. + * fields related to a physical IB port are in a qib_pportdata struct, + * described above) while fields only used by a particular chip-type are in + * a qib_chipdata struct, whose contents are opaque to this file. + */ +struct qib_devdata { + struct qib_ibdev verbs_dev; /* must be first */ + struct list_head list; + /* pointers to related structs for this device */ + /* pci access data structure */ + struct pci_dev *pcidev; + struct cdev *user_cdev; + struct cdev *diag_cdev; + struct device *user_device; + struct device *diag_device; + + /* mem-mapped pointer to base of chip regs */ + u64 __iomem *kregbase; + /* end of mem-mapped chip space excluding sendbuf and user regs */ + u64 __iomem *kregend; + /* physical address of chip for io_remap, etc. */ + resource_size_t physaddr; + /* qib_cfgctxts pointers */ + struct qib_ctxtdata **rcd; /* Receive Context Data */ + + /* qib_pportdata, points to array of (physical) port-specific + * data structs, indexed by pidx (0..n-1) + */ + struct qib_pportdata *pport; + struct qib_chip_specific *cspec; /* chip-specific */ + + /* kvirt address of 1st 2k pio buffer */ + void __iomem *pio2kbase; + /* kvirt address of 1st 4k pio buffer */ + void __iomem *pio4kbase; + /* mem-mapped pointer to base of PIO buffers (if using WC PAT) */ + void __iomem *piobase; + /* mem-mapped pointer to base of user chip regs (if using WC PAT) */ + u64 __iomem *userbase; + void __iomem *piovl15base; /* base of VL15 buffers, if not WC */ + /* + * points to area where PIOavail registers will be DMA'ed. + * Has to be on a page of it's own, because the page will be + * mapped into user program space. This copy is *ONLY* ever + * written by DMA, not by the driver! Need a copy per device + * when we get to multiple devices + */ + volatile __le64 *pioavailregs_dma; /* DMA'ed by chip */ + /* physical address where updates occur */ + dma_addr_t pioavailregs_phys; + + /* device-specific implementations of functions needed by + * common code. Contrary to previous consensus, we can't + * really just point to a device-specific table, because we + * may need to "bend", e.g. *_f_put_tid + */ + /* fallback to alternate interrupt type if possible */ + int (*f_intr_fallback)(struct qib_devdata *); + /* hard reset chip */ + int (*f_reset)(struct qib_devdata *); + void (*f_quiet_serdes)(struct qib_pportdata *); + int (*f_bringup_serdes)(struct qib_pportdata *); + int (*f_early_init)(struct qib_devdata *); + void (*f_clear_tids)(struct qib_devdata *, struct qib_ctxtdata *); + void (*f_put_tid)(struct qib_devdata *, u64 __iomem*, + u32, unsigned long); + void (*f_cleanup)(struct qib_devdata *); + void (*f_setextled)(struct qib_pportdata *, u32); + /* fill out chip-specific fields */ + int (*f_get_base_info)(struct qib_ctxtdata *, struct qib_base_info *); + /* free irq */ + void (*f_free_irq)(struct qib_devdata *); + struct qib_message_header *(*f_get_msgheader) + (struct qib_devdata *, __le32 *); + void (*f_config_ctxts)(struct qib_devdata *); + int (*f_get_ib_cfg)(struct qib_pportdata *, int); + int (*f_set_ib_cfg)(struct qib_pportdata *, int, u32); + int (*f_set_ib_loopback)(struct qib_pportdata *, const char *); + int (*f_get_ib_table)(struct qib_pportdata *, int, void *); + int (*f_set_ib_table)(struct qib_pportdata *, int, void *); + u32 (*f_iblink_state)(u64); + u8 (*f_ibphys_portstate)(u64); + void (*f_xgxs_reset)(struct qib_pportdata *); + /* per chip actions needed for IB Link up/down changes */ + int (*f_ib_updown)(struct qib_pportdata *, int, u64); + u32 __iomem *(*f_getsendbuf)(struct qib_pportdata *, u64, u32 *); + /* Read/modify/write of GPIO pins (potentially chip-specific */ + int (*f_gpio_mod)(struct qib_devdata *dd, u32 out, u32 dir, + u32 mask); + /* Enable writes to config EEPROM (if supported) */ + int (*f_eeprom_wen)(struct qib_devdata *dd, int wen); + /* + * modify rcvctrl shadow[s] and write to appropriate chip-regs. + * see above QIB_RCVCTRL_xxx_ENB/DIS for operations. + * (ctxt == -1) means "all contexts", only meaningful for + * clearing. Could remove if chip_spec shutdown properly done. + */ + void (*f_rcvctrl)(struct qib_pportdata *, unsigned int op, + int ctxt); + /* Read/modify/write sendctrl appropriately for op and port. */ + void (*f_sendctrl)(struct qib_pportdata *, u32 op); + void (*f_set_intr_state)(struct qib_devdata *, u32); + void (*f_set_armlaunch)(struct qib_devdata *, u32); + void (*f_wantpiobuf_intr)(struct qib_devdata *, u32); + int (*f_late_initreg)(struct qib_devdata *); + int (*f_init_sdma_regs)(struct qib_pportdata *); + u16 (*f_sdma_gethead)(struct qib_pportdata *); + int (*f_sdma_busy)(struct qib_pportdata *); + void (*f_sdma_update_tail)(struct qib_pportdata *, u16); + void (*f_sdma_set_desc_cnt)(struct qib_pportdata *, unsigned); + void (*f_sdma_sendctrl)(struct qib_pportdata *, unsigned); + void (*f_sdma_hw_clean_up)(struct qib_pportdata *); + void (*f_sdma_hw_start_up)(struct qib_pportdata *); + void (*f_sdma_init_early)(struct qib_pportdata *); + void (*f_set_cntr_sample)(struct qib_pportdata *, u32, u32); + void (*f_update_usrhead)(struct qib_ctxtdata *, u64, u32, u32, u32); + u32 (*f_hdrqempty)(struct qib_ctxtdata *); + u64 (*f_portcntr)(struct qib_pportdata *, u32); + u32 (*f_read_cntrs)(struct qib_devdata *, loff_t, char **, + u64 **); + u32 (*f_read_portcntrs)(struct qib_devdata *, loff_t, u32, + char **, u64 **); + u32 (*f_setpbc_control)(struct qib_pportdata *, u32, u8, u8); + void (*f_initvl15_bufs)(struct qib_devdata *); + void (*f_init_ctxt)(struct qib_ctxtdata *); + void (*f_txchk_change)(struct qib_devdata *, u32, u32, u32, + struct qib_ctxtdata *); + void (*f_writescratch)(struct qib_devdata *, u32); + int (*f_tempsense_rd)(struct qib_devdata *, int regnum); +#ifdef CONFIG_INFINIBAND_QIB_DCA + int (*f_notify_dca)(struct qib_devdata *, unsigned long event); +#endif + + char *boardname; /* human readable board info */ + + /* template for writing TIDs */ + u64 tidtemplate; + /* value to write to free TIDs */ + u64 tidinvalid; + + /* number of registers used for pioavail */ + u32 pioavregs; + /* device (not port) flags, basically device capabilities */ + u32 flags; + /* last buffer for user use */ + u32 lastctxt_piobuf; + + /* reset value */ + u64 z_int_counter; + /* percpu intcounter */ + u64 __percpu *int_counter; + + /* pio bufs allocated per ctxt */ + u32 pbufsctxt; + /* if remainder on bufs/ctxt, ctxts < extrabuf get 1 extra */ + u32 ctxts_extrabuf; + /* + * number of ctxts configured as max; zero is set to number chip + * supports, less gives more pio bufs/ctxt, etc. + */ + u32 cfgctxts; + /* + * number of ctxts available for PSM open + */ + u32 freectxts; + + /* + * hint that we should update pioavailshadow before + * looking for a PIO buffer + */ + u32 upd_pio_shadow; + + /* internal debugging stats */ + u32 maxpkts_call; + u32 avgpkts_call; + u64 nopiobufs; + + /* PCI Vendor ID (here for NodeInfo) */ + u16 vendorid; + /* PCI Device ID (here for NodeInfo) */ + u16 deviceid; + /* for write combining settings */ + int wc_cookie; + unsigned long wc_base; + unsigned long wc_len; + + /* shadow copy of struct page *'s for exp tid pages */ + struct page **pageshadow; + /* shadow copy of dma handles for exp tid pages */ + dma_addr_t *physshadow; + u64 __iomem *egrtidbase; + spinlock_t sendctrl_lock; /* protect changes to sendctrl shadow */ + /* around rcd and (user ctxts) ctxt_cnt use (intr vs free) */ + spinlock_t uctxt_lock; /* rcd and user context changes */ + /* + * per unit status, see also portdata statusp + * mapped readonly into user processes so they can get unit and + * IB link status cheaply + */ + u64 *devstatusp; + char *freezemsg; /* freeze msg if hw error put chip in freeze */ + u32 freezelen; /* max length of freezemsg */ + /* timer used to prevent stats overflow, error throttling, etc. */ + struct timer_list stats_timer; + + /* timer to verify interrupts work, and fallback if possible */ + struct timer_list intrchk_timer; + unsigned long ureg_align; /* user register alignment */ + + /* + * Protects pioavailshadow, pioavailkernel, pio_need_disarm, and + * pio_writing. + */ + spinlock_t pioavail_lock; + /* + * index of last buffer to optimize search for next + */ + u32 last_pio; + /* + * min kernel pio buffer to optimize search + */ + u32 min_kernel_pio; + /* + * Shadow copies of registers; size indicates read access size. + * Most of them are readonly, but some are write-only register, + * where we manipulate the bits in the shadow copy, and then write + * the shadow copy to qlogic_ib. + * + * We deliberately make most of these 32 bits, since they have + * restricted range. For any that we read, we won't to generate 32 + * bit accesses, since Opteron will generate 2 separate 32 bit HT + * transactions for a 64 bit read, and we want to avoid unnecessary + * bus transactions. + */ + + /* This is the 64 bit group */ + + unsigned long pioavailshadow[6]; + /* bitmap of send buffers available for the kernel to use with PIO. */ + unsigned long pioavailkernel[6]; + /* bitmap of send buffers which need to be disarmed. */ + unsigned long pio_need_disarm[3]; + /* bitmap of send buffers which are being written to. */ + unsigned long pio_writing[3]; + /* kr_revision shadow */ + u64 revision; + /* Base GUID for device (from eeprom, network order) */ + __be64 base_guid; + + /* + * kr_sendpiobufbase value (chip offset of pio buffers), and the + * base of the 2KB buffer s(user processes only use 2K) + */ + u64 piobufbase; + u32 pio2k_bufbase; + + /* these are the "32 bit" regs */ + + /* number of GUIDs in the flash for this interface */ + u32 nguid; + /* + * the following two are 32-bit bitmasks, but {test,clear,set}_bit + * all expect bit fields to be "unsigned long" + */ + unsigned long rcvctrl; /* shadow per device rcvctrl */ + unsigned long sendctrl; /* shadow per device sendctrl */ + + /* value we put in kr_rcvhdrcnt */ + u32 rcvhdrcnt; + /* value we put in kr_rcvhdrsize */ + u32 rcvhdrsize; + /* value we put in kr_rcvhdrentsize */ + u32 rcvhdrentsize; + /* kr_ctxtcnt value */ + u32 ctxtcnt; + /* kr_pagealign value */ + u32 palign; + /* number of "2KB" PIO buffers */ + u32 piobcnt2k; + /* size in bytes of "2KB" PIO buffers */ + u32 piosize2k; + /* max usable size in dwords of a "2KB" PIO buffer before going "4KB" */ + u32 piosize2kmax_dwords; + /* number of "4KB" PIO buffers */ + u32 piobcnt4k; + /* size in bytes of "4KB" PIO buffers */ + u32 piosize4k; + /* kr_rcvegrbase value */ + u32 rcvegrbase; + /* kr_rcvtidbase value */ + u32 rcvtidbase; + /* kr_rcvtidcnt value */ + u32 rcvtidcnt; + /* kr_userregbase */ + u32 uregbase; + /* shadow the control register contents */ + u32 control; + + /* chip address space used by 4k pio buffers */ + u32 align4k; + /* size of each rcvegrbuffer */ + u16 rcvegrbufsize; + /* log2 of above */ + u16 rcvegrbufsize_shift; + /* localbus width (1, 2,4,8,16,32) from config space */ + u32 lbus_width; + /* localbus speed in MHz */ + u32 lbus_speed; + int unit; /* unit # of this chip */ + + /* start of CHIP_SPEC move to chipspec, but need code changes */ + /* low and high portions of MSI capability/vector */ + u32 msi_lo; + /* saved after PCIe init for restore after reset */ + u32 msi_hi; + /* MSI data (vector) saved for restore */ + u16 msi_data; + /* so we can rewrite it after a chip reset */ + u32 pcibar0; + /* so we can rewrite it after a chip reset */ + u32 pcibar1; + u64 rhdrhead_intr_off; + + /* + * ASCII serial number, from flash, large enough for original + * all digit strings, and longer QLogic serial number format + */ + u8 serial[16]; + /* human readable board version */ + u8 boardversion[96]; + u8 lbus_info[32]; /* human readable localbus info */ + /* chip major rev, from qib_revision */ + u8 majrev; + /* chip minor rev, from qib_revision */ + u8 minrev; + + /* Misc small ints */ + /* Number of physical ports available */ + u8 num_pports; + /* Lowest context number which can be used by user processes */ + u8 first_user_ctxt; + u8 n_krcv_queues; + u8 qpn_mask; + u8 skip_kctxt_mask; + + u16 rhf_offset; /* offset of RHF within receive header entry */ + + /* + * GPIO pins for twsi-connected devices, and device code for eeprom + */ + u8 gpio_sda_num; + u8 gpio_scl_num; + u8 twsi_eeprom_dev; + u8 board_atten; + + /* Support (including locks) for EEPROM logging of errors and time */ + /* control access to actual counters, timer */ + spinlock_t eep_st_lock; + /* control high-level access to EEPROM */ + struct mutex eep_lock; + uint64_t traffic_wds; + /* + * masks for which bits of errs, hwerrs that cause + * each of the counters to increment. + */ + struct qib_eep_log_mask eep_st_masks[QIB_EEP_LOG_CNT]; + struct qib_diag_client *diag_client; + spinlock_t qib_diag_trans_lock; /* protect diag observer ops */ + struct diag_observer_list_elt *diag_observer_list; + + u8 psxmitwait_supported; + /* cycle length of PS* counters in HW (in picoseconds) */ + u16 psxmitwait_check_rate; + /* high volume overflow errors defered to tasklet */ + struct tasklet_struct error_tasklet; + /* per device cq worker */ + struct kthread_worker *worker; + + int assigned_node_id; /* NUMA node closest to HCA */ +}; + +/* hol_state values */ +#define QIB_HOL_UP 0 +#define QIB_HOL_INIT 1 + +#define QIB_SDMA_SENDCTRL_OP_ENABLE (1U << 0) +#define QIB_SDMA_SENDCTRL_OP_INTENABLE (1U << 1) +#define QIB_SDMA_SENDCTRL_OP_HALT (1U << 2) +#define QIB_SDMA_SENDCTRL_OP_CLEANUP (1U << 3) +#define QIB_SDMA_SENDCTRL_OP_DRAIN (1U << 4) + +/* operation types for f_txchk_change() */ +#define TXCHK_CHG_TYPE_DIS1 3 +#define TXCHK_CHG_TYPE_ENAB1 2 +#define TXCHK_CHG_TYPE_KERN 1 +#define TXCHK_CHG_TYPE_USER 0 + +#define QIB_CHASE_TIME msecs_to_jiffies(145) +#define QIB_CHASE_DIS_TIME msecs_to_jiffies(160) + +/* Private data for file operations */ +struct qib_filedata { + struct qib_ctxtdata *rcd; + unsigned subctxt; + unsigned tidcursor; + struct qib_user_sdma_queue *pq; + int rec_cpu_num; /* for cpu affinity; -1 if none */ +}; + +extern struct list_head qib_dev_list; +extern spinlock_t qib_devs_lock; +extern struct qib_devdata *qib_lookup(int unit); +extern u32 qib_cpulist_count; +extern unsigned long *qib_cpulist; + +extern unsigned qib_cc_table_size; +int qib_init(struct qib_devdata *, int); +int init_chip_wc_pat(struct qib_devdata *dd, u32); +int qib_enable_wc(struct qib_devdata *dd); +void qib_disable_wc(struct qib_devdata *dd); +int qib_count_units(int *npresentp, int *nupp); +int qib_count_active_units(void); + +int qib_cdev_init(int minor, const char *name, + const struct file_operations *fops, + struct cdev **cdevp, struct device **devp); +void qib_cdev_cleanup(struct cdev **cdevp, struct device **devp); +int qib_dev_init(void); +void qib_dev_cleanup(void); + +int qib_diag_add(struct qib_devdata *); +void qib_diag_remove(struct qib_devdata *); +void qib_handle_e_ibstatuschanged(struct qib_pportdata *, u64); +void qib_sdma_update_tail(struct qib_pportdata *, u16); /* hold sdma_lock */ + +int qib_decode_err(struct qib_devdata *dd, char *buf, size_t blen, u64 err); +void qib_bad_intrstatus(struct qib_devdata *); +void qib_handle_urcv(struct qib_devdata *, u64); + +/* clean up any per-chip chip-specific stuff */ +void qib_chip_cleanup(struct qib_devdata *); +/* clean up any chip type-specific stuff */ +void qib_chip_done(void); + +/* check to see if we have to force ordering for write combining */ +int qib_unordered_wc(void); +void qib_pio_copy(void __iomem *to, const void *from, size_t count); + +void qib_disarm_piobufs(struct qib_devdata *, unsigned, unsigned); +int qib_disarm_piobufs_ifneeded(struct qib_ctxtdata *); +void qib_disarm_piobufs_set(struct qib_devdata *, unsigned long *, unsigned); +void qib_cancel_sends(struct qib_pportdata *); + +int qib_create_rcvhdrq(struct qib_devdata *, struct qib_ctxtdata *); +int qib_setup_eagerbufs(struct qib_ctxtdata *); +void qib_set_ctxtcnt(struct qib_devdata *); +int qib_create_ctxts(struct qib_devdata *dd); +struct qib_ctxtdata *qib_create_ctxtdata(struct qib_pportdata *, u32, int); +int qib_init_pportdata(struct qib_pportdata *, struct qib_devdata *, u8, u8); +void qib_free_ctxtdata(struct qib_devdata *, struct qib_ctxtdata *); + +u32 qib_kreceive(struct qib_ctxtdata *, u32 *, u32 *); +int qib_reset_device(int); +int qib_wait_linkstate(struct qib_pportdata *, u32, int); +int qib_set_linkstate(struct qib_pportdata *, u8); +int qib_set_mtu(struct qib_pportdata *, u16); +int qib_set_lid(struct qib_pportdata *, u32, u8); +void qib_hol_down(struct qib_pportdata *); +void qib_hol_init(struct qib_pportdata *); +void qib_hol_up(struct qib_pportdata *); +void qib_hol_event(unsigned long); +void qib_disable_after_error(struct qib_devdata *); +int qib_set_uevent_bits(struct qib_pportdata *, const int); + +/* for use in system calls, where we want to know device type, etc. */ +#define ctxt_fp(fp) \ + (((struct qib_filedata *)(fp)->private_data)->rcd) +#define subctxt_fp(fp) \ + (((struct qib_filedata *)(fp)->private_data)->subctxt) +#define tidcursor_fp(fp) \ + (((struct qib_filedata *)(fp)->private_data)->tidcursor) +#define user_sdma_queue_fp(fp) \ + (((struct qib_filedata *)(fp)->private_data)->pq) + +static inline struct qib_devdata *dd_from_ppd(struct qib_pportdata *ppd) +{ + return ppd->dd; +} + +static inline struct qib_devdata *dd_from_dev(struct qib_ibdev *dev) +{ + return container_of(dev, struct qib_devdata, verbs_dev); +} + +static inline struct qib_devdata *dd_from_ibdev(struct ib_device *ibdev) +{ + return dd_from_dev(to_idev(ibdev)); +} + +static inline struct qib_pportdata *ppd_from_ibp(struct qib_ibport *ibp) +{ + return container_of(ibp, struct qib_pportdata, ibport_data); +} + +static inline struct qib_ibport *to_iport(struct ib_device *ibdev, u8 port) +{ + struct qib_devdata *dd = dd_from_ibdev(ibdev); + unsigned pidx = port - 1; /* IB number port from 1, hdw from 0 */ + + WARN_ON(pidx >= dd->num_pports); + return &dd->pport[pidx].ibport_data; +} + +/* + * values for dd->flags (_device_ related flags) and + */ +#define QIB_HAS_LINK_LATENCY 0x1 /* supports link latency (IB 1.2) */ +#define QIB_INITTED 0x2 /* chip and driver up and initted */ +#define QIB_DOING_RESET 0x4 /* in the middle of doing chip reset */ +#define QIB_PRESENT 0x8 /* chip accesses can be done */ +#define QIB_PIO_FLUSH_WC 0x10 /* Needs Write combining flush for PIO */ +#define QIB_HAS_THRESH_UPDATE 0x40 +#define QIB_HAS_SDMA_TIMEOUT 0x80 +#define QIB_USE_SPCL_TRIG 0x100 /* SpecialTrigger launch enabled */ +#define QIB_NODMA_RTAIL 0x200 /* rcvhdrtail register DMA enabled */ +#define QIB_HAS_INTX 0x800 /* Supports INTx interrupts */ +#define QIB_HAS_SEND_DMA 0x1000 /* Supports Send DMA */ +#define QIB_HAS_VLSUPP 0x2000 /* Supports multiple VLs; PBC different */ +#define QIB_HAS_HDRSUPP 0x4000 /* Supports header suppression */ +#define QIB_BADINTR 0x8000 /* severe interrupt problems */ +#define QIB_DCA_ENABLED 0x10000 /* Direct Cache Access enabled */ +#define QIB_HAS_QSFP 0x20000 /* device (card instance) has QSFP */ + +/* + * values for ppd->lflags (_ib_port_ related flags) + */ +#define QIBL_LINKV 0x1 /* IB link state valid */ +#define QIBL_LINKDOWN 0x8 /* IB link is down */ +#define QIBL_LINKINIT 0x10 /* IB link level is up */ +#define QIBL_LINKARMED 0x20 /* IB link is ARMED */ +#define QIBL_LINKACTIVE 0x40 /* IB link is ACTIVE */ +/* leave a gap for more IB-link state */ +#define QIBL_IB_AUTONEG_INPROG 0x1000 /* non-IBTA DDR/QDR neg active */ +#define QIBL_IB_AUTONEG_FAILED 0x2000 /* non-IBTA DDR/QDR neg failed */ +#define QIBL_IB_LINK_DISABLED 0x4000 /* Linkdown-disable forced, + * Do not try to bring up */ +#define QIBL_IB_FORCE_NOTIFY 0x8000 /* force notify on next ib change */ + +/* IB dword length mask in PBC (lower 11 bits); same for all chips */ +#define QIB_PBC_LENGTH_MASK ((1 << 11) - 1) + + +/* ctxt_flag bit offsets */ + /* waiting for a packet to arrive */ +#define QIB_CTXT_WAITING_RCV 2 + /* master has not finished initializing */ +#define QIB_CTXT_MASTER_UNINIT 4 + /* waiting for an urgent packet to arrive */ +#define QIB_CTXT_WAITING_URG 5 + +/* free up any allocated data at closes */ +void qib_free_data(struct qib_ctxtdata *dd); +void qib_chg_pioavailkernel(struct qib_devdata *, unsigned, unsigned, + u32, struct qib_ctxtdata *); +struct qib_devdata *qib_init_iba7322_funcs(struct pci_dev *, + const struct pci_device_id *); +struct qib_devdata *qib_init_iba7220_funcs(struct pci_dev *, + const struct pci_device_id *); +struct qib_devdata *qib_init_iba6120_funcs(struct pci_dev *, + const struct pci_device_id *); +void qib_free_devdata(struct qib_devdata *); +struct qib_devdata *qib_alloc_devdata(struct pci_dev *pdev, size_t extra); + +#define QIB_TWSI_NO_DEV 0xFF +/* Below qib_twsi_ functions must be called with eep_lock held */ +int qib_twsi_reset(struct qib_devdata *dd); +int qib_twsi_blk_rd(struct qib_devdata *dd, int dev, int addr, void *buffer, + int len); +int qib_twsi_blk_wr(struct qib_devdata *dd, int dev, int addr, + const void *buffer, int len); +void qib_get_eeprom_info(struct qib_devdata *); +#define qib_inc_eeprom_err(dd, eidx, incr) +void qib_dump_lookup_output_queue(struct qib_devdata *); +void qib_force_pio_avail_update(struct qib_devdata *); +void qib_clear_symerror_on_linkup(unsigned long opaque); + +/* + * Set LED override, only the two LSBs have "public" meaning, but + * any non-zero value substitutes them for the Link and LinkTrain + * LED states. + */ +#define QIB_LED_PHYS 1 /* Physical (linktraining) GREEN LED */ +#define QIB_LED_LOG 2 /* Logical (link) YELLOW LED */ +void qib_set_led_override(struct qib_pportdata *ppd, unsigned int val); + +/* send dma routines */ +int qib_setup_sdma(struct qib_pportdata *); +void qib_teardown_sdma(struct qib_pportdata *); +void __qib_sdma_intr(struct qib_pportdata *); +void qib_sdma_intr(struct qib_pportdata *); +void qib_user_sdma_send_desc(struct qib_pportdata *dd, + struct list_head *pktlist); +int qib_sdma_verbs_send(struct qib_pportdata *, struct qib_sge_state *, + u32, struct qib_verbs_txreq *); +/* ppd->sdma_lock should be locked before calling this. */ +int qib_sdma_make_progress(struct qib_pportdata *dd); + +static inline int qib_sdma_empty(const struct qib_pportdata *ppd) +{ + return ppd->sdma_descq_added == ppd->sdma_descq_removed; +} + +/* must be called under qib_sdma_lock */ +static inline u16 qib_sdma_descq_freecnt(const struct qib_pportdata *ppd) +{ + return ppd->sdma_descq_cnt - + (ppd->sdma_descq_added - ppd->sdma_descq_removed) - 1; +} + +static inline int __qib_sdma_running(struct qib_pportdata *ppd) +{ + return ppd->sdma_state.current_state == qib_sdma_state_s99_running; +} +int qib_sdma_running(struct qib_pportdata *); +void dump_sdma_state(struct qib_pportdata *ppd); +void __qib_sdma_process_event(struct qib_pportdata *, enum qib_sdma_events); +void qib_sdma_process_event(struct qib_pportdata *, enum qib_sdma_events); + +/* + * number of words used for protocol header if not set by qib_userinit(); + */ +#define QIB_DFLT_RCVHDRSIZE 9 + +/* + * We need to be able to handle an IB header of at least 24 dwords. + * We need the rcvhdrq large enough to handle largest IB header, but + * still have room for a 2KB MTU standard IB packet. + * Additionally, some processor/memory controller combinations + * benefit quite strongly from having the DMA'ed data be cacheline + * aligned and a cacheline multiple, so we set the size to 32 dwords + * (2 64-byte primary cachelines for pretty much all processors of + * interest). The alignment hurts nothing, other than using somewhat + * more memory. + */ +#define QIB_RCVHDR_ENTSIZE 32 + +int qib_get_user_pages(unsigned long, size_t, struct page **); +void qib_release_user_pages(struct page **, size_t); +int qib_eeprom_read(struct qib_devdata *, u8, void *, int); +int qib_eeprom_write(struct qib_devdata *, u8, const void *, int); +u32 __iomem *qib_getsendbuf_range(struct qib_devdata *, u32 *, u32, u32); +void qib_sendbuf_done(struct qib_devdata *, unsigned); + +static inline void qib_clear_rcvhdrtail(const struct qib_ctxtdata *rcd) +{ + *((u64 *) rcd->rcvhdrtail_kvaddr) = 0ULL; +} + +static inline u32 qib_get_rcvhdrtail(const struct qib_ctxtdata *rcd) +{ + /* + * volatile because it's a DMA target from the chip, routine is + * inlined, and don't want register caching or reordering. + */ + return (u32) le64_to_cpu( + *((volatile __le64 *)rcd->rcvhdrtail_kvaddr)); /* DMA'ed */ +} + +static inline u32 qib_get_hdrqtail(const struct qib_ctxtdata *rcd) +{ + const struct qib_devdata *dd = rcd->dd; + u32 hdrqtail; + + if (dd->flags & QIB_NODMA_RTAIL) { + __le32 *rhf_addr; + u32 seq; + + rhf_addr = (__le32 *) rcd->rcvhdrq + + rcd->head + dd->rhf_offset; + seq = qib_hdrget_seq(rhf_addr); + hdrqtail = rcd->head; + if (seq == rcd->seq_cnt) + hdrqtail++; + } else + hdrqtail = qib_get_rcvhdrtail(rcd); + + return hdrqtail; +} + +/* + * sysfs interface. + */ + +extern const char ib_qib_version[]; + +int qib_device_create(struct qib_devdata *); +void qib_device_remove(struct qib_devdata *); + +int qib_create_port_files(struct ib_device *ibdev, u8 port_num, + struct kobject *kobj); +int qib_verbs_register_sysfs(struct qib_devdata *); +void qib_verbs_unregister_sysfs(struct qib_devdata *); +/* Hook for sysfs read of QSFP */ +extern int qib_qsfp_dump(struct qib_pportdata *ppd, char *buf, int len); + +int __init qib_init_qibfs(void); +int __exit qib_exit_qibfs(void); + +int qibfs_add(struct qib_devdata *); +int qibfs_remove(struct qib_devdata *); + +int qib_pcie_init(struct pci_dev *, const struct pci_device_id *); +int qib_pcie_ddinit(struct qib_devdata *, struct pci_dev *, + const struct pci_device_id *); +void qib_pcie_ddcleanup(struct qib_devdata *); +int qib_pcie_params(struct qib_devdata *, u32, u32 *, struct qib_msix_entry *); +int qib_reinit_intr(struct qib_devdata *); +void qib_enable_intx(struct pci_dev *); +void qib_nomsi(struct qib_devdata *); +void qib_nomsix(struct qib_devdata *); +void qib_pcie_getcmd(struct qib_devdata *, u16 *, u8 *, u8 *); +void qib_pcie_reenable(struct qib_devdata *, u16, u8, u8); +/* interrupts for device */ +u64 qib_int_counter(struct qib_devdata *); +/* interrupt for all devices */ +u64 qib_sps_ints(void); + +/* + * dma_addr wrappers - all 0's invalid for hw + */ +dma_addr_t qib_map_page(struct pci_dev *, struct page *, unsigned long, + size_t, int); +const char *qib_get_unit_name(int unit); + +/* + * Flush write combining store buffers (if present) and perform a write + * barrier. + */ +static inline void qib_flush_wc(void) +{ +#if defined(CONFIG_X86_64) + asm volatile("sfence" : : : "memory"); +#else + wmb(); /* no reorder around wc flush */ +#endif +} + +/* global module parameter variables */ +extern unsigned qib_ibmtu; +extern ushort qib_cfgctxts; +extern ushort qib_num_cfg_vls; +extern ushort qib_mini_init; /* If set, do few (ideally 0) writes to chip */ +extern unsigned qib_n_krcv_queues; +extern unsigned qib_sdma_fetch_arb; +extern unsigned qib_compat_ddr_negotiate; +extern int qib_special_trigger; +extern unsigned qib_numa_aware; + +extern struct mutex qib_mutex; + +/* Number of seconds before our card status check... */ +#define STATUS_TIMEOUT 60 + +#define QIB_DRV_NAME "ib_qib" +#define QIB_USER_MINOR_BASE 0 +#define QIB_TRACE_MINOR 127 +#define QIB_DIAGPKT_MINOR 128 +#define QIB_DIAG_MINOR_BASE 129 +#define QIB_NMINORS 255 + +#define PCI_VENDOR_ID_PATHSCALE 0x1fc1 +#define PCI_VENDOR_ID_QLOGIC 0x1077 +#define PCI_DEVICE_ID_QLOGIC_IB_6120 0x10 +#define PCI_DEVICE_ID_QLOGIC_IB_7220 0x7220 +#define PCI_DEVICE_ID_QLOGIC_IB_7322 0x7322 + +/* + * qib_early_err is used (only!) to print early errors before devdata is + * allocated, or when dd->pcidev may not be valid, and at the tail end of + * cleanup when devdata may have been freed, etc. qib_dev_porterr is + * the same as qib_dev_err, but is used when the message really needs + * the IB port# to be definitive as to what's happening.. + * All of these go to the trace log, and the trace log entry is done + * first to avoid possible serial port delays from printk. + */ +#define qib_early_err(dev, fmt, ...) \ + dev_err(dev, fmt, ##__VA_ARGS__) + +#define qib_dev_err(dd, fmt, ...) \ + dev_err(&(dd)->pcidev->dev, "%s: " fmt, \ + qib_get_unit_name((dd)->unit), ##__VA_ARGS__) + +#define qib_dev_warn(dd, fmt, ...) \ + dev_warn(&(dd)->pcidev->dev, "%s: " fmt, \ + qib_get_unit_name((dd)->unit), ##__VA_ARGS__) + +#define qib_dev_porterr(dd, port, fmt, ...) \ + dev_err(&(dd)->pcidev->dev, "%s: IB%u:%u " fmt, \ + qib_get_unit_name((dd)->unit), (dd)->unit, (port), \ + ##__VA_ARGS__) + +#define qib_devinfo(pcidev, fmt, ...) \ + dev_info(&(pcidev)->dev, fmt, ##__VA_ARGS__) + +/* + * this is used for formatting hw error messages... + */ +struct qib_hwerror_msgs { + u64 mask; + const char *msg; + size_t sz; +}; + +#define QLOGIC_IB_HWE_MSG(a, b) { .mask = a, .msg = b } + +/* in qib_intr.c... */ +void qib_format_hwerrors(u64 hwerrs, + const struct qib_hwerror_msgs *hwerrmsgs, + size_t nhwerrmsgs, char *msg, size_t lmsg); +#endif /* _QIB_KERNEL_H */ diff --git a/kernel/drivers/infiniband/hw/qib/qib_6120_regs.h b/kernel/drivers/infiniband/hw/qib/qib_6120_regs.h new file mode 100644 index 000000000..e16cb6f7d --- /dev/null +++ b/kernel/drivers/infiniband/hw/qib/qib_6120_regs.h @@ -0,0 +1,977 @@ +/* + * Copyright (c) 2008, 2009, 2010 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* This file is mechanically generated from RTL. Any hand-edits will be lost! */ + +#define QIB_6120_Revision_OFFS 0x0 +#define QIB_6120_Revision_R_Simulator_LSB 0x3F +#define QIB_6120_Revision_R_Simulator_RMASK 0x1 +#define QIB_6120_Revision_Reserved_LSB 0x28 +#define QIB_6120_Revision_Reserved_RMASK 0x7FFFFF +#define QIB_6120_Revision_BoardID_LSB 0x20 +#define QIB_6120_Revision_BoardID_RMASK 0xFF +#define QIB_6120_Revision_R_SW_LSB 0x18 +#define QIB_6120_Revision_R_SW_RMASK 0xFF +#define QIB_6120_Revision_R_Arch_LSB 0x10 +#define QIB_6120_Revision_R_Arch_RMASK 0xFF +#define QIB_6120_Revision_R_ChipRevMajor_LSB 0x8 +#define QIB_6120_Revision_R_ChipRevMajor_RMASK 0xFF +#define QIB_6120_Revision_R_ChipRevMinor_LSB 0x0 +#define QIB_6120_Revision_R_ChipRevMinor_RMASK 0xFF + +#define QIB_6120_Control_OFFS 0x8 +#define QIB_6120_Control_TxLatency_LSB 0x4 +#define QIB_6120_Control_TxLatency_RMASK 0x1 +#define QIB_6120_Control_PCIERetryBufDiagEn_LSB 0x3 +#define QIB_6120_Control_PCIERetryBufDiagEn_RMASK 0x1 +#define QIB_6120_Control_LinkEn_LSB 0x2 +#define QIB_6120_Control_LinkEn_RMASK 0x1 +#define QIB_6120_Control_FreezeMode_LSB 0x1 +#define QIB_6120_Control_FreezeMode_RMASK 0x1 +#define QIB_6120_Control_SyncReset_LSB 0x0 +#define QIB_6120_Control_SyncReset_RMASK 0x1 + +#define QIB_6120_PageAlign_OFFS 0x10 + +#define QIB_6120_PortCnt_OFFS 0x18 + +#define QIB_6120_SendRegBase_OFFS 0x30 + +#define QIB_6120_UserRegBase_OFFS 0x38 + +#define QIB_6120_CntrRegBase_OFFS 0x40 + +#define QIB_6120_Scratch_OFFS 0x48 +#define QIB_6120_Scratch_TopHalf_LSB 0x20 +#define QIB_6120_Scratch_TopHalf_RMASK 0xFFFFFFFF +#define QIB_6120_Scratch_BottomHalf_LSB 0x0 +#define QIB_6120_Scratch_BottomHalf_RMASK 0xFFFFFFFF + +#define QIB_6120_IntBlocked_OFFS 0x60 +#define QIB_6120_IntBlocked_ErrorIntBlocked_LSB 0x1F +#define QIB_6120_IntBlocked_ErrorIntBlocked_RMASK 0x1 +#define QIB_6120_IntBlocked_PioSetIntBlocked_LSB 0x1E +#define QIB_6120_IntBlocked_PioSetIntBlocked_RMASK 0x1 +#define QIB_6120_IntBlocked_PioBufAvailIntBlocked_LSB 0x1D +#define QIB_6120_IntBlocked_PioBufAvailIntBlocked_RMASK 0x1 +#define QIB_6120_IntBlocked_assertGPIOIntBlocked_LSB 0x1C +#define QIB_6120_IntBlocked_assertGPIOIntBlocked_RMASK 0x1 +#define QIB_6120_IntBlocked_Reserved_LSB 0xF +#define QIB_6120_IntBlocked_Reserved_RMASK 0x1FFF +#define QIB_6120_IntBlocked_RcvAvail4IntBlocked_LSB 0x10 +#define QIB_6120_IntBlocked_RcvAvail4IntBlocked_RMASK 0x1 +#define QIB_6120_IntBlocked_RcvAvail3IntBlocked_LSB 0xF +#define QIB_6120_IntBlocked_RcvAvail3IntBlocked_RMASK 0x1 +#define QIB_6120_IntBlocked_RcvAvail2IntBlocked_LSB 0xE +#define QIB_6120_IntBlocked_RcvAvail2IntBlocked_RMASK 0x1 +#define QIB_6120_IntBlocked_RcvAvail1IntBlocked_LSB 0xD +#define QIB_6120_IntBlocked_RcvAvail1IntBlocked_RMASK 0x1 +#define QIB_6120_IntBlocked_RcvAvail0IntBlocked_LSB 0xC +#define QIB_6120_IntBlocked_RcvAvail0IntBlocked_RMASK 0x1 +#define QIB_6120_IntBlocked_Reserved1_LSB 0x5 +#define QIB_6120_IntBlocked_Reserved1_RMASK 0x7F +#define QIB_6120_IntBlocked_RcvUrg4IntBlocked_LSB 0x4 +#define QIB_6120_IntBlocked_RcvUrg4IntBlocked_RMASK 0x1 +#define QIB_6120_IntBlocked_RcvUrg3IntBlocked_LSB 0x3 +#define QIB_6120_IntBlocked_RcvUrg3IntBlocked_RMASK 0x1 +#define QIB_6120_IntBlocked_RcvUrg2IntBlocked_LSB 0x2 +#define QIB_6120_IntBlocked_RcvUrg2IntBlocked_RMASK 0x1 +#define QIB_6120_IntBlocked_RcvUrg1IntBlocked_LSB 0x1 +#define QIB_6120_IntBlocked_RcvUrg1IntBlocked_RMASK 0x1 +#define QIB_6120_IntBlocked_RcvUrg0IntBlocked_LSB 0x0 +#define QIB_6120_IntBlocked_RcvUrg0IntBlocked_RMASK 0x1 + +#define QIB_6120_IntMask_OFFS 0x68 +#define QIB_6120_IntMask_ErrorIntMask_LSB 0x1F +#define QIB_6120_IntMask_ErrorIntMask_RMASK 0x1 +#define QIB_6120_IntMask_PioSetIntMask_LSB 0x1E +#define QIB_6120_IntMask_PioSetIntMask_RMASK 0x1 +#define QIB_6120_IntMask_PioBufAvailIntMask_LSB 0x1D +#define QIB_6120_IntMask_PioBufAvailIntMask_RMASK 0x1 +#define QIB_6120_IntMask_assertGPIOIntMask_LSB 0x1C +#define QIB_6120_IntMask_assertGPIOIntMask_RMASK 0x1 +#define QIB_6120_IntMask_Reserved_LSB 0x11 +#define QIB_6120_IntMask_Reserved_RMASK 0x7FF +#define QIB_6120_IntMask_RcvAvail4IntMask_LSB 0x10 +#define QIB_6120_IntMask_RcvAvail4IntMask_RMASK 0x1 +#define QIB_6120_IntMask_RcvAvail3IntMask_LSB 0xF +#define QIB_6120_IntMask_RcvAvail3IntMask_RMASK 0x1 +#define QIB_6120_IntMask_RcvAvail2IntMask_LSB 0xE +#define QIB_6120_IntMask_RcvAvail2IntMask_RMASK 0x1 +#define QIB_6120_IntMask_RcvAvail1IntMask_LSB 0xD +#define QIB_6120_IntMask_RcvAvail1IntMask_RMASK 0x1 +#define QIB_6120_IntMask_RcvAvail0IntMask_LSB 0xC +#define QIB_6120_IntMask_RcvAvail0IntMask_RMASK 0x1 +#define QIB_6120_IntMask_Reserved1_LSB 0x5 +#define QIB_6120_IntMask_Reserved1_RMASK 0x7F +#define QIB_6120_IntMask_RcvUrg4IntMask_LSB 0x4 +#define QIB_6120_IntMask_RcvUrg4IntMask_RMASK 0x1 +#define QIB_6120_IntMask_RcvUrg3IntMask_LSB 0x3 +#define QIB_6120_IntMask_RcvUrg3IntMask_RMASK 0x1 +#define QIB_6120_IntMask_RcvUrg2IntMask_LSB 0x2 +#define QIB_6120_IntMask_RcvUrg2IntMask_RMASK 0x1 +#define QIB_6120_IntMask_RcvUrg1IntMask_LSB 0x1 +#define QIB_6120_IntMask_RcvUrg1IntMask_RMASK 0x1 +#define QIB_6120_IntMask_RcvUrg0IntMask_LSB 0x0 +#define QIB_6120_IntMask_RcvUrg0IntMask_RMASK 0x1 + +#define QIB_6120_IntStatus_OFFS 0x70 +#define QIB_6120_IntStatus_Error_LSB 0x1F +#define QIB_6120_IntStatus_Error_RMASK 0x1 +#define QIB_6120_IntStatus_PioSent_LSB 0x1E +#define QIB_6120_IntStatus_PioSent_RMASK 0x1 +#define QIB_6120_IntStatus_PioBufAvail_LSB 0x1D +#define QIB_6120_IntStatus_PioBufAvail_RMASK 0x1 +#define QIB_6120_IntStatus_assertGPIO_LSB 0x1C +#define QIB_6120_IntStatus_assertGPIO_RMASK 0x1 +#define QIB_6120_IntStatus_Reserved_LSB 0xF +#define QIB_6120_IntStatus_Reserved_RMASK 0x1FFF +#define QIB_6120_IntStatus_RcvAvail4_LSB 0x10 +#define QIB_6120_IntStatus_RcvAvail4_RMASK 0x1 +#define QIB_6120_IntStatus_RcvAvail3_LSB 0xF +#define QIB_6120_IntStatus_RcvAvail3_RMASK 0x1 +#define QIB_6120_IntStatus_RcvAvail2_LSB 0xE +#define QIB_6120_IntStatus_RcvAvail2_RMASK 0x1 +#define QIB_6120_IntStatus_RcvAvail1_LSB 0xD +#define QIB_6120_IntStatus_RcvAvail1_RMASK 0x1 +#define QIB_6120_IntStatus_RcvAvail0_LSB 0xC +#define QIB_6120_IntStatus_RcvAvail0_RMASK 0x1 +#define QIB_6120_IntStatus_Reserved1_LSB 0x5 +#define QIB_6120_IntStatus_Reserved1_RMASK 0x7F +#define QIB_6120_IntStatus_RcvUrg4_LSB 0x4 +#define QIB_6120_IntStatus_RcvUrg4_RMASK 0x1 +#define QIB_6120_IntStatus_RcvUrg3_LSB 0x3 +#define QIB_6120_IntStatus_RcvUrg3_RMASK 0x1 +#define QIB_6120_IntStatus_RcvUrg2_LSB 0x2 +#define QIB_6120_IntStatus_RcvUrg2_RMASK 0x1 +#define QIB_6120_IntStatus_RcvUrg1_LSB 0x1 +#define QIB_6120_IntStatus_RcvUrg1_RMASK 0x1 +#define QIB_6120_IntStatus_RcvUrg0_LSB 0x0 +#define QIB_6120_IntStatus_RcvUrg0_RMASK 0x1 + +#define QIB_6120_IntClear_OFFS 0x78 +#define QIB_6120_IntClear_ErrorIntClear_LSB 0x1F +#define QIB_6120_IntClear_ErrorIntClear_RMASK 0x1 +#define QIB_6120_IntClear_PioSetIntClear_LSB 0x1E +#define QIB_6120_IntClear_PioSetIntClear_RMASK 0x1 +#define QIB_6120_IntClear_PioBufAvailIntClear_LSB 0x1D +#define QIB_6120_IntClear_PioBufAvailIntClear_RMASK 0x1 +#define QIB_6120_IntClear_assertGPIOIntClear_LSB 0x1C +#define QIB_6120_IntClear_assertGPIOIntClear_RMASK 0x1 +#define QIB_6120_IntClear_Reserved_LSB 0xF +#define QIB_6120_IntClear_Reserved_RMASK 0x1FFF +#define QIB_6120_IntClear_RcvAvail4IntClear_LSB 0x10 +#define QIB_6120_IntClear_RcvAvail4IntClear_RMASK 0x1 +#define QIB_6120_IntClear_RcvAvail3IntClear_LSB 0xF +#define QIB_6120_IntClear_RcvAvail3IntClear_RMASK 0x1 +#define QIB_6120_IntClear_RcvAvail2IntClear_LSB 0xE +#define QIB_6120_IntClear_RcvAvail2IntClear_RMASK 0x1 +#define QIB_6120_IntClear_RcvAvail1IntClear_LSB 0xD +#define QIB_6120_IntClear_RcvAvail1IntClear_RMASK 0x1 +#define QIB_6120_IntClear_RcvAvail0IntClear_LSB 0xC +#define QIB_6120_IntClear_RcvAvail0IntClear_RMASK 0x1 +#define QIB_6120_IntClear_Reserved1_LSB 0x5 +#define QIB_6120_IntClear_Reserved1_RMASK 0x7F +#define QIB_6120_IntClear_RcvUrg4IntClear_LSB 0x4 +#define QIB_6120_IntClear_RcvUrg4IntClear_RMASK 0x1 +#define QIB_6120_IntClear_RcvUrg3IntClear_LSB 0x3 +#define QIB_6120_IntClear_RcvUrg3IntClear_RMASK 0x1 +#define QIB_6120_IntClear_RcvUrg2IntClear_LSB 0x2 +#define QIB_6120_IntClear_RcvUrg2IntClear_RMASK 0x1 +#define QIB_6120_IntClear_RcvUrg1IntClear_LSB 0x1 +#define QIB_6120_IntClear_RcvUrg1IntClear_RMASK 0x1 +#define QIB_6120_IntClear_RcvUrg0IntClear_LSB 0x0 +#define QIB_6120_IntClear_RcvUrg0IntClear_RMASK 0x1 + +#define QIB_6120_ErrMask_OFFS 0x80 +#define QIB_6120_ErrMask_Reserved_LSB 0x34 +#define QIB_6120_ErrMask_Reserved_RMASK 0xFFF +#define QIB_6120_ErrMask_HardwareErrMask_LSB 0x33 +#define QIB_6120_ErrMask_HardwareErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_ResetNegatedMask_LSB 0x32 +#define QIB_6120_ErrMask_ResetNegatedMask_RMASK 0x1 +#define QIB_6120_ErrMask_InvalidAddrErrMask_LSB 0x31 +#define QIB_6120_ErrMask_InvalidAddrErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_IBStatusChangedMask_LSB 0x30 +#define QIB_6120_ErrMask_IBStatusChangedMask_RMASK 0x1 +#define QIB_6120_ErrMask_Reserved1_LSB 0x26 +#define QIB_6120_ErrMask_Reserved1_RMASK 0x3FF +#define QIB_6120_ErrMask_SendUnsupportedVLErrMask_LSB 0x25 +#define QIB_6120_ErrMask_SendUnsupportedVLErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_SendUnexpectedPktNumErrMask_LSB 0x24 +#define QIB_6120_ErrMask_SendUnexpectedPktNumErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_SendPioArmLaunchErrMask_LSB 0x23 +#define QIB_6120_ErrMask_SendPioArmLaunchErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_SendDroppedDataPktErrMask_LSB 0x22 +#define QIB_6120_ErrMask_SendDroppedDataPktErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_SendDroppedSmpPktErrMask_LSB 0x21 +#define QIB_6120_ErrMask_SendDroppedSmpPktErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_SendPktLenErrMask_LSB 0x20 +#define QIB_6120_ErrMask_SendPktLenErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_SendUnderRunErrMask_LSB 0x1F +#define QIB_6120_ErrMask_SendUnderRunErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_SendMaxPktLenErrMask_LSB 0x1E +#define QIB_6120_ErrMask_SendMaxPktLenErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_SendMinPktLenErrMask_LSB 0x1D +#define QIB_6120_ErrMask_SendMinPktLenErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_Reserved2_LSB 0x12 +#define QIB_6120_ErrMask_Reserved2_RMASK 0x7FF +#define QIB_6120_ErrMask_RcvIBLostLinkErrMask_LSB 0x11 +#define QIB_6120_ErrMask_RcvIBLostLinkErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_RcvHdrErrMask_LSB 0x10 +#define QIB_6120_ErrMask_RcvHdrErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_RcvHdrLenErrMask_LSB 0xF +#define QIB_6120_ErrMask_RcvHdrLenErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_RcvBadTidErrMask_LSB 0xE +#define QIB_6120_ErrMask_RcvBadTidErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_RcvHdrFullErrMask_LSB 0xD +#define QIB_6120_ErrMask_RcvHdrFullErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_RcvEgrFullErrMask_LSB 0xC +#define QIB_6120_ErrMask_RcvEgrFullErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_RcvBadVersionErrMask_LSB 0xB +#define QIB_6120_ErrMask_RcvBadVersionErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_RcvIBFlowErrMask_LSB 0xA +#define QIB_6120_ErrMask_RcvIBFlowErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_RcvEBPErrMask_LSB 0x9 +#define QIB_6120_ErrMask_RcvEBPErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_RcvUnsupportedVLErrMask_LSB 0x8 +#define QIB_6120_ErrMask_RcvUnsupportedVLErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_RcvUnexpectedCharErrMask_LSB 0x7 +#define QIB_6120_ErrMask_RcvUnexpectedCharErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_RcvShortPktLenErrMask_LSB 0x6 +#define QIB_6120_ErrMask_RcvShortPktLenErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_RcvLongPktLenErrMask_LSB 0x5 +#define QIB_6120_ErrMask_RcvLongPktLenErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_RcvMaxPktLenErrMask_LSB 0x4 +#define QIB_6120_ErrMask_RcvMaxPktLenErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_RcvMinPktLenErrMask_LSB 0x3 +#define QIB_6120_ErrMask_RcvMinPktLenErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_RcvICRCErrMask_LSB 0x2 +#define QIB_6120_ErrMask_RcvICRCErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_RcvVCRCErrMask_LSB 0x1 +#define QIB_6120_ErrMask_RcvVCRCErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_RcvFormatErrMask_LSB 0x0 +#define QIB_6120_ErrMask_RcvFormatErrMask_RMASK 0x1 + +#define QIB_6120_ErrStatus_OFFS 0x88 +#define QIB_6120_ErrStatus_Reserved_LSB 0x34 +#define QIB_6120_ErrStatus_Reserved_RMASK 0xFFF +#define QIB_6120_ErrStatus_HardwareErr_LSB 0x33 +#define QIB_6120_ErrStatus_HardwareErr_RMASK 0x1 +#define QIB_6120_ErrStatus_ResetNegated_LSB 0x32 +#define QIB_6120_ErrStatus_ResetNegated_RMASK 0x1 +#define QIB_6120_ErrStatus_InvalidAddrErr_LSB 0x31 +#define QIB_6120_ErrStatus_InvalidAddrErr_RMASK 0x1 +#define QIB_6120_ErrStatus_IBStatusChanged_LSB 0x30 +#define QIB_6120_ErrStatus_IBStatusChanged_RMASK 0x1 +#define QIB_6120_ErrStatus_Reserved1_LSB 0x26 +#define QIB_6120_ErrStatus_Reserved1_RMASK 0x3FF +#define QIB_6120_ErrStatus_SendUnsupportedVLErr_LSB 0x25 +#define QIB_6120_ErrStatus_SendUnsupportedVLErr_RMASK 0x1 +#define QIB_6120_ErrStatus_SendUnexpectedPktNumErr_LSB 0x24 +#define QIB_6120_ErrStatus_SendUnexpectedPktNumErr_RMASK 0x1 +#define QIB_6120_ErrStatus_SendPioArmLaunchErr_LSB 0x23 +#define QIB_6120_ErrStatus_SendPioArmLaunchErr_RMASK 0x1 +#define QIB_6120_ErrStatus_SendDroppedDataPktErr_LSB 0x22 +#define QIB_6120_ErrStatus_SendDroppedDataPktErr_RMASK 0x1 +#define QIB_6120_ErrStatus_SendDroppedSmpPktErr_LSB 0x21 +#define QIB_6120_ErrStatus_SendDroppedSmpPktErr_RMASK 0x1 +#define QIB_6120_ErrStatus_SendPktLenErr_LSB 0x20 +#define QIB_6120_ErrStatus_SendPktLenErr_RMASK 0x1 +#define QIB_6120_ErrStatus_SendUnderRunErr_LSB 0x1F +#define QIB_6120_ErrStatus_SendUnderRunErr_RMASK 0x1 +#define QIB_6120_ErrStatus_SendMaxPktLenErr_LSB 0x1E +#define QIB_6120_ErrStatus_SendMaxPktLenErr_RMASK 0x1 +#define QIB_6120_ErrStatus_SendMinPktLenErr_LSB 0x1D +#define QIB_6120_ErrStatus_SendMinPktLenErr_RMASK 0x1 +#define QIB_6120_ErrStatus_Reserved2_LSB 0x12 +#define QIB_6120_ErrStatus_Reserved2_RMASK 0x7FF +#define QIB_6120_ErrStatus_RcvIBLostLinkErr_LSB 0x11 +#define QIB_6120_ErrStatus_RcvIBLostLinkErr_RMASK 0x1 +#define QIB_6120_ErrStatus_RcvHdrErr_LSB 0x10 +#define QIB_6120_ErrStatus_RcvHdrErr_RMASK 0x1 +#define QIB_6120_ErrStatus_RcvHdrLenErr_LSB 0xF +#define QIB_6120_ErrStatus_RcvHdrLenErr_RMASK 0x1 +#define QIB_6120_ErrStatus_RcvBadTidErr_LSB 0xE +#define QIB_6120_ErrStatus_RcvBadTidErr_RMASK 0x1 +#define QIB_6120_ErrStatus_RcvHdrFullErr_LSB 0xD +#define QIB_6120_ErrStatus_RcvHdrFullErr_RMASK 0x1 +#define QIB_6120_ErrStatus_RcvEgrFullErr_LSB 0xC +#define QIB_6120_ErrStatus_RcvEgrFullErr_RMASK 0x1 +#define QIB_6120_ErrStatus_RcvBadVersionErr_LSB 0xB +#define QIB_6120_ErrStatus_RcvBadVersionErr_RMASK 0x1 +#define QIB_6120_ErrStatus_RcvIBFlowErr_LSB 0xA +#define QIB_6120_ErrStatus_RcvIBFlowErr_RMASK 0x1 +#define QIB_6120_ErrStatus_RcvEBPErr_LSB 0x9 +#define QIB_6120_ErrStatus_RcvEBPErr_RMASK 0x1 +#define QIB_6120_ErrStatus_RcvUnsupportedVLErr_LSB 0x8 +#define QIB_6120_ErrStatus_RcvUnsupportedVLErr_RMASK 0x1 +#define QIB_6120_ErrStatus_RcvUnexpectedCharErr_LSB 0x7 +#define QIB_6120_ErrStatus_RcvUnexpectedCharErr_RMASK 0x1 +#define QIB_6120_ErrStatus_RcvShortPktLenErr_LSB 0x6 +#define QIB_6120_ErrStatus_RcvShortPktLenErr_RMASK 0x1 +#define QIB_6120_ErrStatus_RcvLongPktLenErr_LSB 0x5 +#define QIB_6120_ErrStatus_RcvLongPktLenErr_RMASK 0x1 +#define QIB_6120_ErrStatus_RcvMaxPktLenErr_LSB 0x4 +#define QIB_6120_ErrStatus_RcvMaxPktLenErr_RMASK 0x1 +#define QIB_6120_ErrStatus_RcvMinPktLenErr_LSB 0x3 +#define QIB_6120_ErrStatus_RcvMinPktLenErr_RMASK 0x1 +#define QIB_6120_ErrStatus_RcvICRCErr_LSB 0x2 +#define QIB_6120_ErrStatus_RcvICRCErr_RMASK 0x1 +#define QIB_6120_ErrStatus_RcvVCRCErr_LSB 0x1 +#define QIB_6120_ErrStatus_RcvVCRCErr_RMASK 0x1 +#define QIB_6120_ErrStatus_RcvFormatErr_LSB 0x0 +#define QIB_6120_ErrStatus_RcvFormatErr_RMASK 0x1 + +#define QIB_6120_ErrClear_OFFS 0x90 +#define QIB_6120_ErrClear_Reserved_LSB 0x34 +#define QIB_6120_ErrClear_Reserved_RMASK 0xFFF +#define QIB_6120_ErrClear_HardwareErrClear_LSB 0x33 +#define QIB_6120_ErrClear_HardwareErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_ResetNegatedClear_LSB 0x32 +#define QIB_6120_ErrClear_ResetNegatedClear_RMASK 0x1 +#define QIB_6120_ErrClear_InvalidAddrErrClear_LSB 0x31 +#define QIB_6120_ErrClear_InvalidAddrErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_IBStatusChangedClear_LSB 0x30 +#define QIB_6120_ErrClear_IBStatusChangedClear_RMASK 0x1 +#define QIB_6120_ErrClear_Reserved1_LSB 0x26 +#define QIB_6120_ErrClear_Reserved1_RMASK 0x3FF +#define QIB_6120_ErrClear_SendUnsupportedVLErrClear_LSB 0x25 +#define QIB_6120_ErrClear_SendUnsupportedVLErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_SendUnexpectedPktNumErrClear_LSB 0x24 +#define QIB_6120_ErrClear_SendUnexpectedPktNumErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_SendPioArmLaunchErrClear_LSB 0x23 +#define QIB_6120_ErrClear_SendPioArmLaunchErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_SendDroppedDataPktErrClear_LSB 0x22 +#define QIB_6120_ErrClear_SendDroppedDataPktErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_SendDroppedSmpPktErrClear_LSB 0x21 +#define QIB_6120_ErrClear_SendDroppedSmpPktErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_SendPktLenErrClear_LSB 0x20 +#define QIB_6120_ErrClear_SendPktLenErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_SendUnderRunErrClear_LSB 0x1F +#define QIB_6120_ErrClear_SendUnderRunErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_SendMaxPktLenErrClear_LSB 0x1E +#define QIB_6120_ErrClear_SendMaxPktLenErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_SendMinPktLenErrClear_LSB 0x1D +#define QIB_6120_ErrClear_SendMinPktLenErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_Reserved2_LSB 0x12 +#define QIB_6120_ErrClear_Reserved2_RMASK 0x7FF +#define QIB_6120_ErrClear_RcvIBLostLinkErrClear_LSB 0x11 +#define QIB_6120_ErrClear_RcvIBLostLinkErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_RcvHdrErrClear_LSB 0x10 +#define QIB_6120_ErrClear_RcvHdrErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_RcvHdrLenErrClear_LSB 0xF +#define QIB_6120_ErrClear_RcvHdrLenErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_RcvBadTidErrClear_LSB 0xE +#define QIB_6120_ErrClear_RcvBadTidErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_RcvHdrFullErrClear_LSB 0xD +#define QIB_6120_ErrClear_RcvHdrFullErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_RcvEgrFullErrClear_LSB 0xC +#define QIB_6120_ErrClear_RcvEgrFullErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_RcvBadVersionErrClear_LSB 0xB +#define QIB_6120_ErrClear_RcvBadVersionErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_RcvIBFlowErrClear_LSB 0xA +#define QIB_6120_ErrClear_RcvIBFlowErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_RcvEBPErrClear_LSB 0x9 +#define QIB_6120_ErrClear_RcvEBPErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_RcvUnsupportedVLErrClear_LSB 0x8 +#define QIB_6120_ErrClear_RcvUnsupportedVLErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_RcvUnexpectedCharErrClear_LSB 0x7 +#define QIB_6120_ErrClear_RcvUnexpectedCharErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_RcvShortPktLenErrClear_LSB 0x6 +#define QIB_6120_ErrClear_RcvShortPktLenErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_RcvLongPktLenErrClear_LSB 0x5 +#define QIB_6120_ErrClear_RcvLongPktLenErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_RcvMaxPktLenErrClear_LSB 0x4 +#define QIB_6120_ErrClear_RcvMaxPktLenErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_RcvMinPktLenErrClear_LSB 0x3 +#define QIB_6120_ErrClear_RcvMinPktLenErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_RcvICRCErrClear_LSB 0x2 +#define QIB_6120_ErrClear_RcvICRCErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_RcvVCRCErrClear_LSB 0x1 +#define QIB_6120_ErrClear_RcvVCRCErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_RcvFormatErrClear_LSB 0x0 +#define QIB_6120_ErrClear_RcvFormatErrClear_RMASK 0x1 + +#define QIB_6120_HwErrMask_OFFS 0x98 +#define QIB_6120_HwErrMask_IBCBusFromSPCParityErrMask_LSB 0x3F +#define QIB_6120_HwErrMask_IBCBusFromSPCParityErrMask_RMASK 0x1 +#define QIB_6120_HwErrMask_IBCBusToSPCParityErrMask_LSB 0x3E +#define QIB_6120_HwErrMask_IBCBusToSPCParityErrMask_RMASK 0x1 +#define QIB_6120_HwErrMask_Reserved_LSB 0x3D +#define QIB_6120_HwErrMask_Reserved_RMASK 0x1 +#define QIB_6120_HwErrMask_IBSerdesPClkNotDetectMask_LSB 0x3C +#define QIB_6120_HwErrMask_IBSerdesPClkNotDetectMask_RMASK 0x1 +#define QIB_6120_HwErrMask_PCIESerdesQ0PClkNotDetectMask_LSB 0x3B +#define QIB_6120_HwErrMask_PCIESerdesQ0PClkNotDetectMask_RMASK 0x1 +#define QIB_6120_HwErrMask_PCIESerdesQ1PClkNotDetectMask_LSB 0x3A +#define QIB_6120_HwErrMask_PCIESerdesQ1PClkNotDetectMask_RMASK 0x1 +#define QIB_6120_HwErrMask_Reserved1_LSB 0x39 +#define QIB_6120_HwErrMask_Reserved1_RMASK 0x1 +#define QIB_6120_HwErrMask_IBPLLrfSlipMask_LSB 0x38 +#define QIB_6120_HwErrMask_IBPLLrfSlipMask_RMASK 0x1 +#define QIB_6120_HwErrMask_IBPLLfbSlipMask_LSB 0x37 +#define QIB_6120_HwErrMask_IBPLLfbSlipMask_RMASK 0x1 +#define QIB_6120_HwErrMask_PowerOnBISTFailedMask_LSB 0x36 +#define QIB_6120_HwErrMask_PowerOnBISTFailedMask_RMASK 0x1 +#define QIB_6120_HwErrMask_Reserved2_LSB 0x33 +#define QIB_6120_HwErrMask_Reserved2_RMASK 0x7 +#define QIB_6120_HwErrMask_RXEMemParityErrMask_LSB 0x2C +#define QIB_6120_HwErrMask_RXEMemParityErrMask_RMASK 0x7F +#define QIB_6120_HwErrMask_TXEMemParityErrMask_LSB 0x28 +#define QIB_6120_HwErrMask_TXEMemParityErrMask_RMASK 0xF +#define QIB_6120_HwErrMask_Reserved3_LSB 0x22 +#define QIB_6120_HwErrMask_Reserved3_RMASK 0x3F +#define QIB_6120_HwErrMask_PCIeBusParityErrMask_LSB 0x1F +#define QIB_6120_HwErrMask_PCIeBusParityErrMask_RMASK 0x7 +#define QIB_6120_HwErrMask_PcieCplTimeoutMask_LSB 0x1E +#define QIB_6120_HwErrMask_PcieCplTimeoutMask_RMASK 0x1 +#define QIB_6120_HwErrMask_PoisonedTLPMask_LSB 0x1D +#define QIB_6120_HwErrMask_PoisonedTLPMask_RMASK 0x1 +#define QIB_6120_HwErrMask_Reserved4_LSB 0x6 +#define QIB_6120_HwErrMask_Reserved4_RMASK 0x7FFFFF +#define QIB_6120_HwErrMask_PCIeMemParityErrMask_LSB 0x0 +#define QIB_6120_HwErrMask_PCIeMemParityErrMask_RMASK 0x3F + +#define QIB_6120_HwErrStatus_OFFS 0xA0 +#define QIB_6120_HwErrStatus_IBCBusFromSPCParityErr_LSB 0x3F +#define QIB_6120_HwErrStatus_IBCBusFromSPCParityErr_RMASK 0x1 +#define QIB_6120_HwErrStatus_IBCBusToSPCParityErr_LSB 0x3E +#define QIB_6120_HwErrStatus_IBCBusToSPCParityErr_RMASK 0x1 +#define QIB_6120_HwErrStatus_Reserved_LSB 0x3D +#define QIB_6120_HwErrStatus_Reserved_RMASK 0x1 +#define QIB_6120_HwErrStatus_IBSerdesPClkNotDetect_LSB 0x3C +#define QIB_6120_HwErrStatus_IBSerdesPClkNotDetect_RMASK 0x1 +#define QIB_6120_HwErrStatus_PCIESerdesQ0PClkNotDetect_LSB 0x3B +#define QIB_6120_HwErrStatus_PCIESerdesQ0PClkNotDetect_RMASK 0x1 +#define QIB_6120_HwErrStatus_PCIESerdesQ1PClkNotDetect_LSB 0x3A +#define QIB_6120_HwErrStatus_PCIESerdesQ1PClkNotDetect_RMASK 0x1 +#define QIB_6120_HwErrStatus_Reserved1_LSB 0x39 +#define QIB_6120_HwErrStatus_Reserved1_RMASK 0x1 +#define QIB_6120_HwErrStatus_IBPLLrfSlip_LSB 0x38 +#define QIB_6120_HwErrStatus_IBPLLrfSlip_RMASK 0x1 +#define QIB_6120_HwErrStatus_IBPLLfbSlip_LSB 0x37 +#define QIB_6120_HwErrStatus_IBPLLfbSlip_RMASK 0x1 +#define QIB_6120_HwErrStatus_PowerOnBISTFailed_LSB 0x36 +#define QIB_6120_HwErrStatus_PowerOnBISTFailed_RMASK 0x1 +#define QIB_6120_HwErrStatus_Reserved2_LSB 0x33 +#define QIB_6120_HwErrStatus_Reserved2_RMASK 0x7 +#define QIB_6120_HwErrStatus_RXEMemParity_LSB 0x2C +#define QIB_6120_HwErrStatus_RXEMemParity_RMASK 0x7F +#define QIB_6120_HwErrStatus_TXEMemParity_LSB 0x28 +#define QIB_6120_HwErrStatus_TXEMemParity_RMASK 0xF +#define QIB_6120_HwErrStatus_Reserved3_LSB 0x22 +#define QIB_6120_HwErrStatus_Reserved3_RMASK 0x3F +#define QIB_6120_HwErrStatus_PCIeBusParity_LSB 0x1F +#define QIB_6120_HwErrStatus_PCIeBusParity_RMASK 0x7 +#define QIB_6120_HwErrStatus_PcieCplTimeout_LSB 0x1E +#define QIB_6120_HwErrStatus_PcieCplTimeout_RMASK 0x1 +#define QIB_6120_HwErrStatus_PoisenedTLP_LSB 0x1D +#define QIB_6120_HwErrStatus_PoisenedTLP_RMASK 0x1 +#define QIB_6120_HwErrStatus_Reserved4_LSB 0x6 +#define QIB_6120_HwErrStatus_Reserved4_RMASK 0x7FFFFF +#define QIB_6120_HwErrStatus_PCIeMemParity_LSB 0x0 +#define QIB_6120_HwErrStatus_PCIeMemParity_RMASK 0x3F + +#define QIB_6120_HwErrClear_OFFS 0xA8 +#define QIB_6120_HwErrClear_IBCBusFromSPCParityErrClear_LSB 0x3F +#define QIB_6120_HwErrClear_IBCBusFromSPCParityErrClear_RMASK 0x1 +#define QIB_6120_HwErrClear_IBCBusToSPCparityErrClear_LSB 0x3E +#define QIB_6120_HwErrClear_IBCBusToSPCparityErrClear_RMASK 0x1 +#define QIB_6120_HwErrClear_Reserved_LSB 0x3D +#define QIB_6120_HwErrClear_Reserved_RMASK 0x1 +#define QIB_6120_HwErrClear_IBSerdesPClkNotDetectClear_LSB 0x3C +#define QIB_6120_HwErrClear_IBSerdesPClkNotDetectClear_RMASK 0x1 +#define QIB_6120_HwErrClear_PCIESerdesQ0PClkNotDetectClear_LSB 0x3B +#define QIB_6120_HwErrClear_PCIESerdesQ0PClkNotDetectClear_RMASK 0x1 +#define QIB_6120_HwErrClear_PCIESerdesQ1PClkNotDetectClear_LSB 0x3A +#define QIB_6120_HwErrClear_PCIESerdesQ1PClkNotDetectClear_RMASK 0x1 +#define QIB_6120_HwErrClear_Reserved1_LSB 0x39 +#define QIB_6120_HwErrClear_Reserved1_RMASK 0x1 +#define QIB_6120_HwErrClear_IBPLLrfSlipClear_LSB 0x38 +#define QIB_6120_HwErrClear_IBPLLrfSlipClear_RMASK 0x1 +#define QIB_6120_HwErrClear_IBPLLfbSlipClear_LSB 0x37 +#define QIB_6120_HwErrClear_IBPLLfbSlipClear_RMASK 0x1 +#define QIB_6120_HwErrClear_PowerOnBISTFailedClear_LSB 0x36 +#define QIB_6120_HwErrClear_PowerOnBISTFailedClear_RMASK 0x1 +#define QIB_6120_HwErrClear_Reserved2_LSB 0x33 +#define QIB_6120_HwErrClear_Reserved2_RMASK 0x7 +#define QIB_6120_HwErrClear_RXEMemParityClear_LSB 0x2C +#define QIB_6120_HwErrClear_RXEMemParityClear_RMASK 0x7F +#define QIB_6120_HwErrClear_TXEMemParityClear_LSB 0x28 +#define QIB_6120_HwErrClear_TXEMemParityClear_RMASK 0xF +#define QIB_6120_HwErrClear_Reserved3_LSB 0x22 +#define QIB_6120_HwErrClear_Reserved3_RMASK 0x3F +#define QIB_6120_HwErrClear_PCIeBusParityClr_LSB 0x1F +#define QIB_6120_HwErrClear_PCIeBusParityClr_RMASK 0x7 +#define QIB_6120_HwErrClear_PcieCplTimeoutClear_LSB 0x1E +#define QIB_6120_HwErrClear_PcieCplTimeoutClear_RMASK 0x1 +#define QIB_6120_HwErrClear_PoisonedTLPClear_LSB 0x1D +#define QIB_6120_HwErrClear_PoisonedTLPClear_RMASK 0x1 +#define QIB_6120_HwErrClear_Reserved4_LSB 0x6 +#define QIB_6120_HwErrClear_Reserved4_RMASK 0x7FFFFF +#define QIB_6120_HwErrClear_PCIeMemParityClr_LSB 0x0 +#define QIB_6120_HwErrClear_PCIeMemParityClr_RMASK 0x3F + +#define QIB_6120_HwDiagCtrl_OFFS 0xB0 +#define QIB_6120_HwDiagCtrl_ForceIBCBusFromSPCParityErr_LSB 0x3F +#define QIB_6120_HwDiagCtrl_ForceIBCBusFromSPCParityErr_RMASK 0x1 +#define QIB_6120_HwDiagCtrl_ForceIBCBusToSPCParityErr_LSB 0x3E +#define QIB_6120_HwDiagCtrl_ForceIBCBusToSPCParityErr_RMASK 0x1 +#define QIB_6120_HwDiagCtrl_CounterWrEnable_LSB 0x3D +#define QIB_6120_HwDiagCtrl_CounterWrEnable_RMASK 0x1 +#define QIB_6120_HwDiagCtrl_CounterDisable_LSB 0x3C +#define QIB_6120_HwDiagCtrl_CounterDisable_RMASK 0x1 +#define QIB_6120_HwDiagCtrl_Reserved_LSB 0x33 +#define QIB_6120_HwDiagCtrl_Reserved_RMASK 0x1FF +#define QIB_6120_HwDiagCtrl_ForceRxMemParityErr_LSB 0x2C +#define QIB_6120_HwDiagCtrl_ForceRxMemParityErr_RMASK 0x7F +#define QIB_6120_HwDiagCtrl_ForceTxMemparityErr_LSB 0x28 +#define QIB_6120_HwDiagCtrl_ForceTxMemparityErr_RMASK 0xF +#define QIB_6120_HwDiagCtrl_Reserved1_LSB 0x23 +#define QIB_6120_HwDiagCtrl_Reserved1_RMASK 0x1F +#define QIB_6120_HwDiagCtrl_forcePCIeBusParity_LSB 0x1F +#define QIB_6120_HwDiagCtrl_forcePCIeBusParity_RMASK 0xF +#define QIB_6120_HwDiagCtrl_Reserved2_LSB 0x6 +#define QIB_6120_HwDiagCtrl_Reserved2_RMASK 0x1FFFFFF +#define QIB_6120_HwDiagCtrl_forcePCIeMemParity_LSB 0x0 +#define QIB_6120_HwDiagCtrl_forcePCIeMemParity_RMASK 0x3F + +#define QIB_6120_IBCStatus_OFFS 0xC0 +#define QIB_6120_IBCStatus_TxCreditOk_LSB 0x1F +#define QIB_6120_IBCStatus_TxCreditOk_RMASK 0x1 +#define QIB_6120_IBCStatus_TxReady_LSB 0x1E +#define QIB_6120_IBCStatus_TxReady_RMASK 0x1 +#define QIB_6120_IBCStatus_Reserved_LSB 0x7 +#define QIB_6120_IBCStatus_Reserved_RMASK 0x7FFFFF +#define QIB_6120_IBCStatus_LinkState_LSB 0x4 +#define QIB_6120_IBCStatus_LinkState_RMASK 0x7 +#define QIB_6120_IBCStatus_LinkTrainingState_LSB 0x0 +#define QIB_6120_IBCStatus_LinkTrainingState_RMASK 0xF + +#define QIB_6120_IBCCtrl_OFFS 0xC8 +#define QIB_6120_IBCCtrl_Loopback_LSB 0x3F +#define QIB_6120_IBCCtrl_Loopback_RMASK 0x1 +#define QIB_6120_IBCCtrl_LinkDownDefaultState_LSB 0x3E +#define QIB_6120_IBCCtrl_LinkDownDefaultState_RMASK 0x1 +#define QIB_6120_IBCCtrl_Reserved_LSB 0x2B +#define QIB_6120_IBCCtrl_Reserved_RMASK 0x7FFFF +#define QIB_6120_IBCCtrl_CreditScale_LSB 0x28 +#define QIB_6120_IBCCtrl_CreditScale_RMASK 0x7 +#define QIB_6120_IBCCtrl_OverrunThreshold_LSB 0x24 +#define QIB_6120_IBCCtrl_OverrunThreshold_RMASK 0xF +#define QIB_6120_IBCCtrl_PhyerrThreshold_LSB 0x20 +#define QIB_6120_IBCCtrl_PhyerrThreshold_RMASK 0xF +#define QIB_6120_IBCCtrl_Reserved1_LSB 0x1F +#define QIB_6120_IBCCtrl_Reserved1_RMASK 0x1 +#define QIB_6120_IBCCtrl_MaxPktLen_LSB 0x14 +#define QIB_6120_IBCCtrl_MaxPktLen_RMASK 0x7FF +#define QIB_6120_IBCCtrl_LinkCmd_LSB 0x12 +#define QIB_6120_IBCCtrl_LinkCmd_RMASK 0x3 +#define QIB_6120_IBCCtrl_LinkInitCmd_LSB 0x10 +#define QIB_6120_IBCCtrl_LinkInitCmd_RMASK 0x3 +#define QIB_6120_IBCCtrl_FlowCtrlWaterMark_LSB 0x8 +#define QIB_6120_IBCCtrl_FlowCtrlWaterMark_RMASK 0xFF +#define QIB_6120_IBCCtrl_FlowCtrlPeriod_LSB 0x0 +#define QIB_6120_IBCCtrl_FlowCtrlPeriod_RMASK 0xFF + +#define QIB_6120_EXTStatus_OFFS 0xD0 +#define QIB_6120_EXTStatus_GPIOIn_LSB 0x30 +#define QIB_6120_EXTStatus_GPIOIn_RMASK 0xFFFF +#define QIB_6120_EXTStatus_Reserved_LSB 0x20 +#define QIB_6120_EXTStatus_Reserved_RMASK 0xFFFF +#define QIB_6120_EXTStatus_Reserved1_LSB 0x10 +#define QIB_6120_EXTStatus_Reserved1_RMASK 0xFFFF +#define QIB_6120_EXTStatus_MemBISTFoundErr_LSB 0xF +#define QIB_6120_EXTStatus_MemBISTFoundErr_RMASK 0x1 +#define QIB_6120_EXTStatus_MemBISTEndTest_LSB 0xE +#define QIB_6120_EXTStatus_MemBISTEndTest_RMASK 0x1 +#define QIB_6120_EXTStatus_Reserved2_LSB 0x0 +#define QIB_6120_EXTStatus_Reserved2_RMASK 0x3FFF + +#define QIB_6120_EXTCtrl_OFFS 0xD8 +#define QIB_6120_EXTCtrl_GPIOOe_LSB 0x30 +#define QIB_6120_EXTCtrl_GPIOOe_RMASK 0xFFFF +#define QIB_6120_EXTCtrl_GPIOInvert_LSB 0x20 +#define QIB_6120_EXTCtrl_GPIOInvert_RMASK 0xFFFF +#define QIB_6120_EXTCtrl_Reserved_LSB 0x4 +#define QIB_6120_EXTCtrl_Reserved_RMASK 0xFFFFFFF +#define QIB_6120_EXTCtrl_LEDPriPortGreenOn_LSB 0x3 +#define QIB_6120_EXTCtrl_LEDPriPortGreenOn_RMASK 0x1 +#define QIB_6120_EXTCtrl_LEDPriPortYellowOn_LSB 0x2 +#define QIB_6120_EXTCtrl_LEDPriPortYellowOn_RMASK 0x1 +#define QIB_6120_EXTCtrl_LEDGblOkGreenOn_LSB 0x1 +#define QIB_6120_EXTCtrl_LEDGblOkGreenOn_RMASK 0x1 +#define QIB_6120_EXTCtrl_LEDGblErrRedOff_LSB 0x0 +#define QIB_6120_EXTCtrl_LEDGblErrRedOff_RMASK 0x1 + +#define QIB_6120_GPIOOut_OFFS 0xE0 + +#define QIB_6120_GPIOMask_OFFS 0xE8 + +#define QIB_6120_GPIOStatus_OFFS 0xF0 + +#define QIB_6120_GPIOClear_OFFS 0xF8 + +#define QIB_6120_RcvCtrl_OFFS 0x100 +#define QIB_6120_RcvCtrl_TailUpd_LSB 0x1F +#define QIB_6120_RcvCtrl_TailUpd_RMASK 0x1 +#define QIB_6120_RcvCtrl_RcvPartitionKeyDisable_LSB 0x1E +#define QIB_6120_RcvCtrl_RcvPartitionKeyDisable_RMASK 0x1 +#define QIB_6120_RcvCtrl_Reserved_LSB 0x15 +#define QIB_6120_RcvCtrl_Reserved_RMASK 0x1FF +#define QIB_6120_RcvCtrl_IntrAvail_LSB 0x10 +#define QIB_6120_RcvCtrl_IntrAvail_RMASK 0x1F +#define QIB_6120_RcvCtrl_Reserved1_LSB 0x9 +#define QIB_6120_RcvCtrl_Reserved1_RMASK 0x7F +#define QIB_6120_RcvCtrl_Reserved2_LSB 0x5 +#define QIB_6120_RcvCtrl_Reserved2_RMASK 0xF +#define QIB_6120_RcvCtrl_PortEnable_LSB 0x0 +#define QIB_6120_RcvCtrl_PortEnable_RMASK 0x1F + +#define QIB_6120_RcvBTHQP_OFFS 0x108 +#define QIB_6120_RcvBTHQP_BTHQP_Mask_LSB 0x1E +#define QIB_6120_RcvBTHQP_BTHQP_Mask_RMASK 0x3 +#define QIB_6120_RcvBTHQP_Reserved_LSB 0x18 +#define QIB_6120_RcvBTHQP_Reserved_RMASK 0x3F +#define QIB_6120_RcvBTHQP_RcvBTHQP_LSB 0x0 +#define QIB_6120_RcvBTHQP_RcvBTHQP_RMASK 0xFFFFFF + +#define QIB_6120_RcvHdrSize_OFFS 0x110 + +#define QIB_6120_RcvHdrCnt_OFFS 0x118 + +#define QIB_6120_RcvHdrEntSize_OFFS 0x120 + +#define QIB_6120_RcvTIDBase_OFFS 0x128 + +#define QIB_6120_RcvTIDCnt_OFFS 0x130 + +#define QIB_6120_RcvEgrBase_OFFS 0x138 + +#define QIB_6120_RcvEgrCnt_OFFS 0x140 + +#define QIB_6120_RcvBufBase_OFFS 0x148 + +#define QIB_6120_RcvBufSize_OFFS 0x150 + +#define QIB_6120_RxIntMemBase_OFFS 0x158 + +#define QIB_6120_RxIntMemSize_OFFS 0x160 + +#define QIB_6120_RcvPartitionKey_OFFS 0x168 + +#define QIB_6120_RcvPktLEDCnt_OFFS 0x178 +#define QIB_6120_RcvPktLEDCnt_ONperiod_LSB 0x20 +#define QIB_6120_RcvPktLEDCnt_ONperiod_RMASK 0xFFFFFFFF +#define QIB_6120_RcvPktLEDCnt_OFFperiod_LSB 0x0 +#define QIB_6120_RcvPktLEDCnt_OFFperiod_RMASK 0xFFFFFFFF + +#define QIB_6120_SendCtrl_OFFS 0x1C0 +#define QIB_6120_SendCtrl_Disarm_LSB 0x1F +#define QIB_6120_SendCtrl_Disarm_RMASK 0x1 +#define QIB_6120_SendCtrl_Reserved_LSB 0x17 +#define QIB_6120_SendCtrl_Reserved_RMASK 0xFF +#define QIB_6120_SendCtrl_DisarmPIOBuf_LSB 0x10 +#define QIB_6120_SendCtrl_DisarmPIOBuf_RMASK 0x7F +#define QIB_6120_SendCtrl_Reserved1_LSB 0x4 +#define QIB_6120_SendCtrl_Reserved1_RMASK 0xFFF +#define QIB_6120_SendCtrl_PIOEnable_LSB 0x3 +#define QIB_6120_SendCtrl_PIOEnable_RMASK 0x1 +#define QIB_6120_SendCtrl_PIOBufAvailUpd_LSB 0x2 +#define QIB_6120_SendCtrl_PIOBufAvailUpd_RMASK 0x1 +#define QIB_6120_SendCtrl_PIOIntBufAvail_LSB 0x1 +#define QIB_6120_SendCtrl_PIOIntBufAvail_RMASK 0x1 +#define QIB_6120_SendCtrl_Abort_LSB 0x0 +#define QIB_6120_SendCtrl_Abort_RMASK 0x1 + +#define QIB_6120_SendPIOBufBase_OFFS 0x1C8 +#define QIB_6120_SendPIOBufBase_Reserved_LSB 0x35 +#define QIB_6120_SendPIOBufBase_Reserved_RMASK 0x7FF +#define QIB_6120_SendPIOBufBase_BaseAddr_LargePIO_LSB 0x20 +#define QIB_6120_SendPIOBufBase_BaseAddr_LargePIO_RMASK 0x1FFFFF +#define QIB_6120_SendPIOBufBase_Reserved1_LSB 0x15 +#define QIB_6120_SendPIOBufBase_Reserved1_RMASK 0x7FF +#define QIB_6120_SendPIOBufBase_BaseAddr_SmallPIO_LSB 0x0 +#define QIB_6120_SendPIOBufBase_BaseAddr_SmallPIO_RMASK 0x1FFFFF + +#define QIB_6120_SendPIOSize_OFFS 0x1D0 +#define QIB_6120_SendPIOSize_Reserved_LSB 0x2D +#define QIB_6120_SendPIOSize_Reserved_RMASK 0xFFFFF +#define QIB_6120_SendPIOSize_Size_LargePIO_LSB 0x20 +#define QIB_6120_SendPIOSize_Size_LargePIO_RMASK 0x1FFF +#define QIB_6120_SendPIOSize_Reserved1_LSB 0xC +#define QIB_6120_SendPIOSize_Reserved1_RMASK 0xFFFFF +#define QIB_6120_SendPIOSize_Size_SmallPIO_LSB 0x0 +#define QIB_6120_SendPIOSize_Size_SmallPIO_RMASK 0xFFF + +#define QIB_6120_SendPIOBufCnt_OFFS 0x1D8 +#define QIB_6120_SendPIOBufCnt_Reserved_LSB 0x24 +#define QIB_6120_SendPIOBufCnt_Reserved_RMASK 0xFFFFFFF +#define QIB_6120_SendPIOBufCnt_Num_LargePIO_LSB 0x20 +#define QIB_6120_SendPIOBufCnt_Num_LargePIO_RMASK 0xF +#define QIB_6120_SendPIOBufCnt_Reserved1_LSB 0x9 +#define QIB_6120_SendPIOBufCnt_Reserved1_RMASK 0x7FFFFF +#define QIB_6120_SendPIOBufCnt_Num_SmallPIO_LSB 0x0 +#define QIB_6120_SendPIOBufCnt_Num_SmallPIO_RMASK 0x1FF + +#define QIB_6120_SendPIOAvailAddr_OFFS 0x1E0 +#define QIB_6120_SendPIOAvailAddr_SendPIOAvailAddr_LSB 0x6 +#define QIB_6120_SendPIOAvailAddr_SendPIOAvailAddr_RMASK 0x3FFFFFFFF +#define QIB_6120_SendPIOAvailAddr_Reserved_LSB 0x0 +#define QIB_6120_SendPIOAvailAddr_Reserved_RMASK 0x3F + +#define QIB_6120_SendBufErr0_OFFS 0x240 +#define QIB_6120_SendBufErr0_SendBufErrPIO_63_0_LSB 0x0 +#define QIB_6120_SendBufErr0_SendBufErrPIO_63_0_RMASK 0x0 + +#define QIB_6120_RcvHdrAddr0_OFFS 0x280 +#define QIB_6120_RcvHdrAddr0_RcvHdrAddr0_LSB 0x2 +#define QIB_6120_RcvHdrAddr0_RcvHdrAddr0_RMASK 0x3FFFFFFFFF +#define QIB_6120_RcvHdrAddr0_Reserved_LSB 0x0 +#define QIB_6120_RcvHdrAddr0_Reserved_RMASK 0x3 + +#define QIB_6120_RcvHdrTailAddr0_OFFS 0x300 +#define QIB_6120_RcvHdrTailAddr0_RcvHdrTailAddr0_LSB 0x2 +#define QIB_6120_RcvHdrTailAddr0_RcvHdrTailAddr0_RMASK 0x3FFFFFFFFF +#define QIB_6120_RcvHdrTailAddr0_Reserved_LSB 0x0 +#define QIB_6120_RcvHdrTailAddr0_Reserved_RMASK 0x3 + +#define QIB_6120_SerdesCfg0_OFFS 0x3C0 +#define QIB_6120_SerdesCfg0_DisableIBTxIdleDetect_LSB 0x3F +#define QIB_6120_SerdesCfg0_DisableIBTxIdleDetect_RMASK 0x1 +#define QIB_6120_SerdesCfg0_Reserved_LSB 0x38 +#define QIB_6120_SerdesCfg0_Reserved_RMASK 0x7F +#define QIB_6120_SerdesCfg0_RxEqCtl_LSB 0x36 +#define QIB_6120_SerdesCfg0_RxEqCtl_RMASK 0x3 +#define QIB_6120_SerdesCfg0_TxTermAdj_LSB 0x34 +#define QIB_6120_SerdesCfg0_TxTermAdj_RMASK 0x3 +#define QIB_6120_SerdesCfg0_RxTermAdj_LSB 0x32 +#define QIB_6120_SerdesCfg0_RxTermAdj_RMASK 0x3 +#define QIB_6120_SerdesCfg0_TermAdj1_LSB 0x31 +#define QIB_6120_SerdesCfg0_TermAdj1_RMASK 0x1 +#define QIB_6120_SerdesCfg0_TermAdj0_LSB 0x30 +#define QIB_6120_SerdesCfg0_TermAdj0_RMASK 0x1 +#define QIB_6120_SerdesCfg0_LPBKA_LSB 0x2F +#define QIB_6120_SerdesCfg0_LPBKA_RMASK 0x1 +#define QIB_6120_SerdesCfg0_LPBKB_LSB 0x2E +#define QIB_6120_SerdesCfg0_LPBKB_RMASK 0x1 +#define QIB_6120_SerdesCfg0_LPBKC_LSB 0x2D +#define QIB_6120_SerdesCfg0_LPBKC_RMASK 0x1 +#define QIB_6120_SerdesCfg0_LPBKD_LSB 0x2C +#define QIB_6120_SerdesCfg0_LPBKD_RMASK 0x1 +#define QIB_6120_SerdesCfg0_PW_LSB 0x2B +#define QIB_6120_SerdesCfg0_PW_RMASK 0x1 +#define QIB_6120_SerdesCfg0_RefSel_LSB 0x29 +#define QIB_6120_SerdesCfg0_RefSel_RMASK 0x3 +#define QIB_6120_SerdesCfg0_ParReset_LSB 0x28 +#define QIB_6120_SerdesCfg0_ParReset_RMASK 0x1 +#define QIB_6120_SerdesCfg0_ParLPBK_LSB 0x27 +#define QIB_6120_SerdesCfg0_ParLPBK_RMASK 0x1 +#define QIB_6120_SerdesCfg0_OffsetEn_LSB 0x26 +#define QIB_6120_SerdesCfg0_OffsetEn_RMASK 0x1 +#define QIB_6120_SerdesCfg0_Offset_LSB 0x1E +#define QIB_6120_SerdesCfg0_Offset_RMASK 0xFF +#define QIB_6120_SerdesCfg0_L2PwrDn_LSB 0x1D +#define QIB_6120_SerdesCfg0_L2PwrDn_RMASK 0x1 +#define QIB_6120_SerdesCfg0_ResetPLL_LSB 0x1C +#define QIB_6120_SerdesCfg0_ResetPLL_RMASK 0x1 +#define QIB_6120_SerdesCfg0_RxTermEnX_LSB 0x18 +#define QIB_6120_SerdesCfg0_RxTermEnX_RMASK 0xF +#define QIB_6120_SerdesCfg0_BeaconTxEnX_LSB 0x14 +#define QIB_6120_SerdesCfg0_BeaconTxEnX_RMASK 0xF +#define QIB_6120_SerdesCfg0_RxDetEnX_LSB 0x10 +#define QIB_6120_SerdesCfg0_RxDetEnX_RMASK 0xF +#define QIB_6120_SerdesCfg0_TxIdeEnX_LSB 0xC +#define QIB_6120_SerdesCfg0_TxIdeEnX_RMASK 0xF +#define QIB_6120_SerdesCfg0_RxIdleEnX_LSB 0x8 +#define QIB_6120_SerdesCfg0_RxIdleEnX_RMASK 0xF +#define QIB_6120_SerdesCfg0_L1PwrDnA_LSB 0x7 +#define QIB_6120_SerdesCfg0_L1PwrDnA_RMASK 0x1 +#define QIB_6120_SerdesCfg0_L1PwrDnB_LSB 0x6 +#define QIB_6120_SerdesCfg0_L1PwrDnB_RMASK 0x1 +#define QIB_6120_SerdesCfg0_L1PwrDnC_LSB 0x5 +#define QIB_6120_SerdesCfg0_L1PwrDnC_RMASK 0x1 +#define QIB_6120_SerdesCfg0_L1PwrDnD_LSB 0x4 +#define QIB_6120_SerdesCfg0_L1PwrDnD_RMASK 0x1 +#define QIB_6120_SerdesCfg0_ResetA_LSB 0x3 +#define QIB_6120_SerdesCfg0_ResetA_RMASK 0x1 +#define QIB_6120_SerdesCfg0_ResetB_LSB 0x2 +#define QIB_6120_SerdesCfg0_ResetB_RMASK 0x1 +#define QIB_6120_SerdesCfg0_ResetC_LSB 0x1 +#define QIB_6120_SerdesCfg0_ResetC_RMASK 0x1 +#define QIB_6120_SerdesCfg0_ResetD_LSB 0x0 +#define QIB_6120_SerdesCfg0_ResetD_RMASK 0x1 + +#define QIB_6120_SerdesStat_OFFS 0x3D0 +#define QIB_6120_SerdesStat_Reserved_LSB 0xC +#define QIB_6120_SerdesStat_Reserved_RMASK 0xFFFFFFFFFFFFF +#define QIB_6120_SerdesStat_BeaconDetA_LSB 0xB +#define QIB_6120_SerdesStat_BeaconDetA_RMASK 0x1 +#define QIB_6120_SerdesStat_BeaconDetB_LSB 0xA +#define QIB_6120_SerdesStat_BeaconDetB_RMASK 0x1 +#define QIB_6120_SerdesStat_BeaconDetC_LSB 0x9 +#define QIB_6120_SerdesStat_BeaconDetC_RMASK 0x1 +#define QIB_6120_SerdesStat_BeaconDetD_LSB 0x8 +#define QIB_6120_SerdesStat_BeaconDetD_RMASK 0x1 +#define QIB_6120_SerdesStat_RxDetA_LSB 0x7 +#define QIB_6120_SerdesStat_RxDetA_RMASK 0x1 +#define QIB_6120_SerdesStat_RxDetB_LSB 0x6 +#define QIB_6120_SerdesStat_RxDetB_RMASK 0x1 +#define QIB_6120_SerdesStat_RxDetC_LSB 0x5 +#define QIB_6120_SerdesStat_RxDetC_RMASK 0x1 +#define QIB_6120_SerdesStat_RxDetD_LSB 0x4 +#define QIB_6120_SerdesStat_RxDetD_RMASK 0x1 +#define QIB_6120_SerdesStat_TxIdleDetA_LSB 0x3 +#define QIB_6120_SerdesStat_TxIdleDetA_RMASK 0x1 +#define QIB_6120_SerdesStat_TxIdleDetB_LSB 0x2 +#define QIB_6120_SerdesStat_TxIdleDetB_RMASK 0x1 +#define QIB_6120_SerdesStat_TxIdleDetC_LSB 0x1 +#define QIB_6120_SerdesStat_TxIdleDetC_RMASK 0x1 +#define QIB_6120_SerdesStat_TxIdleDetD_LSB 0x0 +#define QIB_6120_SerdesStat_TxIdleDetD_RMASK 0x1 + +#define QIB_6120_XGXSCfg_OFFS 0x3D8 +#define QIB_6120_XGXSCfg_ArmLaunchErrorDisable_LSB 0x3F +#define QIB_6120_XGXSCfg_ArmLaunchErrorDisable_RMASK 0x1 +#define QIB_6120_XGXSCfg_Reserved_LSB 0x17 +#define QIB_6120_XGXSCfg_Reserved_RMASK 0xFFFFFFFFFF +#define QIB_6120_XGXSCfg_polarity_inv_LSB 0x13 +#define QIB_6120_XGXSCfg_polarity_inv_RMASK 0xF +#define QIB_6120_XGXSCfg_link_sync_mask_LSB 0x9 +#define QIB_6120_XGXSCfg_link_sync_mask_RMASK 0x3FF +#define QIB_6120_XGXSCfg_port_addr_LSB 0x4 +#define QIB_6120_XGXSCfg_port_addr_RMASK 0x1F +#define QIB_6120_XGXSCfg_mdd_30_LSB 0x3 +#define QIB_6120_XGXSCfg_mdd_30_RMASK 0x1 +#define QIB_6120_XGXSCfg_xcv_resetn_LSB 0x2 +#define QIB_6120_XGXSCfg_xcv_resetn_RMASK 0x1 +#define QIB_6120_XGXSCfg_Reserved1_LSB 0x1 +#define QIB_6120_XGXSCfg_Reserved1_RMASK 0x1 +#define QIB_6120_XGXSCfg_tx_rx_resetn_LSB 0x0 +#define QIB_6120_XGXSCfg_tx_rx_resetn_RMASK 0x1 + +#define QIB_6120_LBIntCnt_OFFS 0x12000 + +#define QIB_6120_LBFlowStallCnt_OFFS 0x12008 + +#define QIB_6120_TxUnsupVLErrCnt_OFFS 0x12018 + +#define QIB_6120_TxDataPktCnt_OFFS 0x12020 + +#define QIB_6120_TxFlowPktCnt_OFFS 0x12028 + +#define QIB_6120_TxDwordCnt_OFFS 0x12030 + +#define QIB_6120_TxLenErrCnt_OFFS 0x12038 + +#define QIB_6120_TxMaxMinLenErrCnt_OFFS 0x12040 + +#define QIB_6120_TxUnderrunCnt_OFFS 0x12048 + +#define QIB_6120_TxFlowStallCnt_OFFS 0x12050 + +#define QIB_6120_TxDroppedPktCnt_OFFS 0x12058 + +#define QIB_6120_RxDroppedPktCnt_OFFS 0x12060 + +#define QIB_6120_RxDataPktCnt_OFFS 0x12068 + +#define QIB_6120_RxFlowPktCnt_OFFS 0x12070 + +#define QIB_6120_RxDwordCnt_OFFS 0x12078 + +#define QIB_6120_RxLenErrCnt_OFFS 0x12080 + +#define QIB_6120_RxMaxMinLenErrCnt_OFFS 0x12088 + +#define QIB_6120_RxICRCErrCnt_OFFS 0x12090 + +#define QIB_6120_RxVCRCErrCnt_OFFS 0x12098 + +#define QIB_6120_RxFlowCtrlErrCnt_OFFS 0x120A0 + +#define QIB_6120_RxBadFormatCnt_OFFS 0x120A8 + +#define QIB_6120_RxLinkProblemCnt_OFFS 0x120B0 + +#define QIB_6120_RxEBPCnt_OFFS 0x120B8 + +#define QIB_6120_RxLPCRCErrCnt_OFFS 0x120C0 + +#define QIB_6120_RxBufOvflCnt_OFFS 0x120C8 + +#define QIB_6120_RxTIDFullErrCnt_OFFS 0x120D0 + +#define QIB_6120_RxTIDValidErrCnt_OFFS 0x120D8 + +#define QIB_6120_RxPKeyMismatchCnt_OFFS 0x120E0 + +#define QIB_6120_RxP0HdrEgrOvflCnt_OFFS 0x120E8 + +#define QIB_6120_IBStatusChangeCnt_OFFS 0x12140 + +#define QIB_6120_IBLinkErrRecoveryCnt_OFFS 0x12148 + +#define QIB_6120_IBLinkDownedCnt_OFFS 0x12150 + +#define QIB_6120_IBSymbolErrCnt_OFFS 0x12158 + +#define QIB_6120_PcieRetryBufDiagQwordCnt_OFFS 0x12170 + +#define QIB_6120_RcvEgrArray0_OFFS 0x14000 + +#define QIB_6120_RcvTIDArray0_OFFS 0x54000 + +#define QIB_6120_PIOLaunchFIFO_OFFS 0x64000 + +#define QIB_6120_SendPIOpbcCache_OFFS 0x64800 + +#define QIB_6120_RcvBuf1_OFFS 0x72000 + +#define QIB_6120_RcvBuf2_OFFS 0x75000 + +#define QIB_6120_RcvFlags_OFFS 0x77000 + +#define QIB_6120_RcvLookupBuf1_OFFS 0x79000 + +#define QIB_6120_RcvDMABuf_OFFS 0x7B000 + +#define QIB_6120_MiscRXEIntMem_OFFS 0x7C000 + +#define QIB_6120_PCIERcvBuf_OFFS 0x80000 + +#define QIB_6120_PCIERetryBuf_OFFS 0x82000 + +#define QIB_6120_PCIERcvBufRdToWrAddr_OFFS 0x84000 + +#define QIB_6120_PIOBuf0_MA_OFFS 0x100000 diff --git a/kernel/drivers/infiniband/hw/qib/qib_7220.h b/kernel/drivers/infiniband/hw/qib/qib_7220.h new file mode 100644 index 000000000..a5356cb42 --- /dev/null +++ b/kernel/drivers/infiniband/hw/qib/qib_7220.h @@ -0,0 +1,149 @@ +#ifndef _QIB_7220_H +#define _QIB_7220_H +/* + * Copyright (c) 2007, 2009, 2010 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* grab register-defs auto-generated by HW */ +#include "qib_7220_regs.h" + +/* The number of eager receive TIDs for context zero. */ +#define IBA7220_KRCVEGRCNT 2048U + +#define IB_7220_LT_STATE_CFGRCVFCFG 0x09 +#define IB_7220_LT_STATE_CFGWAITRMT 0x0a +#define IB_7220_LT_STATE_TXREVLANES 0x0d +#define IB_7220_LT_STATE_CFGENH 0x10 + +struct qib_chip_specific { + u64 __iomem *cregbase; + u64 *cntrs; + u64 *portcntrs; + spinlock_t sdepb_lock; /* serdes EPB bus */ + spinlock_t rcvmod_lock; /* protect rcvctrl shadow changes */ + spinlock_t gpio_lock; /* RMW of shadows/regs for ExtCtrl and GPIO */ + u64 hwerrmask; + u64 errormask; + u64 gpio_out; /* shadow of kr_gpio_out, for rmw ops */ + u64 gpio_mask; /* shadow the gpio mask register */ + u64 extctrl; /* shadow the gpio output enable, etc... */ + u32 ncntrs; + u32 nportcntrs; + u32 cntrnamelen; + u32 portcntrnamelen; + u32 numctxts; + u32 rcvegrcnt; + u32 autoneg_tries; + u32 serdes_first_init_done; + u32 sdmabufcnt; + u32 lastbuf_for_pio; + u32 updthresh; /* current AvailUpdThld */ + u32 updthresh_dflt; /* default AvailUpdThld */ + int irq; + u8 presets_needed; + u8 relock_timer_active; + char emsgbuf[128]; + char sdmamsgbuf[192]; + char bitsmsgbuf[64]; + struct timer_list relock_timer; + unsigned int relock_interval; /* in jiffies */ +}; + +struct qib_chippport_specific { + struct qib_pportdata pportdata; + wait_queue_head_t autoneg_wait; + struct delayed_work autoneg_work; + struct timer_list chase_timer; + /* + * these 5 fields are used to establish deltas for IB symbol + * errors and linkrecovery errors. They can be reported on + * some chips during link negotiation prior to INIT, and with + * DDR when faking DDR negotiations with non-IBTA switches. + * The chip counters are adjusted at driver unload if there is + * a non-zero delta. + */ + u64 ibdeltainprog; + u64 ibsymdelta; + u64 ibsymsnap; + u64 iblnkerrdelta; + u64 iblnkerrsnap; + u64 ibcctrl; /* kr_ibcctrl shadow */ + u64 ibcddrctrl; /* kr_ibcddrctrl shadow */ + unsigned long chase_end; + u32 last_delay_mult; +}; + +/* + * This header file provides the declarations and common definitions + * for (mostly) manipulation of the SerDes blocks within the IBA7220. + * the functions declared should only be called from within other + * 7220-related files such as qib_iba7220.c or qib_sd7220.c. + */ +int qib_sd7220_presets(struct qib_devdata *dd); +int qib_sd7220_init(struct qib_devdata *dd); +void qib_sd7220_clr_ibpar(struct qib_devdata *); +/* + * Below used for sdnum parameter, selecting one of the two sections + * used for PCIe, or the single SerDes used for IB, which is the + * only one currently used + */ +#define IB_7220_SERDES 2 + +static inline u32 qib_read_kreg32(const struct qib_devdata *dd, + const u16 regno) +{ + if (!dd->kregbase || !(dd->flags & QIB_PRESENT)) + return -1; + return readl((u32 __iomem *)&dd->kregbase[regno]); +} + +static inline u64 qib_read_kreg64(const struct qib_devdata *dd, + const u16 regno) +{ + if (!dd->kregbase || !(dd->flags & QIB_PRESENT)) + return -1; + + return readq(&dd->kregbase[regno]); +} + +static inline void qib_write_kreg(const struct qib_devdata *dd, + const u16 regno, u64 value) +{ + if (dd->kregbase) + writeq(value, &dd->kregbase[regno]); +} + +void set_7220_relock_poll(struct qib_devdata *, int); +void shutdown_7220_relock_poll(struct qib_devdata *); +void toggle_7220_rclkrls(struct qib_devdata *); + + +#endif /* _QIB_7220_H */ diff --git a/kernel/drivers/infiniband/hw/qib/qib_7220_regs.h b/kernel/drivers/infiniband/hw/qib/qib_7220_regs.h new file mode 100644 index 000000000..0da5bb750 --- /dev/null +++ b/kernel/drivers/infiniband/hw/qib/qib_7220_regs.h @@ -0,0 +1,1496 @@ +/* + * Copyright (c) 2008, 2009, 2010 QLogic Corporation. All rights reserved. + * + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +/* This file is mechanically generated from RTL. Any hand-edits will be lost! */ + +#define QIB_7220_Revision_OFFS 0x0 +#define QIB_7220_Revision_R_Simulator_LSB 0x3F +#define QIB_7220_Revision_R_Simulator_RMASK 0x1 +#define QIB_7220_Revision_R_Emulation_LSB 0x3E +#define QIB_7220_Revision_R_Emulation_RMASK 0x1 +#define QIB_7220_Revision_R_Emulation_Revcode_LSB 0x28 +#define QIB_7220_Revision_R_Emulation_Revcode_RMASK 0x3FFFFF +#define QIB_7220_Revision_BoardID_LSB 0x20 +#define QIB_7220_Revision_BoardID_RMASK 0xFF +#define QIB_7220_Revision_R_SW_LSB 0x18 +#define QIB_7220_Revision_R_SW_RMASK 0xFF +#define QIB_7220_Revision_R_Arch_LSB 0x10 +#define QIB_7220_Revision_R_Arch_RMASK 0xFF +#define QIB_7220_Revision_R_ChipRevMajor_LSB 0x8 +#define QIB_7220_Revision_R_ChipRevMajor_RMASK 0xFF +#define QIB_7220_Revision_R_ChipRevMinor_LSB 0x0 +#define QIB_7220_Revision_R_ChipRevMinor_RMASK 0xFF + +#define QIB_7220_Control_OFFS 0x8 +#define QIB_7220_Control_SyncResetExceptPcieIRAMRST_LSB 0x7 +#define QIB_7220_Control_SyncResetExceptPcieIRAMRST_RMASK 0x1 +#define QIB_7220_Control_PCIECplQDiagEn_LSB 0x6 +#define QIB_7220_Control_PCIECplQDiagEn_RMASK 0x1 +#define QIB_7220_Control_Reserved_LSB 0x5 +#define QIB_7220_Control_Reserved_RMASK 0x1 +#define QIB_7220_Control_TxLatency_LSB 0x4 +#define QIB_7220_Control_TxLatency_RMASK 0x1 +#define QIB_7220_Control_PCIERetryBufDiagEn_LSB 0x3 +#define QIB_7220_Control_PCIERetryBufDiagEn_RMASK 0x1 +#define QIB_7220_Control_LinkEn_LSB 0x2 +#define QIB_7220_Control_LinkEn_RMASK 0x1 +#define QIB_7220_Control_FreezeMode_LSB 0x1 +#define QIB_7220_Control_FreezeMode_RMASK 0x1 +#define QIB_7220_Control_SyncReset_LSB 0x0 +#define QIB_7220_Control_SyncReset_RMASK 0x1 + +#define QIB_7220_PageAlign_OFFS 0x10 + +#define QIB_7220_PortCnt_OFFS 0x18 + +#define QIB_7220_SendRegBase_OFFS 0x30 + +#define QIB_7220_UserRegBase_OFFS 0x38 + +#define QIB_7220_CntrRegBase_OFFS 0x40 + +#define QIB_7220_Scratch_OFFS 0x48 + +#define QIB_7220_IntMask_OFFS 0x68 +#define QIB_7220_IntMask_SDmaIntMask_LSB 0x3F +#define QIB_7220_IntMask_SDmaIntMask_RMASK 0x1 +#define QIB_7220_IntMask_SDmaDisabledMasked_LSB 0x3E +#define QIB_7220_IntMask_SDmaDisabledMasked_RMASK 0x1 +#define QIB_7220_IntMask_Reserved_LSB 0x31 +#define QIB_7220_IntMask_Reserved_RMASK 0x1FFF +#define QIB_7220_IntMask_RcvUrg16IntMask_LSB 0x30 +#define QIB_7220_IntMask_RcvUrg16IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvUrg15IntMask_LSB 0x2F +#define QIB_7220_IntMask_RcvUrg15IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvUrg14IntMask_LSB 0x2E +#define QIB_7220_IntMask_RcvUrg14IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvUrg13IntMask_LSB 0x2D +#define QIB_7220_IntMask_RcvUrg13IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvUrg12IntMask_LSB 0x2C +#define QIB_7220_IntMask_RcvUrg12IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvUrg11IntMask_LSB 0x2B +#define QIB_7220_IntMask_RcvUrg11IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvUrg10IntMask_LSB 0x2A +#define QIB_7220_IntMask_RcvUrg10IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvUrg9IntMask_LSB 0x29 +#define QIB_7220_IntMask_RcvUrg9IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvUrg8IntMask_LSB 0x28 +#define QIB_7220_IntMask_RcvUrg8IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvUrg7IntMask_LSB 0x27 +#define QIB_7220_IntMask_RcvUrg7IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvUrg6IntMask_LSB 0x26 +#define QIB_7220_IntMask_RcvUrg6IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvUrg5IntMask_LSB 0x25 +#define QIB_7220_IntMask_RcvUrg5IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvUrg4IntMask_LSB 0x24 +#define QIB_7220_IntMask_RcvUrg4IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvUrg3IntMask_LSB 0x23 +#define QIB_7220_IntMask_RcvUrg3IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvUrg2IntMask_LSB 0x22 +#define QIB_7220_IntMask_RcvUrg2IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvUrg1IntMask_LSB 0x21 +#define QIB_7220_IntMask_RcvUrg1IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvUrg0IntMask_LSB 0x20 +#define QIB_7220_IntMask_RcvUrg0IntMask_RMASK 0x1 +#define QIB_7220_IntMask_ErrorIntMask_LSB 0x1F +#define QIB_7220_IntMask_ErrorIntMask_RMASK 0x1 +#define QIB_7220_IntMask_PioSetIntMask_LSB 0x1E +#define QIB_7220_IntMask_PioSetIntMask_RMASK 0x1 +#define QIB_7220_IntMask_PioBufAvailIntMask_LSB 0x1D +#define QIB_7220_IntMask_PioBufAvailIntMask_RMASK 0x1 +#define QIB_7220_IntMask_assertGPIOIntMask_LSB 0x1C +#define QIB_7220_IntMask_assertGPIOIntMask_RMASK 0x1 +#define QIB_7220_IntMask_IBSerdesTrimDoneIntMask_LSB 0x1B +#define QIB_7220_IntMask_IBSerdesTrimDoneIntMask_RMASK 0x1 +#define QIB_7220_IntMask_JIntMask_LSB 0x1A +#define QIB_7220_IntMask_JIntMask_RMASK 0x1 +#define QIB_7220_IntMask_Reserved1_LSB 0x11 +#define QIB_7220_IntMask_Reserved1_RMASK 0x1FF +#define QIB_7220_IntMask_RcvAvail16IntMask_LSB 0x10 +#define QIB_7220_IntMask_RcvAvail16IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvAvail15IntMask_LSB 0xF +#define QIB_7220_IntMask_RcvAvail15IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvAvail14IntMask_LSB 0xE +#define QIB_7220_IntMask_RcvAvail14IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvAvail13IntMask_LSB 0xD +#define QIB_7220_IntMask_RcvAvail13IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvAvail12IntMask_LSB 0xC +#define QIB_7220_IntMask_RcvAvail12IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvAvail11IntMask_LSB 0xB +#define QIB_7220_IntMask_RcvAvail11IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvAvail10IntMask_LSB 0xA +#define QIB_7220_IntMask_RcvAvail10IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvAvail9IntMask_LSB 0x9 +#define QIB_7220_IntMask_RcvAvail9IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvAvail8IntMask_LSB 0x8 +#define QIB_7220_IntMask_RcvAvail8IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvAvail7IntMask_LSB 0x7 +#define QIB_7220_IntMask_RcvAvail7IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvAvail6IntMask_LSB 0x6 +#define QIB_7220_IntMask_RcvAvail6IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvAvail5IntMask_LSB 0x5 +#define QIB_7220_IntMask_RcvAvail5IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvAvail4IntMask_LSB 0x4 +#define QIB_7220_IntMask_RcvAvail4IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvAvail3IntMask_LSB 0x3 +#define QIB_7220_IntMask_RcvAvail3IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvAvail2IntMask_LSB 0x2 +#define QIB_7220_IntMask_RcvAvail2IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvAvail1IntMask_LSB 0x1 +#define QIB_7220_IntMask_RcvAvail1IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvAvail0IntMask_LSB 0x0 +#define QIB_7220_IntMask_RcvAvail0IntMask_RMASK 0x1 + +#define QIB_7220_IntStatus_OFFS 0x70 +#define QIB_7220_IntStatus_SDmaInt_LSB 0x3F +#define QIB_7220_IntStatus_SDmaInt_RMASK 0x1 +#define QIB_7220_IntStatus_SDmaDisabled_LSB 0x3E +#define QIB_7220_IntStatus_SDmaDisabled_RMASK 0x1 +#define QIB_7220_IntStatus_Reserved_LSB 0x31 +#define QIB_7220_IntStatus_Reserved_RMASK 0x1FFF +#define QIB_7220_IntStatus_RcvUrg16_LSB 0x30 +#define QIB_7220_IntStatus_RcvUrg16_RMASK 0x1 +#define QIB_7220_IntStatus_RcvUrg15_LSB 0x2F +#define QIB_7220_IntStatus_RcvUrg15_RMASK 0x1 +#define QIB_7220_IntStatus_RcvUrg14_LSB 0x2E +#define QIB_7220_IntStatus_RcvUrg14_RMASK 0x1 +#define QIB_7220_IntStatus_RcvUrg13_LSB 0x2D +#define QIB_7220_IntStatus_RcvUrg13_RMASK 0x1 +#define QIB_7220_IntStatus_RcvUrg12_LSB 0x2C +#define QIB_7220_IntStatus_RcvUrg12_RMASK 0x1 +#define QIB_7220_IntStatus_RcvUrg11_LSB 0x2B +#define QIB_7220_IntStatus_RcvUrg11_RMASK 0x1 +#define QIB_7220_IntStatus_RcvUrg10_LSB 0x2A +#define QIB_7220_IntStatus_RcvUrg10_RMASK 0x1 +#define QIB_7220_IntStatus_RcvUrg9_LSB 0x29 +#define QIB_7220_IntStatus_RcvUrg9_RMASK 0x1 +#define QIB_7220_IntStatus_RcvUrg8_LSB 0x28 +#define QIB_7220_IntStatus_RcvUrg8_RMASK 0x1 +#define QIB_7220_IntStatus_RcvUrg7_LSB 0x27 +#define QIB_7220_IntStatus_RcvUrg7_RMASK 0x1 +#define QIB_7220_IntStatus_RcvUrg6_LSB 0x26 +#define QIB_7220_IntStatus_RcvUrg6_RMASK 0x1 +#define QIB_7220_IntStatus_RcvUrg5_LSB 0x25 +#define QIB_7220_IntStatus_RcvUrg5_RMASK 0x1 +#define QIB_7220_IntStatus_RcvUrg4_LSB 0x24 +#define QIB_7220_IntStatus_RcvUrg4_RMASK 0x1 +#define QIB_7220_IntStatus_RcvUrg3_LSB 0x23 +#define QIB_7220_IntStatus_RcvUrg3_RMASK 0x1 +#define QIB_7220_IntStatus_RcvUrg2_LSB 0x22 +#define QIB_7220_IntStatus_RcvUrg2_RMASK 0x1 +#define QIB_7220_IntStatus_RcvUrg1_LSB 0x21 +#define QIB_7220_IntStatus_RcvUrg1_RMASK 0x1 +#define QIB_7220_IntStatus_RcvUrg0_LSB 0x20 +#define QIB_7220_IntStatus_RcvUrg0_RMASK 0x1 +#define QIB_7220_IntStatus_Error_LSB 0x1F +#define QIB_7220_IntStatus_Error_RMASK 0x1 +#define QIB_7220_IntStatus_PioSent_LSB 0x1E +#define QIB_7220_IntStatus_PioSent_RMASK 0x1 +#define QIB_7220_IntStatus_PioBufAvail_LSB 0x1D +#define QIB_7220_IntStatus_PioBufAvail_RMASK 0x1 +#define QIB_7220_IntStatus_assertGPIO_LSB 0x1C +#define QIB_7220_IntStatus_assertGPIO_RMASK 0x1 +#define QIB_7220_IntStatus_IBSerdesTrimDone_LSB 0x1B +#define QIB_7220_IntStatus_IBSerdesTrimDone_RMASK 0x1 +#define QIB_7220_IntStatus_JInt_LSB 0x1A +#define QIB_7220_IntStatus_JInt_RMASK 0x1 +#define QIB_7220_IntStatus_Reserved1_LSB 0x11 +#define QIB_7220_IntStatus_Reserved1_RMASK 0x1FF +#define QIB_7220_IntStatus_RcvAvail16_LSB 0x10 +#define QIB_7220_IntStatus_RcvAvail16_RMASK 0x1 +#define QIB_7220_IntStatus_RcvAvail15_LSB 0xF +#define QIB_7220_IntStatus_RcvAvail15_RMASK 0x1 +#define QIB_7220_IntStatus_RcvAvail14_LSB 0xE +#define QIB_7220_IntStatus_RcvAvail14_RMASK 0x1 +#define QIB_7220_IntStatus_RcvAvail13_LSB 0xD +#define QIB_7220_IntStatus_RcvAvail13_RMASK 0x1 +#define QIB_7220_IntStatus_RcvAvail12_LSB 0xC +#define QIB_7220_IntStatus_RcvAvail12_RMASK 0x1 +#define QIB_7220_IntStatus_RcvAvail11_LSB 0xB +#define QIB_7220_IntStatus_RcvAvail11_RMASK 0x1 +#define QIB_7220_IntStatus_RcvAvail10_LSB 0xA +#define QIB_7220_IntStatus_RcvAvail10_RMASK 0x1 +#define QIB_7220_IntStatus_RcvAvail9_LSB 0x9 +#define QIB_7220_IntStatus_RcvAvail9_RMASK 0x1 +#define QIB_7220_IntStatus_RcvAvail8_LSB 0x8 +#define QIB_7220_IntStatus_RcvAvail8_RMASK 0x1 +#define QIB_7220_IntStatus_RcvAvail7_LSB 0x7 +#define QIB_7220_IntStatus_RcvAvail7_RMASK 0x1 +#define QIB_7220_IntStatus_RcvAvail6_LSB 0x6 +#define QIB_7220_IntStatus_RcvAvail6_RMASK 0x1 +#define QIB_7220_IntStatus_RcvAvail5_LSB 0x5 +#define QIB_7220_IntStatus_RcvAvail5_RMASK 0x1 +#define QIB_7220_IntStatus_RcvAvail4_LSB 0x4 +#define QIB_7220_IntStatus_RcvAvail4_RMASK 0x1 +#define QIB_7220_IntStatus_RcvAvail3_LSB 0x3 +#define QIB_7220_IntStatus_RcvAvail3_RMASK 0x1 +#define QIB_7220_IntStatus_RcvAvail2_LSB 0x2 +#define QIB_7220_IntStatus_RcvAvail2_RMASK 0x1 +#define QIB_7220_IntStatus_RcvAvail1_LSB 0x1 +#define QIB_7220_IntStatus_RcvAvail1_RMASK 0x1 +#define QIB_7220_IntStatus_RcvAvail0_LSB 0x0 +#define QIB_7220_IntStatus_RcvAvail0_RMASK 0x1 + +#define QIB_7220_IntClear_OFFS 0x78 +#define QIB_7220_IntClear_SDmaIntClear_LSB 0x3F +#define QIB_7220_IntClear_SDmaIntClear_RMASK 0x1 +#define QIB_7220_IntClear_SDmaDisabledClear_LSB 0x3E +#define QIB_7220_IntClear_SDmaDisabledClear_RMASK 0x1 +#define QIB_7220_IntClear_Reserved_LSB 0x31 +#define QIB_7220_IntClear_Reserved_RMASK 0x1FFF +#define QIB_7220_IntClear_RcvUrg16IntClear_LSB 0x30 +#define QIB_7220_IntClear_RcvUrg16IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvUrg15IntClear_LSB 0x2F +#define QIB_7220_IntClear_RcvUrg15IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvUrg14IntClear_LSB 0x2E +#define QIB_7220_IntClear_RcvUrg14IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvUrg13IntClear_LSB 0x2D +#define QIB_7220_IntClear_RcvUrg13IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvUrg12IntClear_LSB 0x2C +#define QIB_7220_IntClear_RcvUrg12IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvUrg11IntClear_LSB 0x2B +#define QIB_7220_IntClear_RcvUrg11IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvUrg10IntClear_LSB 0x2A +#define QIB_7220_IntClear_RcvUrg10IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvUrg9IntClear_LSB 0x29 +#define QIB_7220_IntClear_RcvUrg9IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvUrg8IntClear_LSB 0x28 +#define QIB_7220_IntClear_RcvUrg8IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvUrg7IntClear_LSB 0x27 +#define QIB_7220_IntClear_RcvUrg7IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvUrg6IntClear_LSB 0x26 +#define QIB_7220_IntClear_RcvUrg6IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvUrg5IntClear_LSB 0x25 +#define QIB_7220_IntClear_RcvUrg5IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvUrg4IntClear_LSB 0x24 +#define QIB_7220_IntClear_RcvUrg4IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvUrg3IntClear_LSB 0x23 +#define QIB_7220_IntClear_RcvUrg3IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvUrg2IntClear_LSB 0x22 +#define QIB_7220_IntClear_RcvUrg2IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvUrg1IntClear_LSB 0x21 +#define QIB_7220_IntClear_RcvUrg1IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvUrg0IntClear_LSB 0x20 +#define QIB_7220_IntClear_RcvUrg0IntClear_RMASK 0x1 +#define QIB_7220_IntClear_ErrorIntClear_LSB 0x1F +#define QIB_7220_IntClear_ErrorIntClear_RMASK 0x1 +#define QIB_7220_IntClear_PioSetIntClear_LSB 0x1E +#define QIB_7220_IntClear_PioSetIntClear_RMASK 0x1 +#define QIB_7220_IntClear_PioBufAvailIntClear_LSB 0x1D +#define QIB_7220_IntClear_PioBufAvailIntClear_RMASK 0x1 +#define QIB_7220_IntClear_assertGPIOIntClear_LSB 0x1C +#define QIB_7220_IntClear_assertGPIOIntClear_RMASK 0x1 +#define QIB_7220_IntClear_IBSerdesTrimDoneClear_LSB 0x1B +#define QIB_7220_IntClear_IBSerdesTrimDoneClear_RMASK 0x1 +#define QIB_7220_IntClear_JIntClear_LSB 0x1A +#define QIB_7220_IntClear_JIntClear_RMASK 0x1 +#define QIB_7220_IntClear_Reserved1_LSB 0x11 +#define QIB_7220_IntClear_Reserved1_RMASK 0x1FF +#define QIB_7220_IntClear_RcvAvail16IntClear_LSB 0x10 +#define QIB_7220_IntClear_RcvAvail16IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvAvail15IntClear_LSB 0xF +#define QIB_7220_IntClear_RcvAvail15IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvAvail14IntClear_LSB 0xE +#define QIB_7220_IntClear_RcvAvail14IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvAvail13IntClear_LSB 0xD +#define QIB_7220_IntClear_RcvAvail13IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvAvail12IntClear_LSB 0xC +#define QIB_7220_IntClear_RcvAvail12IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvAvail11IntClear_LSB 0xB +#define QIB_7220_IntClear_RcvAvail11IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvAvail10IntClear_LSB 0xA +#define QIB_7220_IntClear_RcvAvail10IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvAvail9IntClear_LSB 0x9 +#define QIB_7220_IntClear_RcvAvail9IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvAvail8IntClear_LSB 0x8 +#define QIB_7220_IntClear_RcvAvail8IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvAvail7IntClear_LSB 0x7 +#define QIB_7220_IntClear_RcvAvail7IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvAvail6IntClear_LSB 0x6 +#define QIB_7220_IntClear_RcvAvail6IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvAvail5IntClear_LSB 0x5 +#define QIB_7220_IntClear_RcvAvail5IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvAvail4IntClear_LSB 0x4 +#define QIB_7220_IntClear_RcvAvail4IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvAvail3IntClear_LSB 0x3 +#define QIB_7220_IntClear_RcvAvail3IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvAvail2IntClear_LSB 0x2 +#define QIB_7220_IntClear_RcvAvail2IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvAvail1IntClear_LSB 0x1 +#define QIB_7220_IntClear_RcvAvail1IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvAvail0IntClear_LSB 0x0 +#define QIB_7220_IntClear_RcvAvail0IntClear_RMASK 0x1 + +#define QIB_7220_ErrMask_OFFS 0x80 +#define QIB_7220_ErrMask_Reserved_LSB 0x36 +#define QIB_7220_ErrMask_Reserved_RMASK 0x3FF +#define QIB_7220_ErrMask_InvalidEEPCmdMask_LSB 0x35 +#define QIB_7220_ErrMask_InvalidEEPCmdMask_RMASK 0x1 +#define QIB_7220_ErrMask_SDmaDescAddrMisalignErrMask_LSB 0x34 +#define QIB_7220_ErrMask_SDmaDescAddrMisalignErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_HardwareErrMask_LSB 0x33 +#define QIB_7220_ErrMask_HardwareErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_ResetNegatedMask_LSB 0x32 +#define QIB_7220_ErrMask_ResetNegatedMask_RMASK 0x1 +#define QIB_7220_ErrMask_InvalidAddrErrMask_LSB 0x31 +#define QIB_7220_ErrMask_InvalidAddrErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_IBStatusChangedMask_LSB 0x30 +#define QIB_7220_ErrMask_IBStatusChangedMask_RMASK 0x1 +#define QIB_7220_ErrMask_SDmaUnexpDataErrMask_LSB 0x2F +#define QIB_7220_ErrMask_SDmaUnexpDataErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_SDmaMissingDwErrMask_LSB 0x2E +#define QIB_7220_ErrMask_SDmaMissingDwErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_SDmaDwEnErrMask_LSB 0x2D +#define QIB_7220_ErrMask_SDmaDwEnErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_SDmaRpyTagErrMask_LSB 0x2C +#define QIB_7220_ErrMask_SDmaRpyTagErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_SDma1stDescErrMask_LSB 0x2B +#define QIB_7220_ErrMask_SDma1stDescErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_SDmaBaseErrMask_LSB 0x2A +#define QIB_7220_ErrMask_SDmaBaseErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_SDmaTailOutOfBoundErrMask_LSB 0x29 +#define QIB_7220_ErrMask_SDmaTailOutOfBoundErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_SDmaOutOfBoundErrMask_LSB 0x28 +#define QIB_7220_ErrMask_SDmaOutOfBoundErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_SDmaGenMismatchErrMask_LSB 0x27 +#define QIB_7220_ErrMask_SDmaGenMismatchErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_SendBufMisuseErrMask_LSB 0x26 +#define QIB_7220_ErrMask_SendBufMisuseErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_SendUnsupportedVLErrMask_LSB 0x25 +#define QIB_7220_ErrMask_SendUnsupportedVLErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_SendUnexpectedPktNumErrMask_LSB 0x24 +#define QIB_7220_ErrMask_SendUnexpectedPktNumErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_SendPioArmLaunchErrMask_LSB 0x23 +#define QIB_7220_ErrMask_SendPioArmLaunchErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_SendDroppedDataPktErrMask_LSB 0x22 +#define QIB_7220_ErrMask_SendDroppedDataPktErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_SendDroppedSmpPktErrMask_LSB 0x21 +#define QIB_7220_ErrMask_SendDroppedSmpPktErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_SendPktLenErrMask_LSB 0x20 +#define QIB_7220_ErrMask_SendPktLenErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_SendUnderRunErrMask_LSB 0x1F +#define QIB_7220_ErrMask_SendUnderRunErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_SendMaxPktLenErrMask_LSB 0x1E +#define QIB_7220_ErrMask_SendMaxPktLenErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_SendMinPktLenErrMask_LSB 0x1D +#define QIB_7220_ErrMask_SendMinPktLenErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_SDmaDisabledErrMask_LSB 0x1C +#define QIB_7220_ErrMask_SDmaDisabledErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_SendSpecialTriggerErrMask_LSB 0x1B +#define QIB_7220_ErrMask_SendSpecialTriggerErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_Reserved1_LSB 0x12 +#define QIB_7220_ErrMask_Reserved1_RMASK 0x1FF +#define QIB_7220_ErrMask_RcvIBLostLinkErrMask_LSB 0x11 +#define QIB_7220_ErrMask_RcvIBLostLinkErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_RcvHdrErrMask_LSB 0x10 +#define QIB_7220_ErrMask_RcvHdrErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_RcvHdrLenErrMask_LSB 0xF +#define QIB_7220_ErrMask_RcvHdrLenErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_RcvBadTidErrMask_LSB 0xE +#define QIB_7220_ErrMask_RcvBadTidErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_RcvHdrFullErrMask_LSB 0xD +#define QIB_7220_ErrMask_RcvHdrFullErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_RcvEgrFullErrMask_LSB 0xC +#define QIB_7220_ErrMask_RcvEgrFullErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_RcvBadVersionErrMask_LSB 0xB +#define QIB_7220_ErrMask_RcvBadVersionErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_RcvIBFlowErrMask_LSB 0xA +#define QIB_7220_ErrMask_RcvIBFlowErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_RcvEBPErrMask_LSB 0x9 +#define QIB_7220_ErrMask_RcvEBPErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_RcvUnsupportedVLErrMask_LSB 0x8 +#define QIB_7220_ErrMask_RcvUnsupportedVLErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_RcvUnexpectedCharErrMask_LSB 0x7 +#define QIB_7220_ErrMask_RcvUnexpectedCharErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_RcvShortPktLenErrMask_LSB 0x6 +#define QIB_7220_ErrMask_RcvShortPktLenErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_RcvLongPktLenErrMask_LSB 0x5 +#define QIB_7220_ErrMask_RcvLongPktLenErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_RcvMaxPktLenErrMask_LSB 0x4 +#define QIB_7220_ErrMask_RcvMaxPktLenErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_RcvMinPktLenErrMask_LSB 0x3 +#define QIB_7220_ErrMask_RcvMinPktLenErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_RcvICRCErrMask_LSB 0x2 +#define QIB_7220_ErrMask_RcvICRCErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_RcvVCRCErrMask_LSB 0x1 +#define QIB_7220_ErrMask_RcvVCRCErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_RcvFormatErrMask_LSB 0x0 +#define QIB_7220_ErrMask_RcvFormatErrMask_RMASK 0x1 + +#define QIB_7220_ErrStatus_OFFS 0x88 +#define QIB_7220_ErrStatus_Reserved_LSB 0x36 +#define QIB_7220_ErrStatus_Reserved_RMASK 0x3FF +#define QIB_7220_ErrStatus_InvalidEEPCmdErr_LSB 0x35 +#define QIB_7220_ErrStatus_InvalidEEPCmdErr_RMASK 0x1 +#define QIB_7220_ErrStatus_SDmaDescAddrMisalignErr_LSB 0x34 +#define QIB_7220_ErrStatus_SDmaDescAddrMisalignErr_RMASK 0x1 +#define QIB_7220_ErrStatus_HardwareErr_LSB 0x33 +#define QIB_7220_ErrStatus_HardwareErr_RMASK 0x1 +#define QIB_7220_ErrStatus_ResetNegated_LSB 0x32 +#define QIB_7220_ErrStatus_ResetNegated_RMASK 0x1 +#define QIB_7220_ErrStatus_InvalidAddrErr_LSB 0x31 +#define QIB_7220_ErrStatus_InvalidAddrErr_RMASK 0x1 +#define QIB_7220_ErrStatus_IBStatusChanged_LSB 0x30 +#define QIB_7220_ErrStatus_IBStatusChanged_RMASK 0x1 +#define QIB_7220_ErrStatus_SDmaUnexpDataErr_LSB 0x2F +#define QIB_7220_ErrStatus_SDmaUnexpDataErr_RMASK 0x1 +#define QIB_7220_ErrStatus_SDmaMissingDwErr_LSB 0x2E +#define QIB_7220_ErrStatus_SDmaMissingDwErr_RMASK 0x1 +#define QIB_7220_ErrStatus_SDmaDwEnErr_LSB 0x2D +#define QIB_7220_ErrStatus_SDmaDwEnErr_RMASK 0x1 +#define QIB_7220_ErrStatus_SDmaRpyTagErr_LSB 0x2C +#define QIB_7220_ErrStatus_SDmaRpyTagErr_RMASK 0x1 +#define QIB_7220_ErrStatus_SDma1stDescErr_LSB 0x2B +#define QIB_7220_ErrStatus_SDma1stDescErr_RMASK 0x1 +#define QIB_7220_ErrStatus_SDmaBaseErr_LSB 0x2A +#define QIB_7220_ErrStatus_SDmaBaseErr_RMASK 0x1 +#define QIB_7220_ErrStatus_SDmaTailOutOfBoundErr_LSB 0x29 +#define QIB_7220_ErrStatus_SDmaTailOutOfBoundErr_RMASK 0x1 +#define QIB_7220_ErrStatus_SDmaOutOfBoundErr_LSB 0x28 +#define QIB_7220_ErrStatus_SDmaOutOfBoundErr_RMASK 0x1 +#define QIB_7220_ErrStatus_SDmaGenMismatchErr_LSB 0x27 +#define QIB_7220_ErrStatus_SDmaGenMismatchErr_RMASK 0x1 +#define QIB_7220_ErrStatus_SendBufMisuseErr_LSB 0x26 +#define QIB_7220_ErrStatus_SendBufMisuseErr_RMASK 0x1 +#define QIB_7220_ErrStatus_SendUnsupportedVLErr_LSB 0x25 +#define QIB_7220_ErrStatus_SendUnsupportedVLErr_RMASK 0x1 +#define QIB_7220_ErrStatus_SendUnexpectedPktNumErr_LSB 0x24 +#define QIB_7220_ErrStatus_SendUnexpectedPktNumErr_RMASK 0x1 +#define QIB_7220_ErrStatus_SendPioArmLaunchErr_LSB 0x23 +#define QIB_7220_ErrStatus_SendPioArmLaunchErr_RMASK 0x1 +#define QIB_7220_ErrStatus_SendDroppedDataPktErr_LSB 0x22 +#define QIB_7220_ErrStatus_SendDroppedDataPktErr_RMASK 0x1 +#define QIB_7220_ErrStatus_SendDroppedSmpPktErr_LSB 0x21 +#define QIB_7220_ErrStatus_SendDroppedSmpPktErr_RMASK 0x1 +#define QIB_7220_ErrStatus_SendPktLenErr_LSB 0x20 +#define QIB_7220_ErrStatus_SendPktLenErr_RMASK 0x1 +#define QIB_7220_ErrStatus_SendUnderRunErr_LSB 0x1F +#define QIB_7220_ErrStatus_SendUnderRunErr_RMASK 0x1 +#define QIB_7220_ErrStatus_SendMaxPktLenErr_LSB 0x1E +#define QIB_7220_ErrStatus_SendMaxPktLenErr_RMASK 0x1 +#define QIB_7220_ErrStatus_SendMinPktLenErr_LSB 0x1D +#define QIB_7220_ErrStatus_SendMinPktLenErr_RMASK 0x1 +#define QIB_7220_ErrStatus_SDmaDisabledErr_LSB 0x1C +#define QIB_7220_ErrStatus_SDmaDisabledErr_RMASK 0x1 +#define QIB_7220_ErrStatus_SendSpecialTriggerErr_LSB 0x1B +#define QIB_7220_ErrStatus_SendSpecialTriggerErr_RMASK 0x1 +#define QIB_7220_ErrStatus_Reserved1_LSB 0x12 +#define QIB_7220_ErrStatus_Reserved1_RMASK 0x1FF +#define QIB_7220_ErrStatus_RcvIBLostLinkErr_LSB 0x11 +#define QIB_7220_ErrStatus_RcvIBLostLinkErr_RMASK 0x1 +#define QIB_7220_ErrStatus_RcvHdrErr_LSB 0x10 +#define QIB_7220_ErrStatus_RcvHdrErr_RMASK 0x1 +#define QIB_7220_ErrStatus_RcvHdrLenErr_LSB 0xF +#define QIB_7220_ErrStatus_RcvHdrLenErr_RMASK 0x1 +#define QIB_7220_ErrStatus_RcvBadTidErr_LSB 0xE +#define QIB_7220_ErrStatus_RcvBadTidErr_RMASK 0x1 +#define QIB_7220_ErrStatus_RcvHdrFullErr_LSB 0xD +#define QIB_7220_ErrStatus_RcvHdrFullErr_RMASK 0x1 +#define QIB_7220_ErrStatus_RcvEgrFullErr_LSB 0xC +#define QIB_7220_ErrStatus_RcvEgrFullErr_RMASK 0x1 +#define QIB_7220_ErrStatus_RcvBadVersionErr_LSB 0xB +#define QIB_7220_ErrStatus_RcvBadVersionErr_RMASK 0x1 +#define QIB_7220_ErrStatus_RcvIBFlowErr_LSB 0xA +#define QIB_7220_ErrStatus_RcvIBFlowErr_RMASK 0x1 +#define QIB_7220_ErrStatus_RcvEBPErr_LSB 0x9 +#define QIB_7220_ErrStatus_RcvEBPErr_RMASK 0x1 +#define QIB_7220_ErrStatus_RcvUnsupportedVLErr_LSB 0x8 +#define QIB_7220_ErrStatus_RcvUnsupportedVLErr_RMASK 0x1 +#define QIB_7220_ErrStatus_RcvUnexpectedCharErr_LSB 0x7 +#define QIB_7220_ErrStatus_RcvUnexpectedCharErr_RMASK 0x1 +#define QIB_7220_ErrStatus_RcvShortPktLenErr_LSB 0x6 +#define QIB_7220_ErrStatus_RcvShortPktLenErr_RMASK 0x1 +#define QIB_7220_ErrStatus_RcvLongPktLenErr_LSB 0x5 +#define QIB_7220_ErrStatus_RcvLongPktLenErr_RMASK 0x1 +#define QIB_7220_ErrStatus_RcvMaxPktLenErr_LSB 0x4 +#define QIB_7220_ErrStatus_RcvMaxPktLenErr_RMASK 0x1 +#define QIB_7220_ErrStatus_RcvMinPktLenErr_LSB 0x3 +#define QIB_7220_ErrStatus_RcvMinPktLenErr_RMASK 0x1 +#define QIB_7220_ErrStatus_RcvICRCErr_LSB 0x2 +#define QIB_7220_ErrStatus_RcvICRCErr_RMASK 0x1 +#define QIB_7220_ErrStatus_RcvVCRCErr_LSB 0x1 +#define QIB_7220_ErrStatus_RcvVCRCErr_RMASK 0x1 +#define QIB_7220_ErrStatus_RcvFormatErr_LSB 0x0 +#define QIB_7220_ErrStatus_RcvFormatErr_RMASK 0x1 + +#define QIB_7220_ErrClear_OFFS 0x90 +#define QIB_7220_ErrClear_Reserved_LSB 0x36 +#define QIB_7220_ErrClear_Reserved_RMASK 0x3FF +#define QIB_7220_ErrClear_InvalidEEPCmdErrClear_LSB 0x35 +#define QIB_7220_ErrClear_InvalidEEPCmdErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_SDmaDescAddrMisalignErrClear_LSB 0x34 +#define QIB_7220_ErrClear_SDmaDescAddrMisalignErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_HardwareErrClear_LSB 0x33 +#define QIB_7220_ErrClear_HardwareErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_ResetNegatedClear_LSB 0x32 +#define QIB_7220_ErrClear_ResetNegatedClear_RMASK 0x1 +#define QIB_7220_ErrClear_InvalidAddrErrClear_LSB 0x31 +#define QIB_7220_ErrClear_InvalidAddrErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_IBStatusChangedClear_LSB 0x30 +#define QIB_7220_ErrClear_IBStatusChangedClear_RMASK 0x1 +#define QIB_7220_ErrClear_SDmaUnexpDataErrClear_LSB 0x2F +#define QIB_7220_ErrClear_SDmaUnexpDataErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_SDmaMissingDwErrClear_LSB 0x2E +#define QIB_7220_ErrClear_SDmaMissingDwErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_SDmaDwEnErrClear_LSB 0x2D +#define QIB_7220_ErrClear_SDmaDwEnErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_SDmaRpyTagErrClear_LSB 0x2C +#define QIB_7220_ErrClear_SDmaRpyTagErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_SDma1stDescErrClear_LSB 0x2B +#define QIB_7220_ErrClear_SDma1stDescErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_SDmaBaseErrClear_LSB 0x2A +#define QIB_7220_ErrClear_SDmaBaseErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_SDmaTailOutOfBoundErrClear_LSB 0x29 +#define QIB_7220_ErrClear_SDmaTailOutOfBoundErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_SDmaOutOfBoundErrClear_LSB 0x28 +#define QIB_7220_ErrClear_SDmaOutOfBoundErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_SDmaGenMismatchErrClear_LSB 0x27 +#define QIB_7220_ErrClear_SDmaGenMismatchErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_SendBufMisuseErrClear_LSB 0x26 +#define QIB_7220_ErrClear_SendBufMisuseErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_SendUnsupportedVLErrClear_LSB 0x25 +#define QIB_7220_ErrClear_SendUnsupportedVLErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_SendUnexpectedPktNumErrClear_LSB 0x24 +#define QIB_7220_ErrClear_SendUnexpectedPktNumErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_SendPioArmLaunchErrClear_LSB 0x23 +#define QIB_7220_ErrClear_SendPioArmLaunchErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_SendDroppedDataPktErrClear_LSB 0x22 +#define QIB_7220_ErrClear_SendDroppedDataPktErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_SendDroppedSmpPktErrClear_LSB 0x21 +#define QIB_7220_ErrClear_SendDroppedSmpPktErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_SendPktLenErrClear_LSB 0x20 +#define QIB_7220_ErrClear_SendPktLenErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_SendUnderRunErrClear_LSB 0x1F +#define QIB_7220_ErrClear_SendUnderRunErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_SendMaxPktLenErrClear_LSB 0x1E +#define QIB_7220_ErrClear_SendMaxPktLenErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_SendMinPktLenErrClear_LSB 0x1D +#define QIB_7220_ErrClear_SendMinPktLenErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_SDmaDisabledErrClear_LSB 0x1C +#define QIB_7220_ErrClear_SDmaDisabledErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_SendSpecialTriggerErrClear_LSB 0x1B +#define QIB_7220_ErrClear_SendSpecialTriggerErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_Reserved1_LSB 0x12 +#define QIB_7220_ErrClear_Reserved1_RMASK 0x1FF +#define QIB_7220_ErrClear_RcvIBLostLinkErrClear_LSB 0x11 +#define QIB_7220_ErrClear_RcvIBLostLinkErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_RcvHdrErrClear_LSB 0x10 +#define QIB_7220_ErrClear_RcvHdrErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_RcvHdrLenErrClear_LSB 0xF +#define QIB_7220_ErrClear_RcvHdrLenErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_RcvBadTidErrClear_LSB 0xE +#define QIB_7220_ErrClear_RcvBadTidErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_RcvHdrFullErrClear_LSB 0xD +#define QIB_7220_ErrClear_RcvHdrFullErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_RcvEgrFullErrClear_LSB 0xC +#define QIB_7220_ErrClear_RcvEgrFullErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_RcvBadVersionErrClear_LSB 0xB +#define QIB_7220_ErrClear_RcvBadVersionErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_RcvIBFlowErrClear_LSB 0xA +#define QIB_7220_ErrClear_RcvIBFlowErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_RcvEBPErrClear_LSB 0x9 +#define QIB_7220_ErrClear_RcvEBPErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_RcvUnsupportedVLErrClear_LSB 0x8 +#define QIB_7220_ErrClear_RcvUnsupportedVLErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_RcvUnexpectedCharErrClear_LSB 0x7 +#define QIB_7220_ErrClear_RcvUnexpectedCharErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_RcvShortPktLenErrClear_LSB 0x6 +#define QIB_7220_ErrClear_RcvShortPktLenErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_RcvLongPktLenErrClear_LSB 0x5 +#define QIB_7220_ErrClear_RcvLongPktLenErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_RcvMaxPktLenErrClear_LSB 0x4 +#define QIB_7220_ErrClear_RcvMaxPktLenErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_RcvMinPktLenErrClear_LSB 0x3 +#define QIB_7220_ErrClear_RcvMinPktLenErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_RcvICRCErrClear_LSB 0x2 +#define QIB_7220_ErrClear_RcvICRCErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_RcvVCRCErrClear_LSB 0x1 +#define QIB_7220_ErrClear_RcvVCRCErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_RcvFormatErrClear_LSB 0x0 +#define QIB_7220_ErrClear_RcvFormatErrClear_RMASK 0x1 + +#define QIB_7220_HwErrMask_OFFS 0x98 +#define QIB_7220_HwErrMask_IBCBusFromSPCParityErrMask_LSB 0x3F +#define QIB_7220_HwErrMask_IBCBusFromSPCParityErrMask_RMASK 0x1 +#define QIB_7220_HwErrMask_IBCBusToSPCParityErrMask_LSB 0x3E +#define QIB_7220_HwErrMask_IBCBusToSPCParityErrMask_RMASK 0x1 +#define QIB_7220_HwErrMask_Clk_uC_PLLNotLockedMask_LSB 0x3D +#define QIB_7220_HwErrMask_Clk_uC_PLLNotLockedMask_RMASK 0x1 +#define QIB_7220_HwErrMask_IBSerdesPClkNotDetectMask_LSB 0x3C +#define QIB_7220_HwErrMask_IBSerdesPClkNotDetectMask_RMASK 0x1 +#define QIB_7220_HwErrMask_PCIESerdesQ3PClkNotDetectMask_LSB 0x3B +#define QIB_7220_HwErrMask_PCIESerdesQ3PClkNotDetectMask_RMASK 0x1 +#define QIB_7220_HwErrMask_PCIESerdesQ2PClkNotDetectMask_LSB 0x3A +#define QIB_7220_HwErrMask_PCIESerdesQ2PClkNotDetectMask_RMASK 0x1 +#define QIB_7220_HwErrMask_PCIESerdesQ1PClkNotDetectMask_LSB 0x39 +#define QIB_7220_HwErrMask_PCIESerdesQ1PClkNotDetectMask_RMASK 0x1 +#define QIB_7220_HwErrMask_PCIESerdesQ0PClkNotDetectMask_LSB 0x38 +#define QIB_7220_HwErrMask_PCIESerdesQ0PClkNotDetectMask_RMASK 0x1 +#define QIB_7220_HwErrMask_Reserved_LSB 0x37 +#define QIB_7220_HwErrMask_Reserved_RMASK 0x1 +#define QIB_7220_HwErrMask_PowerOnBISTFailedMask_LSB 0x36 +#define QIB_7220_HwErrMask_PowerOnBISTFailedMask_RMASK 0x1 +#define QIB_7220_HwErrMask_Reserved1_LSB 0x33 +#define QIB_7220_HwErrMask_Reserved1_RMASK 0x7 +#define QIB_7220_HwErrMask_RXEMemParityErrMask_LSB 0x2C +#define QIB_7220_HwErrMask_RXEMemParityErrMask_RMASK 0x7F +#define QIB_7220_HwErrMask_TXEMemParityErrMask_LSB 0x28 +#define QIB_7220_HwErrMask_TXEMemParityErrMask_RMASK 0xF +#define QIB_7220_HwErrMask_DDSRXEQMemoryParityErrMask_LSB 0x27 +#define QIB_7220_HwErrMask_DDSRXEQMemoryParityErrMask_RMASK 0x1 +#define QIB_7220_HwErrMask_IB_uC_MemoryParityErrMask_LSB 0x26 +#define QIB_7220_HwErrMask_IB_uC_MemoryParityErrMask_RMASK 0x1 +#define QIB_7220_HwErrMask_PCIEOct1_uC_MemoryParityErrMask_LSB 0x25 +#define QIB_7220_HwErrMask_PCIEOct1_uC_MemoryParityErrMask_RMASK 0x1 +#define QIB_7220_HwErrMask_PCIEOct0_uC_MemoryParityErrMask_LSB 0x24 +#define QIB_7220_HwErrMask_PCIEOct0_uC_MemoryParityErrMask_RMASK 0x1 +#define QIB_7220_HwErrMask_Reserved2_LSB 0x22 +#define QIB_7220_HwErrMask_Reserved2_RMASK 0x3 +#define QIB_7220_HwErrMask_PCIeBusParityErrMask_LSB 0x1F +#define QIB_7220_HwErrMask_PCIeBusParityErrMask_RMASK 0x7 +#define QIB_7220_HwErrMask_PcieCplTimeoutMask_LSB 0x1E +#define QIB_7220_HwErrMask_PcieCplTimeoutMask_RMASK 0x1 +#define QIB_7220_HwErrMask_PoisonedTLPMask_LSB 0x1D +#define QIB_7220_HwErrMask_PoisonedTLPMask_RMASK 0x1 +#define QIB_7220_HwErrMask_SDmaMemReadErrMask_LSB 0x1C +#define QIB_7220_HwErrMask_SDmaMemReadErrMask_RMASK 0x1 +#define QIB_7220_HwErrMask_Reserved3_LSB 0x8 +#define QIB_7220_HwErrMask_Reserved3_RMASK 0xFFFFF +#define QIB_7220_HwErrMask_PCIeMemParityErrMask_LSB 0x0 +#define QIB_7220_HwErrMask_PCIeMemParityErrMask_RMASK 0xFF + +#define QIB_7220_HwErrStatus_OFFS 0xA0 +#define QIB_7220_HwErrStatus_IBCBusFromSPCParityErr_LSB 0x3F +#define QIB_7220_HwErrStatus_IBCBusFromSPCParityErr_RMASK 0x1 +#define QIB_7220_HwErrStatus_IBCBusToSPCParityErr_LSB 0x3E +#define QIB_7220_HwErrStatus_IBCBusToSPCParityErr_RMASK 0x1 +#define QIB_7220_HwErrStatus_Clk_uC_PLLNotLocked_LSB 0x3D +#define QIB_7220_HwErrStatus_Clk_uC_PLLNotLocked_RMASK 0x1 +#define QIB_7220_HwErrStatus_IBSerdesPClkNotDetect_LSB 0x3C +#define QIB_7220_HwErrStatus_IBSerdesPClkNotDetect_RMASK 0x1 +#define QIB_7220_HwErrStatus_PCIESerdesQ3PClkNotDetect_LSB 0x3B +#define QIB_7220_HwErrStatus_PCIESerdesQ3PClkNotDetect_RMASK 0x1 +#define QIB_7220_HwErrStatus_PCIESerdesQ2PClkNotDetect_LSB 0x3A +#define QIB_7220_HwErrStatus_PCIESerdesQ2PClkNotDetect_RMASK 0x1 +#define QIB_7220_HwErrStatus_PCIESerdesQ1PClkNotDetect_LSB 0x39 +#define QIB_7220_HwErrStatus_PCIESerdesQ1PClkNotDetect_RMASK 0x1 +#define QIB_7220_HwErrStatus_PCIESerdesQ0PClkNotDetect_LSB 0x38 +#define QIB_7220_HwErrStatus_PCIESerdesQ0PClkNotDetect_RMASK 0x1 +#define QIB_7220_HwErrStatus_Reserved_LSB 0x37 +#define QIB_7220_HwErrStatus_Reserved_RMASK 0x1 +#define QIB_7220_HwErrStatus_PowerOnBISTFailed_LSB 0x36 +#define QIB_7220_HwErrStatus_PowerOnBISTFailed_RMASK 0x1 +#define QIB_7220_HwErrStatus_Reserved1_LSB 0x33 +#define QIB_7220_HwErrStatus_Reserved1_RMASK 0x7 +#define QIB_7220_HwErrStatus_RXEMemParity_LSB 0x2C +#define QIB_7220_HwErrStatus_RXEMemParity_RMASK 0x7F +#define QIB_7220_HwErrStatus_TXEMemParity_LSB 0x28 +#define QIB_7220_HwErrStatus_TXEMemParity_RMASK 0xF +#define QIB_7220_HwErrStatus_DDSRXEQMemoryParityErr_LSB 0x27 +#define QIB_7220_HwErrStatus_DDSRXEQMemoryParityErr_RMASK 0x1 +#define QIB_7220_HwErrStatus_IB_uC_MemoryParityErr_LSB 0x26 +#define QIB_7220_HwErrStatus_IB_uC_MemoryParityErr_RMASK 0x1 +#define QIB_7220_HwErrStatus_PCIE_uC_Oct1MemoryParityErr_LSB 0x25 +#define QIB_7220_HwErrStatus_PCIE_uC_Oct1MemoryParityErr_RMASK 0x1 +#define QIB_7220_HwErrStatus_PCIE_uC_Oct0MemoryParityErr_LSB 0x24 +#define QIB_7220_HwErrStatus_PCIE_uC_Oct0MemoryParityErr_RMASK 0x1 +#define QIB_7220_HwErrStatus_Reserved2_LSB 0x22 +#define QIB_7220_HwErrStatus_Reserved2_RMASK 0x3 +#define QIB_7220_HwErrStatus_PCIeBusParity_LSB 0x1F +#define QIB_7220_HwErrStatus_PCIeBusParity_RMASK 0x7 +#define QIB_7220_HwErrStatus_PcieCplTimeout_LSB 0x1E +#define QIB_7220_HwErrStatus_PcieCplTimeout_RMASK 0x1 +#define QIB_7220_HwErrStatus_PoisenedTLP_LSB 0x1D +#define QIB_7220_HwErrStatus_PoisenedTLP_RMASK 0x1 +#define QIB_7220_HwErrStatus_SDmaMemReadErr_LSB 0x1C +#define QIB_7220_HwErrStatus_SDmaMemReadErr_RMASK 0x1 +#define QIB_7220_HwErrStatus_Reserved3_LSB 0x8 +#define QIB_7220_HwErrStatus_Reserved3_RMASK 0xFFFFF +#define QIB_7220_HwErrStatus_PCIeMemParity_LSB 0x0 +#define QIB_7220_HwErrStatus_PCIeMemParity_RMASK 0xFF + +#define QIB_7220_HwErrClear_OFFS 0xA8 +#define QIB_7220_HwErrClear_IBCBusFromSPCParityErrClear_LSB 0x3F +#define QIB_7220_HwErrClear_IBCBusFromSPCParityErrClear_RMASK 0x1 +#define QIB_7220_HwErrClear_IBCBusToSPCparityErrClear_LSB 0x3E +#define QIB_7220_HwErrClear_IBCBusToSPCparityErrClear_RMASK 0x1 +#define QIB_7220_HwErrClear_Clk_uC_PLLNotLockedClear_LSB 0x3D +#define QIB_7220_HwErrClear_Clk_uC_PLLNotLockedClear_RMASK 0x1 +#define QIB_7220_HwErrClear_IBSerdesPClkNotDetectClear_LSB 0x3C +#define QIB_7220_HwErrClear_IBSerdesPClkNotDetectClear_RMASK 0x1 +#define QIB_7220_HwErrClear_PCIESerdesQ3PClkNotDetectClear_LSB 0x3B +#define QIB_7220_HwErrClear_PCIESerdesQ3PClkNotDetectClear_RMASK 0x1 +#define QIB_7220_HwErrClear_PCIESerdesQ2PClkNotDetectClear_LSB 0x3A +#define QIB_7220_HwErrClear_PCIESerdesQ2PClkNotDetectClear_RMASK 0x1 +#define QIB_7220_HwErrClear_PCIESerdesQ1PClkNotDetectClear_LSB 0x39 +#define QIB_7220_HwErrClear_PCIESerdesQ1PClkNotDetectClear_RMASK 0x1 +#define QIB_7220_HwErrClear_PCIESerdesQ0PClkNotDetectClear_LSB 0x38 +#define QIB_7220_HwErrClear_PCIESerdesQ0PClkNotDetectClear_RMASK 0x1 +#define QIB_7220_HwErrClear_Reserved_LSB 0x37 +#define QIB_7220_HwErrClear_Reserved_RMASK 0x1 +#define QIB_7220_HwErrClear_PowerOnBISTFailedClear_LSB 0x36 +#define QIB_7220_HwErrClear_PowerOnBISTFailedClear_RMASK 0x1 +#define QIB_7220_HwErrClear_Reserved1_LSB 0x33 +#define QIB_7220_HwErrClear_Reserved1_RMASK 0x7 +#define QIB_7220_HwErrClear_RXEMemParityClear_LSB 0x2C +#define QIB_7220_HwErrClear_RXEMemParityClear_RMASK 0x7F +#define QIB_7220_HwErrClear_TXEMemParityClear_LSB 0x28 +#define QIB_7220_HwErrClear_TXEMemParityClear_RMASK 0xF +#define QIB_7220_HwErrClear_DDSRXEQMemoryParityErrClear_LSB 0x27 +#define QIB_7220_HwErrClear_DDSRXEQMemoryParityErrClear_RMASK 0x1 +#define QIB_7220_HwErrClear_IB_uC_MemoryParityErrClear_LSB 0x26 +#define QIB_7220_HwErrClear_IB_uC_MemoryParityErrClear_RMASK 0x1 +#define QIB_7220_HwErrClear_PCIE_uC_Oct1MemoryParityErrClear_LSB 0x25 +#define QIB_7220_HwErrClear_PCIE_uC_Oct1MemoryParityErrClear_RMASK 0x1 +#define QIB_7220_HwErrClear_PCIE_uC_Oct0MemoryParityErrClear_LSB 0x24 +#define QIB_7220_HwErrClear_PCIE_uC_Oct0MemoryParityErrClear_RMASK 0x1 +#define QIB_7220_HwErrClear_Reserved2_LSB 0x22 +#define QIB_7220_HwErrClear_Reserved2_RMASK 0x3 +#define QIB_7220_HwErrClear_PCIeBusParityClr_LSB 0x1F +#define QIB_7220_HwErrClear_PCIeBusParityClr_RMASK 0x7 +#define QIB_7220_HwErrClear_PcieCplTimeoutClear_LSB 0x1E +#define QIB_7220_HwErrClear_PcieCplTimeoutClear_RMASK 0x1 +#define QIB_7220_HwErrClear_PoisonedTLPClear_LSB 0x1D +#define QIB_7220_HwErrClear_PoisonedTLPClear_RMASK 0x1 +#define QIB_7220_HwErrClear_SDmaMemReadErrClear_LSB 0x1C +#define QIB_7220_HwErrClear_SDmaMemReadErrClear_RMASK 0x1 +#define QIB_7220_HwErrClear_Reserved3_LSB 0x8 +#define QIB_7220_HwErrClear_Reserved3_RMASK 0xFFFFF +#define QIB_7220_HwErrClear_PCIeMemParityClr_LSB 0x0 +#define QIB_7220_HwErrClear_PCIeMemParityClr_RMASK 0xFF + +#define QIB_7220_HwDiagCtrl_OFFS 0xB0 +#define QIB_7220_HwDiagCtrl_ForceIBCBusFromSPCParityErr_LSB 0x3F +#define QIB_7220_HwDiagCtrl_ForceIBCBusFromSPCParityErr_RMASK 0x1 +#define QIB_7220_HwDiagCtrl_ForceIBCBusToSPCParityErr_LSB 0x3E +#define QIB_7220_HwDiagCtrl_ForceIBCBusToSPCParityErr_RMASK 0x1 +#define QIB_7220_HwDiagCtrl_CounterWrEnable_LSB 0x3D +#define QIB_7220_HwDiagCtrl_CounterWrEnable_RMASK 0x1 +#define QIB_7220_HwDiagCtrl_CounterDisable_LSB 0x3C +#define QIB_7220_HwDiagCtrl_CounterDisable_RMASK 0x1 +#define QIB_7220_HwDiagCtrl_Reserved_LSB 0x33 +#define QIB_7220_HwDiagCtrl_Reserved_RMASK 0x1FF +#define QIB_7220_HwDiagCtrl_ForceRxMemParityErr_LSB 0x2C +#define QIB_7220_HwDiagCtrl_ForceRxMemParityErr_RMASK 0x7F +#define QIB_7220_HwDiagCtrl_ForceTxMemparityErr_LSB 0x28 +#define QIB_7220_HwDiagCtrl_ForceTxMemparityErr_RMASK 0xF +#define QIB_7220_HwDiagCtrl_ForceDDSRXEQMemoryParityErr_LSB 0x27 +#define QIB_7220_HwDiagCtrl_ForceDDSRXEQMemoryParityErr_RMASK 0x1 +#define QIB_7220_HwDiagCtrl_ForceIB_uC_MemoryParityErr_LSB 0x26 +#define QIB_7220_HwDiagCtrl_ForceIB_uC_MemoryParityErr_RMASK 0x1 +#define QIB_7220_HwDiagCtrl_ForcePCIE_uC_Oct1MemoryParityErr_LSB 0x25 +#define QIB_7220_HwDiagCtrl_ForcePCIE_uC_Oct1MemoryParityErr_RMASK 0x1 +#define QIB_7220_HwDiagCtrl_ForcePCIE_uC_Oct0MemoryParityErr_LSB 0x24 +#define QIB_7220_HwDiagCtrl_ForcePCIE_uC_Oct0MemoryParityErr_RMASK 0x1 +#define QIB_7220_HwDiagCtrl_Reserved1_LSB 0x23 +#define QIB_7220_HwDiagCtrl_Reserved1_RMASK 0x1 +#define QIB_7220_HwDiagCtrl_forcePCIeBusParity_LSB 0x1F +#define QIB_7220_HwDiagCtrl_forcePCIeBusParity_RMASK 0xF +#define QIB_7220_HwDiagCtrl_Reserved2_LSB 0x8 +#define QIB_7220_HwDiagCtrl_Reserved2_RMASK 0x7FFFFF +#define QIB_7220_HwDiagCtrl_forcePCIeMemParity_LSB 0x0 +#define QIB_7220_HwDiagCtrl_forcePCIeMemParity_RMASK 0xFF + +#define QIB_7220_REG_0000B8_OFFS 0xB8 + +#define QIB_7220_IBCStatus_OFFS 0xC0 +#define QIB_7220_IBCStatus_TxCreditOk_LSB 0x1F +#define QIB_7220_IBCStatus_TxCreditOk_RMASK 0x1 +#define QIB_7220_IBCStatus_TxReady_LSB 0x1E +#define QIB_7220_IBCStatus_TxReady_RMASK 0x1 +#define QIB_7220_IBCStatus_Reserved_LSB 0xE +#define QIB_7220_IBCStatus_Reserved_RMASK 0xFFFF +#define QIB_7220_IBCStatus_IBTxLaneReversed_LSB 0xD +#define QIB_7220_IBCStatus_IBTxLaneReversed_RMASK 0x1 +#define QIB_7220_IBCStatus_IBRxLaneReversed_LSB 0xC +#define QIB_7220_IBCStatus_IBRxLaneReversed_RMASK 0x1 +#define QIB_7220_IBCStatus_IB_SERDES_TRIM_DONE_LSB 0xB +#define QIB_7220_IBCStatus_IB_SERDES_TRIM_DONE_RMASK 0x1 +#define QIB_7220_IBCStatus_DDS_RXEQ_FAIL_LSB 0xA +#define QIB_7220_IBCStatus_DDS_RXEQ_FAIL_RMASK 0x1 +#define QIB_7220_IBCStatus_LinkWidthActive_LSB 0x9 +#define QIB_7220_IBCStatus_LinkWidthActive_RMASK 0x1 +#define QIB_7220_IBCStatus_LinkSpeedActive_LSB 0x8 +#define QIB_7220_IBCStatus_LinkSpeedActive_RMASK 0x1 +#define QIB_7220_IBCStatus_LinkState_LSB 0x5 +#define QIB_7220_IBCStatus_LinkState_RMASK 0x7 +#define QIB_7220_IBCStatus_LinkTrainingState_LSB 0x0 +#define QIB_7220_IBCStatus_LinkTrainingState_RMASK 0x1F + +#define QIB_7220_IBCCtrl_OFFS 0xC8 +#define QIB_7220_IBCCtrl_Loopback_LSB 0x3F +#define QIB_7220_IBCCtrl_Loopback_RMASK 0x1 +#define QIB_7220_IBCCtrl_LinkDownDefaultState_LSB 0x3E +#define QIB_7220_IBCCtrl_LinkDownDefaultState_RMASK 0x1 +#define QIB_7220_IBCCtrl_Reserved_LSB 0x2B +#define QIB_7220_IBCCtrl_Reserved_RMASK 0x7FFFF +#define QIB_7220_IBCCtrl_CreditScale_LSB 0x28 +#define QIB_7220_IBCCtrl_CreditScale_RMASK 0x7 +#define QIB_7220_IBCCtrl_OverrunThreshold_LSB 0x24 +#define QIB_7220_IBCCtrl_OverrunThreshold_RMASK 0xF +#define QIB_7220_IBCCtrl_PhyerrThreshold_LSB 0x20 +#define QIB_7220_IBCCtrl_PhyerrThreshold_RMASK 0xF +#define QIB_7220_IBCCtrl_MaxPktLen_LSB 0x15 +#define QIB_7220_IBCCtrl_MaxPktLen_RMASK 0x7FF +#define QIB_7220_IBCCtrl_LinkCmd_LSB 0x13 +#define QIB_7220_IBCCtrl_LinkCmd_RMASK 0x3 +#define QIB_7220_IBCCtrl_LinkInitCmd_LSB 0x10 +#define QIB_7220_IBCCtrl_LinkInitCmd_RMASK 0x7 +#define QIB_7220_IBCCtrl_FlowCtrlWaterMark_LSB 0x8 +#define QIB_7220_IBCCtrl_FlowCtrlWaterMark_RMASK 0xFF +#define QIB_7220_IBCCtrl_FlowCtrlPeriod_LSB 0x0 +#define QIB_7220_IBCCtrl_FlowCtrlPeriod_RMASK 0xFF + +#define QIB_7220_EXTStatus_OFFS 0xD0 +#define QIB_7220_EXTStatus_GPIOIn_LSB 0x30 +#define QIB_7220_EXTStatus_GPIOIn_RMASK 0xFFFF +#define QIB_7220_EXTStatus_Reserved_LSB 0x20 +#define QIB_7220_EXTStatus_Reserved_RMASK 0xFFFF +#define QIB_7220_EXTStatus_Reserved1_LSB 0x10 +#define QIB_7220_EXTStatus_Reserved1_RMASK 0xFFFF +#define QIB_7220_EXTStatus_MemBISTDisabled_LSB 0xF +#define QIB_7220_EXTStatus_MemBISTDisabled_RMASK 0x1 +#define QIB_7220_EXTStatus_MemBISTEndTest_LSB 0xE +#define QIB_7220_EXTStatus_MemBISTEndTest_RMASK 0x1 +#define QIB_7220_EXTStatus_Reserved2_LSB 0x0 +#define QIB_7220_EXTStatus_Reserved2_RMASK 0x3FFF + +#define QIB_7220_EXTCtrl_OFFS 0xD8 +#define QIB_7220_EXTCtrl_GPIOOe_LSB 0x30 +#define QIB_7220_EXTCtrl_GPIOOe_RMASK 0xFFFF +#define QIB_7220_EXTCtrl_GPIOInvert_LSB 0x20 +#define QIB_7220_EXTCtrl_GPIOInvert_RMASK 0xFFFF +#define QIB_7220_EXTCtrl_Reserved_LSB 0x4 +#define QIB_7220_EXTCtrl_Reserved_RMASK 0xFFFFFFF +#define QIB_7220_EXTCtrl_LEDPriPortGreenOn_LSB 0x3 +#define QIB_7220_EXTCtrl_LEDPriPortGreenOn_RMASK 0x1 +#define QIB_7220_EXTCtrl_LEDPriPortYellowOn_LSB 0x2 +#define QIB_7220_EXTCtrl_LEDPriPortYellowOn_RMASK 0x1 +#define QIB_7220_EXTCtrl_LEDGblOkGreenOn_LSB 0x1 +#define QIB_7220_EXTCtrl_LEDGblOkGreenOn_RMASK 0x1 +#define QIB_7220_EXTCtrl_LEDGblErrRedOff_LSB 0x0 +#define QIB_7220_EXTCtrl_LEDGblErrRedOff_RMASK 0x1 + +#define QIB_7220_GPIOOut_OFFS 0xE0 + +#define QIB_7220_GPIOMask_OFFS 0xE8 + +#define QIB_7220_GPIOStatus_OFFS 0xF0 + +#define QIB_7220_GPIOClear_OFFS 0xF8 + +#define QIB_7220_RcvCtrl_OFFS 0x100 +#define QIB_7220_RcvCtrl_Reserved_LSB 0x27 +#define QIB_7220_RcvCtrl_Reserved_RMASK 0x1FFFFFF +#define QIB_7220_RcvCtrl_RcvQPMapEnable_LSB 0x26 +#define QIB_7220_RcvCtrl_RcvQPMapEnable_RMASK 0x1 +#define QIB_7220_RcvCtrl_PortCfg_LSB 0x24 +#define QIB_7220_RcvCtrl_PortCfg_RMASK 0x3 +#define QIB_7220_RcvCtrl_TailUpd_LSB 0x23 +#define QIB_7220_RcvCtrl_TailUpd_RMASK 0x1 +#define QIB_7220_RcvCtrl_RcvPartitionKeyDisable_LSB 0x22 +#define QIB_7220_RcvCtrl_RcvPartitionKeyDisable_RMASK 0x1 +#define QIB_7220_RcvCtrl_IntrAvail_LSB 0x11 +#define QIB_7220_RcvCtrl_IntrAvail_RMASK 0x1FFFF +#define QIB_7220_RcvCtrl_PortEnable_LSB 0x0 +#define QIB_7220_RcvCtrl_PortEnable_RMASK 0x1FFFF + +#define QIB_7220_RcvBTHQP_OFFS 0x108 +#define QIB_7220_RcvBTHQP_Reserved_LSB 0x18 +#define QIB_7220_RcvBTHQP_Reserved_RMASK 0xFF +#define QIB_7220_RcvBTHQP_RcvBTHQP_LSB 0x0 +#define QIB_7220_RcvBTHQP_RcvBTHQP_RMASK 0xFFFFFF + +#define QIB_7220_RcvHdrSize_OFFS 0x110 + +#define QIB_7220_RcvHdrCnt_OFFS 0x118 + +#define QIB_7220_RcvHdrEntSize_OFFS 0x120 + +#define QIB_7220_RcvTIDBase_OFFS 0x128 + +#define QIB_7220_RcvTIDCnt_OFFS 0x130 + +#define QIB_7220_RcvEgrBase_OFFS 0x138 + +#define QIB_7220_RcvEgrCnt_OFFS 0x140 + +#define QIB_7220_RcvBufBase_OFFS 0x148 + +#define QIB_7220_RcvBufSize_OFFS 0x150 + +#define QIB_7220_RxIntMemBase_OFFS 0x158 + +#define QIB_7220_RxIntMemSize_OFFS 0x160 + +#define QIB_7220_RcvPartitionKey_OFFS 0x168 + +#define QIB_7220_RcvQPMulticastPort_OFFS 0x170 +#define QIB_7220_RcvQPMulticastPort_Reserved_LSB 0x5 +#define QIB_7220_RcvQPMulticastPort_Reserved_RMASK 0x7FFFFFFFFFFFFFF +#define QIB_7220_RcvQPMulticastPort_RcvQpMcPort_LSB 0x0 +#define QIB_7220_RcvQPMulticastPort_RcvQpMcPort_RMASK 0x1F + +#define QIB_7220_RcvPktLEDCnt_OFFS 0x178 +#define QIB_7220_RcvPktLEDCnt_ONperiod_LSB 0x20 +#define QIB_7220_RcvPktLEDCnt_ONperiod_RMASK 0xFFFFFFFF +#define QIB_7220_RcvPktLEDCnt_OFFperiod_LSB 0x0 +#define QIB_7220_RcvPktLEDCnt_OFFperiod_RMASK 0xFFFFFFFF + +#define QIB_7220_IBCDDRCtrl_OFFS 0x180 +#define QIB_7220_IBCDDRCtrl_IB_DLID_MASK_LSB 0x30 +#define QIB_7220_IBCDDRCtrl_IB_DLID_MASK_RMASK 0xFFFF +#define QIB_7220_IBCDDRCtrl_IB_DLID_LSB 0x20 +#define QIB_7220_IBCDDRCtrl_IB_DLID_RMASK 0xFFFF +#define QIB_7220_IBCDDRCtrl_Reserved_LSB 0x1B +#define QIB_7220_IBCDDRCtrl_Reserved_RMASK 0x1F +#define QIB_7220_IBCDDRCtrl_HRTBT_REQ_LSB 0x1A +#define QIB_7220_IBCDDRCtrl_HRTBT_REQ_RMASK 0x1 +#define QIB_7220_IBCDDRCtrl_HRTBT_PORT_LSB 0x12 +#define QIB_7220_IBCDDRCtrl_HRTBT_PORT_RMASK 0xFF +#define QIB_7220_IBCDDRCtrl_HRTBT_AUTO_LSB 0x11 +#define QIB_7220_IBCDDRCtrl_HRTBT_AUTO_RMASK 0x1 +#define QIB_7220_IBCDDRCtrl_HRTBT_ENB_LSB 0x10 +#define QIB_7220_IBCDDRCtrl_HRTBT_ENB_RMASK 0x1 +#define QIB_7220_IBCDDRCtrl_SD_DDS_LSB 0xC +#define QIB_7220_IBCDDRCtrl_SD_DDS_RMASK 0xF +#define QIB_7220_IBCDDRCtrl_SD_DDSV_LSB 0xB +#define QIB_7220_IBCDDRCtrl_SD_DDSV_RMASK 0x1 +#define QIB_7220_IBCDDRCtrl_SD_ADD_ENB_LSB 0xA +#define QIB_7220_IBCDDRCtrl_SD_ADD_ENB_RMASK 0x1 +#define QIB_7220_IBCDDRCtrl_SD_RX_EQUAL_ENABLE_LSB 0x9 +#define QIB_7220_IBCDDRCtrl_SD_RX_EQUAL_ENABLE_RMASK 0x1 +#define QIB_7220_IBCDDRCtrl_IB_LANE_REV_SUPPORTED_LSB 0x8 +#define QIB_7220_IBCDDRCtrl_IB_LANE_REV_SUPPORTED_RMASK 0x1 +#define QIB_7220_IBCDDRCtrl_IB_POLARITY_REV_SUPP_LSB 0x7 +#define QIB_7220_IBCDDRCtrl_IB_POLARITY_REV_SUPP_RMASK 0x1 +#define QIB_7220_IBCDDRCtrl_IB_NUM_CHANNELS_LSB 0x5 +#define QIB_7220_IBCDDRCtrl_IB_NUM_CHANNELS_RMASK 0x3 +#define QIB_7220_IBCDDRCtrl_SD_SPEED_QDR_LSB 0x4 +#define QIB_7220_IBCDDRCtrl_SD_SPEED_QDR_RMASK 0x1 +#define QIB_7220_IBCDDRCtrl_SD_SPEED_DDR_LSB 0x3 +#define QIB_7220_IBCDDRCtrl_SD_SPEED_DDR_RMASK 0x1 +#define QIB_7220_IBCDDRCtrl_SD_SPEED_SDR_LSB 0x2 +#define QIB_7220_IBCDDRCtrl_SD_SPEED_SDR_RMASK 0x1 +#define QIB_7220_IBCDDRCtrl_SD_SPEED_LSB 0x1 +#define QIB_7220_IBCDDRCtrl_SD_SPEED_RMASK 0x1 +#define QIB_7220_IBCDDRCtrl_IB_ENHANCED_MODE_LSB 0x0 +#define QIB_7220_IBCDDRCtrl_IB_ENHANCED_MODE_RMASK 0x1 + +#define QIB_7220_HRTBT_GUID_OFFS 0x188 + +#define QIB_7220_IBCDDRCtrl2_OFFS 0x1A0 +#define QIB_7220_IBCDDRCtrl2_IB_BACK_PORCH_LSB 0x5 +#define QIB_7220_IBCDDRCtrl2_IB_BACK_PORCH_RMASK 0x1F +#define QIB_7220_IBCDDRCtrl2_IB_FRONT_PORCH_LSB 0x0 +#define QIB_7220_IBCDDRCtrl2_IB_FRONT_PORCH_RMASK 0x1F + +#define QIB_7220_IBCDDRStatus_OFFS 0x1A8 +#define QIB_7220_IBCDDRStatus_heartbeat_timed_out_LSB 0x24 +#define QIB_7220_IBCDDRStatus_heartbeat_timed_out_RMASK 0x1 +#define QIB_7220_IBCDDRStatus_heartbeat_crosstalk_LSB 0x20 +#define QIB_7220_IBCDDRStatus_heartbeat_crosstalk_RMASK 0xF +#define QIB_7220_IBCDDRStatus_RxEqLocalDevice_LSB 0x1E +#define QIB_7220_IBCDDRStatus_RxEqLocalDevice_RMASK 0x3 +#define QIB_7220_IBCDDRStatus_ReqDDSLocalFromRmt_LSB 0x1A +#define QIB_7220_IBCDDRStatus_ReqDDSLocalFromRmt_RMASK 0xF +#define QIB_7220_IBCDDRStatus_LinkRoundTripLatency_LSB 0x0 +#define QIB_7220_IBCDDRStatus_LinkRoundTripLatency_RMASK 0x3FFFFFF + +#define QIB_7220_JIntReload_OFFS 0x1B0 +#define QIB_7220_JIntReload_J_limit_reload_LSB 0x10 +#define QIB_7220_JIntReload_J_limit_reload_RMASK 0xFFFF +#define QIB_7220_JIntReload_J_reload_LSB 0x0 +#define QIB_7220_JIntReload_J_reload_RMASK 0xFFFF + +#define QIB_7220_IBNCModeCtrl_OFFS 0x1B8 +#define QIB_7220_IBNCModeCtrl_Reserved_LSB 0x1A +#define QIB_7220_IBNCModeCtrl_Reserved_RMASK 0x3FFFFFFFFF +#define QIB_7220_IBNCModeCtrl_TSMCode_TS2_LSB 0x11 +#define QIB_7220_IBNCModeCtrl_TSMCode_TS2_RMASK 0x1FF +#define QIB_7220_IBNCModeCtrl_TSMCode_TS1_LSB 0x8 +#define QIB_7220_IBNCModeCtrl_TSMCode_TS1_RMASK 0x1FF +#define QIB_7220_IBNCModeCtrl_Reserved1_LSB 0x3 +#define QIB_7220_IBNCModeCtrl_Reserved1_RMASK 0x1F +#define QIB_7220_IBNCModeCtrl_TSMEnable_ignore_TSM_on_rx_LSB 0x2 +#define QIB_7220_IBNCModeCtrl_TSMEnable_ignore_TSM_on_rx_RMASK 0x1 +#define QIB_7220_IBNCModeCtrl_TSMEnable_send_TS2_LSB 0x1 +#define QIB_7220_IBNCModeCtrl_TSMEnable_send_TS2_RMASK 0x1 +#define QIB_7220_IBNCModeCtrl_TSMEnable_send_TS1_LSB 0x0 +#define QIB_7220_IBNCModeCtrl_TSMEnable_send_TS1_RMASK 0x1 + +#define QIB_7220_SendCtrl_OFFS 0x1C0 +#define QIB_7220_SendCtrl_Disarm_LSB 0x1F +#define QIB_7220_SendCtrl_Disarm_RMASK 0x1 +#define QIB_7220_SendCtrl_Reserved_LSB 0x1D +#define QIB_7220_SendCtrl_Reserved_RMASK 0x3 +#define QIB_7220_SendCtrl_AvailUpdThld_LSB 0x18 +#define QIB_7220_SendCtrl_AvailUpdThld_RMASK 0x1F +#define QIB_7220_SendCtrl_DisarmPIOBuf_LSB 0x10 +#define QIB_7220_SendCtrl_DisarmPIOBuf_RMASK 0xFF +#define QIB_7220_SendCtrl_Reserved1_LSB 0xD +#define QIB_7220_SendCtrl_Reserved1_RMASK 0x7 +#define QIB_7220_SendCtrl_SDmaHalt_LSB 0xC +#define QIB_7220_SendCtrl_SDmaHalt_RMASK 0x1 +#define QIB_7220_SendCtrl_SDmaEnable_LSB 0xB +#define QIB_7220_SendCtrl_SDmaEnable_RMASK 0x1 +#define QIB_7220_SendCtrl_SDmaSingleDescriptor_LSB 0xA +#define QIB_7220_SendCtrl_SDmaSingleDescriptor_RMASK 0x1 +#define QIB_7220_SendCtrl_SDmaIntEnable_LSB 0x9 +#define QIB_7220_SendCtrl_SDmaIntEnable_RMASK 0x1 +#define QIB_7220_SendCtrl_Reserved2_LSB 0x5 +#define QIB_7220_SendCtrl_Reserved2_RMASK 0xF +#define QIB_7220_SendCtrl_SSpecialTriggerEn_LSB 0x4 +#define QIB_7220_SendCtrl_SSpecialTriggerEn_RMASK 0x1 +#define QIB_7220_SendCtrl_SPioEnable_LSB 0x3 +#define QIB_7220_SendCtrl_SPioEnable_RMASK 0x1 +#define QIB_7220_SendCtrl_SendBufAvailUpd_LSB 0x2 +#define QIB_7220_SendCtrl_SendBufAvailUpd_RMASK 0x1 +#define QIB_7220_SendCtrl_SendIntBufAvail_LSB 0x1 +#define QIB_7220_SendCtrl_SendIntBufAvail_RMASK 0x1 +#define QIB_7220_SendCtrl_Abort_LSB 0x0 +#define QIB_7220_SendCtrl_Abort_RMASK 0x1 + +#define QIB_7220_SendBufBase_OFFS 0x1C8 +#define QIB_7220_SendBufBase_Reserved_LSB 0x35 +#define QIB_7220_SendBufBase_Reserved_RMASK 0x7FF +#define QIB_7220_SendBufBase_BaseAddr_LargePIO_LSB 0x20 +#define QIB_7220_SendBufBase_BaseAddr_LargePIO_RMASK 0x1FFFFF +#define QIB_7220_SendBufBase_Reserved1_LSB 0x15 +#define QIB_7220_SendBufBase_Reserved1_RMASK 0x7FF +#define QIB_7220_SendBufBase_BaseAddr_SmallPIO_LSB 0x0 +#define QIB_7220_SendBufBase_BaseAddr_SmallPIO_RMASK 0x1FFFFF + +#define QIB_7220_SendBufSize_OFFS 0x1D0 +#define QIB_7220_SendBufSize_Reserved_LSB 0x2D +#define QIB_7220_SendBufSize_Reserved_RMASK 0xFFFFF +#define QIB_7220_SendBufSize_Size_LargePIO_LSB 0x20 +#define QIB_7220_SendBufSize_Size_LargePIO_RMASK 0x1FFF +#define QIB_7220_SendBufSize_Reserved1_LSB 0xC +#define QIB_7220_SendBufSize_Reserved1_RMASK 0xFFFFF +#define QIB_7220_SendBufSize_Size_SmallPIO_LSB 0x0 +#define QIB_7220_SendBufSize_Size_SmallPIO_RMASK 0xFFF + +#define QIB_7220_SendBufCnt_OFFS 0x1D8 +#define QIB_7220_SendBufCnt_Reserved_LSB 0x24 +#define QIB_7220_SendBufCnt_Reserved_RMASK 0xFFFFFFF +#define QIB_7220_SendBufCnt_Num_LargeBuffers_LSB 0x20 +#define QIB_7220_SendBufCnt_Num_LargeBuffers_RMASK 0xF +#define QIB_7220_SendBufCnt_Reserved1_LSB 0x9 +#define QIB_7220_SendBufCnt_Reserved1_RMASK 0x7FFFFF +#define QIB_7220_SendBufCnt_Num_SmallBuffers_LSB 0x0 +#define QIB_7220_SendBufCnt_Num_SmallBuffers_RMASK 0x1FF + +#define QIB_7220_SendBufAvailAddr_OFFS 0x1E0 +#define QIB_7220_SendBufAvailAddr_SendBufAvailAddr_LSB 0x6 +#define QIB_7220_SendBufAvailAddr_SendBufAvailAddr_RMASK 0x3FFFFFFFF +#define QIB_7220_SendBufAvailAddr_Reserved_LSB 0x0 +#define QIB_7220_SendBufAvailAddr_Reserved_RMASK 0x3F + +#define QIB_7220_TxIntMemBase_OFFS 0x1E8 + +#define QIB_7220_TxIntMemSize_OFFS 0x1F0 + +#define QIB_7220_SendDmaBase_OFFS 0x1F8 +#define QIB_7220_SendDmaBase_Reserved_LSB 0x30 +#define QIB_7220_SendDmaBase_Reserved_RMASK 0xFFFF +#define QIB_7220_SendDmaBase_SendDmaBase_LSB 0x0 +#define QIB_7220_SendDmaBase_SendDmaBase_RMASK 0xFFFFFFFFFFFF + +#define QIB_7220_SendDmaLenGen_OFFS 0x200 +#define QIB_7220_SendDmaLenGen_Reserved_LSB 0x13 +#define QIB_7220_SendDmaLenGen_Reserved_RMASK 0x1FFFFFFFFFFF +#define QIB_7220_SendDmaLenGen_Generation_LSB 0x10 +#define QIB_7220_SendDmaLenGen_Generation_MSB 0x12 +#define QIB_7220_SendDmaLenGen_Generation_RMASK 0x7 +#define QIB_7220_SendDmaLenGen_Length_LSB 0x0 +#define QIB_7220_SendDmaLenGen_Length_RMASK 0xFFFF + +#define QIB_7220_SendDmaTail_OFFS 0x208 +#define QIB_7220_SendDmaTail_Reserved_LSB 0x10 +#define QIB_7220_SendDmaTail_Reserved_RMASK 0xFFFFFFFFFFFF +#define QIB_7220_SendDmaTail_SendDmaTail_LSB 0x0 +#define QIB_7220_SendDmaTail_SendDmaTail_RMASK 0xFFFF + +#define QIB_7220_SendDmaHead_OFFS 0x210 +#define QIB_7220_SendDmaHead_Reserved_LSB 0x30 +#define QIB_7220_SendDmaHead_Reserved_RMASK 0xFFFF +#define QIB_7220_SendDmaHead_InternalSendDmaHead_LSB 0x20 +#define QIB_7220_SendDmaHead_InternalSendDmaHead_RMASK 0xFFFF +#define QIB_7220_SendDmaHead_Reserved1_LSB 0x10 +#define QIB_7220_SendDmaHead_Reserved1_RMASK 0xFFFF +#define QIB_7220_SendDmaHead_SendDmaHead_LSB 0x0 +#define QIB_7220_SendDmaHead_SendDmaHead_RMASK 0xFFFF + +#define QIB_7220_SendDmaHeadAddr_OFFS 0x218 +#define QIB_7220_SendDmaHeadAddr_Reserved_LSB 0x30 +#define QIB_7220_SendDmaHeadAddr_Reserved_RMASK 0xFFFF +#define QIB_7220_SendDmaHeadAddr_SendDmaHeadAddr_LSB 0x0 +#define QIB_7220_SendDmaHeadAddr_SendDmaHeadAddr_RMASK 0xFFFFFFFFFFFF + +#define QIB_7220_SendDmaBufMask0_OFFS 0x220 +#define QIB_7220_SendDmaBufMask0_BufMask_63_0_LSB 0x0 +#define QIB_7220_SendDmaBufMask0_BufMask_63_0_RMASK 0x0 + +#define QIB_7220_SendDmaStatus_OFFS 0x238 +#define QIB_7220_SendDmaStatus_ScoreBoardDrainInProg_LSB 0x3F +#define QIB_7220_SendDmaStatus_ScoreBoardDrainInProg_RMASK 0x1 +#define QIB_7220_SendDmaStatus_AbortInProg_LSB 0x3E +#define QIB_7220_SendDmaStatus_AbortInProg_RMASK 0x1 +#define QIB_7220_SendDmaStatus_InternalSDmaEnable_LSB 0x3D +#define QIB_7220_SendDmaStatus_InternalSDmaEnable_RMASK 0x1 +#define QIB_7220_SendDmaStatus_ScbDescIndex_13_0_LSB 0x2F +#define QIB_7220_SendDmaStatus_ScbDescIndex_13_0_RMASK 0x3FFF +#define QIB_7220_SendDmaStatus_RpyLowAddr_6_0_LSB 0x28 +#define QIB_7220_SendDmaStatus_RpyLowAddr_6_0_RMASK 0x7F +#define QIB_7220_SendDmaStatus_RpyTag_7_0_LSB 0x20 +#define QIB_7220_SendDmaStatus_RpyTag_7_0_RMASK 0xFF +#define QIB_7220_SendDmaStatus_ScbFull_LSB 0x1F +#define QIB_7220_SendDmaStatus_ScbFull_RMASK 0x1 +#define QIB_7220_SendDmaStatus_ScbEmpty_LSB 0x1E +#define QIB_7220_SendDmaStatus_ScbEmpty_RMASK 0x1 +#define QIB_7220_SendDmaStatus_ScbEntryValid_LSB 0x1D +#define QIB_7220_SendDmaStatus_ScbEntryValid_RMASK 0x1 +#define QIB_7220_SendDmaStatus_ScbFetchDescFlag_LSB 0x1C +#define QIB_7220_SendDmaStatus_ScbFetchDescFlag_RMASK 0x1 +#define QIB_7220_SendDmaStatus_SplFifoReadyToGo_LSB 0x1B +#define QIB_7220_SendDmaStatus_SplFifoReadyToGo_RMASK 0x1 +#define QIB_7220_SendDmaStatus_SplFifoDisarmed_LSB 0x1A +#define QIB_7220_SendDmaStatus_SplFifoDisarmed_RMASK 0x1 +#define QIB_7220_SendDmaStatus_SplFifoEmpty_LSB 0x19 +#define QIB_7220_SendDmaStatus_SplFifoEmpty_RMASK 0x1 +#define QIB_7220_SendDmaStatus_SplFifoFull_LSB 0x18 +#define QIB_7220_SendDmaStatus_SplFifoFull_RMASK 0x1 +#define QIB_7220_SendDmaStatus_SplFifoBufNum_LSB 0x10 +#define QIB_7220_SendDmaStatus_SplFifoBufNum_RMASK 0xFF +#define QIB_7220_SendDmaStatus_SplFifoDescIndex_LSB 0x0 +#define QIB_7220_SendDmaStatus_SplFifoDescIndex_RMASK 0xFFFF + +#define QIB_7220_SendBufErr0_OFFS 0x240 +#define QIB_7220_SendBufErr0_SendBufErr_63_0_LSB 0x0 +#define QIB_7220_SendBufErr0_SendBufErr_63_0_RMASK 0x0 + +#define QIB_7220_RcvHdrAddr0_OFFS 0x270 +#define QIB_7220_RcvHdrAddr0_RcvHdrAddr0_LSB 0x2 +#define QIB_7220_RcvHdrAddr0_RcvHdrAddr0_RMASK 0x3FFFFFFFFF +#define QIB_7220_RcvHdrAddr0_Reserved_LSB 0x0 +#define QIB_7220_RcvHdrAddr0_Reserved_RMASK 0x3 + +#define QIB_7220_RcvHdrTailAddr0_OFFS 0x300 +#define QIB_7220_RcvHdrTailAddr0_RcvHdrTailAddr0_LSB 0x2 +#define QIB_7220_RcvHdrTailAddr0_RcvHdrTailAddr0_RMASK 0x3FFFFFFFFF +#define QIB_7220_RcvHdrTailAddr0_Reserved_LSB 0x0 +#define QIB_7220_RcvHdrTailAddr0_Reserved_RMASK 0x3 + +#define QIB_7220_ibsd_epb_access_ctrl_OFFS 0x3C0 +#define QIB_7220_ibsd_epb_access_ctrl_sw_ib_epb_req_granted_LSB 0x8 +#define QIB_7220_ibsd_epb_access_ctrl_sw_ib_epb_req_granted_RMASK 0x1 +#define QIB_7220_ibsd_epb_access_ctrl_Reserved_LSB 0x1 +#define QIB_7220_ibsd_epb_access_ctrl_Reserved_RMASK 0x7F +#define QIB_7220_ibsd_epb_access_ctrl_sw_ib_epb_req_LSB 0x0 +#define QIB_7220_ibsd_epb_access_ctrl_sw_ib_epb_req_RMASK 0x1 + +#define QIB_7220_ibsd_epb_transaction_reg_OFFS 0x3C8 +#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_rdy_LSB 0x1F +#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_rdy_RMASK 0x1 +#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_req_error_LSB 0x1E +#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_req_error_RMASK 0x1 +#define QIB_7220_ibsd_epb_transaction_reg_Reserved_LSB 0x1D +#define QIB_7220_ibsd_epb_transaction_reg_Reserved_RMASK 0x1 +#define QIB_7220_ibsd_epb_transaction_reg_mem_data_parity_LSB 0x1C +#define QIB_7220_ibsd_epb_transaction_reg_mem_data_parity_RMASK 0x1 +#define QIB_7220_ibsd_epb_transaction_reg_Reserved1_LSB 0x1B +#define QIB_7220_ibsd_epb_transaction_reg_Reserved1_RMASK 0x1 +#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_cs_LSB 0x19 +#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_cs_RMASK 0x3 +#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_read_write_LSB 0x18 +#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_read_write_RMASK 0x1 +#define QIB_7220_ibsd_epb_transaction_reg_Reserved2_LSB 0x17 +#define QIB_7220_ibsd_epb_transaction_reg_Reserved2_RMASK 0x1 +#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_address_LSB 0x8 +#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_address_RMASK 0x7FFF +#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_data_LSB 0x0 +#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_data_RMASK 0xFF + +#define QIB_7220_XGXSCfg_OFFS 0x3D8 +#define QIB_7220_XGXSCfg_sel_link_down_for_fctrl_lane_sync_reset_LSB 0x3F +#define QIB_7220_XGXSCfg_sel_link_down_for_fctrl_lane_sync_reset_RMASK 0x1 +#define QIB_7220_XGXSCfg_Reserved_LSB 0x13 +#define QIB_7220_XGXSCfg_Reserved_RMASK 0xFFFFFFFFFFF +#define QIB_7220_XGXSCfg_link_sync_mask_LSB 0x9 +#define QIB_7220_XGXSCfg_link_sync_mask_RMASK 0x3FF +#define QIB_7220_XGXSCfg_Reserved1_LSB 0x3 +#define QIB_7220_XGXSCfg_Reserved1_RMASK 0x3F +#define QIB_7220_XGXSCfg_xcv_reset_LSB 0x2 +#define QIB_7220_XGXSCfg_xcv_reset_RMASK 0x1 +#define QIB_7220_XGXSCfg_Reserved2_LSB 0x1 +#define QIB_7220_XGXSCfg_Reserved2_RMASK 0x1 +#define QIB_7220_XGXSCfg_tx_rx_reset_LSB 0x0 +#define QIB_7220_XGXSCfg_tx_rx_reset_RMASK 0x1 + +#define QIB_7220_IBSerDesCtrl_OFFS 0x3E0 +#define QIB_7220_IBSerDesCtrl_Reserved_LSB 0x2D +#define QIB_7220_IBSerDesCtrl_Reserved_RMASK 0x7FFFF +#define QIB_7220_IBSerDesCtrl_INT_uC_LSB 0x2C +#define QIB_7220_IBSerDesCtrl_INT_uC_RMASK 0x1 +#define QIB_7220_IBSerDesCtrl_CKSEL_uC_LSB 0x2A +#define QIB_7220_IBSerDesCtrl_CKSEL_uC_RMASK 0x3 +#define QIB_7220_IBSerDesCtrl_PLLN_LSB 0x28 +#define QIB_7220_IBSerDesCtrl_PLLN_RMASK 0x3 +#define QIB_7220_IBSerDesCtrl_PLLM_LSB 0x25 +#define QIB_7220_IBSerDesCtrl_PLLM_RMASK 0x7 +#define QIB_7220_IBSerDesCtrl_TXOBPD_LSB 0x24 +#define QIB_7220_IBSerDesCtrl_TXOBPD_RMASK 0x1 +#define QIB_7220_IBSerDesCtrl_TWC_LSB 0x23 +#define QIB_7220_IBSerDesCtrl_TWC_RMASK 0x1 +#define QIB_7220_IBSerDesCtrl_RXIDLE_LSB 0x22 +#define QIB_7220_IBSerDesCtrl_RXIDLE_RMASK 0x1 +#define QIB_7220_IBSerDesCtrl_RXINV_LSB 0x21 +#define QIB_7220_IBSerDesCtrl_RXINV_RMASK 0x1 +#define QIB_7220_IBSerDesCtrl_TXINV_LSB 0x20 +#define QIB_7220_IBSerDesCtrl_TXINV_RMASK 0x1 +#define QIB_7220_IBSerDesCtrl_Reserved1_LSB 0x12 +#define QIB_7220_IBSerDesCtrl_Reserved1_RMASK 0x3FFF +#define QIB_7220_IBSerDesCtrl_NumSerDesRegsToWrForRXEQ_LSB 0xD +#define QIB_7220_IBSerDesCtrl_NumSerDesRegsToWrForRXEQ_RMASK 0x1F +#define QIB_7220_IBSerDesCtrl_NumSerDesRegsToWrForDDS_LSB 0x8 +#define QIB_7220_IBSerDesCtrl_NumSerDesRegsToWrForDDS_RMASK 0x1F +#define QIB_7220_IBSerDesCtrl_Reserved2_LSB 0x1 +#define QIB_7220_IBSerDesCtrl_Reserved2_RMASK 0x7F +#define QIB_7220_IBSerDesCtrl_ResetIB_uC_Core_LSB 0x0 +#define QIB_7220_IBSerDesCtrl_ResetIB_uC_Core_RMASK 0x1 + +#define QIB_7220_pciesd_epb_access_ctrl_OFFS 0x400 +#define QIB_7220_pciesd_epb_access_ctrl_sw_pcie_epb_req_granted_LSB 0x8 +#define QIB_7220_pciesd_epb_access_ctrl_sw_pcie_epb_req_granted_RMASK 0x1 +#define QIB_7220_pciesd_epb_access_ctrl_Reserved_LSB 0x3 +#define QIB_7220_pciesd_epb_access_ctrl_Reserved_RMASK 0x1F +#define QIB_7220_pciesd_epb_access_ctrl_sw_pcieepb_star_en_LSB 0x1 +#define QIB_7220_pciesd_epb_access_ctrl_sw_pcieepb_star_en_RMASK 0x3 +#define QIB_7220_pciesd_epb_access_ctrl_sw_pcie_epb_req_LSB 0x0 +#define QIB_7220_pciesd_epb_access_ctrl_sw_pcie_epb_req_RMASK 0x1 + +#define QIB_7220_pciesd_epb_transaction_reg_OFFS 0x408 +#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_rdy_LSB 0x1F +#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_rdy_RMASK 0x1 +#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_req_error_LSB 0x1E +#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_req_error_RMASK 0x1 +#define QIB_7220_pciesd_epb_transaction_reg_Reserved_LSB 0x1D +#define QIB_7220_pciesd_epb_transaction_reg_Reserved_RMASK 0x1 +#define QIB_7220_pciesd_epb_transaction_reg_mem_data_parity_LSB 0x1C +#define QIB_7220_pciesd_epb_transaction_reg_mem_data_parity_RMASK 0x1 +#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_cs_LSB 0x19 +#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_cs_RMASK 0x7 +#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_read_write_LSB 0x18 +#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_read_write_RMASK 0x1 +#define QIB_7220_pciesd_epb_transaction_reg_Reserved1_LSB 0x17 +#define QIB_7220_pciesd_epb_transaction_reg_Reserved1_RMASK 0x1 +#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_address_LSB 0x8 +#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_address_RMASK 0x7FFF +#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_data_LSB 0x0 +#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_data_RMASK 0xFF + +#define QIB_7220_SerDes_DDSRXEQ0_OFFS 0x500 +#define QIB_7220_SerDes_DDSRXEQ0_reg_addr_LSB 0x4 +#define QIB_7220_SerDes_DDSRXEQ0_reg_addr_RMASK 0x3F +#define QIB_7220_SerDes_DDSRXEQ0_element_num_LSB 0x0 +#define QIB_7220_SerDes_DDSRXEQ0_element_num_RMASK 0xF + +#define QIB_7220_LBIntCnt_OFFS 0x13000 + +#define QIB_7220_LBFlowStallCnt_OFFS 0x13008 + +#define QIB_7220_TxSDmaDescCnt_OFFS 0x13010 + +#define QIB_7220_TxUnsupVLErrCnt_OFFS 0x13018 + +#define QIB_7220_TxDataPktCnt_OFFS 0x13020 + +#define QIB_7220_TxFlowPktCnt_OFFS 0x13028 + +#define QIB_7220_TxDwordCnt_OFFS 0x13030 + +#define QIB_7220_TxLenErrCnt_OFFS 0x13038 + +#define QIB_7220_TxMaxMinLenErrCnt_OFFS 0x13040 + +#define QIB_7220_TxUnderrunCnt_OFFS 0x13048 + +#define QIB_7220_TxFlowStallCnt_OFFS 0x13050 + +#define QIB_7220_TxDroppedPktCnt_OFFS 0x13058 + +#define QIB_7220_RxDroppedPktCnt_OFFS 0x13060 + +#define QIB_7220_RxDataPktCnt_OFFS 0x13068 + +#define QIB_7220_RxFlowPktCnt_OFFS 0x13070 + +#define QIB_7220_RxDwordCnt_OFFS 0x13078 + +#define QIB_7220_RxLenErrCnt_OFFS 0x13080 + +#define QIB_7220_RxMaxMinLenErrCnt_OFFS 0x13088 + +#define QIB_7220_RxICRCErrCnt_OFFS 0x13090 + +#define QIB_7220_RxVCRCErrCnt_OFFS 0x13098 + +#define QIB_7220_RxFlowCtrlViolCnt_OFFS 0x130A0 + +#define QIB_7220_RxVersionErrCnt_OFFS 0x130A8 + +#define QIB_7220_RxLinkMalformCnt_OFFS 0x130B0 + +#define QIB_7220_RxEBPCnt_OFFS 0x130B8 + +#define QIB_7220_RxLPCRCErrCnt_OFFS 0x130C0 + +#define QIB_7220_RxBufOvflCnt_OFFS 0x130C8 + +#define QIB_7220_RxTIDFullErrCnt_OFFS 0x130D0 + +#define QIB_7220_RxTIDValidErrCnt_OFFS 0x130D8 + +#define QIB_7220_RxPKeyMismatchCnt_OFFS 0x130E0 + +#define QIB_7220_RxP0HdrEgrOvflCnt_OFFS 0x130E8 + +#define QIB_7220_IBStatusChangeCnt_OFFS 0x13170 + +#define QIB_7220_IBLinkErrRecoveryCnt_OFFS 0x13178 + +#define QIB_7220_IBLinkDownedCnt_OFFS 0x13180 + +#define QIB_7220_IBSymbolErrCnt_OFFS 0x13188 + +#define QIB_7220_RxVL15DroppedPktCnt_OFFS 0x13190 + +#define QIB_7220_RxOtherLocalPhyErrCnt_OFFS 0x13198 + +#define QIB_7220_PcieRetryBufDiagQwordCnt_OFFS 0x131A0 + +#define QIB_7220_ExcessBufferOvflCnt_OFFS 0x131A8 + +#define QIB_7220_LocalLinkIntegrityErrCnt_OFFS 0x131B0 + +#define QIB_7220_RxVlErrCnt_OFFS 0x131B8 + +#define QIB_7220_RxDlidFltrCnt_OFFS 0x131C0 + +#define QIB_7220_CNT_0131C8_OFFS 0x131C8 + +#define QIB_7220_PSStat_OFFS 0x13200 + +#define QIB_7220_PSStart_OFFS 0x13208 + +#define QIB_7220_PSInterval_OFFS 0x13210 + +#define QIB_7220_PSRcvDataCount_OFFS 0x13218 + +#define QIB_7220_PSRcvPktsCount_OFFS 0x13220 + +#define QIB_7220_PSXmitDataCount_OFFS 0x13228 + +#define QIB_7220_PSXmitPktsCount_OFFS 0x13230 + +#define QIB_7220_PSXmitWaitCount_OFFS 0x13238 + +#define QIB_7220_CNT_013240_OFFS 0x13240 + +#define QIB_7220_RcvEgrArray_OFFS 0x14000 + +#define QIB_7220_MEM_038000_OFFS 0x38000 + +#define QIB_7220_RcvTIDArray0_OFFS 0x53000 + +#define QIB_7220_PIOLaunchFIFO_OFFS 0x64000 + +#define QIB_7220_MEM_064480_OFFS 0x64480 + +#define QIB_7220_SendPIOpbcCache_OFFS 0x64800 + +#define QIB_7220_MEM_064C80_OFFS 0x64C80 + +#define QIB_7220_PreLaunchFIFO_OFFS 0x65000 + +#define QIB_7220_MEM_065080_OFFS 0x65080 + +#define QIB_7220_ScoreBoard_OFFS 0x65400 + +#define QIB_7220_MEM_065440_OFFS 0x65440 + +#define QIB_7220_DescriptorFIFO_OFFS 0x65800 + +#define QIB_7220_MEM_065880_OFFS 0x65880 + +#define QIB_7220_RcvBuf1_OFFS 0x72000 + +#define QIB_7220_MEM_074800_OFFS 0x74800 + +#define QIB_7220_RcvBuf2_OFFS 0x75000 + +#define QIB_7220_MEM_076400_OFFS 0x76400 + +#define QIB_7220_RcvFlags_OFFS 0x77000 + +#define QIB_7220_MEM_078400_OFFS 0x78400 + +#define QIB_7220_RcvLookupBuf1_OFFS 0x79000 + +#define QIB_7220_MEM_07A400_OFFS 0x7A400 + +#define QIB_7220_RcvDMADatBuf_OFFS 0x7B000 + +#define QIB_7220_RcvDMAHdrBuf_OFFS 0x7B800 + +#define QIB_7220_MiscRXEIntMem_OFFS 0x7C000 + +#define QIB_7220_MEM_07D400_OFFS 0x7D400 + +#define QIB_7220_PCIERcvBuf_OFFS 0x80000 + +#define QIB_7220_PCIERetryBuf_OFFS 0x84000 + +#define QIB_7220_PCIERcvBufRdToWrAddr_OFFS 0x88000 + +#define QIB_7220_PCIECplBuf_OFFS 0x90000 + +#define QIB_7220_IBSerDesMappTable_OFFS 0x94000 + +#define QIB_7220_MEM_095000_OFFS 0x95000 + +#define QIB_7220_SendBuf0_MA_OFFS 0x100000 + +#define QIB_7220_MEM_1A0000_OFFS 0x1A0000 diff --git a/kernel/drivers/infiniband/hw/qib/qib_7322_regs.h b/kernel/drivers/infiniband/hw/qib/qib_7322_regs.h new file mode 100644 index 000000000..32dc81ff8 --- /dev/null +++ b/kernel/drivers/infiniband/hw/qib/qib_7322_regs.h @@ -0,0 +1,3163 @@ +/* + * Copyright (c) 2008, 2009, 2010 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* This file is mechanically generated from RTL. Any hand-edits will be lost! */ + +#define QIB_7322_Revision_OFFS 0x0 +#define QIB_7322_Revision_DEF 0x0000000002010601 +#define QIB_7322_Revision_R_Simulator_LSB 0x3F +#define QIB_7322_Revision_R_Simulator_MSB 0x3F +#define QIB_7322_Revision_R_Simulator_RMASK 0x1 +#define QIB_7322_Revision_R_Emulation_LSB 0x3E +#define QIB_7322_Revision_R_Emulation_MSB 0x3E +#define QIB_7322_Revision_R_Emulation_RMASK 0x1 +#define QIB_7322_Revision_R_Emulation_Revcode_LSB 0x28 +#define QIB_7322_Revision_R_Emulation_Revcode_MSB 0x3D +#define QIB_7322_Revision_R_Emulation_Revcode_RMASK 0x3FFFFF +#define QIB_7322_Revision_BoardID_LSB 0x20 +#define QIB_7322_Revision_BoardID_MSB 0x27 +#define QIB_7322_Revision_BoardID_RMASK 0xFF +#define QIB_7322_Revision_R_SW_LSB 0x18 +#define QIB_7322_Revision_R_SW_MSB 0x1F +#define QIB_7322_Revision_R_SW_RMASK 0xFF +#define QIB_7322_Revision_R_Arch_LSB 0x10 +#define QIB_7322_Revision_R_Arch_MSB 0x17 +#define QIB_7322_Revision_R_Arch_RMASK 0xFF +#define QIB_7322_Revision_R_ChipRevMajor_LSB 0x8 +#define QIB_7322_Revision_R_ChipRevMajor_MSB 0xF +#define QIB_7322_Revision_R_ChipRevMajor_RMASK 0xFF +#define QIB_7322_Revision_R_ChipRevMinor_LSB 0x0 +#define QIB_7322_Revision_R_ChipRevMinor_MSB 0x7 +#define QIB_7322_Revision_R_ChipRevMinor_RMASK 0xFF + +#define QIB_7322_Control_OFFS 0x8 +#define QIB_7322_Control_DEF 0x0000000000000000 +#define QIB_7322_Control_PCIECplQDiagEn_LSB 0x6 +#define QIB_7322_Control_PCIECplQDiagEn_MSB 0x6 +#define QIB_7322_Control_PCIECplQDiagEn_RMASK 0x1 +#define QIB_7322_Control_PCIEPostQDiagEn_LSB 0x5 +#define QIB_7322_Control_PCIEPostQDiagEn_MSB 0x5 +#define QIB_7322_Control_PCIEPostQDiagEn_RMASK 0x1 +#define QIB_7322_Control_SDmaDescFetchPriorityEn_LSB 0x4 +#define QIB_7322_Control_SDmaDescFetchPriorityEn_MSB 0x4 +#define QIB_7322_Control_SDmaDescFetchPriorityEn_RMASK 0x1 +#define QIB_7322_Control_PCIERetryBufDiagEn_LSB 0x3 +#define QIB_7322_Control_PCIERetryBufDiagEn_MSB 0x3 +#define QIB_7322_Control_PCIERetryBufDiagEn_RMASK 0x1 +#define QIB_7322_Control_FreezeMode_LSB 0x1 +#define QIB_7322_Control_FreezeMode_MSB 0x1 +#define QIB_7322_Control_FreezeMode_RMASK 0x1 +#define QIB_7322_Control_SyncReset_LSB 0x0 +#define QIB_7322_Control_SyncReset_MSB 0x0 +#define QIB_7322_Control_SyncReset_RMASK 0x1 + +#define QIB_7322_PageAlign_OFFS 0x10 +#define QIB_7322_PageAlign_DEF 0x0000000000001000 + +#define QIB_7322_ContextCnt_OFFS 0x18 +#define QIB_7322_ContextCnt_DEF 0x0000000000000012 + +#define QIB_7322_Scratch_OFFS 0x20 +#define QIB_7322_Scratch_DEF 0x0000000000000000 + +#define QIB_7322_CntrRegBase_OFFS 0x28 +#define QIB_7322_CntrRegBase_DEF 0x0000000000011000 + +#define QIB_7322_SendRegBase_OFFS 0x30 +#define QIB_7322_SendRegBase_DEF 0x0000000000003000 + +#define QIB_7322_UserRegBase_OFFS 0x38 +#define QIB_7322_UserRegBase_DEF 0x0000000000200000 + +#define QIB_7322_IntMask_OFFS 0x68 +#define QIB_7322_IntMask_DEF 0x0000000000000000 +#define QIB_7322_IntMask_SDmaIntMask_1_LSB 0x3F +#define QIB_7322_IntMask_SDmaIntMask_1_MSB 0x3F +#define QIB_7322_IntMask_SDmaIntMask_1_RMASK 0x1 +#define QIB_7322_IntMask_SDmaIntMask_0_LSB 0x3E +#define QIB_7322_IntMask_SDmaIntMask_0_MSB 0x3E +#define QIB_7322_IntMask_SDmaIntMask_0_RMASK 0x1 +#define QIB_7322_IntMask_SDmaProgressIntMask_1_LSB 0x3D +#define QIB_7322_IntMask_SDmaProgressIntMask_1_MSB 0x3D +#define QIB_7322_IntMask_SDmaProgressIntMask_1_RMASK 0x1 +#define QIB_7322_IntMask_SDmaProgressIntMask_0_LSB 0x3C +#define QIB_7322_IntMask_SDmaProgressIntMask_0_MSB 0x3C +#define QIB_7322_IntMask_SDmaProgressIntMask_0_RMASK 0x1 +#define QIB_7322_IntMask_SDmaIdleIntMask_1_LSB 0x3B +#define QIB_7322_IntMask_SDmaIdleIntMask_1_MSB 0x3B +#define QIB_7322_IntMask_SDmaIdleIntMask_1_RMASK 0x1 +#define QIB_7322_IntMask_SDmaIdleIntMask_0_LSB 0x3A +#define QIB_7322_IntMask_SDmaIdleIntMask_0_MSB 0x3A +#define QIB_7322_IntMask_SDmaIdleIntMask_0_RMASK 0x1 +#define QIB_7322_IntMask_SDmaCleanupDoneMask_1_LSB 0x39 +#define QIB_7322_IntMask_SDmaCleanupDoneMask_1_MSB 0x39 +#define QIB_7322_IntMask_SDmaCleanupDoneMask_1_RMASK 0x1 +#define QIB_7322_IntMask_SDmaCleanupDoneMask_0_LSB 0x38 +#define QIB_7322_IntMask_SDmaCleanupDoneMask_0_MSB 0x38 +#define QIB_7322_IntMask_SDmaCleanupDoneMask_0_RMASK 0x1 +#define QIB_7322_IntMask_RcvUrg17IntMask_LSB 0x31 +#define QIB_7322_IntMask_RcvUrg17IntMask_MSB 0x31 +#define QIB_7322_IntMask_RcvUrg17IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvUrg16IntMask_LSB 0x30 +#define QIB_7322_IntMask_RcvUrg16IntMask_MSB 0x30 +#define QIB_7322_IntMask_RcvUrg16IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvUrg15IntMask_LSB 0x2F +#define QIB_7322_IntMask_RcvUrg15IntMask_MSB 0x2F +#define QIB_7322_IntMask_RcvUrg15IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvUrg14IntMask_LSB 0x2E +#define QIB_7322_IntMask_RcvUrg14IntMask_MSB 0x2E +#define QIB_7322_IntMask_RcvUrg14IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvUrg13IntMask_LSB 0x2D +#define QIB_7322_IntMask_RcvUrg13IntMask_MSB 0x2D +#define QIB_7322_IntMask_RcvUrg13IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvUrg12IntMask_LSB 0x2C +#define QIB_7322_IntMask_RcvUrg12IntMask_MSB 0x2C +#define QIB_7322_IntMask_RcvUrg12IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvUrg11IntMask_LSB 0x2B +#define QIB_7322_IntMask_RcvUrg11IntMask_MSB 0x2B +#define QIB_7322_IntMask_RcvUrg11IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvUrg10IntMask_LSB 0x2A +#define QIB_7322_IntMask_RcvUrg10IntMask_MSB 0x2A +#define QIB_7322_IntMask_RcvUrg10IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvUrg9IntMask_LSB 0x29 +#define QIB_7322_IntMask_RcvUrg9IntMask_MSB 0x29 +#define QIB_7322_IntMask_RcvUrg9IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvUrg8IntMask_LSB 0x28 +#define QIB_7322_IntMask_RcvUrg8IntMask_MSB 0x28 +#define QIB_7322_IntMask_RcvUrg8IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvUrg7IntMask_LSB 0x27 +#define QIB_7322_IntMask_RcvUrg7IntMask_MSB 0x27 +#define QIB_7322_IntMask_RcvUrg7IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvUrg6IntMask_LSB 0x26 +#define QIB_7322_IntMask_RcvUrg6IntMask_MSB 0x26 +#define QIB_7322_IntMask_RcvUrg6IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvUrg5IntMask_LSB 0x25 +#define QIB_7322_IntMask_RcvUrg5IntMask_MSB 0x25 +#define QIB_7322_IntMask_RcvUrg5IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvUrg4IntMask_LSB 0x24 +#define QIB_7322_IntMask_RcvUrg4IntMask_MSB 0x24 +#define QIB_7322_IntMask_RcvUrg4IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvUrg3IntMask_LSB 0x23 +#define QIB_7322_IntMask_RcvUrg3IntMask_MSB 0x23 +#define QIB_7322_IntMask_RcvUrg3IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvUrg2IntMask_LSB 0x22 +#define QIB_7322_IntMask_RcvUrg2IntMask_MSB 0x22 +#define QIB_7322_IntMask_RcvUrg2IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvUrg1IntMask_LSB 0x21 +#define QIB_7322_IntMask_RcvUrg1IntMask_MSB 0x21 +#define QIB_7322_IntMask_RcvUrg1IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvUrg0IntMask_LSB 0x20 +#define QIB_7322_IntMask_RcvUrg0IntMask_MSB 0x20 +#define QIB_7322_IntMask_RcvUrg0IntMask_RMASK 0x1 +#define QIB_7322_IntMask_ErrIntMask_1_LSB 0x1F +#define QIB_7322_IntMask_ErrIntMask_1_MSB 0x1F +#define QIB_7322_IntMask_ErrIntMask_1_RMASK 0x1 +#define QIB_7322_IntMask_ErrIntMask_0_LSB 0x1E +#define QIB_7322_IntMask_ErrIntMask_0_MSB 0x1E +#define QIB_7322_IntMask_ErrIntMask_0_RMASK 0x1 +#define QIB_7322_IntMask_ErrIntMask_LSB 0x1D +#define QIB_7322_IntMask_ErrIntMask_MSB 0x1D +#define QIB_7322_IntMask_ErrIntMask_RMASK 0x1 +#define QIB_7322_IntMask_AssertGPIOIntMask_LSB 0x1C +#define QIB_7322_IntMask_AssertGPIOIntMask_MSB 0x1C +#define QIB_7322_IntMask_AssertGPIOIntMask_RMASK 0x1 +#define QIB_7322_IntMask_SendDoneIntMask_1_LSB 0x19 +#define QIB_7322_IntMask_SendDoneIntMask_1_MSB 0x19 +#define QIB_7322_IntMask_SendDoneIntMask_1_RMASK 0x1 +#define QIB_7322_IntMask_SendDoneIntMask_0_LSB 0x18 +#define QIB_7322_IntMask_SendDoneIntMask_0_MSB 0x18 +#define QIB_7322_IntMask_SendDoneIntMask_0_RMASK 0x1 +#define QIB_7322_IntMask_SendBufAvailIntMask_LSB 0x17 +#define QIB_7322_IntMask_SendBufAvailIntMask_MSB 0x17 +#define QIB_7322_IntMask_SendBufAvailIntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvAvail17IntMask_LSB 0x11 +#define QIB_7322_IntMask_RcvAvail17IntMask_MSB 0x11 +#define QIB_7322_IntMask_RcvAvail17IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvAvail16IntMask_LSB 0x10 +#define QIB_7322_IntMask_RcvAvail16IntMask_MSB 0x10 +#define QIB_7322_IntMask_RcvAvail16IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvAvail15IntMask_LSB 0xF +#define QIB_7322_IntMask_RcvAvail15IntMask_MSB 0xF +#define QIB_7322_IntMask_RcvAvail15IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvAvail14IntMask_LSB 0xE +#define QIB_7322_IntMask_RcvAvail14IntMask_MSB 0xE +#define QIB_7322_IntMask_RcvAvail14IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvAvail13IntMask_LSB 0xD +#define QIB_7322_IntMask_RcvAvail13IntMask_MSB 0xD +#define QIB_7322_IntMask_RcvAvail13IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvAvail12IntMask_LSB 0xC +#define QIB_7322_IntMask_RcvAvail12IntMask_MSB 0xC +#define QIB_7322_IntMask_RcvAvail12IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvAvail11IntMask_LSB 0xB +#define QIB_7322_IntMask_RcvAvail11IntMask_MSB 0xB +#define QIB_7322_IntMask_RcvAvail11IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvAvail10IntMask_LSB 0xA +#define QIB_7322_IntMask_RcvAvail10IntMask_MSB 0xA +#define QIB_7322_IntMask_RcvAvail10IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvAvail9IntMask_LSB 0x9 +#define QIB_7322_IntMask_RcvAvail9IntMask_MSB 0x9 +#define QIB_7322_IntMask_RcvAvail9IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvAvail8IntMask_LSB 0x8 +#define QIB_7322_IntMask_RcvAvail8IntMask_MSB 0x8 +#define QIB_7322_IntMask_RcvAvail8IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvAvail7IntMask_LSB 0x7 +#define QIB_7322_IntMask_RcvAvail7IntMask_MSB 0x7 +#define QIB_7322_IntMask_RcvAvail7IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvAvail6IntMask_LSB 0x6 +#define QIB_7322_IntMask_RcvAvail6IntMask_MSB 0x6 +#define QIB_7322_IntMask_RcvAvail6IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvAvail5IntMask_LSB 0x5 +#define QIB_7322_IntMask_RcvAvail5IntMask_MSB 0x5 +#define QIB_7322_IntMask_RcvAvail5IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvAvail4IntMask_LSB 0x4 +#define QIB_7322_IntMask_RcvAvail4IntMask_MSB 0x4 +#define QIB_7322_IntMask_RcvAvail4IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvAvail3IntMask_LSB 0x3 +#define QIB_7322_IntMask_RcvAvail3IntMask_MSB 0x3 +#define QIB_7322_IntMask_RcvAvail3IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvAvail2IntMask_LSB 0x2 +#define QIB_7322_IntMask_RcvAvail2IntMask_MSB 0x2 +#define QIB_7322_IntMask_RcvAvail2IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvAvail1IntMask_LSB 0x1 +#define QIB_7322_IntMask_RcvAvail1IntMask_MSB 0x1 +#define QIB_7322_IntMask_RcvAvail1IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvAvail0IntMask_LSB 0x0 +#define QIB_7322_IntMask_RcvAvail0IntMask_MSB 0x0 +#define QIB_7322_IntMask_RcvAvail0IntMask_RMASK 0x1 + +#define QIB_7322_IntStatus_OFFS 0x70 +#define QIB_7322_IntStatus_DEF 0x0000000000000000 +#define QIB_7322_IntStatus_SDmaInt_1_LSB 0x3F +#define QIB_7322_IntStatus_SDmaInt_1_MSB 0x3F +#define QIB_7322_IntStatus_SDmaInt_1_RMASK 0x1 +#define QIB_7322_IntStatus_SDmaInt_0_LSB 0x3E +#define QIB_7322_IntStatus_SDmaInt_0_MSB 0x3E +#define QIB_7322_IntStatus_SDmaInt_0_RMASK 0x1 +#define QIB_7322_IntStatus_SDmaProgressInt_1_LSB 0x3D +#define QIB_7322_IntStatus_SDmaProgressInt_1_MSB 0x3D +#define QIB_7322_IntStatus_SDmaProgressInt_1_RMASK 0x1 +#define QIB_7322_IntStatus_SDmaProgressInt_0_LSB 0x3C +#define QIB_7322_IntStatus_SDmaProgressInt_0_MSB 0x3C +#define QIB_7322_IntStatus_SDmaProgressInt_0_RMASK 0x1 +#define QIB_7322_IntStatus_SDmaIdleInt_1_LSB 0x3B +#define QIB_7322_IntStatus_SDmaIdleInt_1_MSB 0x3B +#define QIB_7322_IntStatus_SDmaIdleInt_1_RMASK 0x1 +#define QIB_7322_IntStatus_SDmaIdleInt_0_LSB 0x3A +#define QIB_7322_IntStatus_SDmaIdleInt_0_MSB 0x3A +#define QIB_7322_IntStatus_SDmaIdleInt_0_RMASK 0x1 +#define QIB_7322_IntStatus_SDmaCleanupDone_1_LSB 0x39 +#define QIB_7322_IntStatus_SDmaCleanupDone_1_MSB 0x39 +#define QIB_7322_IntStatus_SDmaCleanupDone_1_RMASK 0x1 +#define QIB_7322_IntStatus_SDmaCleanupDone_0_LSB 0x38 +#define QIB_7322_IntStatus_SDmaCleanupDone_0_MSB 0x38 +#define QIB_7322_IntStatus_SDmaCleanupDone_0_RMASK 0x1 +#define QIB_7322_IntStatus_RcvUrg17_LSB 0x31 +#define QIB_7322_IntStatus_RcvUrg17_MSB 0x31 +#define QIB_7322_IntStatus_RcvUrg17_RMASK 0x1 +#define QIB_7322_IntStatus_RcvUrg16_LSB 0x30 +#define QIB_7322_IntStatus_RcvUrg16_MSB 0x30 +#define QIB_7322_IntStatus_RcvUrg16_RMASK 0x1 +#define QIB_7322_IntStatus_RcvUrg15_LSB 0x2F +#define QIB_7322_IntStatus_RcvUrg15_MSB 0x2F +#define QIB_7322_IntStatus_RcvUrg15_RMASK 0x1 +#define QIB_7322_IntStatus_RcvUrg14_LSB 0x2E +#define QIB_7322_IntStatus_RcvUrg14_MSB 0x2E +#define QIB_7322_IntStatus_RcvUrg14_RMASK 0x1 +#define QIB_7322_IntStatus_RcvUrg13_LSB 0x2D +#define QIB_7322_IntStatus_RcvUrg13_MSB 0x2D +#define QIB_7322_IntStatus_RcvUrg13_RMASK 0x1 +#define QIB_7322_IntStatus_RcvUrg12_LSB 0x2C +#define QIB_7322_IntStatus_RcvUrg12_MSB 0x2C +#define QIB_7322_IntStatus_RcvUrg12_RMASK 0x1 +#define QIB_7322_IntStatus_RcvUrg11_LSB 0x2B +#define QIB_7322_IntStatus_RcvUrg11_MSB 0x2B +#define QIB_7322_IntStatus_RcvUrg11_RMASK 0x1 +#define QIB_7322_IntStatus_RcvUrg10_LSB 0x2A +#define QIB_7322_IntStatus_RcvUrg10_MSB 0x2A +#define QIB_7322_IntStatus_RcvUrg10_RMASK 0x1 +#define QIB_7322_IntStatus_RcvUrg9_LSB 0x29 +#define QIB_7322_IntStatus_RcvUrg9_MSB 0x29 +#define QIB_7322_IntStatus_RcvUrg9_RMASK 0x1 +#define QIB_7322_IntStatus_RcvUrg8_LSB 0x28 +#define QIB_7322_IntStatus_RcvUrg8_MSB 0x28 +#define QIB_7322_IntStatus_RcvUrg8_RMASK 0x1 +#define QIB_7322_IntStatus_RcvUrg7_LSB 0x27 +#define QIB_7322_IntStatus_RcvUrg7_MSB 0x27 +#define QIB_7322_IntStatus_RcvUrg7_RMASK 0x1 +#define QIB_7322_IntStatus_RcvUrg6_LSB 0x26 +#define QIB_7322_IntStatus_RcvUrg6_MSB 0x26 +#define QIB_7322_IntStatus_RcvUrg6_RMASK 0x1 +#define QIB_7322_IntStatus_RcvUrg5_LSB 0x25 +#define QIB_7322_IntStatus_RcvUrg5_MSB 0x25 +#define QIB_7322_IntStatus_RcvUrg5_RMASK 0x1 +#define QIB_7322_IntStatus_RcvUrg4_LSB 0x24 +#define QIB_7322_IntStatus_RcvUrg4_MSB 0x24 +#define QIB_7322_IntStatus_RcvUrg4_RMASK 0x1 +#define QIB_7322_IntStatus_RcvUrg3_LSB 0x23 +#define QIB_7322_IntStatus_RcvUrg3_MSB 0x23 +#define QIB_7322_IntStatus_RcvUrg3_RMASK 0x1 +#define QIB_7322_IntStatus_RcvUrg2_LSB 0x22 +#define QIB_7322_IntStatus_RcvUrg2_MSB 0x22 +#define QIB_7322_IntStatus_RcvUrg2_RMASK 0x1 +#define QIB_7322_IntStatus_RcvUrg1_LSB 0x21 +#define QIB_7322_IntStatus_RcvUrg1_MSB 0x21 +#define QIB_7322_IntStatus_RcvUrg1_RMASK 0x1 +#define QIB_7322_IntStatus_RcvUrg0_LSB 0x20 +#define QIB_7322_IntStatus_RcvUrg0_MSB 0x20 +#define QIB_7322_IntStatus_RcvUrg0_RMASK 0x1 +#define QIB_7322_IntStatus_Err_1_LSB 0x1F +#define QIB_7322_IntStatus_Err_1_MSB 0x1F +#define QIB_7322_IntStatus_Err_1_RMASK 0x1 +#define QIB_7322_IntStatus_Err_0_LSB 0x1E +#define QIB_7322_IntStatus_Err_0_MSB 0x1E +#define QIB_7322_IntStatus_Err_0_RMASK 0x1 +#define QIB_7322_IntStatus_Err_LSB 0x1D +#define QIB_7322_IntStatus_Err_MSB 0x1D +#define QIB_7322_IntStatus_Err_RMASK 0x1 +#define QIB_7322_IntStatus_AssertGPIO_LSB 0x1C +#define QIB_7322_IntStatus_AssertGPIO_MSB 0x1C +#define QIB_7322_IntStatus_AssertGPIO_RMASK 0x1 +#define QIB_7322_IntStatus_SendDone_1_LSB 0x19 +#define QIB_7322_IntStatus_SendDone_1_MSB 0x19 +#define QIB_7322_IntStatus_SendDone_1_RMASK 0x1 +#define QIB_7322_IntStatus_SendDone_0_LSB 0x18 +#define QIB_7322_IntStatus_SendDone_0_MSB 0x18 +#define QIB_7322_IntStatus_SendDone_0_RMASK 0x1 +#define QIB_7322_IntStatus_SendBufAvail_LSB 0x17 +#define QIB_7322_IntStatus_SendBufAvail_MSB 0x17 +#define QIB_7322_IntStatus_SendBufAvail_RMASK 0x1 +#define QIB_7322_IntStatus_RcvAvail17_LSB 0x11 +#define QIB_7322_IntStatus_RcvAvail17_MSB 0x11 +#define QIB_7322_IntStatus_RcvAvail17_RMASK 0x1 +#define QIB_7322_IntStatus_RcvAvail16_LSB 0x10 +#define QIB_7322_IntStatus_RcvAvail16_MSB 0x10 +#define QIB_7322_IntStatus_RcvAvail16_RMASK 0x1 +#define QIB_7322_IntStatus_RcvAvail15_LSB 0xF +#define QIB_7322_IntStatus_RcvAvail15_MSB 0xF +#define QIB_7322_IntStatus_RcvAvail15_RMASK 0x1 +#define QIB_7322_IntStatus_RcvAvail14_LSB 0xE +#define QIB_7322_IntStatus_RcvAvail14_MSB 0xE +#define QIB_7322_IntStatus_RcvAvail14_RMASK 0x1 +#define QIB_7322_IntStatus_RcvAvail13_LSB 0xD +#define QIB_7322_IntStatus_RcvAvail13_MSB 0xD +#define QIB_7322_IntStatus_RcvAvail13_RMASK 0x1 +#define QIB_7322_IntStatus_RcvAvail12_LSB 0xC +#define QIB_7322_IntStatus_RcvAvail12_MSB 0xC +#define QIB_7322_IntStatus_RcvAvail12_RMASK 0x1 +#define QIB_7322_IntStatus_RcvAvail11_LSB 0xB +#define QIB_7322_IntStatus_RcvAvail11_MSB 0xB +#define QIB_7322_IntStatus_RcvAvail11_RMASK 0x1 +#define QIB_7322_IntStatus_RcvAvail10_LSB 0xA +#define QIB_7322_IntStatus_RcvAvail10_MSB 0xA +#define QIB_7322_IntStatus_RcvAvail10_RMASK 0x1 +#define QIB_7322_IntStatus_RcvAvail9_LSB 0x9 +#define QIB_7322_IntStatus_RcvAvail9_MSB 0x9 +#define QIB_7322_IntStatus_RcvAvail9_RMASK 0x1 +#define QIB_7322_IntStatus_RcvAvail8_LSB 0x8 +#define QIB_7322_IntStatus_RcvAvail8_MSB 0x8 +#define QIB_7322_IntStatus_RcvAvail8_RMASK 0x1 +#define QIB_7322_IntStatus_RcvAvail7_LSB 0x7 +#define QIB_7322_IntStatus_RcvAvail7_MSB 0x7 +#define QIB_7322_IntStatus_RcvAvail7_RMASK 0x1 +#define QIB_7322_IntStatus_RcvAvail6_LSB 0x6 +#define QIB_7322_IntStatus_RcvAvail6_MSB 0x6 +#define QIB_7322_IntStatus_RcvAvail6_RMASK 0x1 +#define QIB_7322_IntStatus_RcvAvail5_LSB 0x5 +#define QIB_7322_IntStatus_RcvAvail5_MSB 0x5 +#define QIB_7322_IntStatus_RcvAvail5_RMASK 0x1 +#define QIB_7322_IntStatus_RcvAvail4_LSB 0x4 +#define QIB_7322_IntStatus_RcvAvail4_MSB 0x4 +#define QIB_7322_IntStatus_RcvAvail4_RMASK 0x1 +#define QIB_7322_IntStatus_RcvAvail3_LSB 0x3 +#define QIB_7322_IntStatus_RcvAvail3_MSB 0x3 +#define QIB_7322_IntStatus_RcvAvail3_RMASK 0x1 +#define QIB_7322_IntStatus_RcvAvail2_LSB 0x2 +#define QIB_7322_IntStatus_RcvAvail2_MSB 0x2 +#define QIB_7322_IntStatus_RcvAvail2_RMASK 0x1 +#define QIB_7322_IntStatus_RcvAvail1_LSB 0x1 +#define QIB_7322_IntStatus_RcvAvail1_MSB 0x1 +#define QIB_7322_IntStatus_RcvAvail1_RMASK 0x1 +#define QIB_7322_IntStatus_RcvAvail0_LSB 0x0 +#define QIB_7322_IntStatus_RcvAvail0_MSB 0x0 +#define QIB_7322_IntStatus_RcvAvail0_RMASK 0x1 + +#define QIB_7322_IntClear_OFFS 0x78 +#define QIB_7322_IntClear_DEF 0x0000000000000000 +#define QIB_7322_IntClear_SDmaIntClear_1_LSB 0x3F +#define QIB_7322_IntClear_SDmaIntClear_1_MSB 0x3F +#define QIB_7322_IntClear_SDmaIntClear_1_RMASK 0x1 +#define QIB_7322_IntClear_SDmaIntClear_0_LSB 0x3E +#define QIB_7322_IntClear_SDmaIntClear_0_MSB 0x3E +#define QIB_7322_IntClear_SDmaIntClear_0_RMASK 0x1 +#define QIB_7322_IntClear_SDmaProgressIntClear_1_LSB 0x3D +#define QIB_7322_IntClear_SDmaProgressIntClear_1_MSB 0x3D +#define QIB_7322_IntClear_SDmaProgressIntClear_1_RMASK 0x1 +#define QIB_7322_IntClear_SDmaProgressIntClear_0_LSB 0x3C +#define QIB_7322_IntClear_SDmaProgressIntClear_0_MSB 0x3C +#define QIB_7322_IntClear_SDmaProgressIntClear_0_RMASK 0x1 +#define QIB_7322_IntClear_SDmaIdleIntClear_1_LSB 0x3B +#define QIB_7322_IntClear_SDmaIdleIntClear_1_MSB 0x3B +#define QIB_7322_IntClear_SDmaIdleIntClear_1_RMASK 0x1 +#define QIB_7322_IntClear_SDmaIdleIntClear_0_LSB 0x3A +#define QIB_7322_IntClear_SDmaIdleIntClear_0_MSB 0x3A +#define QIB_7322_IntClear_SDmaIdleIntClear_0_RMASK 0x1 +#define QIB_7322_IntClear_SDmaCleanupDoneClear_1_LSB 0x39 +#define QIB_7322_IntClear_SDmaCleanupDoneClear_1_MSB 0x39 +#define QIB_7322_IntClear_SDmaCleanupDoneClear_1_RMASK 0x1 +#define QIB_7322_IntClear_SDmaCleanupDoneClear_0_LSB 0x38 +#define QIB_7322_IntClear_SDmaCleanupDoneClear_0_MSB 0x38 +#define QIB_7322_IntClear_SDmaCleanupDoneClear_0_RMASK 0x1 +#define QIB_7322_IntClear_RcvUrg17IntClear_LSB 0x31 +#define QIB_7322_IntClear_RcvUrg17IntClear_MSB 0x31 +#define QIB_7322_IntClear_RcvUrg17IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvUrg16IntClear_LSB 0x30 +#define QIB_7322_IntClear_RcvUrg16IntClear_MSB 0x30 +#define QIB_7322_IntClear_RcvUrg16IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvUrg15IntClear_LSB 0x2F +#define QIB_7322_IntClear_RcvUrg15IntClear_MSB 0x2F +#define QIB_7322_IntClear_RcvUrg15IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvUrg14IntClear_LSB 0x2E +#define QIB_7322_IntClear_RcvUrg14IntClear_MSB 0x2E +#define QIB_7322_IntClear_RcvUrg14IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvUrg13IntClear_LSB 0x2D +#define QIB_7322_IntClear_RcvUrg13IntClear_MSB 0x2D +#define QIB_7322_IntClear_RcvUrg13IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvUrg12IntClear_LSB 0x2C +#define QIB_7322_IntClear_RcvUrg12IntClear_MSB 0x2C +#define QIB_7322_IntClear_RcvUrg12IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvUrg11IntClear_LSB 0x2B +#define QIB_7322_IntClear_RcvUrg11IntClear_MSB 0x2B +#define QIB_7322_IntClear_RcvUrg11IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvUrg10IntClear_LSB 0x2A +#define QIB_7322_IntClear_RcvUrg10IntClear_MSB 0x2A +#define QIB_7322_IntClear_RcvUrg10IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvUrg9IntClear_LSB 0x29 +#define QIB_7322_IntClear_RcvUrg9IntClear_MSB 0x29 +#define QIB_7322_IntClear_RcvUrg9IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvUrg8IntClear_LSB 0x28 +#define QIB_7322_IntClear_RcvUrg8IntClear_MSB 0x28 +#define QIB_7322_IntClear_RcvUrg8IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvUrg7IntClear_LSB 0x27 +#define QIB_7322_IntClear_RcvUrg7IntClear_MSB 0x27 +#define QIB_7322_IntClear_RcvUrg7IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvUrg6IntClear_LSB 0x26 +#define QIB_7322_IntClear_RcvUrg6IntClear_MSB 0x26 +#define QIB_7322_IntClear_RcvUrg6IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvUrg5IntClear_LSB 0x25 +#define QIB_7322_IntClear_RcvUrg5IntClear_MSB 0x25 +#define QIB_7322_IntClear_RcvUrg5IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvUrg4IntClear_LSB 0x24 +#define QIB_7322_IntClear_RcvUrg4IntClear_MSB 0x24 +#define QIB_7322_IntClear_RcvUrg4IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvUrg3IntClear_LSB 0x23 +#define QIB_7322_IntClear_RcvUrg3IntClear_MSB 0x23 +#define QIB_7322_IntClear_RcvUrg3IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvUrg2IntClear_LSB 0x22 +#define QIB_7322_IntClear_RcvUrg2IntClear_MSB 0x22 +#define QIB_7322_IntClear_RcvUrg2IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvUrg1IntClear_LSB 0x21 +#define QIB_7322_IntClear_RcvUrg1IntClear_MSB 0x21 +#define QIB_7322_IntClear_RcvUrg1IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvUrg0IntClear_LSB 0x20 +#define QIB_7322_IntClear_RcvUrg0IntClear_MSB 0x20 +#define QIB_7322_IntClear_RcvUrg0IntClear_RMASK 0x1 +#define QIB_7322_IntClear_ErrIntClear_1_LSB 0x1F +#define QIB_7322_IntClear_ErrIntClear_1_MSB 0x1F +#define QIB_7322_IntClear_ErrIntClear_1_RMASK 0x1 +#define QIB_7322_IntClear_ErrIntClear_0_LSB 0x1E +#define QIB_7322_IntClear_ErrIntClear_0_MSB 0x1E +#define QIB_7322_IntClear_ErrIntClear_0_RMASK 0x1 +#define QIB_7322_IntClear_ErrIntClear_LSB 0x1D +#define QIB_7322_IntClear_ErrIntClear_MSB 0x1D +#define QIB_7322_IntClear_ErrIntClear_RMASK 0x1 +#define QIB_7322_IntClear_AssertGPIOIntClear_LSB 0x1C +#define QIB_7322_IntClear_AssertGPIOIntClear_MSB 0x1C +#define QIB_7322_IntClear_AssertGPIOIntClear_RMASK 0x1 +#define QIB_7322_IntClear_SendDoneIntClear_1_LSB 0x19 +#define QIB_7322_IntClear_SendDoneIntClear_1_MSB 0x19 +#define QIB_7322_IntClear_SendDoneIntClear_1_RMASK 0x1 +#define QIB_7322_IntClear_SendDoneIntClear_0_LSB 0x18 +#define QIB_7322_IntClear_SendDoneIntClear_0_MSB 0x18 +#define QIB_7322_IntClear_SendDoneIntClear_0_RMASK 0x1 +#define QIB_7322_IntClear_SendBufAvailIntClear_LSB 0x17 +#define QIB_7322_IntClear_SendBufAvailIntClear_MSB 0x17 +#define QIB_7322_IntClear_SendBufAvailIntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvAvail17IntClear_LSB 0x11 +#define QIB_7322_IntClear_RcvAvail17IntClear_MSB 0x11 +#define QIB_7322_IntClear_RcvAvail17IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvAvail16IntClear_LSB 0x10 +#define QIB_7322_IntClear_RcvAvail16IntClear_MSB 0x10 +#define QIB_7322_IntClear_RcvAvail16IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvAvail15IntClear_LSB 0xF +#define QIB_7322_IntClear_RcvAvail15IntClear_MSB 0xF +#define QIB_7322_IntClear_RcvAvail15IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvAvail14IntClear_LSB 0xE +#define QIB_7322_IntClear_RcvAvail14IntClear_MSB 0xE +#define QIB_7322_IntClear_RcvAvail14IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvAvail13IntClear_LSB 0xD +#define QIB_7322_IntClear_RcvAvail13IntClear_MSB 0xD +#define QIB_7322_IntClear_RcvAvail13IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvAvail12IntClear_LSB 0xC +#define QIB_7322_IntClear_RcvAvail12IntClear_MSB 0xC +#define QIB_7322_IntClear_RcvAvail12IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvAvail11IntClear_LSB 0xB +#define QIB_7322_IntClear_RcvAvail11IntClear_MSB 0xB +#define QIB_7322_IntClear_RcvAvail11IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvAvail10IntClear_LSB 0xA +#define QIB_7322_IntClear_RcvAvail10IntClear_MSB 0xA +#define QIB_7322_IntClear_RcvAvail10IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvAvail9IntClear_LSB 0x9 +#define QIB_7322_IntClear_RcvAvail9IntClear_MSB 0x9 +#define QIB_7322_IntClear_RcvAvail9IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvAvail8IntClear_LSB 0x8 +#define QIB_7322_IntClear_RcvAvail8IntClear_MSB 0x8 +#define QIB_7322_IntClear_RcvAvail8IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvAvail7IntClear_LSB 0x7 +#define QIB_7322_IntClear_RcvAvail7IntClear_MSB 0x7 +#define QIB_7322_IntClear_RcvAvail7IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvAvail6IntClear_LSB 0x6 +#define QIB_7322_IntClear_RcvAvail6IntClear_MSB 0x6 +#define QIB_7322_IntClear_RcvAvail6IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvAvail5IntClear_LSB 0x5 +#define QIB_7322_IntClear_RcvAvail5IntClear_MSB 0x5 +#define QIB_7322_IntClear_RcvAvail5IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvAvail4IntClear_LSB 0x4 +#define QIB_7322_IntClear_RcvAvail4IntClear_MSB 0x4 +#define QIB_7322_IntClear_RcvAvail4IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvAvail3IntClear_LSB 0x3 +#define QIB_7322_IntClear_RcvAvail3IntClear_MSB 0x3 +#define QIB_7322_IntClear_RcvAvail3IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvAvail2IntClear_LSB 0x2 +#define QIB_7322_IntClear_RcvAvail2IntClear_MSB 0x2 +#define QIB_7322_IntClear_RcvAvail2IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvAvail1IntClear_LSB 0x1 +#define QIB_7322_IntClear_RcvAvail1IntClear_MSB 0x1 +#define QIB_7322_IntClear_RcvAvail1IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvAvail0IntClear_LSB 0x0 +#define QIB_7322_IntClear_RcvAvail0IntClear_MSB 0x0 +#define QIB_7322_IntClear_RcvAvail0IntClear_RMASK 0x1 + +#define QIB_7322_ErrMask_OFFS 0x80 +#define QIB_7322_ErrMask_DEF 0x0000000000000000 +#define QIB_7322_ErrMask_ResetNegatedMask_LSB 0x3F +#define QIB_7322_ErrMask_ResetNegatedMask_MSB 0x3F +#define QIB_7322_ErrMask_ResetNegatedMask_RMASK 0x1 +#define QIB_7322_ErrMask_HardwareErrMask_LSB 0x3E +#define QIB_7322_ErrMask_HardwareErrMask_MSB 0x3E +#define QIB_7322_ErrMask_HardwareErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_InvalidAddrErrMask_LSB 0x3D +#define QIB_7322_ErrMask_InvalidAddrErrMask_MSB 0x3D +#define QIB_7322_ErrMask_InvalidAddrErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_SDmaVL15ErrMask_LSB 0x38 +#define QIB_7322_ErrMask_SDmaVL15ErrMask_MSB 0x38 +#define QIB_7322_ErrMask_SDmaVL15ErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_SBufVL15MisUseErrMask_LSB 0x37 +#define QIB_7322_ErrMask_SBufVL15MisUseErrMask_MSB 0x37 +#define QIB_7322_ErrMask_SBufVL15MisUseErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_InvalidEEPCmdMask_LSB 0x35 +#define QIB_7322_ErrMask_InvalidEEPCmdMask_MSB 0x35 +#define QIB_7322_ErrMask_InvalidEEPCmdMask_RMASK 0x1 +#define QIB_7322_ErrMask_RcvContextShareErrMask_LSB 0x34 +#define QIB_7322_ErrMask_RcvContextShareErrMask_MSB 0x34 +#define QIB_7322_ErrMask_RcvContextShareErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_SendVLMismatchErrMask_LSB 0x24 +#define QIB_7322_ErrMask_SendVLMismatchErrMask_MSB 0x24 +#define QIB_7322_ErrMask_SendVLMismatchErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_SendArmLaunchErrMask_LSB 0x23 +#define QIB_7322_ErrMask_SendArmLaunchErrMask_MSB 0x23 +#define QIB_7322_ErrMask_SendArmLaunchErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_SendSpecialTriggerErrMask_LSB 0x1B +#define QIB_7322_ErrMask_SendSpecialTriggerErrMask_MSB 0x1B +#define QIB_7322_ErrMask_SendSpecialTriggerErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_SDmaWrongPortErrMask_LSB 0x1A +#define QIB_7322_ErrMask_SDmaWrongPortErrMask_MSB 0x1A +#define QIB_7322_ErrMask_SDmaWrongPortErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_SDmaBufMaskDuplicateErrMask_LSB 0x19 +#define QIB_7322_ErrMask_SDmaBufMaskDuplicateErrMask_MSB 0x19 +#define QIB_7322_ErrMask_SDmaBufMaskDuplicateErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_RcvHdrFullErrMask_LSB 0xD +#define QIB_7322_ErrMask_RcvHdrFullErrMask_MSB 0xD +#define QIB_7322_ErrMask_RcvHdrFullErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_RcvEgrFullErrMask_LSB 0xC +#define QIB_7322_ErrMask_RcvEgrFullErrMask_MSB 0xC +#define QIB_7322_ErrMask_RcvEgrFullErrMask_RMASK 0x1 + +#define QIB_7322_ErrStatus_OFFS 0x88 +#define QIB_7322_ErrStatus_DEF 0x0000000000000000 +#define QIB_7322_ErrStatus_ResetNegated_LSB 0x3F +#define QIB_7322_ErrStatus_ResetNegated_MSB 0x3F +#define QIB_7322_ErrStatus_ResetNegated_RMASK 0x1 +#define QIB_7322_ErrStatus_HardwareErr_LSB 0x3E +#define QIB_7322_ErrStatus_HardwareErr_MSB 0x3E +#define QIB_7322_ErrStatus_HardwareErr_RMASK 0x1 +#define QIB_7322_ErrStatus_InvalidAddrErr_LSB 0x3D +#define QIB_7322_ErrStatus_InvalidAddrErr_MSB 0x3D +#define QIB_7322_ErrStatus_InvalidAddrErr_RMASK 0x1 +#define QIB_7322_ErrStatus_SDmaVL15Err_LSB 0x38 +#define QIB_7322_ErrStatus_SDmaVL15Err_MSB 0x38 +#define QIB_7322_ErrStatus_SDmaVL15Err_RMASK 0x1 +#define QIB_7322_ErrStatus_SBufVL15MisUseErr_LSB 0x37 +#define QIB_7322_ErrStatus_SBufVL15MisUseErr_MSB 0x37 +#define QIB_7322_ErrStatus_SBufVL15MisUseErr_RMASK 0x1 +#define QIB_7322_ErrStatus_InvalidEEPCmdErr_LSB 0x35 +#define QIB_7322_ErrStatus_InvalidEEPCmdErr_MSB 0x35 +#define QIB_7322_ErrStatus_InvalidEEPCmdErr_RMASK 0x1 +#define QIB_7322_ErrStatus_RcvContextShareErr_LSB 0x34 +#define QIB_7322_ErrStatus_RcvContextShareErr_MSB 0x34 +#define QIB_7322_ErrStatus_RcvContextShareErr_RMASK 0x1 +#define QIB_7322_ErrStatus_SendVLMismatchErr_LSB 0x24 +#define QIB_7322_ErrStatus_SendVLMismatchErr_MSB 0x24 +#define QIB_7322_ErrStatus_SendVLMismatchErr_RMASK 0x1 +#define QIB_7322_ErrStatus_SendArmLaunchErr_LSB 0x23 +#define QIB_7322_ErrStatus_SendArmLaunchErr_MSB 0x23 +#define QIB_7322_ErrStatus_SendArmLaunchErr_RMASK 0x1 +#define QIB_7322_ErrStatus_SendSpecialTriggerErr_LSB 0x1B +#define QIB_7322_ErrStatus_SendSpecialTriggerErr_MSB 0x1B +#define QIB_7322_ErrStatus_SendSpecialTriggerErr_RMASK 0x1 +#define QIB_7322_ErrStatus_SDmaWrongPortErr_LSB 0x1A +#define QIB_7322_ErrStatus_SDmaWrongPortErr_MSB 0x1A +#define QIB_7322_ErrStatus_SDmaWrongPortErr_RMASK 0x1 +#define QIB_7322_ErrStatus_SDmaBufMaskDuplicateErr_LSB 0x19 +#define QIB_7322_ErrStatus_SDmaBufMaskDuplicateErr_MSB 0x19 +#define QIB_7322_ErrStatus_SDmaBufMaskDuplicateErr_RMASK 0x1 +#define QIB_7322_ErrStatus_RcvHdrFullErr_LSB 0xD +#define QIB_7322_ErrStatus_RcvHdrFullErr_MSB 0xD +#define QIB_7322_ErrStatus_RcvHdrFullErr_RMASK 0x1 +#define QIB_7322_ErrStatus_RcvEgrFullErr_LSB 0xC +#define QIB_7322_ErrStatus_RcvEgrFullErr_MSB 0xC +#define QIB_7322_ErrStatus_RcvEgrFullErr_RMASK 0x1 + +#define QIB_7322_ErrClear_OFFS 0x90 +#define QIB_7322_ErrClear_DEF 0x0000000000000000 +#define QIB_7322_ErrClear_ResetNegatedClear_LSB 0x3F +#define QIB_7322_ErrClear_ResetNegatedClear_MSB 0x3F +#define QIB_7322_ErrClear_ResetNegatedClear_RMASK 0x1 +#define QIB_7322_ErrClear_HardwareErrClear_LSB 0x3E +#define QIB_7322_ErrClear_HardwareErrClear_MSB 0x3E +#define QIB_7322_ErrClear_HardwareErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_InvalidAddrErrClear_LSB 0x3D +#define QIB_7322_ErrClear_InvalidAddrErrClear_MSB 0x3D +#define QIB_7322_ErrClear_InvalidAddrErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_SDmaVL15ErrClear_LSB 0x38 +#define QIB_7322_ErrClear_SDmaVL15ErrClear_MSB 0x38 +#define QIB_7322_ErrClear_SDmaVL15ErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_SBufVL15MisUseErrClear_LSB 0x37 +#define QIB_7322_ErrClear_SBufVL15MisUseErrClear_MSB 0x37 +#define QIB_7322_ErrClear_SBufVL15MisUseErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_InvalidEEPCmdErrClear_LSB 0x35 +#define QIB_7322_ErrClear_InvalidEEPCmdErrClear_MSB 0x35 +#define QIB_7322_ErrClear_InvalidEEPCmdErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_RcvContextShareErrClear_LSB 0x34 +#define QIB_7322_ErrClear_RcvContextShareErrClear_MSB 0x34 +#define QIB_7322_ErrClear_RcvContextShareErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_SendVLMismatchErrMask_LSB 0x24 +#define QIB_7322_ErrClear_SendVLMismatchErrMask_MSB 0x24 +#define QIB_7322_ErrClear_SendVLMismatchErrMask_RMASK 0x1 +#define QIB_7322_ErrClear_SendArmLaunchErrClear_LSB 0x23 +#define QIB_7322_ErrClear_SendArmLaunchErrClear_MSB 0x23 +#define QIB_7322_ErrClear_SendArmLaunchErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_SendSpecialTriggerErrClear_LSB 0x1B +#define QIB_7322_ErrClear_SendSpecialTriggerErrClear_MSB 0x1B +#define QIB_7322_ErrClear_SendSpecialTriggerErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_SDmaWrongPortErrClear_LSB 0x1A +#define QIB_7322_ErrClear_SDmaWrongPortErrClear_MSB 0x1A +#define QIB_7322_ErrClear_SDmaWrongPortErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_SDmaBufMaskDuplicateErrClear_LSB 0x19 +#define QIB_7322_ErrClear_SDmaBufMaskDuplicateErrClear_MSB 0x19 +#define QIB_7322_ErrClear_SDmaBufMaskDuplicateErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_RcvHdrFullErrClear_LSB 0xD +#define QIB_7322_ErrClear_RcvHdrFullErrClear_MSB 0xD +#define QIB_7322_ErrClear_RcvHdrFullErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_RcvEgrFullErrClear_LSB 0xC +#define QIB_7322_ErrClear_RcvEgrFullErrClear_MSB 0xC +#define QIB_7322_ErrClear_RcvEgrFullErrClear_RMASK 0x1 + +#define QIB_7322_HwErrMask_OFFS 0x98 +#define QIB_7322_HwErrMask_DEF 0x0000000000000000 +#define QIB_7322_HwErrMask_IBSerdesPClkNotDetectMask_1_LSB 0x3F +#define QIB_7322_HwErrMask_IBSerdesPClkNotDetectMask_1_MSB 0x3F +#define QIB_7322_HwErrMask_IBSerdesPClkNotDetectMask_1_RMASK 0x1 +#define QIB_7322_HwErrMask_IBSerdesPClkNotDetectMask_0_LSB 0x3E +#define QIB_7322_HwErrMask_IBSerdesPClkNotDetectMask_0_MSB 0x3E +#define QIB_7322_HwErrMask_IBSerdesPClkNotDetectMask_0_RMASK 0x1 +#define QIB_7322_HwErrMask_PCIESerdesPClkNotDetectMask_LSB 0x37 +#define QIB_7322_HwErrMask_PCIESerdesPClkNotDetectMask_MSB 0x37 +#define QIB_7322_HwErrMask_PCIESerdesPClkNotDetectMask_RMASK 0x1 +#define QIB_7322_HwErrMask_PowerOnBISTFailedMask_LSB 0x36 +#define QIB_7322_HwErrMask_PowerOnBISTFailedMask_MSB 0x36 +#define QIB_7322_HwErrMask_PowerOnBISTFailedMask_RMASK 0x1 +#define QIB_7322_HwErrMask_TempsenseTholdReachedMask_LSB 0x35 +#define QIB_7322_HwErrMask_TempsenseTholdReachedMask_MSB 0x35 +#define QIB_7322_HwErrMask_TempsenseTholdReachedMask_RMASK 0x1 +#define QIB_7322_HwErrMask_MemoryErrMask_LSB 0x30 +#define QIB_7322_HwErrMask_MemoryErrMask_MSB 0x30 +#define QIB_7322_HwErrMask_MemoryErrMask_RMASK 0x1 +#define QIB_7322_HwErrMask_pcie_phy_txParityErr_LSB 0x22 +#define QIB_7322_HwErrMask_pcie_phy_txParityErr_MSB 0x22 +#define QIB_7322_HwErrMask_pcie_phy_txParityErr_RMASK 0x1 +#define QIB_7322_HwErrMask_PCIeBusParityErrMask_LSB 0x1F +#define QIB_7322_HwErrMask_PCIeBusParityErrMask_MSB 0x21 +#define QIB_7322_HwErrMask_PCIeBusParityErrMask_RMASK 0x7 +#define QIB_7322_HwErrMask_PcieCplTimeoutMask_LSB 0x1E +#define QIB_7322_HwErrMask_PcieCplTimeoutMask_MSB 0x1E +#define QIB_7322_HwErrMask_PcieCplTimeoutMask_RMASK 0x1 +#define QIB_7322_HwErrMask_PciePoisonedTLPMask_LSB 0x1D +#define QIB_7322_HwErrMask_PciePoisonedTLPMask_MSB 0x1D +#define QIB_7322_HwErrMask_PciePoisonedTLPMask_RMASK 0x1 +#define QIB_7322_HwErrMask_SDmaMemReadErrMask_1_LSB 0x1C +#define QIB_7322_HwErrMask_SDmaMemReadErrMask_1_MSB 0x1C +#define QIB_7322_HwErrMask_SDmaMemReadErrMask_1_RMASK 0x1 +#define QIB_7322_HwErrMask_SDmaMemReadErrMask_0_LSB 0x1B +#define QIB_7322_HwErrMask_SDmaMemReadErrMask_0_MSB 0x1B +#define QIB_7322_HwErrMask_SDmaMemReadErrMask_0_RMASK 0x1 +#define QIB_7322_HwErrMask_IBCBusFromSPCParityErrMask_1_LSB 0xF +#define QIB_7322_HwErrMask_IBCBusFromSPCParityErrMask_1_MSB 0xF +#define QIB_7322_HwErrMask_IBCBusFromSPCParityErrMask_1_RMASK 0x1 +#define QIB_7322_HwErrMask_IBCBusToSPCParityErrMask_1_LSB 0xE +#define QIB_7322_HwErrMask_IBCBusToSPCParityErrMask_1_MSB 0xE +#define QIB_7322_HwErrMask_IBCBusToSPCParityErrMask_1_RMASK 0x1 +#define QIB_7322_HwErrMask_IBCBusFromSPCParityErrMask_0_LSB 0xD +#define QIB_7322_HwErrMask_IBCBusFromSPCParityErrMask_0_MSB 0xD +#define QIB_7322_HwErrMask_IBCBusFromSPCParityErrMask_0_RMASK 0x1 +#define QIB_7322_HwErrMask_statusValidNoEopMask_LSB 0xC +#define QIB_7322_HwErrMask_statusValidNoEopMask_MSB 0xC +#define QIB_7322_HwErrMask_statusValidNoEopMask_RMASK 0x1 +#define QIB_7322_HwErrMask_LATriggeredMask_LSB 0xB +#define QIB_7322_HwErrMask_LATriggeredMask_MSB 0xB +#define QIB_7322_HwErrMask_LATriggeredMask_RMASK 0x1 + +#define QIB_7322_HwErrStatus_OFFS 0xA0 +#define QIB_7322_HwErrStatus_DEF 0x0000000000000000 +#define QIB_7322_HwErrStatus_IBSerdesPClkNotDetect_1_LSB 0x3F +#define QIB_7322_HwErrStatus_IBSerdesPClkNotDetect_1_MSB 0x3F +#define QIB_7322_HwErrStatus_IBSerdesPClkNotDetect_1_RMASK 0x1 +#define QIB_7322_HwErrStatus_IBSerdesPClkNotDetect_0_LSB 0x3E +#define QIB_7322_HwErrStatus_IBSerdesPClkNotDetect_0_MSB 0x3E +#define QIB_7322_HwErrStatus_IBSerdesPClkNotDetect_0_RMASK 0x1 +#define QIB_7322_HwErrStatus_PCIESerdesPClkNotDetect_LSB 0x37 +#define QIB_7322_HwErrStatus_PCIESerdesPClkNotDetect_MSB 0x37 +#define QIB_7322_HwErrStatus_PCIESerdesPClkNotDetect_RMASK 0x1 +#define QIB_7322_HwErrStatus_PowerOnBISTFailed_LSB 0x36 +#define QIB_7322_HwErrStatus_PowerOnBISTFailed_MSB 0x36 +#define QIB_7322_HwErrStatus_PowerOnBISTFailed_RMASK 0x1 +#define QIB_7322_HwErrStatus_TempsenseTholdReached_LSB 0x35 +#define QIB_7322_HwErrStatus_TempsenseTholdReached_MSB 0x35 +#define QIB_7322_HwErrStatus_TempsenseTholdReached_RMASK 0x1 +#define QIB_7322_HwErrStatus_MemoryErr_LSB 0x30 +#define QIB_7322_HwErrStatus_MemoryErr_MSB 0x30 +#define QIB_7322_HwErrStatus_MemoryErr_RMASK 0x1 +#define QIB_7322_HwErrStatus_pcie_phy_txParityErr_LSB 0x22 +#define QIB_7322_HwErrStatus_pcie_phy_txParityErr_MSB 0x22 +#define QIB_7322_HwErrStatus_pcie_phy_txParityErr_RMASK 0x1 +#define QIB_7322_HwErrStatus_PCIeBusParity_LSB 0x1F +#define QIB_7322_HwErrStatus_PCIeBusParity_MSB 0x21 +#define QIB_7322_HwErrStatus_PCIeBusParity_RMASK 0x7 +#define QIB_7322_HwErrStatus_PcieCplTimeout_LSB 0x1E +#define QIB_7322_HwErrStatus_PcieCplTimeout_MSB 0x1E +#define QIB_7322_HwErrStatus_PcieCplTimeout_RMASK 0x1 +#define QIB_7322_HwErrStatus_PciePoisonedTLP_LSB 0x1D +#define QIB_7322_HwErrStatus_PciePoisonedTLP_MSB 0x1D +#define QIB_7322_HwErrStatus_PciePoisonedTLP_RMASK 0x1 +#define QIB_7322_HwErrStatus_SDmaMemReadErr_1_LSB 0x1C +#define QIB_7322_HwErrStatus_SDmaMemReadErr_1_MSB 0x1C +#define QIB_7322_HwErrStatus_SDmaMemReadErr_1_RMASK 0x1 +#define QIB_7322_HwErrStatus_SDmaMemReadErr_0_LSB 0x1B +#define QIB_7322_HwErrStatus_SDmaMemReadErr_0_MSB 0x1B +#define QIB_7322_HwErrStatus_SDmaMemReadErr_0_RMASK 0x1 +#define QIB_7322_HwErrStatus_IBCBusFromSPCParityErr_1_LSB 0xF +#define QIB_7322_HwErrStatus_IBCBusFromSPCParityErr_1_MSB 0xF +#define QIB_7322_HwErrStatus_IBCBusFromSPCParityErr_1_RMASK 0x1 +#define QIB_7322_HwErrStatus_IBCBusToSPCParityErr_1_LSB 0xE +#define QIB_7322_HwErrStatus_IBCBusToSPCParityErr_1_MSB 0xE +#define QIB_7322_HwErrStatus_IBCBusToSPCParityErr_1_RMASK 0x1 +#define QIB_7322_HwErrStatus_IBCBusFromSPCParityErr_0_LSB 0xD +#define QIB_7322_HwErrStatus_IBCBusFromSPCParityErr_0_MSB 0xD +#define QIB_7322_HwErrStatus_IBCBusFromSPCParityErr_0_RMASK 0x1 +#define QIB_7322_HwErrStatus_statusValidNoEop_LSB 0xC +#define QIB_7322_HwErrStatus_statusValidNoEop_MSB 0xC +#define QIB_7322_HwErrStatus_statusValidNoEop_RMASK 0x1 +#define QIB_7322_HwErrStatus_LATriggered_LSB 0xB +#define QIB_7322_HwErrStatus_LATriggered_MSB 0xB +#define QIB_7322_HwErrStatus_LATriggered_RMASK 0x1 + +#define QIB_7322_HwErrClear_OFFS 0xA8 +#define QIB_7322_HwErrClear_DEF 0x0000000000000000 +#define QIB_7322_HwErrClear_IBSerdesPClkNotDetectClear_1_LSB 0x3F +#define QIB_7322_HwErrClear_IBSerdesPClkNotDetectClear_1_MSB 0x3F +#define QIB_7322_HwErrClear_IBSerdesPClkNotDetectClear_1_RMASK 0x1 +#define QIB_7322_HwErrClear_IBSerdesPClkNotDetectClear_0_LSB 0x3E +#define QIB_7322_HwErrClear_IBSerdesPClkNotDetectClear_0_MSB 0x3E +#define QIB_7322_HwErrClear_IBSerdesPClkNotDetectClear_0_RMASK 0x1 +#define QIB_7322_HwErrClear_PCIESerdesPClkNotDetectClear_LSB 0x37 +#define QIB_7322_HwErrClear_PCIESerdesPClkNotDetectClear_MSB 0x37 +#define QIB_7322_HwErrClear_PCIESerdesPClkNotDetectClear_RMASK 0x1 +#define QIB_7322_HwErrClear_PowerOnBISTFailedClear_LSB 0x36 +#define QIB_7322_HwErrClear_PowerOnBISTFailedClear_MSB 0x36 +#define QIB_7322_HwErrClear_PowerOnBISTFailedClear_RMASK 0x1 +#define QIB_7322_HwErrClear_TempsenseTholdReachedClear_LSB 0x35 +#define QIB_7322_HwErrClear_TempsenseTholdReachedClear_MSB 0x35 +#define QIB_7322_HwErrClear_TempsenseTholdReachedClear_RMASK 0x1 +#define QIB_7322_HwErrClear_MemoryErrClear_LSB 0x30 +#define QIB_7322_HwErrClear_MemoryErrClear_MSB 0x30 +#define QIB_7322_HwErrClear_MemoryErrClear_RMASK 0x1 +#define QIB_7322_HwErrClear_pcie_phy_txParityErr_LSB 0x22 +#define QIB_7322_HwErrClear_pcie_phy_txParityErr_MSB 0x22 +#define QIB_7322_HwErrClear_pcie_phy_txParityErr_RMASK 0x1 +#define QIB_7322_HwErrClear_PCIeBusParityClear_LSB 0x1F +#define QIB_7322_HwErrClear_PCIeBusParityClear_MSB 0x21 +#define QIB_7322_HwErrClear_PCIeBusParityClear_RMASK 0x7 +#define QIB_7322_HwErrClear_PcieCplTimeoutClear_LSB 0x1E +#define QIB_7322_HwErrClear_PcieCplTimeoutClear_MSB 0x1E +#define QIB_7322_HwErrClear_PcieCplTimeoutClear_RMASK 0x1 +#define QIB_7322_HwErrClear_PciePoisonedTLPClear_LSB 0x1D +#define QIB_7322_HwErrClear_PciePoisonedTLPClear_MSB 0x1D +#define QIB_7322_HwErrClear_PciePoisonedTLPClear_RMASK 0x1 +#define QIB_7322_HwErrClear_SDmaMemReadErrClear_1_LSB 0x1C +#define QIB_7322_HwErrClear_SDmaMemReadErrClear_1_MSB 0x1C +#define QIB_7322_HwErrClear_SDmaMemReadErrClear_1_RMASK 0x1 +#define QIB_7322_HwErrClear_SDmaMemReadErrClear_0_LSB 0x1B +#define QIB_7322_HwErrClear_SDmaMemReadErrClear_0_MSB 0x1B +#define QIB_7322_HwErrClear_SDmaMemReadErrClear_0_RMASK 0x1 +#define QIB_7322_HwErrClear_IBCBusFromSPCParityErrClear_1_LSB 0xF +#define QIB_7322_HwErrClear_IBCBusFromSPCParityErrClear_1_MSB 0xF +#define QIB_7322_HwErrClear_IBCBusFromSPCParityErrClear_1_RMASK 0x1 +#define QIB_7322_HwErrClear_IBCBusToSPCParityErrClear_1_LSB 0xE +#define QIB_7322_HwErrClear_IBCBusToSPCParityErrClear_1_MSB 0xE +#define QIB_7322_HwErrClear_IBCBusToSPCParityErrClear_1_RMASK 0x1 +#define QIB_7322_HwErrClear_IBCBusFromSPCParityErrClear_0_LSB 0xD +#define QIB_7322_HwErrClear_IBCBusFromSPCParityErrClear_0_MSB 0xD +#define QIB_7322_HwErrClear_IBCBusFromSPCParityErrClear_0_RMASK 0x1 +#define QIB_7322_HwErrClear_statusValidNoEopClear_LSB 0xC +#define QIB_7322_HwErrClear_statusValidNoEopClear_MSB 0xC +#define QIB_7322_HwErrClear_statusValidNoEopClear_RMASK 0x1 +#define QIB_7322_HwErrClear_LATriggeredClear_LSB 0xB +#define QIB_7322_HwErrClear_LATriggeredClear_MSB 0xB +#define QIB_7322_HwErrClear_LATriggeredClear_RMASK 0x1 + +#define QIB_7322_HwDiagCtrl_OFFS 0xB0 +#define QIB_7322_HwDiagCtrl_DEF 0x0000000000000000 +#define QIB_7322_HwDiagCtrl_Diagnostic_LSB 0x3F +#define QIB_7322_HwDiagCtrl_Diagnostic_MSB 0x3F +#define QIB_7322_HwDiagCtrl_Diagnostic_RMASK 0x1 +#define QIB_7322_HwDiagCtrl_CounterWrEnable_LSB 0x3D +#define QIB_7322_HwDiagCtrl_CounterWrEnable_MSB 0x3D +#define QIB_7322_HwDiagCtrl_CounterWrEnable_RMASK 0x1 +#define QIB_7322_HwDiagCtrl_CounterDisable_LSB 0x3C +#define QIB_7322_HwDiagCtrl_CounterDisable_MSB 0x3C +#define QIB_7322_HwDiagCtrl_CounterDisable_RMASK 0x1 +#define QIB_7322_HwDiagCtrl_forcePCIeBusParity_LSB 0x1F +#define QIB_7322_HwDiagCtrl_forcePCIeBusParity_MSB 0x22 +#define QIB_7322_HwDiagCtrl_forcePCIeBusParity_RMASK 0xF +#define QIB_7322_HwDiagCtrl_ForceIBCBusFromSPCParityErr_1_LSB 0xF +#define QIB_7322_HwDiagCtrl_ForceIBCBusFromSPCParityErr_1_MSB 0xF +#define QIB_7322_HwDiagCtrl_ForceIBCBusFromSPCParityErr_1_RMASK 0x1 +#define QIB_7322_HwDiagCtrl_ForceIBCBusToSPCParityErr_1_LSB 0xE +#define QIB_7322_HwDiagCtrl_ForceIBCBusToSPCParityErr_1_MSB 0xE +#define QIB_7322_HwDiagCtrl_ForceIBCBusToSPCParityErr_1_RMASK 0x1 +#define QIB_7322_HwDiagCtrl_ForceIBCBusFromSPCParityErr_0_LSB 0xD +#define QIB_7322_HwDiagCtrl_ForceIBCBusFromSPCParityErr_0_MSB 0xD +#define QIB_7322_HwDiagCtrl_ForceIBCBusFromSPCParityErr_0_RMASK 0x1 +#define QIB_7322_HwDiagCtrl_ForceIBCBusToSPCParityErr_0_LSB 0xC +#define QIB_7322_HwDiagCtrl_ForceIBCBusToSPCParityErr_0_MSB 0xC +#define QIB_7322_HwDiagCtrl_ForceIBCBusToSPCParityErr_0_RMASK 0x1 + +#define QIB_7322_EXTStatus_OFFS 0xC0 +#define QIB_7322_EXTStatus_DEF 0x000000000000X000 +#define QIB_7322_EXTStatus_GPIOIn_LSB 0x30 +#define QIB_7322_EXTStatus_GPIOIn_MSB 0x3F +#define QIB_7322_EXTStatus_GPIOIn_RMASK 0xFFFF +#define QIB_7322_EXTStatus_MemBISTDisabled_LSB 0xF +#define QIB_7322_EXTStatus_MemBISTDisabled_MSB 0xF +#define QIB_7322_EXTStatus_MemBISTDisabled_RMASK 0x1 +#define QIB_7322_EXTStatus_MemBISTEndTest_LSB 0xE +#define QIB_7322_EXTStatus_MemBISTEndTest_MSB 0xE +#define QIB_7322_EXTStatus_MemBISTEndTest_RMASK 0x1 + +#define QIB_7322_EXTCtrl_OFFS 0xC8 +#define QIB_7322_EXTCtrl_DEF 0x0000000000000000 +#define QIB_7322_EXTCtrl_GPIOOe_LSB 0x30 +#define QIB_7322_EXTCtrl_GPIOOe_MSB 0x3F +#define QIB_7322_EXTCtrl_GPIOOe_RMASK 0xFFFF +#define QIB_7322_EXTCtrl_GPIOInvert_LSB 0x20 +#define QIB_7322_EXTCtrl_GPIOInvert_MSB 0x2F +#define QIB_7322_EXTCtrl_GPIOInvert_RMASK 0xFFFF +#define QIB_7322_EXTCtrl_LEDPort1GreenOn_LSB 0x3 +#define QIB_7322_EXTCtrl_LEDPort1GreenOn_MSB 0x3 +#define QIB_7322_EXTCtrl_LEDPort1GreenOn_RMASK 0x1 +#define QIB_7322_EXTCtrl_LEDPort1YellowOn_LSB 0x2 +#define QIB_7322_EXTCtrl_LEDPort1YellowOn_MSB 0x2 +#define QIB_7322_EXTCtrl_LEDPort1YellowOn_RMASK 0x1 +#define QIB_7322_EXTCtrl_LEDPort0GreenOn_LSB 0x1 +#define QIB_7322_EXTCtrl_LEDPort0GreenOn_MSB 0x1 +#define QIB_7322_EXTCtrl_LEDPort0GreenOn_RMASK 0x1 +#define QIB_7322_EXTCtrl_LEDPort0YellowOn_LSB 0x0 +#define QIB_7322_EXTCtrl_LEDPort0YellowOn_MSB 0x0 +#define QIB_7322_EXTCtrl_LEDPort0YellowOn_RMASK 0x1 + +#define QIB_7322_GPIOOut_OFFS 0xE0 +#define QIB_7322_GPIOOut_DEF 0x0000000000000000 + +#define QIB_7322_GPIOMask_OFFS 0xE8 +#define QIB_7322_GPIOMask_DEF 0x0000000000000000 + +#define QIB_7322_GPIOStatus_OFFS 0xF0 +#define QIB_7322_GPIOStatus_DEF 0x0000000000000000 + +#define QIB_7322_GPIOClear_OFFS 0xF8 +#define QIB_7322_GPIOClear_DEF 0x0000000000000000 + +#define QIB_7322_RcvCtrl_OFFS 0x100 +#define QIB_7322_RcvCtrl_DEF 0x0000000000000000 +#define QIB_7322_RcvCtrl_TidReDirect_LSB 0x30 +#define QIB_7322_RcvCtrl_TidReDirect_MSB 0x3F +#define QIB_7322_RcvCtrl_TidReDirect_RMASK 0xFFFF +#define QIB_7322_RcvCtrl_TailUpd_LSB 0x2F +#define QIB_7322_RcvCtrl_TailUpd_MSB 0x2F +#define QIB_7322_RcvCtrl_TailUpd_RMASK 0x1 +#define QIB_7322_RcvCtrl_XrcTypeCode_LSB 0x2C +#define QIB_7322_RcvCtrl_XrcTypeCode_MSB 0x2E +#define QIB_7322_RcvCtrl_XrcTypeCode_RMASK 0x7 +#define QIB_7322_RcvCtrl_TidFlowEnable_LSB 0x2B +#define QIB_7322_RcvCtrl_TidFlowEnable_MSB 0x2B +#define QIB_7322_RcvCtrl_TidFlowEnable_RMASK 0x1 +#define QIB_7322_RcvCtrl_ContextCfg_LSB 0x29 +#define QIB_7322_RcvCtrl_ContextCfg_MSB 0x2A +#define QIB_7322_RcvCtrl_ContextCfg_RMASK 0x3 +#define QIB_7322_RcvCtrl_IntrAvail_LSB 0x14 +#define QIB_7322_RcvCtrl_IntrAvail_MSB 0x25 +#define QIB_7322_RcvCtrl_IntrAvail_RMASK 0x3FFFF +#define QIB_7322_RcvCtrl_dontDropRHQFull_LSB 0x0 +#define QIB_7322_RcvCtrl_dontDropRHQFull_MSB 0x11 +#define QIB_7322_RcvCtrl_dontDropRHQFull_RMASK 0x3FFFF + +#define QIB_7322_RcvHdrSize_OFFS 0x110 +#define QIB_7322_RcvHdrSize_DEF 0x0000000000000000 + +#define QIB_7322_RcvHdrCnt_OFFS 0x118 +#define QIB_7322_RcvHdrCnt_DEF 0x0000000000000000 + +#define QIB_7322_RcvHdrEntSize_OFFS 0x120 +#define QIB_7322_RcvHdrEntSize_DEF 0x0000000000000000 + +#define QIB_7322_RcvTIDBase_OFFS 0x128 +#define QIB_7322_RcvTIDBase_DEF 0x0000000000050000 + +#define QIB_7322_RcvTIDCnt_OFFS 0x130 +#define QIB_7322_RcvTIDCnt_DEF 0x0000000000000200 + +#define QIB_7322_RcvEgrBase_OFFS 0x138 +#define QIB_7322_RcvEgrBase_DEF 0x0000000000014000 + +#define QIB_7322_RcvEgrCnt_OFFS 0x140 +#define QIB_7322_RcvEgrCnt_DEF 0x0000000000001000 + +#define QIB_7322_RcvBufBase_OFFS 0x148 +#define QIB_7322_RcvBufBase_DEF 0x0000000000080000 + +#define QIB_7322_RcvBufSize_OFFS 0x150 +#define QIB_7322_RcvBufSize_DEF 0x0000000000005000 + +#define QIB_7322_RxIntMemBase_OFFS 0x158 +#define QIB_7322_RxIntMemBase_DEF 0x0000000000077000 + +#define QIB_7322_RxIntMemSize_OFFS 0x160 +#define QIB_7322_RxIntMemSize_DEF 0x0000000000007000 + +#define QIB_7322_feature_mask_OFFS 0x190 +#define QIB_7322_feature_mask_DEF 0x00000000000000XX + +#define QIB_7322_active_feature_mask_OFFS 0x198 +#define QIB_7322_active_feature_mask_DEF 0x00000000000000XX +#define QIB_7322_active_feature_mask_Port1_QDR_Enabled_LSB 0x5 +#define QIB_7322_active_feature_mask_Port1_QDR_Enabled_MSB 0x5 +#define QIB_7322_active_feature_mask_Port1_QDR_Enabled_RMASK 0x1 +#define QIB_7322_active_feature_mask_Port1_DDR_Enabled_LSB 0x4 +#define QIB_7322_active_feature_mask_Port1_DDR_Enabled_MSB 0x4 +#define QIB_7322_active_feature_mask_Port1_DDR_Enabled_RMASK 0x1 +#define QIB_7322_active_feature_mask_Port1_SDR_Enabled_LSB 0x3 +#define QIB_7322_active_feature_mask_Port1_SDR_Enabled_MSB 0x3 +#define QIB_7322_active_feature_mask_Port1_SDR_Enabled_RMASK 0x1 +#define QIB_7322_active_feature_mask_Port0_QDR_Enabled_LSB 0x2 +#define QIB_7322_active_feature_mask_Port0_QDR_Enabled_MSB 0x2 +#define QIB_7322_active_feature_mask_Port0_QDR_Enabled_RMASK 0x1 +#define QIB_7322_active_feature_mask_Port0_DDR_Enabled_LSB 0x1 +#define QIB_7322_active_feature_mask_Port0_DDR_Enabled_MSB 0x1 +#define QIB_7322_active_feature_mask_Port0_DDR_Enabled_RMASK 0x1 +#define QIB_7322_active_feature_mask_Port0_SDR_Enabled_LSB 0x0 +#define QIB_7322_active_feature_mask_Port0_SDR_Enabled_MSB 0x0 +#define QIB_7322_active_feature_mask_Port0_SDR_Enabled_RMASK 0x1 + +#define QIB_7322_SendCtrl_OFFS 0x1C0 +#define QIB_7322_SendCtrl_DEF 0x0000000000000000 +#define QIB_7322_SendCtrl_Disarm_LSB 0x1F +#define QIB_7322_SendCtrl_Disarm_MSB 0x1F +#define QIB_7322_SendCtrl_Disarm_RMASK 0x1 +#define QIB_7322_SendCtrl_SendBufAvailPad64Byte_LSB 0x1D +#define QIB_7322_SendCtrl_SendBufAvailPad64Byte_MSB 0x1D +#define QIB_7322_SendCtrl_SendBufAvailPad64Byte_RMASK 0x1 +#define QIB_7322_SendCtrl_AvailUpdThld_LSB 0x18 +#define QIB_7322_SendCtrl_AvailUpdThld_MSB 0x1C +#define QIB_7322_SendCtrl_AvailUpdThld_RMASK 0x1F +#define QIB_7322_SendCtrl_DisarmSendBuf_LSB 0x10 +#define QIB_7322_SendCtrl_DisarmSendBuf_MSB 0x17 +#define QIB_7322_SendCtrl_DisarmSendBuf_RMASK 0xFF +#define QIB_7322_SendCtrl_SpecialTriggerEn_LSB 0x4 +#define QIB_7322_SendCtrl_SpecialTriggerEn_MSB 0x4 +#define QIB_7322_SendCtrl_SpecialTriggerEn_RMASK 0x1 +#define QIB_7322_SendCtrl_SendBufAvailUpd_LSB 0x2 +#define QIB_7322_SendCtrl_SendBufAvailUpd_MSB 0x2 +#define QIB_7322_SendCtrl_SendBufAvailUpd_RMASK 0x1 +#define QIB_7322_SendCtrl_SendIntBufAvail_LSB 0x1 +#define QIB_7322_SendCtrl_SendIntBufAvail_MSB 0x1 +#define QIB_7322_SendCtrl_SendIntBufAvail_RMASK 0x1 + +#define QIB_7322_SendBufBase_OFFS 0x1C8 +#define QIB_7322_SendBufBase_DEF 0x0018000000100000 +#define QIB_7322_SendBufBase_BaseAddr_LargePIO_LSB 0x20 +#define QIB_7322_SendBufBase_BaseAddr_LargePIO_MSB 0x34 +#define QIB_7322_SendBufBase_BaseAddr_LargePIO_RMASK 0x1FFFFF +#define QIB_7322_SendBufBase_BaseAddr_SmallPIO_LSB 0x0 +#define QIB_7322_SendBufBase_BaseAddr_SmallPIO_MSB 0x14 +#define QIB_7322_SendBufBase_BaseAddr_SmallPIO_RMASK 0x1FFFFF + +#define QIB_7322_SendBufSize_OFFS 0x1D0 +#define QIB_7322_SendBufSize_DEF 0x0000108000000880 +#define QIB_7322_SendBufSize_Size_LargePIO_LSB 0x20 +#define QIB_7322_SendBufSize_Size_LargePIO_MSB 0x2C +#define QIB_7322_SendBufSize_Size_LargePIO_RMASK 0x1FFF +#define QIB_7322_SendBufSize_Size_SmallPIO_LSB 0x0 +#define QIB_7322_SendBufSize_Size_SmallPIO_MSB 0xB +#define QIB_7322_SendBufSize_Size_SmallPIO_RMASK 0xFFF + +#define QIB_7322_SendBufCnt_OFFS 0x1D8 +#define QIB_7322_SendBufCnt_DEF 0x0000002000000080 +#define QIB_7322_SendBufCnt_Num_LargeBuffers_LSB 0x20 +#define QIB_7322_SendBufCnt_Num_LargeBuffers_MSB 0x25 +#define QIB_7322_SendBufCnt_Num_LargeBuffers_RMASK 0x3F +#define QIB_7322_SendBufCnt_Num_SmallBuffers_LSB 0x0 +#define QIB_7322_SendBufCnt_Num_SmallBuffers_MSB 0x8 +#define QIB_7322_SendBufCnt_Num_SmallBuffers_RMASK 0x1FF + +#define QIB_7322_SendBufAvailAddr_OFFS 0x1E0 +#define QIB_7322_SendBufAvailAddr_DEF 0x0000000000000000 +#define QIB_7322_SendBufAvailAddr_SendBufAvailAddr_LSB 0x6 +#define QIB_7322_SendBufAvailAddr_SendBufAvailAddr_MSB 0x27 +#define QIB_7322_SendBufAvailAddr_SendBufAvailAddr_RMASK 0x3FFFFFFFF + +#define QIB_7322_SendBufErr0_OFFS 0x240 +#define QIB_7322_SendBufErr0_DEF 0x0000000000000000 +#define QIB_7322_SendBufErr0_SendBufErr_63_0_LSB 0x0 +#define QIB_7322_SendBufErr0_SendBufErr_63_0_MSB 0x3F +#define QIB_7322_SendBufErr0_SendBufErr_63_0_RMASK 0x0 + +#define QIB_7322_AvailUpdCount_OFFS 0x268 +#define QIB_7322_AvailUpdCount_DEF 0x0000000000000000 +#define QIB_7322_AvailUpdCount_AvailUpdCount_LSB 0x0 +#define QIB_7322_AvailUpdCount_AvailUpdCount_MSB 0x4 +#define QIB_7322_AvailUpdCount_AvailUpdCount_RMASK 0x1F + +#define QIB_7322_RcvHdrAddr0_OFFS 0x280 +#define QIB_7322_RcvHdrAddr0_DEF 0x0000000000000000 +#define QIB_7322_RcvHdrAddr0_RcvHdrAddr_LSB 0x2 +#define QIB_7322_RcvHdrAddr0_RcvHdrAddr_MSB 0x27 +#define QIB_7322_RcvHdrAddr0_RcvHdrAddr_RMASK 0x3FFFFFFFFF + +#define QIB_7322_RcvHdrTailAddr0_OFFS 0x340 +#define QIB_7322_RcvHdrTailAddr0_DEF 0x0000000000000000 +#define QIB_7322_RcvHdrTailAddr0_RcvHdrTailAddr_LSB 0x2 +#define QIB_7322_RcvHdrTailAddr0_RcvHdrTailAddr_MSB 0x27 +#define QIB_7322_RcvHdrTailAddr0_RcvHdrTailAddr_RMASK 0x3FFFFFFFFF + +#define QIB_7322_ahb_access_ctrl_OFFS 0x460 +#define QIB_7322_ahb_access_ctrl_DEF 0x0000000000000000 +#define QIB_7322_ahb_access_ctrl_sw_sel_ahb_trgt_LSB 0x1 +#define QIB_7322_ahb_access_ctrl_sw_sel_ahb_trgt_MSB 0x2 +#define QIB_7322_ahb_access_ctrl_sw_sel_ahb_trgt_RMASK 0x3 +#define QIB_7322_ahb_access_ctrl_sw_ahb_sel_LSB 0x0 +#define QIB_7322_ahb_access_ctrl_sw_ahb_sel_MSB 0x0 +#define QIB_7322_ahb_access_ctrl_sw_ahb_sel_RMASK 0x1 + +#define QIB_7322_ahb_transaction_reg_OFFS 0x468 +#define QIB_7322_ahb_transaction_reg_DEF 0x0000000080000000 +#define QIB_7322_ahb_transaction_reg_ahb_data_LSB 0x20 +#define QIB_7322_ahb_transaction_reg_ahb_data_MSB 0x3F +#define QIB_7322_ahb_transaction_reg_ahb_data_RMASK 0xFFFFFFFF +#define QIB_7322_ahb_transaction_reg_ahb_rdy_LSB 0x1F +#define QIB_7322_ahb_transaction_reg_ahb_rdy_MSB 0x1F +#define QIB_7322_ahb_transaction_reg_ahb_rdy_RMASK 0x1 +#define QIB_7322_ahb_transaction_reg_ahb_req_err_LSB 0x1E +#define QIB_7322_ahb_transaction_reg_ahb_req_err_MSB 0x1E +#define QIB_7322_ahb_transaction_reg_ahb_req_err_RMASK 0x1 +#define QIB_7322_ahb_transaction_reg_write_not_read_LSB 0x1B +#define QIB_7322_ahb_transaction_reg_write_not_read_MSB 0x1B +#define QIB_7322_ahb_transaction_reg_write_not_read_RMASK 0x1 +#define QIB_7322_ahb_transaction_reg_ahb_address_LSB 0x10 +#define QIB_7322_ahb_transaction_reg_ahb_address_MSB 0x1A +#define QIB_7322_ahb_transaction_reg_ahb_address_RMASK 0x7FF + +#define QIB_7322_SPC_JTAG_ACCESS_REG_OFFS 0x470 +#define QIB_7322_SPC_JTAG_ACCESS_REG_DEF 0x0000000000000001 +#define QIB_7322_SPC_JTAG_ACCESS_REG_SPC_JTAG_ACCESS_EN_LSB 0xA +#define QIB_7322_SPC_JTAG_ACCESS_REG_SPC_JTAG_ACCESS_EN_MSB 0xA +#define QIB_7322_SPC_JTAG_ACCESS_REG_SPC_JTAG_ACCESS_EN_RMASK 0x1 +#define QIB_7322_SPC_JTAG_ACCESS_REG_bist_en_LSB 0x5 +#define QIB_7322_SPC_JTAG_ACCESS_REG_bist_en_MSB 0x9 +#define QIB_7322_SPC_JTAG_ACCESS_REG_bist_en_RMASK 0x1F +#define QIB_7322_SPC_JTAG_ACCESS_REG_opcode_LSB 0x3 +#define QIB_7322_SPC_JTAG_ACCESS_REG_opcode_MSB 0x4 +#define QIB_7322_SPC_JTAG_ACCESS_REG_opcode_RMASK 0x3 +#define QIB_7322_SPC_JTAG_ACCESS_REG_tdi_LSB 0x2 +#define QIB_7322_SPC_JTAG_ACCESS_REG_tdi_MSB 0x2 +#define QIB_7322_SPC_JTAG_ACCESS_REG_tdi_RMASK 0x1 +#define QIB_7322_SPC_JTAG_ACCESS_REG_tdo_LSB 0x1 +#define QIB_7322_SPC_JTAG_ACCESS_REG_tdo_MSB 0x1 +#define QIB_7322_SPC_JTAG_ACCESS_REG_tdo_RMASK 0x1 +#define QIB_7322_SPC_JTAG_ACCESS_REG_rdy_LSB 0x0 +#define QIB_7322_SPC_JTAG_ACCESS_REG_rdy_MSB 0x0 +#define QIB_7322_SPC_JTAG_ACCESS_REG_rdy_RMASK 0x1 + +#define QIB_7322_SendCheckMask0_OFFS 0x4C0 +#define QIB_7322_SendCheckMask0_DEF 0x0000000000000000 +#define QIB_7322_SendCheckMask0_SendCheckMask_63_32_LSB 0x0 +#define QIB_7322_SendCheckMask0_SendCheckMask_63_32_MSB 0x3F +#define QIB_7322_SendCheckMask0_SendCheckMask_63_32_RMASK 0x0 + +#define QIB_7322_SendGRHCheckMask0_OFFS 0x4E0 +#define QIB_7322_SendGRHCheckMask0_DEF 0x0000000000000000 +#define QIB_7322_SendGRHCheckMask0_SendGRHCheckMask_63_32_LSB 0x0 +#define QIB_7322_SendGRHCheckMask0_SendGRHCheckMask_63_32_MSB 0x3F +#define QIB_7322_SendGRHCheckMask0_SendGRHCheckMask_63_32_RMASK 0x0 + +#define QIB_7322_SendIBPacketMask0_OFFS 0x500 +#define QIB_7322_SendIBPacketMask0_DEF 0x0000000000000000 +#define QIB_7322_SendIBPacketMask0_SendIBPacketMask_63_32_LSB 0x0 +#define QIB_7322_SendIBPacketMask0_SendIBPacketMask_63_32_MSB 0x3F +#define QIB_7322_SendIBPacketMask0_SendIBPacketMask_63_32_RMASK 0x0 + +#define QIB_7322_IntRedirect0_OFFS 0x540 +#define QIB_7322_IntRedirect0_DEF 0x0000000000000000 +#define QIB_7322_IntRedirect0_vec11_LSB 0x37 +#define QIB_7322_IntRedirect0_vec11_MSB 0x3B +#define QIB_7322_IntRedirect0_vec11_RMASK 0x1F +#define QIB_7322_IntRedirect0_vec10_LSB 0x32 +#define QIB_7322_IntRedirect0_vec10_MSB 0x36 +#define QIB_7322_IntRedirect0_vec10_RMASK 0x1F +#define QIB_7322_IntRedirect0_vec9_LSB 0x2D +#define QIB_7322_IntRedirect0_vec9_MSB 0x31 +#define QIB_7322_IntRedirect0_vec9_RMASK 0x1F +#define QIB_7322_IntRedirect0_vec8_LSB 0x28 +#define QIB_7322_IntRedirect0_vec8_MSB 0x2C +#define QIB_7322_IntRedirect0_vec8_RMASK 0x1F +#define QIB_7322_IntRedirect0_vec7_LSB 0x23 +#define QIB_7322_IntRedirect0_vec7_MSB 0x27 +#define QIB_7322_IntRedirect0_vec7_RMASK 0x1F +#define QIB_7322_IntRedirect0_vec6_LSB 0x1E +#define QIB_7322_IntRedirect0_vec6_MSB 0x22 +#define QIB_7322_IntRedirect0_vec6_RMASK 0x1F +#define QIB_7322_IntRedirect0_vec5_LSB 0x19 +#define QIB_7322_IntRedirect0_vec5_MSB 0x1D +#define QIB_7322_IntRedirect0_vec5_RMASK 0x1F +#define QIB_7322_IntRedirect0_vec4_LSB 0x14 +#define QIB_7322_IntRedirect0_vec4_MSB 0x18 +#define QIB_7322_IntRedirect0_vec4_RMASK 0x1F +#define QIB_7322_IntRedirect0_vec3_LSB 0xF +#define QIB_7322_IntRedirect0_vec3_MSB 0x13 +#define QIB_7322_IntRedirect0_vec3_RMASK 0x1F +#define QIB_7322_IntRedirect0_vec2_LSB 0xA +#define QIB_7322_IntRedirect0_vec2_MSB 0xE +#define QIB_7322_IntRedirect0_vec2_RMASK 0x1F +#define QIB_7322_IntRedirect0_vec1_LSB 0x5 +#define QIB_7322_IntRedirect0_vec1_MSB 0x9 +#define QIB_7322_IntRedirect0_vec1_RMASK 0x1F +#define QIB_7322_IntRedirect0_vec0_LSB 0x0 +#define QIB_7322_IntRedirect0_vec0_MSB 0x4 +#define QIB_7322_IntRedirect0_vec0_RMASK 0x1F + +#define QIB_7322_Int_Granted_OFFS 0x570 +#define QIB_7322_Int_Granted_DEF 0x0000000000000000 + +#define QIB_7322_vec_clr_without_int_OFFS 0x578 +#define QIB_7322_vec_clr_without_int_DEF 0x0000000000000000 + +#define QIB_7322_DCACtrlA_OFFS 0x580 +#define QIB_7322_DCACtrlA_DEF 0x0000000000000000 +#define QIB_7322_DCACtrlA_SendDMAHead1DCAEnable_LSB 0x4 +#define QIB_7322_DCACtrlA_SendDMAHead1DCAEnable_MSB 0x4 +#define QIB_7322_DCACtrlA_SendDMAHead1DCAEnable_RMASK 0x1 +#define QIB_7322_DCACtrlA_SendDMAHead0DCAEnable_LSB 0x3 +#define QIB_7322_DCACtrlA_SendDMAHead0DCAEnable_MSB 0x3 +#define QIB_7322_DCACtrlA_SendDMAHead0DCAEnable_RMASK 0x1 +#define QIB_7322_DCACtrlA_RcvTailUpdDCAEnable_LSB 0x2 +#define QIB_7322_DCACtrlA_RcvTailUpdDCAEnable_MSB 0x2 +#define QIB_7322_DCACtrlA_RcvTailUpdDCAEnable_RMASK 0x1 +#define QIB_7322_DCACtrlA_EagerDCAEnable_LSB 0x1 +#define QIB_7322_DCACtrlA_EagerDCAEnable_MSB 0x1 +#define QIB_7322_DCACtrlA_EagerDCAEnable_RMASK 0x1 +#define QIB_7322_DCACtrlA_RcvHdrqDCAEnable_LSB 0x0 +#define QIB_7322_DCACtrlA_RcvHdrqDCAEnable_MSB 0x0 +#define QIB_7322_DCACtrlA_RcvHdrqDCAEnable_RMASK 0x1 + +#define QIB_7322_DCACtrlB_OFFS 0x588 +#define QIB_7322_DCACtrlB_DEF 0x0000000000000000 +#define QIB_7322_DCACtrlB_RcvHdrq3DCAXfrCnt_LSB 0x36 +#define QIB_7322_DCACtrlB_RcvHdrq3DCAXfrCnt_MSB 0x3B +#define QIB_7322_DCACtrlB_RcvHdrq3DCAXfrCnt_RMASK 0x3F +#define QIB_7322_DCACtrlB_RcvHdrq3DCAOPH_LSB 0x2E +#define QIB_7322_DCACtrlB_RcvHdrq3DCAOPH_MSB 0x35 +#define QIB_7322_DCACtrlB_RcvHdrq3DCAOPH_RMASK 0xFF +#define QIB_7322_DCACtrlB_RcvHdrq2DCAXfrCnt_LSB 0x28 +#define QIB_7322_DCACtrlB_RcvHdrq2DCAXfrCnt_MSB 0x2D +#define QIB_7322_DCACtrlB_RcvHdrq2DCAXfrCnt_RMASK 0x3F +#define QIB_7322_DCACtrlB_RcvHdrq2DCAOPH_LSB 0x20 +#define QIB_7322_DCACtrlB_RcvHdrq2DCAOPH_MSB 0x27 +#define QIB_7322_DCACtrlB_RcvHdrq2DCAOPH_RMASK 0xFF +#define QIB_7322_DCACtrlB_RcvHdrq1DCAXfrCnt_LSB 0x16 +#define QIB_7322_DCACtrlB_RcvHdrq1DCAXfrCnt_MSB 0x1B +#define QIB_7322_DCACtrlB_RcvHdrq1DCAXfrCnt_RMASK 0x3F +#define QIB_7322_DCACtrlB_RcvHdrq1DCAOPH_LSB 0xE +#define QIB_7322_DCACtrlB_RcvHdrq1DCAOPH_MSB 0x15 +#define QIB_7322_DCACtrlB_RcvHdrq1DCAOPH_RMASK 0xFF +#define QIB_7322_DCACtrlB_RcvHdrq0DCAXfrCnt_LSB 0x8 +#define QIB_7322_DCACtrlB_RcvHdrq0DCAXfrCnt_MSB 0xD +#define QIB_7322_DCACtrlB_RcvHdrq0DCAXfrCnt_RMASK 0x3F +#define QIB_7322_DCACtrlB_RcvHdrq0DCAOPH_LSB 0x0 +#define QIB_7322_DCACtrlB_RcvHdrq0DCAOPH_MSB 0x7 +#define QIB_7322_DCACtrlB_RcvHdrq0DCAOPH_RMASK 0xFF + +#define QIB_7322_DCACtrlC_OFFS 0x590 +#define QIB_7322_DCACtrlC_DEF 0x0000000000000000 +#define QIB_7322_DCACtrlC_RcvHdrq7DCAXfrCnt_LSB 0x36 +#define QIB_7322_DCACtrlC_RcvHdrq7DCAXfrCnt_MSB 0x3B +#define QIB_7322_DCACtrlC_RcvHdrq7DCAXfrCnt_RMASK 0x3F +#define QIB_7322_DCACtrlC_RcvHdrq7DCAOPH_LSB 0x2E +#define QIB_7322_DCACtrlC_RcvHdrq7DCAOPH_MSB 0x35 +#define QIB_7322_DCACtrlC_RcvHdrq7DCAOPH_RMASK 0xFF +#define QIB_7322_DCACtrlC_RcvHdrq6DCAXfrCnt_LSB 0x28 +#define QIB_7322_DCACtrlC_RcvHdrq6DCAXfrCnt_MSB 0x2D +#define QIB_7322_DCACtrlC_RcvHdrq6DCAXfrCnt_RMASK 0x3F +#define QIB_7322_DCACtrlC_RcvHdrq6DCAOPH_LSB 0x20 +#define QIB_7322_DCACtrlC_RcvHdrq6DCAOPH_MSB 0x27 +#define QIB_7322_DCACtrlC_RcvHdrq6DCAOPH_RMASK 0xFF +#define QIB_7322_DCACtrlC_RcvHdrq5DCAXfrCnt_LSB 0x16 +#define QIB_7322_DCACtrlC_RcvHdrq5DCAXfrCnt_MSB 0x1B +#define QIB_7322_DCACtrlC_RcvHdrq5DCAXfrCnt_RMASK 0x3F +#define QIB_7322_DCACtrlC_RcvHdrq5DCAOPH_LSB 0xE +#define QIB_7322_DCACtrlC_RcvHdrq5DCAOPH_MSB 0x15 +#define QIB_7322_DCACtrlC_RcvHdrq5DCAOPH_RMASK 0xFF +#define QIB_7322_DCACtrlC_RcvHdrq4DCAXfrCnt_LSB 0x8 +#define QIB_7322_DCACtrlC_RcvHdrq4DCAXfrCnt_MSB 0xD +#define QIB_7322_DCACtrlC_RcvHdrq4DCAXfrCnt_RMASK 0x3F +#define QIB_7322_DCACtrlC_RcvHdrq4DCAOPH_LSB 0x0 +#define QIB_7322_DCACtrlC_RcvHdrq4DCAOPH_MSB 0x7 +#define QIB_7322_DCACtrlC_RcvHdrq4DCAOPH_RMASK 0xFF + +#define QIB_7322_DCACtrlD_OFFS 0x598 +#define QIB_7322_DCACtrlD_DEF 0x0000000000000000 +#define QIB_7322_DCACtrlD_RcvHdrq11DCAXfrCnt_LSB 0x36 +#define QIB_7322_DCACtrlD_RcvHdrq11DCAXfrCnt_MSB 0x3B +#define QIB_7322_DCACtrlD_RcvHdrq11DCAXfrCnt_RMASK 0x3F +#define QIB_7322_DCACtrlD_RcvHdrq11DCAOPH_LSB 0x2E +#define QIB_7322_DCACtrlD_RcvHdrq11DCAOPH_MSB 0x35 +#define QIB_7322_DCACtrlD_RcvHdrq11DCAOPH_RMASK 0xFF +#define QIB_7322_DCACtrlD_RcvHdrq10DCAXfrCnt_LSB 0x28 +#define QIB_7322_DCACtrlD_RcvHdrq10DCAXfrCnt_MSB 0x2D +#define QIB_7322_DCACtrlD_RcvHdrq10DCAXfrCnt_RMASK 0x3F +#define QIB_7322_DCACtrlD_RcvHdrq10DCAOPH_LSB 0x20 +#define QIB_7322_DCACtrlD_RcvHdrq10DCAOPH_MSB 0x27 +#define QIB_7322_DCACtrlD_RcvHdrq10DCAOPH_RMASK 0xFF +#define QIB_7322_DCACtrlD_RcvHdrq9DCAXfrCnt_LSB 0x16 +#define QIB_7322_DCACtrlD_RcvHdrq9DCAXfrCnt_MSB 0x1B +#define QIB_7322_DCACtrlD_RcvHdrq9DCAXfrCnt_RMASK 0x3F +#define QIB_7322_DCACtrlD_RcvHdrq9DCAOPH_LSB 0xE +#define QIB_7322_DCACtrlD_RcvHdrq9DCAOPH_MSB 0x15 +#define QIB_7322_DCACtrlD_RcvHdrq9DCAOPH_RMASK 0xFF +#define QIB_7322_DCACtrlD_RcvHdrq8DCAXfrCnt_LSB 0x8 +#define QIB_7322_DCACtrlD_RcvHdrq8DCAXfrCnt_MSB 0xD +#define QIB_7322_DCACtrlD_RcvHdrq8DCAXfrCnt_RMASK 0x3F +#define QIB_7322_DCACtrlD_RcvHdrq8DCAOPH_LSB 0x0 +#define QIB_7322_DCACtrlD_RcvHdrq8DCAOPH_MSB 0x7 +#define QIB_7322_DCACtrlD_RcvHdrq8DCAOPH_RMASK 0xFF + +#define QIB_7322_DCACtrlE_OFFS 0x5A0 +#define QIB_7322_DCACtrlE_DEF 0x0000000000000000 +#define QIB_7322_DCACtrlE_RcvHdrq15DCAXfrCnt_LSB 0x36 +#define QIB_7322_DCACtrlE_RcvHdrq15DCAXfrCnt_MSB 0x3B +#define QIB_7322_DCACtrlE_RcvHdrq15DCAXfrCnt_RMASK 0x3F +#define QIB_7322_DCACtrlE_RcvHdrq15DCAOPH_LSB 0x2E +#define QIB_7322_DCACtrlE_RcvHdrq15DCAOPH_MSB 0x35 +#define QIB_7322_DCACtrlE_RcvHdrq15DCAOPH_RMASK 0xFF +#define QIB_7322_DCACtrlE_RcvHdrq14DCAXfrCnt_LSB 0x28 +#define QIB_7322_DCACtrlE_RcvHdrq14DCAXfrCnt_MSB 0x2D +#define QIB_7322_DCACtrlE_RcvHdrq14DCAXfrCnt_RMASK 0x3F +#define QIB_7322_DCACtrlE_RcvHdrq14DCAOPH_LSB 0x20 +#define QIB_7322_DCACtrlE_RcvHdrq14DCAOPH_MSB 0x27 +#define QIB_7322_DCACtrlE_RcvHdrq14DCAOPH_RMASK 0xFF +#define QIB_7322_DCACtrlE_RcvHdrq13DCAXfrCnt_LSB 0x16 +#define QIB_7322_DCACtrlE_RcvHdrq13DCAXfrCnt_MSB 0x1B +#define QIB_7322_DCACtrlE_RcvHdrq13DCAXfrCnt_RMASK 0x3F +#define QIB_7322_DCACtrlE_RcvHdrq13DCAOPH_LSB 0xE +#define QIB_7322_DCACtrlE_RcvHdrq13DCAOPH_MSB 0x15 +#define QIB_7322_DCACtrlE_RcvHdrq13DCAOPH_RMASK 0xFF +#define QIB_7322_DCACtrlE_RcvHdrq12DCAXfrCnt_LSB 0x8 +#define QIB_7322_DCACtrlE_RcvHdrq12DCAXfrCnt_MSB 0xD +#define QIB_7322_DCACtrlE_RcvHdrq12DCAXfrCnt_RMASK 0x3F +#define QIB_7322_DCACtrlE_RcvHdrq12DCAOPH_LSB 0x0 +#define QIB_7322_DCACtrlE_RcvHdrq12DCAOPH_MSB 0x7 +#define QIB_7322_DCACtrlE_RcvHdrq12DCAOPH_RMASK 0xFF + +#define QIB_7322_DCACtrlF_OFFS 0x5A8 +#define QIB_7322_DCACtrlF_DEF 0x0000000000000000 +#define QIB_7322_DCACtrlF_SendDma1DCAOPH_LSB 0x28 +#define QIB_7322_DCACtrlF_SendDma1DCAOPH_MSB 0x2F +#define QIB_7322_DCACtrlF_SendDma1DCAOPH_RMASK 0xFF +#define QIB_7322_DCACtrlF_SendDma0DCAOPH_LSB 0x20 +#define QIB_7322_DCACtrlF_SendDma0DCAOPH_MSB 0x27 +#define QIB_7322_DCACtrlF_SendDma0DCAOPH_RMASK 0xFF +#define QIB_7322_DCACtrlF_RcvHdrq17DCAXfrCnt_LSB 0x16 +#define QIB_7322_DCACtrlF_RcvHdrq17DCAXfrCnt_MSB 0x1B +#define QIB_7322_DCACtrlF_RcvHdrq17DCAXfrCnt_RMASK 0x3F +#define QIB_7322_DCACtrlF_RcvHdrq17DCAOPH_LSB 0xE +#define QIB_7322_DCACtrlF_RcvHdrq17DCAOPH_MSB 0x15 +#define QIB_7322_DCACtrlF_RcvHdrq17DCAOPH_RMASK 0xFF +#define QIB_7322_DCACtrlF_RcvHdrq16DCAXfrCnt_LSB 0x8 +#define QIB_7322_DCACtrlF_RcvHdrq16DCAXfrCnt_MSB 0xD +#define QIB_7322_DCACtrlF_RcvHdrq16DCAXfrCnt_RMASK 0x3F +#define QIB_7322_DCACtrlF_RcvHdrq16DCAOPH_LSB 0x0 +#define QIB_7322_DCACtrlF_RcvHdrq16DCAOPH_MSB 0x7 +#define QIB_7322_DCACtrlF_RcvHdrq16DCAOPH_RMASK 0xFF + +#define QIB_7322_RcvAvailTimeOut0_OFFS 0xC00 +#define QIB_7322_RcvAvailTimeOut0_DEF 0x0000000000000000 +#define QIB_7322_RcvAvailTimeOut0_RcvAvailTOCount_LSB 0x10 +#define QIB_7322_RcvAvailTimeOut0_RcvAvailTOCount_MSB 0x1F +#define QIB_7322_RcvAvailTimeOut0_RcvAvailTOCount_RMASK 0xFFFF +#define QIB_7322_RcvAvailTimeOut0_RcvAvailTOReload_LSB 0x0 +#define QIB_7322_RcvAvailTimeOut0_RcvAvailTOReload_MSB 0xF +#define QIB_7322_RcvAvailTimeOut0_RcvAvailTOReload_RMASK 0xFFFF + +#define QIB_7322_CntrRegBase_0_OFFS 0x1028 +#define QIB_7322_CntrRegBase_0_DEF 0x0000000000012000 + +#define QIB_7322_ErrMask_0_OFFS 0x1080 +#define QIB_7322_ErrMask_0_DEF 0x0000000000000000 +#define QIB_7322_ErrMask_0_IBStatusChangedMask_LSB 0x3A +#define QIB_7322_ErrMask_0_IBStatusChangedMask_MSB 0x3A +#define QIB_7322_ErrMask_0_IBStatusChangedMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_SHeadersErrMask_LSB 0x39 +#define QIB_7322_ErrMask_0_SHeadersErrMask_MSB 0x39 +#define QIB_7322_ErrMask_0_SHeadersErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_VL15BufMisuseErrMask_LSB 0x36 +#define QIB_7322_ErrMask_0_VL15BufMisuseErrMask_MSB 0x36 +#define QIB_7322_ErrMask_0_VL15BufMisuseErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_SDmaHaltErrMask_LSB 0x31 +#define QIB_7322_ErrMask_0_SDmaHaltErrMask_MSB 0x31 +#define QIB_7322_ErrMask_0_SDmaHaltErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_SDmaDescAddrMisalignErrMask_LSB 0x30 +#define QIB_7322_ErrMask_0_SDmaDescAddrMisalignErrMask_MSB 0x30 +#define QIB_7322_ErrMask_0_SDmaDescAddrMisalignErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_SDmaUnexpDataErrMask_LSB 0x2F +#define QIB_7322_ErrMask_0_SDmaUnexpDataErrMask_MSB 0x2F +#define QIB_7322_ErrMask_0_SDmaUnexpDataErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_SDmaMissingDwErrMask_LSB 0x2E +#define QIB_7322_ErrMask_0_SDmaMissingDwErrMask_MSB 0x2E +#define QIB_7322_ErrMask_0_SDmaMissingDwErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_SDmaDwEnErrMask_LSB 0x2D +#define QIB_7322_ErrMask_0_SDmaDwEnErrMask_MSB 0x2D +#define QIB_7322_ErrMask_0_SDmaDwEnErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_SDmaRpyTagErrMask_LSB 0x2C +#define QIB_7322_ErrMask_0_SDmaRpyTagErrMask_MSB 0x2C +#define QIB_7322_ErrMask_0_SDmaRpyTagErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_SDma1stDescErrMask_LSB 0x2B +#define QIB_7322_ErrMask_0_SDma1stDescErrMask_MSB 0x2B +#define QIB_7322_ErrMask_0_SDma1stDescErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_SDmaBaseErrMask_LSB 0x2A +#define QIB_7322_ErrMask_0_SDmaBaseErrMask_MSB 0x2A +#define QIB_7322_ErrMask_0_SDmaBaseErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_SDmaTailOutOfBoundErrMask_LSB 0x29 +#define QIB_7322_ErrMask_0_SDmaTailOutOfBoundErrMask_MSB 0x29 +#define QIB_7322_ErrMask_0_SDmaTailOutOfBoundErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_SDmaOutOfBoundErrMask_LSB 0x28 +#define QIB_7322_ErrMask_0_SDmaOutOfBoundErrMask_MSB 0x28 +#define QIB_7322_ErrMask_0_SDmaOutOfBoundErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_SDmaGenMismatchErrMask_LSB 0x27 +#define QIB_7322_ErrMask_0_SDmaGenMismatchErrMask_MSB 0x27 +#define QIB_7322_ErrMask_0_SDmaGenMismatchErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_SendBufMisuseErrMask_LSB 0x26 +#define QIB_7322_ErrMask_0_SendBufMisuseErrMask_MSB 0x26 +#define QIB_7322_ErrMask_0_SendBufMisuseErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_SendUnsupportedVLErrMask_LSB 0x25 +#define QIB_7322_ErrMask_0_SendUnsupportedVLErrMask_MSB 0x25 +#define QIB_7322_ErrMask_0_SendUnsupportedVLErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_SendUnexpectedPktNumErrMask_LSB 0x24 +#define QIB_7322_ErrMask_0_SendUnexpectedPktNumErrMask_MSB 0x24 +#define QIB_7322_ErrMask_0_SendUnexpectedPktNumErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_SendDroppedDataPktErrMask_LSB 0x22 +#define QIB_7322_ErrMask_0_SendDroppedDataPktErrMask_MSB 0x22 +#define QIB_7322_ErrMask_0_SendDroppedDataPktErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_SendDroppedSmpPktErrMask_LSB 0x21 +#define QIB_7322_ErrMask_0_SendDroppedSmpPktErrMask_MSB 0x21 +#define QIB_7322_ErrMask_0_SendDroppedSmpPktErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_SendPktLenErrMask_LSB 0x20 +#define QIB_7322_ErrMask_0_SendPktLenErrMask_MSB 0x20 +#define QIB_7322_ErrMask_0_SendPktLenErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_SendUnderRunErrMask_LSB 0x1F +#define QIB_7322_ErrMask_0_SendUnderRunErrMask_MSB 0x1F +#define QIB_7322_ErrMask_0_SendUnderRunErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_SendMaxPktLenErrMask_LSB 0x1E +#define QIB_7322_ErrMask_0_SendMaxPktLenErrMask_MSB 0x1E +#define QIB_7322_ErrMask_0_SendMaxPktLenErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_SendMinPktLenErrMask_LSB 0x1D +#define QIB_7322_ErrMask_0_SendMinPktLenErrMask_MSB 0x1D +#define QIB_7322_ErrMask_0_SendMinPktLenErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_RcvIBLostLinkErrMask_LSB 0x11 +#define QIB_7322_ErrMask_0_RcvIBLostLinkErrMask_MSB 0x11 +#define QIB_7322_ErrMask_0_RcvIBLostLinkErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_RcvHdrErrMask_LSB 0x10 +#define QIB_7322_ErrMask_0_RcvHdrErrMask_MSB 0x10 +#define QIB_7322_ErrMask_0_RcvHdrErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_RcvHdrLenErrMask_LSB 0xF +#define QIB_7322_ErrMask_0_RcvHdrLenErrMask_MSB 0xF +#define QIB_7322_ErrMask_0_RcvHdrLenErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_RcvBadTidErrMask_LSB 0xE +#define QIB_7322_ErrMask_0_RcvBadTidErrMask_MSB 0xE +#define QIB_7322_ErrMask_0_RcvBadTidErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_RcvBadVersionErrMask_LSB 0xB +#define QIB_7322_ErrMask_0_RcvBadVersionErrMask_MSB 0xB +#define QIB_7322_ErrMask_0_RcvBadVersionErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_RcvIBFlowErrMask_LSB 0xA +#define QIB_7322_ErrMask_0_RcvIBFlowErrMask_MSB 0xA +#define QIB_7322_ErrMask_0_RcvIBFlowErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_RcvEBPErrMask_LSB 0x9 +#define QIB_7322_ErrMask_0_RcvEBPErrMask_MSB 0x9 +#define QIB_7322_ErrMask_0_RcvEBPErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_RcvUnsupportedVLErrMask_LSB 0x8 +#define QIB_7322_ErrMask_0_RcvUnsupportedVLErrMask_MSB 0x8 +#define QIB_7322_ErrMask_0_RcvUnsupportedVLErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_RcvUnexpectedCharErrMask_LSB 0x7 +#define QIB_7322_ErrMask_0_RcvUnexpectedCharErrMask_MSB 0x7 +#define QIB_7322_ErrMask_0_RcvUnexpectedCharErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_RcvShortPktLenErrMask_LSB 0x6 +#define QIB_7322_ErrMask_0_RcvShortPktLenErrMask_MSB 0x6 +#define QIB_7322_ErrMask_0_RcvShortPktLenErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_RcvLongPktLenErrMask_LSB 0x5 +#define QIB_7322_ErrMask_0_RcvLongPktLenErrMask_MSB 0x5 +#define QIB_7322_ErrMask_0_RcvLongPktLenErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_RcvMaxPktLenErrMask_LSB 0x4 +#define QIB_7322_ErrMask_0_RcvMaxPktLenErrMask_MSB 0x4 +#define QIB_7322_ErrMask_0_RcvMaxPktLenErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_RcvMinPktLenErrMask_LSB 0x3 +#define QIB_7322_ErrMask_0_RcvMinPktLenErrMask_MSB 0x3 +#define QIB_7322_ErrMask_0_RcvMinPktLenErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_RcvICRCErrMask_LSB 0x2 +#define QIB_7322_ErrMask_0_RcvICRCErrMask_MSB 0x2 +#define QIB_7322_ErrMask_0_RcvICRCErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_RcvVCRCErrMask_LSB 0x1 +#define QIB_7322_ErrMask_0_RcvVCRCErrMask_MSB 0x1 +#define QIB_7322_ErrMask_0_RcvVCRCErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_RcvFormatErrMask_LSB 0x0 +#define QIB_7322_ErrMask_0_RcvFormatErrMask_MSB 0x0 +#define QIB_7322_ErrMask_0_RcvFormatErrMask_RMASK 0x1 + +#define QIB_7322_ErrStatus_0_OFFS 0x1088 +#define QIB_7322_ErrStatus_0_DEF 0x0000000000000000 +#define QIB_7322_ErrStatus_0_IBStatusChanged_LSB 0x3A +#define QIB_7322_ErrStatus_0_IBStatusChanged_MSB 0x3A +#define QIB_7322_ErrStatus_0_IBStatusChanged_RMASK 0x1 +#define QIB_7322_ErrStatus_0_SHeadersErr_LSB 0x39 +#define QIB_7322_ErrStatus_0_SHeadersErr_MSB 0x39 +#define QIB_7322_ErrStatus_0_SHeadersErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_VL15BufMisuseErr_LSB 0x36 +#define QIB_7322_ErrStatus_0_VL15BufMisuseErr_MSB 0x36 +#define QIB_7322_ErrStatus_0_VL15BufMisuseErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_SDmaHaltErr_LSB 0x31 +#define QIB_7322_ErrStatus_0_SDmaHaltErr_MSB 0x31 +#define QIB_7322_ErrStatus_0_SDmaHaltErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_SDmaDescAddrMisalignErr_LSB 0x30 +#define QIB_7322_ErrStatus_0_SDmaDescAddrMisalignErr_MSB 0x30 +#define QIB_7322_ErrStatus_0_SDmaDescAddrMisalignErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_SDmaUnexpDataErr_LSB 0x2F +#define QIB_7322_ErrStatus_0_SDmaUnexpDataErr_MSB 0x2F +#define QIB_7322_ErrStatus_0_SDmaUnexpDataErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_SDmaMissingDwErr_LSB 0x2E +#define QIB_7322_ErrStatus_0_SDmaMissingDwErr_MSB 0x2E +#define QIB_7322_ErrStatus_0_SDmaMissingDwErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_SDmaDwEnErr_LSB 0x2D +#define QIB_7322_ErrStatus_0_SDmaDwEnErr_MSB 0x2D +#define QIB_7322_ErrStatus_0_SDmaDwEnErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_SDmaRpyTagErr_LSB 0x2C +#define QIB_7322_ErrStatus_0_SDmaRpyTagErr_MSB 0x2C +#define QIB_7322_ErrStatus_0_SDmaRpyTagErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_SDma1stDescErr_LSB 0x2B +#define QIB_7322_ErrStatus_0_SDma1stDescErr_MSB 0x2B +#define QIB_7322_ErrStatus_0_SDma1stDescErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_SDmaBaseErr_LSB 0x2A +#define QIB_7322_ErrStatus_0_SDmaBaseErr_MSB 0x2A +#define QIB_7322_ErrStatus_0_SDmaBaseErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_SDmaTailOutOfBoundErr_LSB 0x29 +#define QIB_7322_ErrStatus_0_SDmaTailOutOfBoundErr_MSB 0x29 +#define QIB_7322_ErrStatus_0_SDmaTailOutOfBoundErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_SDmaOutOfBoundErr_LSB 0x28 +#define QIB_7322_ErrStatus_0_SDmaOutOfBoundErr_MSB 0x28 +#define QIB_7322_ErrStatus_0_SDmaOutOfBoundErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_SDmaGenMismatchErr_LSB 0x27 +#define QIB_7322_ErrStatus_0_SDmaGenMismatchErr_MSB 0x27 +#define QIB_7322_ErrStatus_0_SDmaGenMismatchErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_SendBufMisuseErr_LSB 0x26 +#define QIB_7322_ErrStatus_0_SendBufMisuseErr_MSB 0x26 +#define QIB_7322_ErrStatus_0_SendBufMisuseErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_SendUnsupportedVLErr_LSB 0x25 +#define QIB_7322_ErrStatus_0_SendUnsupportedVLErr_MSB 0x25 +#define QIB_7322_ErrStatus_0_SendUnsupportedVLErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_SendUnexpectedPktNumErr_LSB 0x24 +#define QIB_7322_ErrStatus_0_SendUnexpectedPktNumErr_MSB 0x24 +#define QIB_7322_ErrStatus_0_SendUnexpectedPktNumErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_SendDroppedDataPktErr_LSB 0x22 +#define QIB_7322_ErrStatus_0_SendDroppedDataPktErr_MSB 0x22 +#define QIB_7322_ErrStatus_0_SendDroppedDataPktErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_SendDroppedSmpPktErr_LSB 0x21 +#define QIB_7322_ErrStatus_0_SendDroppedSmpPktErr_MSB 0x21 +#define QIB_7322_ErrStatus_0_SendDroppedSmpPktErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_SendPktLenErr_LSB 0x20 +#define QIB_7322_ErrStatus_0_SendPktLenErr_MSB 0x20 +#define QIB_7322_ErrStatus_0_SendPktLenErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_SendUnderRunErr_LSB 0x1F +#define QIB_7322_ErrStatus_0_SendUnderRunErr_MSB 0x1F +#define QIB_7322_ErrStatus_0_SendUnderRunErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_SendMaxPktLenErr_LSB 0x1E +#define QIB_7322_ErrStatus_0_SendMaxPktLenErr_MSB 0x1E +#define QIB_7322_ErrStatus_0_SendMaxPktLenErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_SendMinPktLenErr_LSB 0x1D +#define QIB_7322_ErrStatus_0_SendMinPktLenErr_MSB 0x1D +#define QIB_7322_ErrStatus_0_SendMinPktLenErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_RcvIBLostLinkErr_LSB 0x11 +#define QIB_7322_ErrStatus_0_RcvIBLostLinkErr_MSB 0x11 +#define QIB_7322_ErrStatus_0_RcvIBLostLinkErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_RcvHdrErr_LSB 0x10 +#define QIB_7322_ErrStatus_0_RcvHdrErr_MSB 0x10 +#define QIB_7322_ErrStatus_0_RcvHdrErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_RcvHdrLenErr_LSB 0xF +#define QIB_7322_ErrStatus_0_RcvHdrLenErr_MSB 0xF +#define QIB_7322_ErrStatus_0_RcvHdrLenErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_RcvBadTidErr_LSB 0xE +#define QIB_7322_ErrStatus_0_RcvBadTidErr_MSB 0xE +#define QIB_7322_ErrStatus_0_RcvBadTidErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_RcvBadVersionErr_LSB 0xB +#define QIB_7322_ErrStatus_0_RcvBadVersionErr_MSB 0xB +#define QIB_7322_ErrStatus_0_RcvBadVersionErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_RcvIBFlowErr_LSB 0xA +#define QIB_7322_ErrStatus_0_RcvIBFlowErr_MSB 0xA +#define QIB_7322_ErrStatus_0_RcvIBFlowErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_RcvEBPErr_LSB 0x9 +#define QIB_7322_ErrStatus_0_RcvEBPErr_MSB 0x9 +#define QIB_7322_ErrStatus_0_RcvEBPErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_RcvUnsupportedVLErr_LSB 0x8 +#define QIB_7322_ErrStatus_0_RcvUnsupportedVLErr_MSB 0x8 +#define QIB_7322_ErrStatus_0_RcvUnsupportedVLErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_RcvUnexpectedCharErr_LSB 0x7 +#define QIB_7322_ErrStatus_0_RcvUnexpectedCharErr_MSB 0x7 +#define QIB_7322_ErrStatus_0_RcvUnexpectedCharErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_RcvShortPktLenErr_LSB 0x6 +#define QIB_7322_ErrStatus_0_RcvShortPktLenErr_MSB 0x6 +#define QIB_7322_ErrStatus_0_RcvShortPktLenErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_RcvLongPktLenErr_LSB 0x5 +#define QIB_7322_ErrStatus_0_RcvLongPktLenErr_MSB 0x5 +#define QIB_7322_ErrStatus_0_RcvLongPktLenErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_RcvMaxPktLenErr_LSB 0x4 +#define QIB_7322_ErrStatus_0_RcvMaxPktLenErr_MSB 0x4 +#define QIB_7322_ErrStatus_0_RcvMaxPktLenErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_RcvMinPktLenErr_LSB 0x3 +#define QIB_7322_ErrStatus_0_RcvMinPktLenErr_MSB 0x3 +#define QIB_7322_ErrStatus_0_RcvMinPktLenErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_RcvICRCErr_LSB 0x2 +#define QIB_7322_ErrStatus_0_RcvICRCErr_MSB 0x2 +#define QIB_7322_ErrStatus_0_RcvICRCErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_RcvVCRCErr_LSB 0x1 +#define QIB_7322_ErrStatus_0_RcvVCRCErr_MSB 0x1 +#define QIB_7322_ErrStatus_0_RcvVCRCErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_RcvFormatErr_LSB 0x0 +#define QIB_7322_ErrStatus_0_RcvFormatErr_MSB 0x0 +#define QIB_7322_ErrStatus_0_RcvFormatErr_RMASK 0x1 + +#define QIB_7322_ErrClear_0_OFFS 0x1090 +#define QIB_7322_ErrClear_0_DEF 0x0000000000000000 +#define QIB_7322_ErrClear_0_IBStatusChangedClear_LSB 0x3A +#define QIB_7322_ErrClear_0_IBStatusChangedClear_MSB 0x3A +#define QIB_7322_ErrClear_0_IBStatusChangedClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_SHeadersErrClear_LSB 0x39 +#define QIB_7322_ErrClear_0_SHeadersErrClear_MSB 0x39 +#define QIB_7322_ErrClear_0_SHeadersErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_VL15BufMisuseErrClear_LSB 0x36 +#define QIB_7322_ErrClear_0_VL15BufMisuseErrClear_MSB 0x36 +#define QIB_7322_ErrClear_0_VL15BufMisuseErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_SDmaHaltErrClear_LSB 0x31 +#define QIB_7322_ErrClear_0_SDmaHaltErrClear_MSB 0x31 +#define QIB_7322_ErrClear_0_SDmaHaltErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_SDmaDescAddrMisalignErrClear_LSB 0x30 +#define QIB_7322_ErrClear_0_SDmaDescAddrMisalignErrClear_MSB 0x30 +#define QIB_7322_ErrClear_0_SDmaDescAddrMisalignErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_SDmaUnexpDataErrClear_LSB 0x2F +#define QIB_7322_ErrClear_0_SDmaUnexpDataErrClear_MSB 0x2F +#define QIB_7322_ErrClear_0_SDmaUnexpDataErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_SDmaMissingDwErrClear_LSB 0x2E +#define QIB_7322_ErrClear_0_SDmaMissingDwErrClear_MSB 0x2E +#define QIB_7322_ErrClear_0_SDmaMissingDwErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_SDmaDwEnErrClear_LSB 0x2D +#define QIB_7322_ErrClear_0_SDmaDwEnErrClear_MSB 0x2D +#define QIB_7322_ErrClear_0_SDmaDwEnErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_SDmaRpyTagErrClear_LSB 0x2C +#define QIB_7322_ErrClear_0_SDmaRpyTagErrClear_MSB 0x2C +#define QIB_7322_ErrClear_0_SDmaRpyTagErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_SDma1stDescErrClear_LSB 0x2B +#define QIB_7322_ErrClear_0_SDma1stDescErrClear_MSB 0x2B +#define QIB_7322_ErrClear_0_SDma1stDescErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_SDmaBaseErrClear_LSB 0x2A +#define QIB_7322_ErrClear_0_SDmaBaseErrClear_MSB 0x2A +#define QIB_7322_ErrClear_0_SDmaBaseErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_SDmaTailOutOfBoundErrClear_LSB 0x29 +#define QIB_7322_ErrClear_0_SDmaTailOutOfBoundErrClear_MSB 0x29 +#define QIB_7322_ErrClear_0_SDmaTailOutOfBoundErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_SDmaOutOfBoundErrClear_LSB 0x28 +#define QIB_7322_ErrClear_0_SDmaOutOfBoundErrClear_MSB 0x28 +#define QIB_7322_ErrClear_0_SDmaOutOfBoundErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_SDmaGenMismatchErrClear_LSB 0x27 +#define QIB_7322_ErrClear_0_SDmaGenMismatchErrClear_MSB 0x27 +#define QIB_7322_ErrClear_0_SDmaGenMismatchErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_SendBufMisuseErrClear_LSB 0x26 +#define QIB_7322_ErrClear_0_SendBufMisuseErrClear_MSB 0x26 +#define QIB_7322_ErrClear_0_SendBufMisuseErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_SendUnsupportedVLErrClear_LSB 0x25 +#define QIB_7322_ErrClear_0_SendUnsupportedVLErrClear_MSB 0x25 +#define QIB_7322_ErrClear_0_SendUnsupportedVLErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_SendUnexpectedPktNumErrClear_LSB 0x24 +#define QIB_7322_ErrClear_0_SendUnexpectedPktNumErrClear_MSB 0x24 +#define QIB_7322_ErrClear_0_SendUnexpectedPktNumErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_SendDroppedDataPktErrClear_LSB 0x22 +#define QIB_7322_ErrClear_0_SendDroppedDataPktErrClear_MSB 0x22 +#define QIB_7322_ErrClear_0_SendDroppedDataPktErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_SendDroppedSmpPktErrClear_LSB 0x21 +#define QIB_7322_ErrClear_0_SendDroppedSmpPktErrClear_MSB 0x21 +#define QIB_7322_ErrClear_0_SendDroppedSmpPktErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_SendPktLenErrClear_LSB 0x20 +#define QIB_7322_ErrClear_0_SendPktLenErrClear_MSB 0x20 +#define QIB_7322_ErrClear_0_SendPktLenErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_SendUnderRunErrClear_LSB 0x1F +#define QIB_7322_ErrClear_0_SendUnderRunErrClear_MSB 0x1F +#define QIB_7322_ErrClear_0_SendUnderRunErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_SendMaxPktLenErrClear_LSB 0x1E +#define QIB_7322_ErrClear_0_SendMaxPktLenErrClear_MSB 0x1E +#define QIB_7322_ErrClear_0_SendMaxPktLenErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_SendMinPktLenErrClear_LSB 0x1D +#define QIB_7322_ErrClear_0_SendMinPktLenErrClear_MSB 0x1D +#define QIB_7322_ErrClear_0_SendMinPktLenErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_RcvIBLostLinkErrClear_LSB 0x11 +#define QIB_7322_ErrClear_0_RcvIBLostLinkErrClear_MSB 0x11 +#define QIB_7322_ErrClear_0_RcvIBLostLinkErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_RcvHdrErrClear_LSB 0x10 +#define QIB_7322_ErrClear_0_RcvHdrErrClear_MSB 0x10 +#define QIB_7322_ErrClear_0_RcvHdrErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_RcvHdrLenErrClear_LSB 0xF +#define QIB_7322_ErrClear_0_RcvHdrLenErrClear_MSB 0xF +#define QIB_7322_ErrClear_0_RcvHdrLenErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_RcvBadTidErrClear_LSB 0xE +#define QIB_7322_ErrClear_0_RcvBadTidErrClear_MSB 0xE +#define QIB_7322_ErrClear_0_RcvBadTidErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_RcvBadVersionErrClear_LSB 0xB +#define QIB_7322_ErrClear_0_RcvBadVersionErrClear_MSB 0xB +#define QIB_7322_ErrClear_0_RcvBadVersionErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_RcvIBFlowErrClear_LSB 0xA +#define QIB_7322_ErrClear_0_RcvIBFlowErrClear_MSB 0xA +#define QIB_7322_ErrClear_0_RcvIBFlowErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_RcvEBPErrClear_LSB 0x9 +#define QIB_7322_ErrClear_0_RcvEBPErrClear_MSB 0x9 +#define QIB_7322_ErrClear_0_RcvEBPErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_RcvUnsupportedVLErrClear_LSB 0x8 +#define QIB_7322_ErrClear_0_RcvUnsupportedVLErrClear_MSB 0x8 +#define QIB_7322_ErrClear_0_RcvUnsupportedVLErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_RcvUnexpectedCharErrClear_LSB 0x7 +#define QIB_7322_ErrClear_0_RcvUnexpectedCharErrClear_MSB 0x7 +#define QIB_7322_ErrClear_0_RcvUnexpectedCharErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_RcvShortPktLenErrClear_LSB 0x6 +#define QIB_7322_ErrClear_0_RcvShortPktLenErrClear_MSB 0x6 +#define QIB_7322_ErrClear_0_RcvShortPktLenErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_RcvLongPktLenErrClear_LSB 0x5 +#define QIB_7322_ErrClear_0_RcvLongPktLenErrClear_MSB 0x5 +#define QIB_7322_ErrClear_0_RcvLongPktLenErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_RcvMaxPktLenErrClear_LSB 0x4 +#define QIB_7322_ErrClear_0_RcvMaxPktLenErrClear_MSB 0x4 +#define QIB_7322_ErrClear_0_RcvMaxPktLenErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_RcvMinPktLenErrClear_LSB 0x3 +#define QIB_7322_ErrClear_0_RcvMinPktLenErrClear_MSB 0x3 +#define QIB_7322_ErrClear_0_RcvMinPktLenErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_RcvICRCErrClear_LSB 0x2 +#define QIB_7322_ErrClear_0_RcvICRCErrClear_MSB 0x2 +#define QIB_7322_ErrClear_0_RcvICRCErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_RcvVCRCErrClear_LSB 0x1 +#define QIB_7322_ErrClear_0_RcvVCRCErrClear_MSB 0x1 +#define QIB_7322_ErrClear_0_RcvVCRCErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_RcvFormatErrClear_LSB 0x0 +#define QIB_7322_ErrClear_0_RcvFormatErrClear_MSB 0x0 +#define QIB_7322_ErrClear_0_RcvFormatErrClear_RMASK 0x1 + +#define QIB_7322_TXEStatus_0_OFFS 0x10B8 +#define QIB_7322_TXEStatus_0_DEF 0x0000000XC00080FF +#define QIB_7322_TXEStatus_0_TXE_IBC_Idle_LSB 0x1F +#define QIB_7322_TXEStatus_0_TXE_IBC_Idle_MSB 0x1F +#define QIB_7322_TXEStatus_0_TXE_IBC_Idle_RMASK 0x1 +#define QIB_7322_TXEStatus_0_RmFifoEmpty_LSB 0x1E +#define QIB_7322_TXEStatus_0_RmFifoEmpty_MSB 0x1E +#define QIB_7322_TXEStatus_0_RmFifoEmpty_RMASK 0x1 +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL15_LSB 0xF +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL15_MSB 0xF +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL15_RMASK 0x1 +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL7_LSB 0x7 +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL7_MSB 0x7 +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL7_RMASK 0x1 +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL6_LSB 0x6 +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL6_MSB 0x6 +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL6_RMASK 0x1 +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL5_LSB 0x5 +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL5_MSB 0x5 +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL5_RMASK 0x1 +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL4_LSB 0x4 +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL4_MSB 0x4 +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL4_RMASK 0x1 +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL3_LSB 0x3 +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL3_MSB 0x3 +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL3_RMASK 0x1 +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL2_LSB 0x2 +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL2_MSB 0x2 +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL2_RMASK 0x1 +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL1_LSB 0x1 +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL1_MSB 0x1 +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL1_RMASK 0x1 +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL0_LSB 0x0 +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL0_MSB 0x0 +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL0_RMASK 0x1 + +#define QIB_7322_RcvCtrl_0_OFFS 0x1100 +#define QIB_7322_RcvCtrl_0_DEF 0x0000000000000000 +#define QIB_7322_RcvCtrl_0_RcvResetCredit_LSB 0x2A +#define QIB_7322_RcvCtrl_0_RcvResetCredit_MSB 0x2A +#define QIB_7322_RcvCtrl_0_RcvResetCredit_RMASK 0x1 +#define QIB_7322_RcvCtrl_0_RcvPartitionKeyDisable_LSB 0x29 +#define QIB_7322_RcvCtrl_0_RcvPartitionKeyDisable_MSB 0x29 +#define QIB_7322_RcvCtrl_0_RcvPartitionKeyDisable_RMASK 0x1 +#define QIB_7322_RcvCtrl_0_RcvQPMapEnable_LSB 0x28 +#define QIB_7322_RcvCtrl_0_RcvQPMapEnable_MSB 0x28 +#define QIB_7322_RcvCtrl_0_RcvQPMapEnable_RMASK 0x1 +#define QIB_7322_RcvCtrl_0_RcvIBPortEnable_LSB 0x27 +#define QIB_7322_RcvCtrl_0_RcvIBPortEnable_MSB 0x27 +#define QIB_7322_RcvCtrl_0_RcvIBPortEnable_RMASK 0x1 +#define QIB_7322_RcvCtrl_0_ContextEnableUser_LSB 0x2 +#define QIB_7322_RcvCtrl_0_ContextEnableUser_MSB 0x11 +#define QIB_7322_RcvCtrl_0_ContextEnableUser_RMASK 0xFFFF +#define QIB_7322_RcvCtrl_0_ContextEnableKernel_LSB 0x0 +#define QIB_7322_RcvCtrl_0_ContextEnableKernel_MSB 0x0 +#define QIB_7322_RcvCtrl_0_ContextEnableKernel_RMASK 0x1 + +#define QIB_7322_RcvBTHQP_0_OFFS 0x1108 +#define QIB_7322_RcvBTHQP_0_DEF 0x0000000000000000 +#define QIB_7322_RcvBTHQP_0_RcvBTHQP_LSB 0x0 +#define QIB_7322_RcvBTHQP_0_RcvBTHQP_MSB 0x17 +#define QIB_7322_RcvBTHQP_0_RcvBTHQP_RMASK 0xFFFFFF + +#define QIB_7322_RcvQPMapTableA_0_OFFS 0x1110 +#define QIB_7322_RcvQPMapTableA_0_DEF 0x0000000000000000 +#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext5_LSB 0x19 +#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext5_MSB 0x1D +#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext5_RMASK 0x1F +#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext4_LSB 0x14 +#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext4_MSB 0x18 +#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext4_RMASK 0x1F +#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext3_LSB 0xF +#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext3_MSB 0x13 +#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext3_RMASK 0x1F +#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext2_LSB 0xA +#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext2_MSB 0xE +#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext2_RMASK 0x1F +#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext1_LSB 0x5 +#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext1_MSB 0x9 +#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext1_RMASK 0x1F +#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext0_LSB 0x0 +#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext0_MSB 0x4 +#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext0_RMASK 0x1F + +#define QIB_7322_RcvQPMapTableB_0_OFFS 0x1118 +#define QIB_7322_RcvQPMapTableB_0_DEF 0x0000000000000000 +#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext11_LSB 0x19 +#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext11_MSB 0x1D +#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext11_RMASK 0x1F +#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext10_LSB 0x14 +#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext10_MSB 0x18 +#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext10_RMASK 0x1F +#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext9_LSB 0xF +#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext9_MSB 0x13 +#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext9_RMASK 0x1F +#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext8_LSB 0xA +#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext8_MSB 0xE +#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext8_RMASK 0x1F +#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext7_LSB 0x5 +#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext7_MSB 0x9 +#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext7_RMASK 0x1F +#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext6_LSB 0x0 +#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext6_MSB 0x4 +#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext6_RMASK 0x1F + +#define QIB_7322_RcvQPMapTableC_0_OFFS 0x1120 +#define QIB_7322_RcvQPMapTableC_0_DEF 0x0000000000000000 +#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext17_LSB 0x19 +#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext17_MSB 0x1D +#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext17_RMASK 0x1F +#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext16_LSB 0x14 +#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext16_MSB 0x18 +#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext16_RMASK 0x1F +#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext15_LSB 0xF +#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext15_MSB 0x13 +#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext15_RMASK 0x1F +#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext14_LSB 0xA +#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext14_MSB 0xE +#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext14_RMASK 0x1F +#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext13_LSB 0x5 +#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext13_MSB 0x9 +#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext13_RMASK 0x1F +#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext12_LSB 0x0 +#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext12_MSB 0x4 +#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext12_RMASK 0x1F + +#define QIB_7322_RcvQPMapTableD_0_OFFS 0x1128 +#define QIB_7322_RcvQPMapTableD_0_DEF 0x0000000000000000 +#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext23_LSB 0x19 +#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext23_MSB 0x1D +#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext23_RMASK 0x1F +#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext22_LSB 0x14 +#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext22_MSB 0x18 +#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext22_RMASK 0x1F +#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext21_LSB 0xF +#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext21_MSB 0x13 +#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext21_RMASK 0x1F +#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext20_LSB 0xA +#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext20_MSB 0xE +#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext20_RMASK 0x1F +#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext19_LSB 0x5 +#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext19_MSB 0x9 +#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext19_RMASK 0x1F +#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext18_LSB 0x0 +#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext18_MSB 0x4 +#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext18_RMASK 0x1F + +#define QIB_7322_RcvQPMapTableE_0_OFFS 0x1130 +#define QIB_7322_RcvQPMapTableE_0_DEF 0x0000000000000000 +#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext29_LSB 0x19 +#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext29_MSB 0x1D +#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext29_RMASK 0x1F +#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext28_LSB 0x14 +#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext28_MSB 0x18 +#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext28_RMASK 0x1F +#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext27_LSB 0xF +#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext27_MSB 0x13 +#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext27_RMASK 0x1F +#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext26_LSB 0xA +#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext26_MSB 0xE +#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext26_RMASK 0x1F +#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext25_LSB 0x5 +#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext25_MSB 0x9 +#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext25_RMASK 0x1F +#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext24_LSB 0x0 +#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext24_MSB 0x4 +#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext24_RMASK 0x1F + +#define QIB_7322_RcvQPMapTableF_0_OFFS 0x1138 +#define QIB_7322_RcvQPMapTableF_0_DEF 0x0000000000000000 +#define QIB_7322_RcvQPMapTableF_0_RcvQPMapContext31_LSB 0x5 +#define QIB_7322_RcvQPMapTableF_0_RcvQPMapContext31_MSB 0x9 +#define QIB_7322_RcvQPMapTableF_0_RcvQPMapContext31_RMASK 0x1F +#define QIB_7322_RcvQPMapTableF_0_RcvQPMapContext30_LSB 0x0 +#define QIB_7322_RcvQPMapTableF_0_RcvQPMapContext30_MSB 0x4 +#define QIB_7322_RcvQPMapTableF_0_RcvQPMapContext30_RMASK 0x1F + +#define QIB_7322_PSStat_0_OFFS 0x1140 +#define QIB_7322_PSStat_0_DEF 0x0000000000000000 + +#define QIB_7322_PSStart_0_OFFS 0x1148 +#define QIB_7322_PSStart_0_DEF 0x0000000000000000 + +#define QIB_7322_PSInterval_0_OFFS 0x1150 +#define QIB_7322_PSInterval_0_DEF 0x0000000000000000 + +#define QIB_7322_RcvStatus_0_OFFS 0x1160 +#define QIB_7322_RcvStatus_0_DEF 0x0000000000000000 +#define QIB_7322_RcvStatus_0_DmaeqBlockingContext_LSB 0x1 +#define QIB_7322_RcvStatus_0_DmaeqBlockingContext_MSB 0x5 +#define QIB_7322_RcvStatus_0_DmaeqBlockingContext_RMASK 0x1F +#define QIB_7322_RcvStatus_0_RxPktInProgress_LSB 0x0 +#define QIB_7322_RcvStatus_0_RxPktInProgress_MSB 0x0 +#define QIB_7322_RcvStatus_0_RxPktInProgress_RMASK 0x1 + +#define QIB_7322_RcvPartitionKey_0_OFFS 0x1168 +#define QIB_7322_RcvPartitionKey_0_DEF 0x0000000000000000 + +#define QIB_7322_RcvQPMulticastContext_0_OFFS 0x1170 +#define QIB_7322_RcvQPMulticastContext_0_DEF 0x0000000000000000 +#define QIB_7322_RcvQPMulticastContext_0_RcvQpMcContext_LSB 0x0 +#define QIB_7322_RcvQPMulticastContext_0_RcvQpMcContext_MSB 0x4 +#define QIB_7322_RcvQPMulticastContext_0_RcvQpMcContext_RMASK 0x1F + +#define QIB_7322_RcvPktLEDCnt_0_OFFS 0x1178 +#define QIB_7322_RcvPktLEDCnt_0_DEF 0x0000000000000000 +#define QIB_7322_RcvPktLEDCnt_0_ONperiod_LSB 0x20 +#define QIB_7322_RcvPktLEDCnt_0_ONperiod_MSB 0x3F +#define QIB_7322_RcvPktLEDCnt_0_ONperiod_RMASK 0xFFFFFFFF +#define QIB_7322_RcvPktLEDCnt_0_OFFperiod_LSB 0x0 +#define QIB_7322_RcvPktLEDCnt_0_OFFperiod_MSB 0x1F +#define QIB_7322_RcvPktLEDCnt_0_OFFperiod_RMASK 0xFFFFFFFF + +#define QIB_7322_SendDmaIdleCnt_0_OFFS 0x1180 +#define QIB_7322_SendDmaIdleCnt_0_DEF 0x0000000000000000 +#define QIB_7322_SendDmaIdleCnt_0_SendDmaIdleCnt_LSB 0x0 +#define QIB_7322_SendDmaIdleCnt_0_SendDmaIdleCnt_MSB 0xF +#define QIB_7322_SendDmaIdleCnt_0_SendDmaIdleCnt_RMASK 0xFFFF + +#define QIB_7322_SendDmaReloadCnt_0_OFFS 0x1188 +#define QIB_7322_SendDmaReloadCnt_0_DEF 0x0000000000000000 +#define QIB_7322_SendDmaReloadCnt_0_SendDmaReloadCnt_LSB 0x0 +#define QIB_7322_SendDmaReloadCnt_0_SendDmaReloadCnt_MSB 0xF +#define QIB_7322_SendDmaReloadCnt_0_SendDmaReloadCnt_RMASK 0xFFFF + +#define QIB_7322_SendDmaDescCnt_0_OFFS 0x1190 +#define QIB_7322_SendDmaDescCnt_0_DEF 0x0000000000000000 +#define QIB_7322_SendDmaDescCnt_0_SendDmaDescCnt_LSB 0x0 +#define QIB_7322_SendDmaDescCnt_0_SendDmaDescCnt_MSB 0xF +#define QIB_7322_SendDmaDescCnt_0_SendDmaDescCnt_RMASK 0xFFFF + +#define QIB_7322_SendCtrl_0_OFFS 0x11C0 +#define QIB_7322_SendCtrl_0_DEF 0x0000000000000000 +#define QIB_7322_SendCtrl_0_IBVLArbiterEn_LSB 0xF +#define QIB_7322_SendCtrl_0_IBVLArbiterEn_MSB 0xF +#define QIB_7322_SendCtrl_0_IBVLArbiterEn_RMASK 0x1 +#define QIB_7322_SendCtrl_0_TxeDrainRmFifo_LSB 0xE +#define QIB_7322_SendCtrl_0_TxeDrainRmFifo_MSB 0xE +#define QIB_7322_SendCtrl_0_TxeDrainRmFifo_RMASK 0x1 +#define QIB_7322_SendCtrl_0_TxeDrainLaFifo_LSB 0xD +#define QIB_7322_SendCtrl_0_TxeDrainLaFifo_MSB 0xD +#define QIB_7322_SendCtrl_0_TxeDrainLaFifo_RMASK 0x1 +#define QIB_7322_SendCtrl_0_SDmaHalt_LSB 0xC +#define QIB_7322_SendCtrl_0_SDmaHalt_MSB 0xC +#define QIB_7322_SendCtrl_0_SDmaHalt_RMASK 0x1 +#define QIB_7322_SendCtrl_0_SDmaEnable_LSB 0xB +#define QIB_7322_SendCtrl_0_SDmaEnable_MSB 0xB +#define QIB_7322_SendCtrl_0_SDmaEnable_RMASK 0x1 +#define QIB_7322_SendCtrl_0_SDmaSingleDescriptor_LSB 0xA +#define QIB_7322_SendCtrl_0_SDmaSingleDescriptor_MSB 0xA +#define QIB_7322_SendCtrl_0_SDmaSingleDescriptor_RMASK 0x1 +#define QIB_7322_SendCtrl_0_SDmaIntEnable_LSB 0x9 +#define QIB_7322_SendCtrl_0_SDmaIntEnable_MSB 0x9 +#define QIB_7322_SendCtrl_0_SDmaIntEnable_RMASK 0x1 +#define QIB_7322_SendCtrl_0_SDmaCleanup_LSB 0x8 +#define QIB_7322_SendCtrl_0_SDmaCleanup_MSB 0x8 +#define QIB_7322_SendCtrl_0_SDmaCleanup_RMASK 0x1 +#define QIB_7322_SendCtrl_0_ForceCreditUpToDate_LSB 0x7 +#define QIB_7322_SendCtrl_0_ForceCreditUpToDate_MSB 0x7 +#define QIB_7322_SendCtrl_0_ForceCreditUpToDate_RMASK 0x1 +#define QIB_7322_SendCtrl_0_SendEnable_LSB 0x3 +#define QIB_7322_SendCtrl_0_SendEnable_MSB 0x3 +#define QIB_7322_SendCtrl_0_SendEnable_RMASK 0x1 +#define QIB_7322_SendCtrl_0_TxeBypassIbc_LSB 0x1 +#define QIB_7322_SendCtrl_0_TxeBypassIbc_MSB 0x1 +#define QIB_7322_SendCtrl_0_TxeBypassIbc_RMASK 0x1 +#define QIB_7322_SendCtrl_0_TxeAbortIbc_LSB 0x0 +#define QIB_7322_SendCtrl_0_TxeAbortIbc_MSB 0x0 +#define QIB_7322_SendCtrl_0_TxeAbortIbc_RMASK 0x1 + +#define QIB_7322_SendDmaBase_0_OFFS 0x11F8 +#define QIB_7322_SendDmaBase_0_DEF 0x0000000000000000 +#define QIB_7322_SendDmaBase_0_SendDmaBase_LSB 0x0 +#define QIB_7322_SendDmaBase_0_SendDmaBase_MSB 0x2F +#define QIB_7322_SendDmaBase_0_SendDmaBase_RMASK 0xFFFFFFFFFFFF + +#define QIB_7322_SendDmaLenGen_0_OFFS 0x1200 +#define QIB_7322_SendDmaLenGen_0_DEF 0x0000000000000000 +#define QIB_7322_SendDmaLenGen_0_Generation_LSB 0x10 +#define QIB_7322_SendDmaLenGen_0_Generation_MSB 0x12 +#define QIB_7322_SendDmaLenGen_0_Generation_RMASK 0x7 +#define QIB_7322_SendDmaLenGen_0_Length_LSB 0x0 +#define QIB_7322_SendDmaLenGen_0_Length_MSB 0xF +#define QIB_7322_SendDmaLenGen_0_Length_RMASK 0xFFFF + +#define QIB_7322_SendDmaTail_0_OFFS 0x1208 +#define QIB_7322_SendDmaTail_0_DEF 0x0000000000000000 +#define QIB_7322_SendDmaTail_0_SendDmaTail_LSB 0x0 +#define QIB_7322_SendDmaTail_0_SendDmaTail_MSB 0xF +#define QIB_7322_SendDmaTail_0_SendDmaTail_RMASK 0xFFFF + +#define QIB_7322_SendDmaHead_0_OFFS 0x1210 +#define QIB_7322_SendDmaHead_0_DEF 0x0000000000000000 +#define QIB_7322_SendDmaHead_0_InternalSendDmaHead_LSB 0x20 +#define QIB_7322_SendDmaHead_0_InternalSendDmaHead_MSB 0x2F +#define QIB_7322_SendDmaHead_0_InternalSendDmaHead_RMASK 0xFFFF +#define QIB_7322_SendDmaHead_0_SendDmaHead_LSB 0x0 +#define QIB_7322_SendDmaHead_0_SendDmaHead_MSB 0xF +#define QIB_7322_SendDmaHead_0_SendDmaHead_RMASK 0xFFFF + +#define QIB_7322_SendDmaHeadAddr_0_OFFS 0x1218 +#define QIB_7322_SendDmaHeadAddr_0_DEF 0x0000000000000000 +#define QIB_7322_SendDmaHeadAddr_0_SendDmaHeadAddr_LSB 0x0 +#define QIB_7322_SendDmaHeadAddr_0_SendDmaHeadAddr_MSB 0x2F +#define QIB_7322_SendDmaHeadAddr_0_SendDmaHeadAddr_RMASK 0xFFFFFFFFFFFF + +#define QIB_7322_SendDmaBufMask0_0_OFFS 0x1220 +#define QIB_7322_SendDmaBufMask0_0_DEF 0x0000000000000000 +#define QIB_7322_SendDmaBufMask0_0_BufMask_63_0_LSB 0x0 +#define QIB_7322_SendDmaBufMask0_0_BufMask_63_0_MSB 0x3F +#define QIB_7322_SendDmaBufMask0_0_BufMask_63_0_RMASK 0x0 + +#define QIB_7322_SendDmaStatus_0_OFFS 0x1238 +#define QIB_7322_SendDmaStatus_0_DEF 0x0000000042000000 +#define QIB_7322_SendDmaStatus_0_ScoreBoardDrainInProg_LSB 0x3F +#define QIB_7322_SendDmaStatus_0_ScoreBoardDrainInProg_MSB 0x3F +#define QIB_7322_SendDmaStatus_0_ScoreBoardDrainInProg_RMASK 0x1 +#define QIB_7322_SendDmaStatus_0_HaltInProg_LSB 0x3E +#define QIB_7322_SendDmaStatus_0_HaltInProg_MSB 0x3E +#define QIB_7322_SendDmaStatus_0_HaltInProg_RMASK 0x1 +#define QIB_7322_SendDmaStatus_0_InternalSDmaHalt_LSB 0x3D +#define QIB_7322_SendDmaStatus_0_InternalSDmaHalt_MSB 0x3D +#define QIB_7322_SendDmaStatus_0_InternalSDmaHalt_RMASK 0x1 +#define QIB_7322_SendDmaStatus_0_ScbDescIndex_13_0_LSB 0x2F +#define QIB_7322_SendDmaStatus_0_ScbDescIndex_13_0_MSB 0x3C +#define QIB_7322_SendDmaStatus_0_ScbDescIndex_13_0_RMASK 0x3FFF +#define QIB_7322_SendDmaStatus_0_RpyLowAddr_6_0_LSB 0x28 +#define QIB_7322_SendDmaStatus_0_RpyLowAddr_6_0_MSB 0x2E +#define QIB_7322_SendDmaStatus_0_RpyLowAddr_6_0_RMASK 0x7F +#define QIB_7322_SendDmaStatus_0_RpyTag_7_0_LSB 0x20 +#define QIB_7322_SendDmaStatus_0_RpyTag_7_0_MSB 0x27 +#define QIB_7322_SendDmaStatus_0_RpyTag_7_0_RMASK 0xFF +#define QIB_7322_SendDmaStatus_0_ScbFull_LSB 0x1F +#define QIB_7322_SendDmaStatus_0_ScbFull_MSB 0x1F +#define QIB_7322_SendDmaStatus_0_ScbFull_RMASK 0x1 +#define QIB_7322_SendDmaStatus_0_ScbEmpty_LSB 0x1E +#define QIB_7322_SendDmaStatus_0_ScbEmpty_MSB 0x1E +#define QIB_7322_SendDmaStatus_0_ScbEmpty_RMASK 0x1 +#define QIB_7322_SendDmaStatus_0_ScbEntryValid_LSB 0x1D +#define QIB_7322_SendDmaStatus_0_ScbEntryValid_MSB 0x1D +#define QIB_7322_SendDmaStatus_0_ScbEntryValid_RMASK 0x1 +#define QIB_7322_SendDmaStatus_0_ScbFetchDescFlag_LSB 0x1C +#define QIB_7322_SendDmaStatus_0_ScbFetchDescFlag_MSB 0x1C +#define QIB_7322_SendDmaStatus_0_ScbFetchDescFlag_RMASK 0x1 +#define QIB_7322_SendDmaStatus_0_SplFifoReadyToGo_LSB 0x1B +#define QIB_7322_SendDmaStatus_0_SplFifoReadyToGo_MSB 0x1B +#define QIB_7322_SendDmaStatus_0_SplFifoReadyToGo_RMASK 0x1 +#define QIB_7322_SendDmaStatus_0_SplFifoDisarmed_LSB 0x1A +#define QIB_7322_SendDmaStatus_0_SplFifoDisarmed_MSB 0x1A +#define QIB_7322_SendDmaStatus_0_SplFifoDisarmed_RMASK 0x1 +#define QIB_7322_SendDmaStatus_0_SplFifoEmpty_LSB 0x19 +#define QIB_7322_SendDmaStatus_0_SplFifoEmpty_MSB 0x19 +#define QIB_7322_SendDmaStatus_0_SplFifoEmpty_RMASK 0x1 +#define QIB_7322_SendDmaStatus_0_SplFifoFull_LSB 0x18 +#define QIB_7322_SendDmaStatus_0_SplFifoFull_MSB 0x18 +#define QIB_7322_SendDmaStatus_0_SplFifoFull_RMASK 0x1 +#define QIB_7322_SendDmaStatus_0_SplFifoBufNum_LSB 0x10 +#define QIB_7322_SendDmaStatus_0_SplFifoBufNum_MSB 0x17 +#define QIB_7322_SendDmaStatus_0_SplFifoBufNum_RMASK 0xFF +#define QIB_7322_SendDmaStatus_0_SplFifoDescIndex_LSB 0x0 +#define QIB_7322_SendDmaStatus_0_SplFifoDescIndex_MSB 0xF +#define QIB_7322_SendDmaStatus_0_SplFifoDescIndex_RMASK 0xFFFF + +#define QIB_7322_SendDmaPriorityThld_0_OFFS 0x1258 +#define QIB_7322_SendDmaPriorityThld_0_DEF 0x0000000000000000 +#define QIB_7322_SendDmaPriorityThld_0_PriorityThreshold_LSB 0x0 +#define QIB_7322_SendDmaPriorityThld_0_PriorityThreshold_MSB 0x3 +#define QIB_7322_SendDmaPriorityThld_0_PriorityThreshold_RMASK 0xF + +#define QIB_7322_SendHdrErrSymptom_0_OFFS 0x1260 +#define QIB_7322_SendHdrErrSymptom_0_DEF 0x0000000000000000 +#define QIB_7322_SendHdrErrSymptom_0_NonKeyPacket_LSB 0x6 +#define QIB_7322_SendHdrErrSymptom_0_NonKeyPacket_MSB 0x6 +#define QIB_7322_SendHdrErrSymptom_0_NonKeyPacket_RMASK 0x1 +#define QIB_7322_SendHdrErrSymptom_0_GRHFail_LSB 0x5 +#define QIB_7322_SendHdrErrSymptom_0_GRHFail_MSB 0x5 +#define QIB_7322_SendHdrErrSymptom_0_GRHFail_RMASK 0x1 +#define QIB_7322_SendHdrErrSymptom_0_PkeyFail_LSB 0x4 +#define QIB_7322_SendHdrErrSymptom_0_PkeyFail_MSB 0x4 +#define QIB_7322_SendHdrErrSymptom_0_PkeyFail_RMASK 0x1 +#define QIB_7322_SendHdrErrSymptom_0_QPFail_LSB 0x3 +#define QIB_7322_SendHdrErrSymptom_0_QPFail_MSB 0x3 +#define QIB_7322_SendHdrErrSymptom_0_QPFail_RMASK 0x1 +#define QIB_7322_SendHdrErrSymptom_0_SLIDFail_LSB 0x2 +#define QIB_7322_SendHdrErrSymptom_0_SLIDFail_MSB 0x2 +#define QIB_7322_SendHdrErrSymptom_0_SLIDFail_RMASK 0x1 +#define QIB_7322_SendHdrErrSymptom_0_RawIPV6_LSB 0x1 +#define QIB_7322_SendHdrErrSymptom_0_RawIPV6_MSB 0x1 +#define QIB_7322_SendHdrErrSymptom_0_RawIPV6_RMASK 0x1 +#define QIB_7322_SendHdrErrSymptom_0_PacketTooSmall_LSB 0x0 +#define QIB_7322_SendHdrErrSymptom_0_PacketTooSmall_MSB 0x0 +#define QIB_7322_SendHdrErrSymptom_0_PacketTooSmall_RMASK 0x1 + +#define QIB_7322_RxCreditVL0_0_OFFS 0x1280 +#define QIB_7322_RxCreditVL0_0_DEF 0x0000000000000000 +#define QIB_7322_RxCreditVL0_0_RxBufrConsumedVL_LSB 0x10 +#define QIB_7322_RxCreditVL0_0_RxBufrConsumedVL_MSB 0x1B +#define QIB_7322_RxCreditVL0_0_RxBufrConsumedVL_RMASK 0xFFF +#define QIB_7322_RxCreditVL0_0_RxMaxCreditVL_LSB 0x0 +#define QIB_7322_RxCreditVL0_0_RxMaxCreditVL_MSB 0xB +#define QIB_7322_RxCreditVL0_0_RxMaxCreditVL_RMASK 0xFFF + +#define QIB_7322_SendDmaBufUsed0_0_OFFS 0x1480 +#define QIB_7322_SendDmaBufUsed0_0_DEF 0x0000000000000000 +#define QIB_7322_SendDmaBufUsed0_0_BufUsed_63_0_LSB 0x0 +#define QIB_7322_SendDmaBufUsed0_0_BufUsed_63_0_MSB 0x3F +#define QIB_7322_SendDmaBufUsed0_0_BufUsed_63_0_RMASK 0x0 + +#define QIB_7322_SendCheckControl_0_OFFS 0x14A8 +#define QIB_7322_SendCheckControl_0_DEF 0x0000000000000000 +#define QIB_7322_SendCheckControl_0_PKey_En_LSB 0x4 +#define QIB_7322_SendCheckControl_0_PKey_En_MSB 0x4 +#define QIB_7322_SendCheckControl_0_PKey_En_RMASK 0x1 +#define QIB_7322_SendCheckControl_0_BTHQP_En_LSB 0x3 +#define QIB_7322_SendCheckControl_0_BTHQP_En_MSB 0x3 +#define QIB_7322_SendCheckControl_0_BTHQP_En_RMASK 0x1 +#define QIB_7322_SendCheckControl_0_SLID_En_LSB 0x2 +#define QIB_7322_SendCheckControl_0_SLID_En_MSB 0x2 +#define QIB_7322_SendCheckControl_0_SLID_En_RMASK 0x1 +#define QIB_7322_SendCheckControl_0_RawIPV6_En_LSB 0x1 +#define QIB_7322_SendCheckControl_0_RawIPV6_En_MSB 0x1 +#define QIB_7322_SendCheckControl_0_RawIPV6_En_RMASK 0x1 +#define QIB_7322_SendCheckControl_0_PacketTooSmall_En_LSB 0x0 +#define QIB_7322_SendCheckControl_0_PacketTooSmall_En_MSB 0x0 +#define QIB_7322_SendCheckControl_0_PacketTooSmall_En_RMASK 0x1 + +#define QIB_7322_SendIBSLIDMask_0_OFFS 0x14B0 +#define QIB_7322_SendIBSLIDMask_0_DEF 0x0000000000000000 +#define QIB_7322_SendIBSLIDMask_0_SendIBSLIDMask_15_0_LSB 0x0 +#define QIB_7322_SendIBSLIDMask_0_SendIBSLIDMask_15_0_MSB 0xF +#define QIB_7322_SendIBSLIDMask_0_SendIBSLIDMask_15_0_RMASK 0xFFFF + +#define QIB_7322_SendIBSLIDAssign_0_OFFS 0x14B8 +#define QIB_7322_SendIBSLIDAssign_0_DEF 0x0000000000000000 +#define QIB_7322_SendIBSLIDAssign_0_SendIBSLIDAssign_15_0_LSB 0x0 +#define QIB_7322_SendIBSLIDAssign_0_SendIBSLIDAssign_15_0_MSB 0xF +#define QIB_7322_SendIBSLIDAssign_0_SendIBSLIDAssign_15_0_RMASK 0xFFFF + +#define QIB_7322_IBCStatusA_0_OFFS 0x1540 +#define QIB_7322_IBCStatusA_0_DEF 0x0000000000000X02 +#define QIB_7322_IBCStatusA_0_TxCreditOk_VL7_LSB 0x27 +#define QIB_7322_IBCStatusA_0_TxCreditOk_VL7_MSB 0x27 +#define QIB_7322_IBCStatusA_0_TxCreditOk_VL7_RMASK 0x1 +#define QIB_7322_IBCStatusA_0_TxCreditOk_VL6_LSB 0x26 +#define QIB_7322_IBCStatusA_0_TxCreditOk_VL6_MSB 0x26 +#define QIB_7322_IBCStatusA_0_TxCreditOk_VL6_RMASK 0x1 +#define QIB_7322_IBCStatusA_0_TxCreditOk_VL5_LSB 0x25 +#define QIB_7322_IBCStatusA_0_TxCreditOk_VL5_MSB 0x25 +#define QIB_7322_IBCStatusA_0_TxCreditOk_VL5_RMASK 0x1 +#define QIB_7322_IBCStatusA_0_TxCreditOk_VL4_LSB 0x24 +#define QIB_7322_IBCStatusA_0_TxCreditOk_VL4_MSB 0x24 +#define QIB_7322_IBCStatusA_0_TxCreditOk_VL4_RMASK 0x1 +#define QIB_7322_IBCStatusA_0_TxCreditOk_VL3_LSB 0x23 +#define QIB_7322_IBCStatusA_0_TxCreditOk_VL3_MSB 0x23 +#define QIB_7322_IBCStatusA_0_TxCreditOk_VL3_RMASK 0x1 +#define QIB_7322_IBCStatusA_0_TxCreditOk_VL2_LSB 0x22 +#define QIB_7322_IBCStatusA_0_TxCreditOk_VL2_MSB 0x22 +#define QIB_7322_IBCStatusA_0_TxCreditOk_VL2_RMASK 0x1 +#define QIB_7322_IBCStatusA_0_TxCreditOk_VL1_LSB 0x21 +#define QIB_7322_IBCStatusA_0_TxCreditOk_VL1_MSB 0x21 +#define QIB_7322_IBCStatusA_0_TxCreditOk_VL1_RMASK 0x1 +#define QIB_7322_IBCStatusA_0_TxCreditOk_VL0_LSB 0x20 +#define QIB_7322_IBCStatusA_0_TxCreditOk_VL0_MSB 0x20 +#define QIB_7322_IBCStatusA_0_TxCreditOk_VL0_RMASK 0x1 +#define QIB_7322_IBCStatusA_0_TxReady_LSB 0x1E +#define QIB_7322_IBCStatusA_0_TxReady_MSB 0x1E +#define QIB_7322_IBCStatusA_0_TxReady_RMASK 0x1 +#define QIB_7322_IBCStatusA_0_LinkSpeedQDR_LSB 0x1D +#define QIB_7322_IBCStatusA_0_LinkSpeedQDR_MSB 0x1D +#define QIB_7322_IBCStatusA_0_LinkSpeedQDR_RMASK 0x1 +#define QIB_7322_IBCStatusA_0_ScrambleCapRemote_LSB 0xF +#define QIB_7322_IBCStatusA_0_ScrambleCapRemote_MSB 0xF +#define QIB_7322_IBCStatusA_0_ScrambleCapRemote_RMASK 0x1 +#define QIB_7322_IBCStatusA_0_ScrambleEn_LSB 0xE +#define QIB_7322_IBCStatusA_0_ScrambleEn_MSB 0xE +#define QIB_7322_IBCStatusA_0_ScrambleEn_RMASK 0x1 +#define QIB_7322_IBCStatusA_0_IBTxLaneReversed_LSB 0xD +#define QIB_7322_IBCStatusA_0_IBTxLaneReversed_MSB 0xD +#define QIB_7322_IBCStatusA_0_IBTxLaneReversed_RMASK 0x1 +#define QIB_7322_IBCStatusA_0_IBRxLaneReversed_LSB 0xC +#define QIB_7322_IBCStatusA_0_IBRxLaneReversed_MSB 0xC +#define QIB_7322_IBCStatusA_0_IBRxLaneReversed_RMASK 0x1 +#define QIB_7322_IBCStatusA_0_DDS_RXEQ_FAIL_LSB 0xA +#define QIB_7322_IBCStatusA_0_DDS_RXEQ_FAIL_MSB 0xA +#define QIB_7322_IBCStatusA_0_DDS_RXEQ_FAIL_RMASK 0x1 +#define QIB_7322_IBCStatusA_0_LinkWidthActive_LSB 0x9 +#define QIB_7322_IBCStatusA_0_LinkWidthActive_MSB 0x9 +#define QIB_7322_IBCStatusA_0_LinkWidthActive_RMASK 0x1 +#define QIB_7322_IBCStatusA_0_LinkSpeedActive_LSB 0x8 +#define QIB_7322_IBCStatusA_0_LinkSpeedActive_MSB 0x8 +#define QIB_7322_IBCStatusA_0_LinkSpeedActive_RMASK 0x1 +#define QIB_7322_IBCStatusA_0_LinkState_LSB 0x5 +#define QIB_7322_IBCStatusA_0_LinkState_MSB 0x7 +#define QIB_7322_IBCStatusA_0_LinkState_RMASK 0x7 +#define QIB_7322_IBCStatusA_0_LinkTrainingState_LSB 0x0 +#define QIB_7322_IBCStatusA_0_LinkTrainingState_MSB 0x4 +#define QIB_7322_IBCStatusA_0_LinkTrainingState_RMASK 0x1F + +#define QIB_7322_IBCStatusB_0_OFFS 0x1548 +#define QIB_7322_IBCStatusB_0_DEF 0x00000000XXXXXXXX +#define QIB_7322_IBCStatusB_0_ibsd_adaptation_timer_debug_LSB 0x27 +#define QIB_7322_IBCStatusB_0_ibsd_adaptation_timer_debug_MSB 0x27 +#define QIB_7322_IBCStatusB_0_ibsd_adaptation_timer_debug_RMASK 0x1 +#define QIB_7322_IBCStatusB_0_ibsd_adaptation_timer_reached_threshold_LSB 0x26 +#define QIB_7322_IBCStatusB_0_ibsd_adaptation_timer_reached_threshold_MSB 0x26 +#define QIB_7322_IBCStatusB_0_ibsd_adaptation_timer_reached_threshold_RMASK 0x1 +#define QIB_7322_IBCStatusB_0_ibsd_adaptation_timer_started_LSB 0x25 +#define QIB_7322_IBCStatusB_0_ibsd_adaptation_timer_started_MSB 0x25 +#define QIB_7322_IBCStatusB_0_ibsd_adaptation_timer_started_RMASK 0x1 +#define QIB_7322_IBCStatusB_0_heartbeat_timed_out_LSB 0x24 +#define QIB_7322_IBCStatusB_0_heartbeat_timed_out_MSB 0x24 +#define QIB_7322_IBCStatusB_0_heartbeat_timed_out_RMASK 0x1 +#define QIB_7322_IBCStatusB_0_heartbeat_crosstalk_LSB 0x20 +#define QIB_7322_IBCStatusB_0_heartbeat_crosstalk_MSB 0x23 +#define QIB_7322_IBCStatusB_0_heartbeat_crosstalk_RMASK 0xF +#define QIB_7322_IBCStatusB_0_RxEqLocalDevice_LSB 0x1E +#define QIB_7322_IBCStatusB_0_RxEqLocalDevice_MSB 0x1F +#define QIB_7322_IBCStatusB_0_RxEqLocalDevice_RMASK 0x3 +#define QIB_7322_IBCStatusB_0_ReqDDSLocalFromRmt_LSB 0x1A +#define QIB_7322_IBCStatusB_0_ReqDDSLocalFromRmt_MSB 0x1D +#define QIB_7322_IBCStatusB_0_ReqDDSLocalFromRmt_RMASK 0xF +#define QIB_7322_IBCStatusB_0_LinkRoundTripLatency_LSB 0x0 +#define QIB_7322_IBCStatusB_0_LinkRoundTripLatency_MSB 0x19 +#define QIB_7322_IBCStatusB_0_LinkRoundTripLatency_RMASK 0x3FFFFFF + +#define QIB_7322_IBCCtrlA_0_OFFS 0x1560 +#define QIB_7322_IBCCtrlA_0_DEF 0x0000000000000000 +#define QIB_7322_IBCCtrlA_0_Loopback_LSB 0x3F +#define QIB_7322_IBCCtrlA_0_Loopback_MSB 0x3F +#define QIB_7322_IBCCtrlA_0_Loopback_RMASK 0x1 +#define QIB_7322_IBCCtrlA_0_LinkDownDefaultState_LSB 0x3E +#define QIB_7322_IBCCtrlA_0_LinkDownDefaultState_MSB 0x3E +#define QIB_7322_IBCCtrlA_0_LinkDownDefaultState_RMASK 0x1 +#define QIB_7322_IBCCtrlA_0_IBLinkEn_LSB 0x3D +#define QIB_7322_IBCCtrlA_0_IBLinkEn_MSB 0x3D +#define QIB_7322_IBCCtrlA_0_IBLinkEn_RMASK 0x1 +#define QIB_7322_IBCCtrlA_0_IBStatIntReductionEn_LSB 0x3C +#define QIB_7322_IBCCtrlA_0_IBStatIntReductionEn_MSB 0x3C +#define QIB_7322_IBCCtrlA_0_IBStatIntReductionEn_RMASK 0x1 +#define QIB_7322_IBCCtrlA_0_NumVLane_LSB 0x30 +#define QIB_7322_IBCCtrlA_0_NumVLane_MSB 0x32 +#define QIB_7322_IBCCtrlA_0_NumVLane_RMASK 0x7 +#define QIB_7322_IBCCtrlA_0_OverrunThreshold_LSB 0x24 +#define QIB_7322_IBCCtrlA_0_OverrunThreshold_MSB 0x27 +#define QIB_7322_IBCCtrlA_0_OverrunThreshold_RMASK 0xF +#define QIB_7322_IBCCtrlA_0_PhyerrThreshold_LSB 0x20 +#define QIB_7322_IBCCtrlA_0_PhyerrThreshold_MSB 0x23 +#define QIB_7322_IBCCtrlA_0_PhyerrThreshold_RMASK 0xF +#define QIB_7322_IBCCtrlA_0_MaxPktLen_LSB 0x15 +#define QIB_7322_IBCCtrlA_0_MaxPktLen_MSB 0x1F +#define QIB_7322_IBCCtrlA_0_MaxPktLen_RMASK 0x7FF +#define QIB_7322_IBCCtrlA_0_LinkCmd_LSB 0x13 +#define QIB_7322_IBCCtrlA_0_LinkCmd_MSB 0x14 +#define QIB_7322_IBCCtrlA_0_LinkCmd_RMASK 0x3 +#define QIB_7322_IBCCtrlA_0_LinkInitCmd_LSB 0x10 +#define QIB_7322_IBCCtrlA_0_LinkInitCmd_MSB 0x12 +#define QIB_7322_IBCCtrlA_0_LinkInitCmd_RMASK 0x7 +#define QIB_7322_IBCCtrlA_0_FlowCtrlWaterMark_LSB 0x8 +#define QIB_7322_IBCCtrlA_0_FlowCtrlWaterMark_MSB 0xF +#define QIB_7322_IBCCtrlA_0_FlowCtrlWaterMark_RMASK 0xFF +#define QIB_7322_IBCCtrlA_0_FlowCtrlPeriod_LSB 0x0 +#define QIB_7322_IBCCtrlA_0_FlowCtrlPeriod_MSB 0x7 +#define QIB_7322_IBCCtrlA_0_FlowCtrlPeriod_RMASK 0xFF + +#define QIB_7322_IBCCtrlB_0_OFFS 0x1568 +#define QIB_7322_IBCCtrlB_0_DEF 0x00000000000305FF +#define QIB_7322_IBCCtrlB_0_IB_DLID_MASK_LSB 0x30 +#define QIB_7322_IBCCtrlB_0_IB_DLID_MASK_MSB 0x3F +#define QIB_7322_IBCCtrlB_0_IB_DLID_MASK_RMASK 0xFFFF +#define QIB_7322_IBCCtrlB_0_IB_DLID_LSB 0x20 +#define QIB_7322_IBCCtrlB_0_IB_DLID_MSB 0x2F +#define QIB_7322_IBCCtrlB_0_IB_DLID_RMASK 0xFFFF +#define QIB_7322_IBCCtrlB_0_IB_ENABLE_FILT_DPKT_LSB 0x1B +#define QIB_7322_IBCCtrlB_0_IB_ENABLE_FILT_DPKT_MSB 0x1B +#define QIB_7322_IBCCtrlB_0_IB_ENABLE_FILT_DPKT_RMASK 0x1 +#define QIB_7322_IBCCtrlB_0_HRTBT_REQ_LSB 0x1A +#define QIB_7322_IBCCtrlB_0_HRTBT_REQ_MSB 0x1A +#define QIB_7322_IBCCtrlB_0_HRTBT_REQ_RMASK 0x1 +#define QIB_7322_IBCCtrlB_0_HRTBT_PORT_LSB 0x12 +#define QIB_7322_IBCCtrlB_0_HRTBT_PORT_MSB 0x19 +#define QIB_7322_IBCCtrlB_0_HRTBT_PORT_RMASK 0xFF +#define QIB_7322_IBCCtrlB_0_HRTBT_AUTO_LSB 0x11 +#define QIB_7322_IBCCtrlB_0_HRTBT_AUTO_MSB 0x11 +#define QIB_7322_IBCCtrlB_0_HRTBT_AUTO_RMASK 0x1 +#define QIB_7322_IBCCtrlB_0_HRTBT_ENB_LSB 0x10 +#define QIB_7322_IBCCtrlB_0_HRTBT_ENB_MSB 0x10 +#define QIB_7322_IBCCtrlB_0_HRTBT_ENB_RMASK 0x1 +#define QIB_7322_IBCCtrlB_0_SD_DDS_LSB 0xC +#define QIB_7322_IBCCtrlB_0_SD_DDS_MSB 0xF +#define QIB_7322_IBCCtrlB_0_SD_DDS_RMASK 0xF +#define QIB_7322_IBCCtrlB_0_SD_DDSV_LSB 0xB +#define QIB_7322_IBCCtrlB_0_SD_DDSV_MSB 0xB +#define QIB_7322_IBCCtrlB_0_SD_DDSV_RMASK 0x1 +#define QIB_7322_IBCCtrlB_0_SD_ADD_ENB_LSB 0xA +#define QIB_7322_IBCCtrlB_0_SD_ADD_ENB_MSB 0xA +#define QIB_7322_IBCCtrlB_0_SD_ADD_ENB_RMASK 0x1 +#define QIB_7322_IBCCtrlB_0_SD_RX_EQUAL_ENABLE_LSB 0x9 +#define QIB_7322_IBCCtrlB_0_SD_RX_EQUAL_ENABLE_MSB 0x9 +#define QIB_7322_IBCCtrlB_0_SD_RX_EQUAL_ENABLE_RMASK 0x1 +#define QIB_7322_IBCCtrlB_0_IB_LANE_REV_SUPPORTED_LSB 0x8 +#define QIB_7322_IBCCtrlB_0_IB_LANE_REV_SUPPORTED_MSB 0x8 +#define QIB_7322_IBCCtrlB_0_IB_LANE_REV_SUPPORTED_RMASK 0x1 +#define QIB_7322_IBCCtrlB_0_IB_POLARITY_REV_SUPP_LSB 0x7 +#define QIB_7322_IBCCtrlB_0_IB_POLARITY_REV_SUPP_MSB 0x7 +#define QIB_7322_IBCCtrlB_0_IB_POLARITY_REV_SUPP_RMASK 0x1 +#define QIB_7322_IBCCtrlB_0_IB_NUM_CHANNELS_LSB 0x5 +#define QIB_7322_IBCCtrlB_0_IB_NUM_CHANNELS_MSB 0x6 +#define QIB_7322_IBCCtrlB_0_IB_NUM_CHANNELS_RMASK 0x3 +#define QIB_7322_IBCCtrlB_0_SD_SPEED_QDR_LSB 0x4 +#define QIB_7322_IBCCtrlB_0_SD_SPEED_QDR_MSB 0x4 +#define QIB_7322_IBCCtrlB_0_SD_SPEED_QDR_RMASK 0x1 +#define QIB_7322_IBCCtrlB_0_SD_SPEED_DDR_LSB 0x3 +#define QIB_7322_IBCCtrlB_0_SD_SPEED_DDR_MSB 0x3 +#define QIB_7322_IBCCtrlB_0_SD_SPEED_DDR_RMASK 0x1 +#define QIB_7322_IBCCtrlB_0_SD_SPEED_SDR_LSB 0x2 +#define QIB_7322_IBCCtrlB_0_SD_SPEED_SDR_MSB 0x2 +#define QIB_7322_IBCCtrlB_0_SD_SPEED_SDR_RMASK 0x1 +#define QIB_7322_IBCCtrlB_0_SD_SPEED_LSB 0x1 +#define QIB_7322_IBCCtrlB_0_SD_SPEED_MSB 0x1 +#define QIB_7322_IBCCtrlB_0_SD_SPEED_RMASK 0x1 +#define QIB_7322_IBCCtrlB_0_IB_ENHANCED_MODE_LSB 0x0 +#define QIB_7322_IBCCtrlB_0_IB_ENHANCED_MODE_MSB 0x0 +#define QIB_7322_IBCCtrlB_0_IB_ENHANCED_MODE_RMASK 0x1 + +#define QIB_7322_IBCCtrlC_0_OFFS 0x1570 +#define QIB_7322_IBCCtrlC_0_DEF 0x0000000000000301 +#define QIB_7322_IBCCtrlC_0_IB_BACK_PORCH_LSB 0x5 +#define QIB_7322_IBCCtrlC_0_IB_BACK_PORCH_MSB 0x9 +#define QIB_7322_IBCCtrlC_0_IB_BACK_PORCH_RMASK 0x1F +#define QIB_7322_IBCCtrlC_0_IB_FRONT_PORCH_LSB 0x0 +#define QIB_7322_IBCCtrlC_0_IB_FRONT_PORCH_MSB 0x4 +#define QIB_7322_IBCCtrlC_0_IB_FRONT_PORCH_RMASK 0x1F + +#define QIB_7322_HRTBT_GUID_0_OFFS 0x1588 +#define QIB_7322_HRTBT_GUID_0_DEF 0x0000000000000000 + +#define QIB_7322_IB_SDTEST_IF_TX_0_OFFS 0x1590 +#define QIB_7322_IB_SDTEST_IF_TX_0_DEF 0x0000000000000000 +#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_RX_CFG_LSB 0x30 +#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_RX_CFG_MSB 0x3F +#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_RX_CFG_RMASK 0xFFFF +#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_TX_CFG_LSB 0x20 +#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_TX_CFG_MSB 0x2F +#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_TX_CFG_RMASK 0xFFFF +#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_SPEED_LSB 0xD +#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_SPEED_MSB 0xF +#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_SPEED_RMASK 0x7 +#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_OPCODE_LSB 0xB +#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_OPCODE_MSB 0xC +#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_OPCODE_RMASK 0x3 +#define QIB_7322_IB_SDTEST_IF_TX_0_CREDIT_CHANGE_LSB 0x4 +#define QIB_7322_IB_SDTEST_IF_TX_0_CREDIT_CHANGE_MSB 0x4 +#define QIB_7322_IB_SDTEST_IF_TX_0_CREDIT_CHANGE_RMASK 0x1 +#define QIB_7322_IB_SDTEST_IF_TX_0_VL_CAP_LSB 0x2 +#define QIB_7322_IB_SDTEST_IF_TX_0_VL_CAP_MSB 0x3 +#define QIB_7322_IB_SDTEST_IF_TX_0_VL_CAP_RMASK 0x3 +#define QIB_7322_IB_SDTEST_IF_TX_0_TS_3_TX_VALID_LSB 0x1 +#define QIB_7322_IB_SDTEST_IF_TX_0_TS_3_TX_VALID_MSB 0x1 +#define QIB_7322_IB_SDTEST_IF_TX_0_TS_3_TX_VALID_RMASK 0x1 +#define QIB_7322_IB_SDTEST_IF_TX_0_TS_T_TX_VALID_LSB 0x0 +#define QIB_7322_IB_SDTEST_IF_TX_0_TS_T_TX_VALID_MSB 0x0 +#define QIB_7322_IB_SDTEST_IF_TX_0_TS_T_TX_VALID_RMASK 0x1 + +#define QIB_7322_IB_SDTEST_IF_RX_0_OFFS 0x1598 +#define QIB_7322_IB_SDTEST_IF_RX_0_DEF 0x0000000000000000 +#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_RX_CFG_LSB 0x30 +#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_RX_CFG_MSB 0x3F +#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_RX_CFG_RMASK 0xFFFF +#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_TX_CFG_LSB 0x20 +#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_TX_CFG_MSB 0x2F +#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_TX_CFG_RMASK 0xFFFF +#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_B_LSB 0x18 +#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_B_MSB 0x1F +#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_B_RMASK 0xFF +#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_A_LSB 0x10 +#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_A_MSB 0x17 +#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_A_RMASK 0xFF +#define QIB_7322_IB_SDTEST_IF_RX_0_TS_3_RX_VALID_LSB 0x1 +#define QIB_7322_IB_SDTEST_IF_RX_0_TS_3_RX_VALID_MSB 0x1 +#define QIB_7322_IB_SDTEST_IF_RX_0_TS_3_RX_VALID_RMASK 0x1 +#define QIB_7322_IB_SDTEST_IF_RX_0_TS_T_RX_VALID_LSB 0x0 +#define QIB_7322_IB_SDTEST_IF_RX_0_TS_T_RX_VALID_MSB 0x0 +#define QIB_7322_IB_SDTEST_IF_RX_0_TS_T_RX_VALID_RMASK 0x1 + +#define QIB_7322_IBNCModeCtrl_0_OFFS 0x15B8 +#define QIB_7322_IBNCModeCtrl_0_DEF 0x0000000000000000 +#define QIB_7322_IBNCModeCtrl_0_ScrambleCapRemoteForce_LSB 0x22 +#define QIB_7322_IBNCModeCtrl_0_ScrambleCapRemoteForce_MSB 0x22 +#define QIB_7322_IBNCModeCtrl_0_ScrambleCapRemoteForce_RMASK 0x1 +#define QIB_7322_IBNCModeCtrl_0_ScrambleCapRemoteMask_LSB 0x21 +#define QIB_7322_IBNCModeCtrl_0_ScrambleCapRemoteMask_MSB 0x21 +#define QIB_7322_IBNCModeCtrl_0_ScrambleCapRemoteMask_RMASK 0x1 +#define QIB_7322_IBNCModeCtrl_0_ScrambleCapLocal_LSB 0x20 +#define QIB_7322_IBNCModeCtrl_0_ScrambleCapLocal_MSB 0x20 +#define QIB_7322_IBNCModeCtrl_0_ScrambleCapLocal_RMASK 0x1 +#define QIB_7322_IBNCModeCtrl_0_TSMCode_TS2_LSB 0x11 +#define QIB_7322_IBNCModeCtrl_0_TSMCode_TS2_MSB 0x19 +#define QIB_7322_IBNCModeCtrl_0_TSMCode_TS2_RMASK 0x1FF +#define QIB_7322_IBNCModeCtrl_0_TSMCode_TS1_LSB 0x8 +#define QIB_7322_IBNCModeCtrl_0_TSMCode_TS1_MSB 0x10 +#define QIB_7322_IBNCModeCtrl_0_TSMCode_TS1_RMASK 0x1FF +#define QIB_7322_IBNCModeCtrl_0_TSMEnable_ignore_TSM_on_rx_LSB 0x2 +#define QIB_7322_IBNCModeCtrl_0_TSMEnable_ignore_TSM_on_rx_MSB 0x2 +#define QIB_7322_IBNCModeCtrl_0_TSMEnable_ignore_TSM_on_rx_RMASK 0x1 +#define QIB_7322_IBNCModeCtrl_0_TSMEnable_send_TS2_LSB 0x1 +#define QIB_7322_IBNCModeCtrl_0_TSMEnable_send_TS2_MSB 0x1 +#define QIB_7322_IBNCModeCtrl_0_TSMEnable_send_TS2_RMASK 0x1 +#define QIB_7322_IBNCModeCtrl_0_TSMEnable_send_TS1_LSB 0x0 +#define QIB_7322_IBNCModeCtrl_0_TSMEnable_send_TS1_MSB 0x0 +#define QIB_7322_IBNCModeCtrl_0_TSMEnable_send_TS1_RMASK 0x1 + +#define QIB_7322_IBSerdesStatus_0_OFFS 0x15D0 +#define QIB_7322_IBSerdesStatus_0_DEF 0x0000000000000000 + +#define QIB_7322_IBPCSConfig_0_OFFS 0x15D8 +#define QIB_7322_IBPCSConfig_0_DEF 0x0000000000000007 +#define QIB_7322_IBPCSConfig_0_link_sync_mask_LSB 0x9 +#define QIB_7322_IBPCSConfig_0_link_sync_mask_MSB 0x12 +#define QIB_7322_IBPCSConfig_0_link_sync_mask_RMASK 0x3FF +#define QIB_7322_IBPCSConfig_0_xcv_rreset_LSB 0x2 +#define QIB_7322_IBPCSConfig_0_xcv_rreset_MSB 0x2 +#define QIB_7322_IBPCSConfig_0_xcv_rreset_RMASK 0x1 +#define QIB_7322_IBPCSConfig_0_xcv_treset_LSB 0x1 +#define QIB_7322_IBPCSConfig_0_xcv_treset_MSB 0x1 +#define QIB_7322_IBPCSConfig_0_xcv_treset_RMASK 0x1 +#define QIB_7322_IBPCSConfig_0_tx_rx_reset_LSB 0x0 +#define QIB_7322_IBPCSConfig_0_tx_rx_reset_MSB 0x0 +#define QIB_7322_IBPCSConfig_0_tx_rx_reset_RMASK 0x1 + +#define QIB_7322_IBSerdesCtrl_0_OFFS 0x15E0 +#define QIB_7322_IBSerdesCtrl_0_DEF 0x0000000000FFA00F +#define QIB_7322_IBSerdesCtrl_0_DISABLE_RXLATOFF_QDR_LSB 0x1A +#define QIB_7322_IBSerdesCtrl_0_DISABLE_RXLATOFF_QDR_MSB 0x1A +#define QIB_7322_IBSerdesCtrl_0_DISABLE_RXLATOFF_QDR_RMASK 0x1 +#define QIB_7322_IBSerdesCtrl_0_DISABLE_RXLATOFF_DDR_LSB 0x19 +#define QIB_7322_IBSerdesCtrl_0_DISABLE_RXLATOFF_DDR_MSB 0x19 +#define QIB_7322_IBSerdesCtrl_0_DISABLE_RXLATOFF_DDR_RMASK 0x1 +#define QIB_7322_IBSerdesCtrl_0_DISABLE_RXLATOFF_SDR_LSB 0x18 +#define QIB_7322_IBSerdesCtrl_0_DISABLE_RXLATOFF_SDR_MSB 0x18 +#define QIB_7322_IBSerdesCtrl_0_DISABLE_RXLATOFF_SDR_RMASK 0x1 +#define QIB_7322_IBSerdesCtrl_0_CHANNEL_RESET_N_LSB 0x14 +#define QIB_7322_IBSerdesCtrl_0_CHANNEL_RESET_N_MSB 0x17 +#define QIB_7322_IBSerdesCtrl_0_CHANNEL_RESET_N_RMASK 0xF +#define QIB_7322_IBSerdesCtrl_0_CGMODE_LSB 0x10 +#define QIB_7322_IBSerdesCtrl_0_CGMODE_MSB 0x13 +#define QIB_7322_IBSerdesCtrl_0_CGMODE_RMASK 0xF +#define QIB_7322_IBSerdesCtrl_0_IB_LAT_MODE_LSB 0xF +#define QIB_7322_IBSerdesCtrl_0_IB_LAT_MODE_MSB 0xF +#define QIB_7322_IBSerdesCtrl_0_IB_LAT_MODE_RMASK 0x1 +#define QIB_7322_IBSerdesCtrl_0_RXLOSEN_LSB 0xD +#define QIB_7322_IBSerdesCtrl_0_RXLOSEN_MSB 0xD +#define QIB_7322_IBSerdesCtrl_0_RXLOSEN_RMASK 0x1 +#define QIB_7322_IBSerdesCtrl_0_LPEN_LSB 0xC +#define QIB_7322_IBSerdesCtrl_0_LPEN_MSB 0xC +#define QIB_7322_IBSerdesCtrl_0_LPEN_RMASK 0x1 +#define QIB_7322_IBSerdesCtrl_0_PLLPD_LSB 0xB +#define QIB_7322_IBSerdesCtrl_0_PLLPD_MSB 0xB +#define QIB_7322_IBSerdesCtrl_0_PLLPD_RMASK 0x1 +#define QIB_7322_IBSerdesCtrl_0_TXPD_LSB 0xA +#define QIB_7322_IBSerdesCtrl_0_TXPD_MSB 0xA +#define QIB_7322_IBSerdesCtrl_0_TXPD_RMASK 0x1 +#define QIB_7322_IBSerdesCtrl_0_RXPD_LSB 0x9 +#define QIB_7322_IBSerdesCtrl_0_RXPD_MSB 0x9 +#define QIB_7322_IBSerdesCtrl_0_RXPD_RMASK 0x1 +#define QIB_7322_IBSerdesCtrl_0_TXIDLE_LSB 0x8 +#define QIB_7322_IBSerdesCtrl_0_TXIDLE_MSB 0x8 +#define QIB_7322_IBSerdesCtrl_0_TXIDLE_RMASK 0x1 +#define QIB_7322_IBSerdesCtrl_0_CMODE_LSB 0x0 +#define QIB_7322_IBSerdesCtrl_0_CMODE_MSB 0x6 +#define QIB_7322_IBSerdesCtrl_0_CMODE_RMASK 0x7F + +#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_OFFS 0x1600 +#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_DEF 0x0000000000000000 +#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_tx_override_deemphasis_select_LSB 0x1F +#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_tx_override_deemphasis_select_MSB 0x1F +#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_tx_override_deemphasis_select_RMASK 0x1 +#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_reset_tx_deemphasis_override_LSB 0x1E +#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_reset_tx_deemphasis_override_MSB 0x1E +#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_reset_tx_deemphasis_override_RMASK 0x1 +#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txampcntl_d2a_LSB 0xE +#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txampcntl_d2a_MSB 0x11 +#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txampcntl_d2a_RMASK 0xF +#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txc0_ena_LSB 0x9 +#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txc0_ena_MSB 0xD +#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txc0_ena_RMASK 0x1F +#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txcp1_ena_LSB 0x5 +#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txcp1_ena_MSB 0x8 +#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txcp1_ena_RMASK 0xF +#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txcn1_xtra_emph0_LSB 0x3 +#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txcn1_xtra_emph0_MSB 0x4 +#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txcn1_xtra_emph0_RMASK 0x3 +#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txcn1_ena_LSB 0x0 +#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txcn1_ena_MSB 0x2 +#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txcn1_ena_RMASK 0x7 + +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_OFFS 0x1640 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_DEF 0x0000000000000000 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch3_LSB 0x27 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch3_MSB 0x27 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch3_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch2_LSB 0x26 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch2_MSB 0x26 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch2_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch1_LSB 0x25 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch1_MSB 0x25 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch1_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch0_LSB 0x24 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch0_MSB 0x24 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch0_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch3_LSB 0x23 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch3_MSB 0x23 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch3_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch2_LSB 0x22 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch2_MSB 0x22 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch2_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch1_LSB 0x21 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch1_MSB 0x21 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch1_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch0_LSB 0x20 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch0_MSB 0x20 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch0_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch3_LSB 0x18 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch3_MSB 0x1F +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch3_RMASK 0xFF +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch2_LSB 0x10 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch2_MSB 0x17 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch2_RMASK 0xFF +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch1_LSB 0x8 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch1_MSB 0xF +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch1_RMASK 0xFF +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch0_LSB 0x0 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch0_MSB 0x7 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch0_RMASK 0xFF + +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_OFFS 0x1648 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_DEF 0x0000000000000000 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch3_LSB 0x27 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch3_MSB 0x27 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch3_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch2_LSB 0x26 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch2_MSB 0x26 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch2_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch1_LSB 0x25 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch1_MSB 0x25 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch1_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch0_LSB 0x24 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch0_MSB 0x24 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch0_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch3_LSB 0x23 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch3_MSB 0x23 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch3_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch2_LSB 0x22 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch2_MSB 0x22 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch2_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch1_LSB 0x21 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch1_MSB 0x21 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch1_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch0_LSB 0x20 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch0_MSB 0x20 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch0_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch3_LSB 0x18 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch3_MSB 0x1F +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch3_RMASK 0xFF +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch2_LSB 0x10 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch2_MSB 0x17 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch2_RMASK 0xFF +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch1_LSB 0x8 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch1_MSB 0xF +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch1_RMASK 0xFF +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch0_LSB 0x0 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch0_MSB 0x7 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch0_RMASK 0xFF + +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_OFFS 0x1650 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_DEF 0x0000000000000000 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch3_LSB 0x27 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch3_MSB 0x27 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch3_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch2_LSB 0x26 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch2_MSB 0x26 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch2_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch1_LSB 0x25 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch1_MSB 0x25 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch1_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch0_LSB 0x24 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch0_MSB 0x24 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch0_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch3_LSB 0x23 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch3_MSB 0x23 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch3_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch2_LSB 0x22 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch2_MSB 0x22 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch2_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch1_LSB 0x21 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch1_MSB 0x21 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch1_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch0_LSB 0x20 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch0_MSB 0x20 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch0_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch3_LSB 0x18 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch3_MSB 0x1F +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch3_RMASK 0xFF +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch2_LSB 0x10 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch2_MSB 0x17 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch2_RMASK 0xFF +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch1_LSB 0x8 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch1_MSB 0xF +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch1_RMASK 0xFF +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch0_LSB 0x0 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch0_MSB 0x7 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch0_RMASK 0xFF + +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_OFFS 0x1658 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_DEF 0x0000000000000000 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch3_LSB 0x27 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch3_MSB 0x27 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch3_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch2_LSB 0x26 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch2_MSB 0x26 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch2_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch1_LSB 0x25 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch1_MSB 0x25 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch1_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch0_LSB 0x24 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch0_MSB 0x24 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch0_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch3_LSB 0x23 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch3_MSB 0x23 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch3_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch2_LSB 0x22 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch2_MSB 0x22 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch2_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch1_LSB 0x21 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch1_MSB 0x21 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch1_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch0_LSB 0x20 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch0_MSB 0x20 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch0_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch3_LSB 0x18 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch3_MSB 0x1F +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch3_RMASK 0xFF +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch2_LSB 0x10 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch2_MSB 0x17 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch2_RMASK 0xFF +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch1_LSB 0x8 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch1_MSB 0xF +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch1_RMASK 0xFF +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch0_LSB 0x0 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch0_MSB 0x7 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch0_RMASK 0xFF + +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_OFFS 0x1660 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_DEF 0x0000000000000000 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch3_LSB 0x27 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch3_MSB 0x27 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch3_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch2_LSB 0x26 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch2_MSB 0x26 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch2_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch1_LSB 0x25 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch1_MSB 0x25 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch1_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch0_LSB 0x24 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch0_MSB 0x24 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch0_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch3_LSB 0x23 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch3_MSB 0x23 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch3_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch2_LSB 0x22 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch2_MSB 0x22 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch2_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch1_LSB 0x21 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch1_MSB 0x21 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch1_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch0_LSB 0x20 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch0_MSB 0x20 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch0_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch3_LSB 0x18 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch3_MSB 0x1F +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch3_RMASK 0xFF +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch2_LSB 0x10 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch2_MSB 0x17 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch2_RMASK 0xFF +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch1_LSB 0x8 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch1_MSB 0xF +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch1_RMASK 0xFF +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch0_LSB 0x0 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch0_MSB 0x7 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch0_RMASK 0xFF + +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_OFFS 0x1668 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_DEF 0x0000000000000000 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch3_LSB 0x27 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch3_MSB 0x27 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch3_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch2_LSB 0x26 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch2_MSB 0x26 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch2_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch1_LSB 0x25 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch1_MSB 0x25 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch1_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch0_LSB 0x24 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch0_MSB 0x24 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch0_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch3_LSB 0x23 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch3_MSB 0x23 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch3_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch2_LSB 0x22 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch2_MSB 0x22 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch2_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch1_LSB 0x21 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch1_MSB 0x21 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch1_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch0_LSB 0x20 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch0_MSB 0x20 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch0_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch3_LSB 0x18 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch3_MSB 0x1F +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch3_RMASK 0xFF +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch2_LSB 0x10 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch2_MSB 0x17 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch2_RMASK 0xFF +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch1_LSB 0x8 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch1_MSB 0xF +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch1_RMASK 0xFF +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch0_LSB 0x0 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch0_MSB 0x7 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch0_RMASK 0xFF + +#define QIB_7322_ADAPT_DISABLE_TIMER_THRESHOLD_0_OFFS 0x1670 +#define QIB_7322_ADAPT_DISABLE_TIMER_THRESHOLD_0_DEF 0x0000000000000000 + +#define QIB_7322_HighPriorityLimit_0_OFFS 0x1BC0 +#define QIB_7322_HighPriorityLimit_0_DEF 0x0000000000000000 +#define QIB_7322_HighPriorityLimit_0_Limit_LSB 0x0 +#define QIB_7322_HighPriorityLimit_0_Limit_MSB 0x7 +#define QIB_7322_HighPriorityLimit_0_Limit_RMASK 0xFF + +#define QIB_7322_LowPriority0_0_OFFS 0x1C00 +#define QIB_7322_LowPriority0_0_DEF 0x0000000000000000 +#define QIB_7322_LowPriority0_0_VirtualLane_LSB 0x10 +#define QIB_7322_LowPriority0_0_VirtualLane_MSB 0x12 +#define QIB_7322_LowPriority0_0_VirtualLane_RMASK 0x7 +#define QIB_7322_LowPriority0_0_Weight_LSB 0x0 +#define QIB_7322_LowPriority0_0_Weight_MSB 0x7 +#define QIB_7322_LowPriority0_0_Weight_RMASK 0xFF + +#define QIB_7322_HighPriority0_0_OFFS 0x1E00 +#define QIB_7322_HighPriority0_0_DEF 0x0000000000000000 +#define QIB_7322_HighPriority0_0_VirtualLane_LSB 0x10 +#define QIB_7322_HighPriority0_0_VirtualLane_MSB 0x12 +#define QIB_7322_HighPriority0_0_VirtualLane_RMASK 0x7 +#define QIB_7322_HighPriority0_0_Weight_LSB 0x0 +#define QIB_7322_HighPriority0_0_Weight_MSB 0x7 +#define QIB_7322_HighPriority0_0_Weight_RMASK 0xFF + +#define QIB_7322_CntrRegBase_1_OFFS 0x2028 +#define QIB_7322_CntrRegBase_1_DEF 0x0000000000013000 + +#define QIB_7322_RcvQPMulticastContext_1_OFFS 0x2170 + +#define QIB_7322_SendCtrl_1_OFFS 0x21C0 + +#define QIB_7322_SendBufAvail0_OFFS 0x3000 +#define QIB_7322_SendBufAvail0_DEF 0x0000000000000000 +#define QIB_7322_SendBufAvail0_SendBuf_31_0_LSB 0x0 +#define QIB_7322_SendBufAvail0_SendBuf_31_0_MSB 0x3F +#define QIB_7322_SendBufAvail0_SendBuf_31_0_RMASK 0x0 + +#define QIB_7322_MsixTable_OFFS 0x8000 +#define QIB_7322_MsixTable_DEF 0x0000000000000000 + +#define QIB_7322_MsixPba_OFFS 0x9000 +#define QIB_7322_MsixPba_DEF 0x0000000000000000 + +#define QIB_7322_LAMemory_OFFS 0xA000 +#define QIB_7322_LAMemory_DEF 0x0000000000000000 + +#define QIB_7322_LBIntCnt_OFFS 0x11000 +#define QIB_7322_LBIntCnt_DEF 0x0000000000000000 + +#define QIB_7322_LBFlowStallCnt_OFFS 0x11008 +#define QIB_7322_LBFlowStallCnt_DEF 0x0000000000000000 + +#define QIB_7322_RxTIDFullErrCnt_OFFS 0x110D0 +#define QIB_7322_RxTIDFullErrCnt_DEF 0x0000000000000000 + +#define QIB_7322_RxTIDValidErrCnt_OFFS 0x110D8 +#define QIB_7322_RxTIDValidErrCnt_DEF 0x0000000000000000 + +#define QIB_7322_RxP0HdrEgrOvflCnt_OFFS 0x110E8 +#define QIB_7322_RxP0HdrEgrOvflCnt_DEF 0x0000000000000000 + +#define QIB_7322_PcieRetryBufDiagQwordCnt_OFFS 0x111A0 +#define QIB_7322_PcieRetryBufDiagQwordCnt_DEF 0x0000000000000000 + +#define QIB_7322_RxTidFlowDropCnt_OFFS 0x111E0 +#define QIB_7322_RxTidFlowDropCnt_DEF 0x0000000000000000 + +#define QIB_7322_LBIntCnt_0_OFFS 0x12000 +#define QIB_7322_LBIntCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_TxCreditUpToDateTimeOut_0_OFFS 0x12008 +#define QIB_7322_TxCreditUpToDateTimeOut_0_DEF 0x0000000000000000 + +#define QIB_7322_TxSDmaDescCnt_0_OFFS 0x12010 +#define QIB_7322_TxSDmaDescCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_TxUnsupVLErrCnt_0_OFFS 0x12018 +#define QIB_7322_TxUnsupVLErrCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_TxDataPktCnt_0_OFFS 0x12020 +#define QIB_7322_TxDataPktCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_TxFlowPktCnt_0_OFFS 0x12028 +#define QIB_7322_TxFlowPktCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_TxDwordCnt_0_OFFS 0x12030 +#define QIB_7322_TxDwordCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_TxLenErrCnt_0_OFFS 0x12038 +#define QIB_7322_TxLenErrCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_TxMaxMinLenErrCnt_0_OFFS 0x12040 +#define QIB_7322_TxMaxMinLenErrCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_TxUnderrunCnt_0_OFFS 0x12048 +#define QIB_7322_TxUnderrunCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_TxFlowStallCnt_0_OFFS 0x12050 +#define QIB_7322_TxFlowStallCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_TxDroppedPktCnt_0_OFFS 0x12058 +#define QIB_7322_TxDroppedPktCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_RxDroppedPktCnt_0_OFFS 0x12060 +#define QIB_7322_RxDroppedPktCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_RxDataPktCnt_0_OFFS 0x12068 +#define QIB_7322_RxDataPktCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_RxFlowPktCnt_0_OFFS 0x12070 +#define QIB_7322_RxFlowPktCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_RxDwordCnt_0_OFFS 0x12078 +#define QIB_7322_RxDwordCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_RxLenErrCnt_0_OFFS 0x12080 +#define QIB_7322_RxLenErrCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_RxMaxMinLenErrCnt_0_OFFS 0x12088 +#define QIB_7322_RxMaxMinLenErrCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_RxICRCErrCnt_0_OFFS 0x12090 +#define QIB_7322_RxICRCErrCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_RxVCRCErrCnt_0_OFFS 0x12098 +#define QIB_7322_RxVCRCErrCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_RxFlowCtrlViolCnt_0_OFFS 0x120A0 +#define QIB_7322_RxFlowCtrlViolCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_RxVersionErrCnt_0_OFFS 0x120A8 +#define QIB_7322_RxVersionErrCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_RxLinkMalformCnt_0_OFFS 0x120B0 +#define QIB_7322_RxLinkMalformCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_RxEBPCnt_0_OFFS 0x120B8 +#define QIB_7322_RxEBPCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_RxLPCRCErrCnt_0_OFFS 0x120C0 +#define QIB_7322_RxLPCRCErrCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_RxBufOvflCnt_0_OFFS 0x120C8 +#define QIB_7322_RxBufOvflCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_RxLenTruncateCnt_0_OFFS 0x120D0 +#define QIB_7322_RxLenTruncateCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_RxPKeyMismatchCnt_0_OFFS 0x120E0 +#define QIB_7322_RxPKeyMismatchCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_IBLinkDownedCnt_0_OFFS 0x12180 +#define QIB_7322_IBLinkDownedCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_IBSymbolErrCnt_0_OFFS 0x12188 +#define QIB_7322_IBSymbolErrCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_IBStatusChangeCnt_0_OFFS 0x12190 +#define QIB_7322_IBStatusChangeCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_IBLinkErrRecoveryCnt_0_OFFS 0x12198 +#define QIB_7322_IBLinkErrRecoveryCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_ExcessBufferOvflCnt_0_OFFS 0x121A8 +#define QIB_7322_ExcessBufferOvflCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_LocalLinkIntegrityErrCnt_0_OFFS 0x121B0 +#define QIB_7322_LocalLinkIntegrityErrCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_RxVlErrCnt_0_OFFS 0x121B8 +#define QIB_7322_RxVlErrCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_RxDlidFltrCnt_0_OFFS 0x121C0 +#define QIB_7322_RxDlidFltrCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_RxVL15DroppedPktCnt_0_OFFS 0x121C8 +#define QIB_7322_RxVL15DroppedPktCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_RxOtherLocalPhyErrCnt_0_OFFS 0x121D0 +#define QIB_7322_RxOtherLocalPhyErrCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_RxQPInvalidContextCnt_0_OFFS 0x121D8 +#define QIB_7322_RxQPInvalidContextCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_TxHeadersErrCnt_0_OFFS 0x121F8 +#define QIB_7322_TxHeadersErrCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_PSRcvDataCount_0_OFFS 0x12218 +#define QIB_7322_PSRcvDataCount_0_DEF 0x0000000000000000 + +#define QIB_7322_PSRcvPktsCount_0_OFFS 0x12220 +#define QIB_7322_PSRcvPktsCount_0_DEF 0x0000000000000000 + +#define QIB_7322_PSXmitDataCount_0_OFFS 0x12228 +#define QIB_7322_PSXmitDataCount_0_DEF 0x0000000000000000 + +#define QIB_7322_PSXmitPktsCount_0_OFFS 0x12230 +#define QIB_7322_PSXmitPktsCount_0_DEF 0x0000000000000000 + +#define QIB_7322_PSXmitWaitCount_0_OFFS 0x12238 +#define QIB_7322_PSXmitWaitCount_0_DEF 0x0000000000000000 + +#define QIB_7322_LBIntCnt_1_OFFS 0x13000 +#define QIB_7322_LBIntCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_TxCreditUpToDateTimeOut_1_OFFS 0x13008 +#define QIB_7322_TxCreditUpToDateTimeOut_1_DEF 0x0000000000000000 + +#define QIB_7322_TxSDmaDescCnt_1_OFFS 0x13010 +#define QIB_7322_TxSDmaDescCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_TxUnsupVLErrCnt_1_OFFS 0x13018 +#define QIB_7322_TxUnsupVLErrCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_TxDataPktCnt_1_OFFS 0x13020 +#define QIB_7322_TxDataPktCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_TxFlowPktCnt_1_OFFS 0x13028 +#define QIB_7322_TxFlowPktCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_TxDwordCnt_1_OFFS 0x13030 +#define QIB_7322_TxDwordCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_TxLenErrCnt_1_OFFS 0x13038 +#define QIB_7322_TxLenErrCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_TxMaxMinLenErrCnt_1_OFFS 0x13040 +#define QIB_7322_TxMaxMinLenErrCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_TxUnderrunCnt_1_OFFS 0x13048 +#define QIB_7322_TxUnderrunCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_TxFlowStallCnt_1_OFFS 0x13050 +#define QIB_7322_TxFlowStallCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_TxDroppedPktCnt_1_OFFS 0x13058 +#define QIB_7322_TxDroppedPktCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_RxDroppedPktCnt_1_OFFS 0x13060 +#define QIB_7322_RxDroppedPktCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_RxDataPktCnt_1_OFFS 0x13068 +#define QIB_7322_RxDataPktCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_RxFlowPktCnt_1_OFFS 0x13070 +#define QIB_7322_RxFlowPktCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_RxDwordCnt_1_OFFS 0x13078 +#define QIB_7322_RxDwordCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_RxLenErrCnt_1_OFFS 0x13080 +#define QIB_7322_RxLenErrCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_RxMaxMinLenErrCnt_1_OFFS 0x13088 +#define QIB_7322_RxMaxMinLenErrCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_RxICRCErrCnt_1_OFFS 0x13090 +#define QIB_7322_RxICRCErrCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_RxVCRCErrCnt_1_OFFS 0x13098 +#define QIB_7322_RxVCRCErrCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_RxFlowCtrlViolCnt_1_OFFS 0x130A0 +#define QIB_7322_RxFlowCtrlViolCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_RxVersionErrCnt_1_OFFS 0x130A8 +#define QIB_7322_RxVersionErrCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_RxLinkMalformCnt_1_OFFS 0x130B0 +#define QIB_7322_RxLinkMalformCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_RxEBPCnt_1_OFFS 0x130B8 +#define QIB_7322_RxEBPCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_RxLPCRCErrCnt_1_OFFS 0x130C0 +#define QIB_7322_RxLPCRCErrCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_RxBufOvflCnt_1_OFFS 0x130C8 +#define QIB_7322_RxBufOvflCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_RxLenTruncateCnt_1_OFFS 0x130D0 +#define QIB_7322_RxLenTruncateCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_RxPKeyMismatchCnt_1_OFFS 0x130E0 +#define QIB_7322_RxPKeyMismatchCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_IBLinkDownedCnt_1_OFFS 0x13180 +#define QIB_7322_IBLinkDownedCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_IBSymbolErrCnt_1_OFFS 0x13188 +#define QIB_7322_IBSymbolErrCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_IBStatusChangeCnt_1_OFFS 0x13190 +#define QIB_7322_IBStatusChangeCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_IBLinkErrRecoveryCnt_1_OFFS 0x13198 +#define QIB_7322_IBLinkErrRecoveryCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_ExcessBufferOvflCnt_1_OFFS 0x131A8 +#define QIB_7322_ExcessBufferOvflCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_LocalLinkIntegrityErrCnt_1_OFFS 0x131B0 +#define QIB_7322_LocalLinkIntegrityErrCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_RxVlErrCnt_1_OFFS 0x131B8 +#define QIB_7322_RxVlErrCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_RxDlidFltrCnt_1_OFFS 0x131C0 +#define QIB_7322_RxDlidFltrCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_RxVL15DroppedPktCnt_1_OFFS 0x131C8 +#define QIB_7322_RxVL15DroppedPktCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_RxOtherLocalPhyErrCnt_1_OFFS 0x131D0 +#define QIB_7322_RxOtherLocalPhyErrCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_RxQPInvalidContextCnt_1_OFFS 0x131D8 +#define QIB_7322_RxQPInvalidContextCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_TxHeadersErrCnt_1_OFFS 0x131F8 +#define QIB_7322_TxHeadersErrCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_PSRcvDataCount_1_OFFS 0x13218 +#define QIB_7322_PSRcvDataCount_1_DEF 0x0000000000000000 + +#define QIB_7322_PSRcvPktsCount_1_OFFS 0x13220 +#define QIB_7322_PSRcvPktsCount_1_DEF 0x0000000000000000 + +#define QIB_7322_PSXmitDataCount_1_OFFS 0x13228 +#define QIB_7322_PSXmitDataCount_1_DEF 0x0000000000000000 + +#define QIB_7322_PSXmitPktsCount_1_OFFS 0x13230 +#define QIB_7322_PSXmitPktsCount_1_DEF 0x0000000000000000 + +#define QIB_7322_PSXmitWaitCount_1_OFFS 0x13238 +#define QIB_7322_PSXmitWaitCount_1_DEF 0x0000000000000000 + +#define QIB_7322_RcvEgrArray_OFFS 0x14000 +#define QIB_7322_RcvEgrArray_DEF 0x0000000000000000 +#define QIB_7322_RcvEgrArray_RT_BufSize_LSB 0x25 +#define QIB_7322_RcvEgrArray_RT_BufSize_MSB 0x27 +#define QIB_7322_RcvEgrArray_RT_BufSize_RMASK 0x7 +#define QIB_7322_RcvEgrArray_RT_Addr_LSB 0x0 +#define QIB_7322_RcvEgrArray_RT_Addr_MSB 0x24 +#define QIB_7322_RcvEgrArray_RT_Addr_RMASK 0x1FFFFFFFFF + +#define QIB_7322_RcvTIDArray0_OFFS 0x50000 +#define QIB_7322_RcvTIDArray0_DEF 0x0000000000000000 +#define QIB_7322_RcvTIDArray0_RT_BufSize_LSB 0x25 +#define QIB_7322_RcvTIDArray0_RT_BufSize_MSB 0x27 +#define QIB_7322_RcvTIDArray0_RT_BufSize_RMASK 0x7 +#define QIB_7322_RcvTIDArray0_RT_Addr_LSB 0x0 +#define QIB_7322_RcvTIDArray0_RT_Addr_MSB 0x24 +#define QIB_7322_RcvTIDArray0_RT_Addr_RMASK 0x1FFFFFFFFF + +#define QIB_7322_IBSD_DDS_MAP_TABLE_0_OFFS 0xD0000 +#define QIB_7322_IBSD_DDS_MAP_TABLE_0_DEF 0x0000000000000000 + +#define QIB_7322_RcvHdrTail0_OFFS 0x200000 +#define QIB_7322_RcvHdrTail0_DEF 0x0000000000000000 + +#define QIB_7322_RcvHdrHead0_OFFS 0x200008 +#define QIB_7322_RcvHdrHead0_DEF 0x0000000000000000 +#define QIB_7322_RcvHdrHead0_counter_LSB 0x20 +#define QIB_7322_RcvHdrHead0_counter_MSB 0x2F +#define QIB_7322_RcvHdrHead0_counter_RMASK 0xFFFF +#define QIB_7322_RcvHdrHead0_RcvHeadPointer_LSB 0x0 +#define QIB_7322_RcvHdrHead0_RcvHeadPointer_MSB 0x1F +#define QIB_7322_RcvHdrHead0_RcvHeadPointer_RMASK 0xFFFFFFFF + +#define QIB_7322_RcvEgrIndexTail0_OFFS 0x200010 +#define QIB_7322_RcvEgrIndexTail0_DEF 0x0000000000000000 + +#define QIB_7322_RcvEgrIndexHead0_OFFS 0x200018 +#define QIB_7322_RcvEgrIndexHead0_DEF 0x0000000000000000 + +#define QIB_7322_RcvTIDFlowTable0_OFFS 0x201000 +#define QIB_7322_RcvTIDFlowTable0_DEF 0x0000000000000000 +#define QIB_7322_RcvTIDFlowTable0_GenMismatch_LSB 0x1C +#define QIB_7322_RcvTIDFlowTable0_GenMismatch_MSB 0x1C +#define QIB_7322_RcvTIDFlowTable0_GenMismatch_RMASK 0x1 +#define QIB_7322_RcvTIDFlowTable0_SeqMismatch_LSB 0x1B +#define QIB_7322_RcvTIDFlowTable0_SeqMismatch_MSB 0x1B +#define QIB_7322_RcvTIDFlowTable0_SeqMismatch_RMASK 0x1 +#define QIB_7322_RcvTIDFlowTable0_KeepOnGenErr_LSB 0x16 +#define QIB_7322_RcvTIDFlowTable0_KeepOnGenErr_MSB 0x16 +#define QIB_7322_RcvTIDFlowTable0_KeepOnGenErr_RMASK 0x1 +#define QIB_7322_RcvTIDFlowTable0_KeepAfterSeqErr_LSB 0x15 +#define QIB_7322_RcvTIDFlowTable0_KeepAfterSeqErr_MSB 0x15 +#define QIB_7322_RcvTIDFlowTable0_KeepAfterSeqErr_RMASK 0x1 +#define QIB_7322_RcvTIDFlowTable0_HdrSuppEnabled_LSB 0x14 +#define QIB_7322_RcvTIDFlowTable0_HdrSuppEnabled_MSB 0x14 +#define QIB_7322_RcvTIDFlowTable0_HdrSuppEnabled_RMASK 0x1 +#define QIB_7322_RcvTIDFlowTable0_FlowValid_LSB 0x13 +#define QIB_7322_RcvTIDFlowTable0_FlowValid_MSB 0x13 +#define QIB_7322_RcvTIDFlowTable0_FlowValid_RMASK 0x1 +#define QIB_7322_RcvTIDFlowTable0_GenVal_LSB 0xB +#define QIB_7322_RcvTIDFlowTable0_GenVal_MSB 0x12 +#define QIB_7322_RcvTIDFlowTable0_GenVal_RMASK 0xFF +#define QIB_7322_RcvTIDFlowTable0_SeqNum_LSB 0x0 +#define QIB_7322_RcvTIDFlowTable0_SeqNum_MSB 0xA +#define QIB_7322_RcvTIDFlowTable0_SeqNum_RMASK 0x7FF diff --git a/kernel/drivers/infiniband/hw/qib/qib_common.h b/kernel/drivers/infiniband/hw/qib/qib_common.h new file mode 100644 index 000000000..4fb78abd8 --- /dev/null +++ b/kernel/drivers/infiniband/hw/qib/qib_common.h @@ -0,0 +1,812 @@ +/* + * Copyright (c) 2006, 2007, 2008, 2009, 2010 QLogic Corporation. + * All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _QIB_COMMON_H +#define _QIB_COMMON_H + +/* + * This file contains defines, structures, etc. that are used + * to communicate between kernel and user code. + */ + +/* This is the IEEE-assigned OUI for QLogic Inc. QLogic_IB */ +#define QIB_SRC_OUI_1 0x00 +#define QIB_SRC_OUI_2 0x11 +#define QIB_SRC_OUI_3 0x75 + +/* version of protocol header (known to chip also). In the long run, + * we should be able to generate and accept a range of version numbers; + * for now we only accept one, and it's compiled in. + */ +#define IPS_PROTO_VERSION 2 + +/* + * These are compile time constants that you may want to enable or disable + * if you are trying to debug problems with code or performance. + * QIB_VERBOSE_TRACING define as 1 if you want additional tracing in + * fastpath code + * QIB_TRACE_REGWRITES define as 1 if you want register writes to be + * traced in faspath code + * _QIB_TRACING define as 0 if you want to remove all tracing in a + * compilation unit + */ + +/* + * The value in the BTH QP field that QLogic_IB uses to differentiate + * an qlogic_ib protocol IB packet vs standard IB transport + * This it needs to be even (0x656b78), because the LSB is sometimes + * used for the MSB of context. The change may cause a problem + * interoperating with older software. + */ +#define QIB_KD_QP 0x656b78 + +/* + * These are the status bits readable (in ascii form, 64bit value) + * from the "status" sysfs file. For binary compatibility, values + * must remain as is; removed states can be reused for different + * purposes. + */ +#define QIB_STATUS_INITTED 0x1 /* basic initialization done */ +/* Chip has been found and initted */ +#define QIB_STATUS_CHIP_PRESENT 0x20 +/* IB link is at ACTIVE, usable for data traffic */ +#define QIB_STATUS_IB_READY 0x40 +/* link is configured, LID, MTU, etc. have been set */ +#define QIB_STATUS_IB_CONF 0x80 +/* A Fatal hardware error has occurred. */ +#define QIB_STATUS_HWERROR 0x200 + +/* + * The list of usermode accessible registers. Also see Reg_* later in file. + */ +enum qib_ureg { + /* (RO) DMA RcvHdr to be used next. */ + ur_rcvhdrtail = 0, + /* (RW) RcvHdr entry to be processed next by host. */ + ur_rcvhdrhead = 1, + /* (RO) Index of next Eager index to use. */ + ur_rcvegrindextail = 2, + /* (RW) Eager TID to be processed next */ + ur_rcvegrindexhead = 3, + /* For internal use only; max register number. */ + _QIB_UregMax +}; + +/* bit values for spi_runtime_flags */ +#define QIB_RUNTIME_PCIE 0x0002 +#define QIB_RUNTIME_FORCE_WC_ORDER 0x0004 +#define QIB_RUNTIME_RCVHDR_COPY 0x0008 +#define QIB_RUNTIME_MASTER 0x0010 +#define QIB_RUNTIME_RCHK 0x0020 +#define QIB_RUNTIME_NODMA_RTAIL 0x0080 +#define QIB_RUNTIME_SPECIAL_TRIGGER 0x0100 +#define QIB_RUNTIME_SDMA 0x0200 +#define QIB_RUNTIME_FORCE_PIOAVAIL 0x0400 +#define QIB_RUNTIME_PIO_REGSWAPPED 0x0800 +#define QIB_RUNTIME_CTXT_MSB_IN_QP 0x1000 +#define QIB_RUNTIME_CTXT_REDIRECT 0x2000 +#define QIB_RUNTIME_HDRSUPP 0x4000 + +/* + * This structure is returned by qib_userinit() immediately after + * open to get implementation-specific info, and info specific to this + * instance. + * + * This struct must have explict pad fields where type sizes + * may result in different alignments between 32 and 64 bit + * programs, since the 64 bit * bit kernel requires the user code + * to have matching offsets + */ +struct qib_base_info { + /* version of hardware, for feature checking. */ + __u32 spi_hw_version; + /* version of software, for feature checking. */ + __u32 spi_sw_version; + /* QLogic_IB context assigned, goes into sent packets */ + __u16 spi_ctxt; + __u16 spi_subctxt; + /* + * IB MTU, packets IB data must be less than this. + * The MTU is in bytes, and will be a multiple of 4 bytes. + */ + __u32 spi_mtu; + /* + * Size of a PIO buffer. Any given packet's total size must be less + * than this (in words). Included is the starting control word, so + * if 513 is returned, then total pkt size is 512 words or less. + */ + __u32 spi_piosize; + /* size of the TID cache in qlogic_ib, in entries */ + __u32 spi_tidcnt; + /* size of the TID Eager list in qlogic_ib, in entries */ + __u32 spi_tidegrcnt; + /* size of a single receive header queue entry in words. */ + __u32 spi_rcvhdrent_size; + /* + * Count of receive header queue entries allocated. + * This may be less than the spu_rcvhdrcnt passed in!. + */ + __u32 spi_rcvhdr_cnt; + + /* per-chip and other runtime features bitmap (QIB_RUNTIME_*) */ + __u32 spi_runtime_flags; + + /* address where hardware receive header queue is mapped */ + __u64 spi_rcvhdr_base; + + /* user program. */ + + /* base address of eager TID receive buffers used by hardware. */ + __u64 spi_rcv_egrbufs; + + /* Allocated by initialization code, not by protocol. */ + + /* + * Size of each TID buffer in host memory, starting at + * spi_rcv_egrbufs. The buffers are virtually contiguous. + */ + __u32 spi_rcv_egrbufsize; + /* + * The special QP (queue pair) value that identifies an qlogic_ib + * protocol packet from standard IB packets. More, probably much + * more, to be added. + */ + __u32 spi_qpair; + + /* + * User register base for init code, not to be used directly by + * protocol or applications. Always points to chip registers, + * for normal or shared context. + */ + __u64 spi_uregbase; + /* + * Maximum buffer size in bytes that can be used in a single TID + * entry (assuming the buffer is aligned to this boundary). This is + * the minimum of what the hardware and software support Guaranteed + * to be a power of 2. + */ + __u32 spi_tid_maxsize; + /* + * alignment of each pio send buffer (byte count + * to add to spi_piobufbase to get to second buffer) + */ + __u32 spi_pioalign; + /* + * The index of the first pio buffer available to this process; + * needed to do lookup in spi_pioavailaddr; not added to + * spi_piobufbase. + */ + __u32 spi_pioindex; + /* number of buffers mapped for this process */ + __u32 spi_piocnt; + + /* + * Base address of writeonly pio buffers for this process. + * Each buffer has spi_piosize words, and is aligned on spi_pioalign + * boundaries. spi_piocnt buffers are mapped from this address + */ + __u64 spi_piobufbase; + + /* + * Base address of readonly memory copy of the pioavail registers. + * There are 2 bits for each buffer. + */ + __u64 spi_pioavailaddr; + + /* + * Address where driver updates a copy of the interface and driver + * status (QIB_STATUS_*) as a 64 bit value. It's followed by a + * link status qword (formerly combined with driver status), then a + * string indicating hardware error, if there was one. + */ + __u64 spi_status; + + /* number of chip ctxts available to user processes */ + __u32 spi_nctxts; + __u16 spi_unit; /* unit number of chip we are using */ + __u16 spi_port; /* IB port number we are using */ + /* num bufs in each contiguous set */ + __u32 spi_rcv_egrperchunk; + /* size in bytes of each contiguous set */ + __u32 spi_rcv_egrchunksize; + /* total size of mmap to cover full rcvegrbuffers */ + __u32 spi_rcv_egrbuftotlen; + __u32 spi_rhf_offset; /* dword offset in hdrqent for rcvhdr flags */ + /* address of readonly memory copy of the rcvhdrq tail register. */ + __u64 spi_rcvhdr_tailaddr; + + /* + * shared memory pages for subctxts if ctxt is shared; these cover + * all the processes in the group sharing a single context. + * all have enough space for the num_subcontexts value on this job. + */ + __u64 spi_subctxt_uregbase; + __u64 spi_subctxt_rcvegrbuf; + __u64 spi_subctxt_rcvhdr_base; + + /* shared memory page for send buffer disarm status */ + __u64 spi_sendbuf_status; +} __aligned(8); + +/* + * This version number is given to the driver by the user code during + * initialization in the spu_userversion field of qib_user_info, so + * the driver can check for compatibility with user code. + * + * The major version changes when data structures + * change in an incompatible way. The driver must be the same or higher + * for initialization to succeed. In some cases, a higher version + * driver will not interoperate with older software, and initialization + * will return an error. + */ +#define QIB_USER_SWMAJOR 1 + +/* + * Minor version differences are always compatible + * a within a major version, however if user software is larger + * than driver software, some new features and/or structure fields + * may not be implemented; the user code must deal with this if it + * cares, or it must abort after initialization reports the difference. + */ +#define QIB_USER_SWMINOR 13 + +#define QIB_USER_SWVERSION ((QIB_USER_SWMAJOR << 16) | QIB_USER_SWMINOR) + +#ifndef QIB_KERN_TYPE +#define QIB_KERN_TYPE 0 +#endif + +/* + * Similarly, this is the kernel version going back to the user. It's + * slightly different, in that we want to tell if the driver was built as + * part of a QLogic release, or from the driver from openfabrics.org, + * kernel.org, or a standard distribution, for support reasons. + * The high bit is 0 for non-QLogic and 1 for QLogic-built/supplied. + * + * It's returned by the driver to the user code during initialization in the + * spi_sw_version field of qib_base_info, so the user code can in turn + * check for compatibility with the kernel. +*/ +#define QIB_KERN_SWVERSION ((QIB_KERN_TYPE << 31) | QIB_USER_SWVERSION) + +/* + * Define the driver version number. This is something that refers only + * to the driver itself, not the software interfaces it supports. + */ +#define QIB_DRIVER_VERSION_BASE "1.11" + +/* create the final driver version string */ +#ifdef QIB_IDSTR +#define QIB_DRIVER_VERSION QIB_DRIVER_VERSION_BASE " " QIB_IDSTR +#else +#define QIB_DRIVER_VERSION QIB_DRIVER_VERSION_BASE +#endif + +/* + * If the unit is specified via open, HCA choice is fixed. If port is + * specified, it's also fixed. Otherwise we try to spread contexts + * across ports and HCAs, using different algorithims. WITHIN is + * the old default, prior to this mechanism. + */ +#define QIB_PORT_ALG_ACROSS 0 /* round robin contexts across HCAs, then + * ports; this is the default */ +#define QIB_PORT_ALG_WITHIN 1 /* use all contexts on an HCA (round robin + * active ports within), then next HCA */ +#define QIB_PORT_ALG_COUNT 2 /* number of algorithm choices */ + +/* + * This structure is passed to qib_userinit() to tell the driver where + * user code buffers are, sizes, etc. The offsets and sizes of the + * fields must remain unchanged, for binary compatibility. It can + * be extended, if userversion is changed so user code can tell, if needed + */ +struct qib_user_info { + /* + * version of user software, to detect compatibility issues. + * Should be set to QIB_USER_SWVERSION. + */ + __u32 spu_userversion; + + __u32 _spu_unused2; + + /* size of struct base_info to write to */ + __u32 spu_base_info_size; + + __u32 spu_port_alg; /* which QIB_PORT_ALG_*; unused user minor < 11 */ + + /* + * If two or more processes wish to share a context, each process + * must set the spu_subctxt_cnt and spu_subctxt_id to the same + * values. The only restriction on the spu_subctxt_id is that + * it be unique for a given node. + */ + __u16 spu_subctxt_cnt; + __u16 spu_subctxt_id; + + __u32 spu_port; /* IB port requested by user if > 0 */ + + /* + * address of struct base_info to write to + */ + __u64 spu_base_info; + +} __aligned(8); + +/* User commands. */ + +/* 16 available, was: old set up userspace (for old user code) */ +#define QIB_CMD_CTXT_INFO 17 /* find out what resources we got */ +#define QIB_CMD_RECV_CTRL 18 /* control receipt of packets */ +#define QIB_CMD_TID_UPDATE 19 /* update expected TID entries */ +#define QIB_CMD_TID_FREE 20 /* free expected TID entries */ +#define QIB_CMD_SET_PART_KEY 21 /* add partition key */ +/* 22 available, was: return info on slave processes (for old user code) */ +#define QIB_CMD_ASSIGN_CTXT 23 /* allocate HCA and ctxt */ +#define QIB_CMD_USER_INIT 24 /* set up userspace */ +#define QIB_CMD_UNUSED_1 25 +#define QIB_CMD_UNUSED_2 26 +#define QIB_CMD_PIOAVAILUPD 27 /* force an update of PIOAvail reg */ +#define QIB_CMD_POLL_TYPE 28 /* set the kind of polling we want */ +#define QIB_CMD_ARMLAUNCH_CTRL 29 /* armlaunch detection control */ +/* 30 is unused */ +#define QIB_CMD_SDMA_INFLIGHT 31 /* sdma inflight counter request */ +#define QIB_CMD_SDMA_COMPLETE 32 /* sdma completion counter request */ +/* 33 available, was a testing feature */ +#define QIB_CMD_DISARM_BUFS 34 /* disarm send buffers w/ errors */ +#define QIB_CMD_ACK_EVENT 35 /* ack & clear bits */ +#define QIB_CMD_CPUS_LIST 36 /* list of cpus allocated, for pinned + * processes: qib_cpus_list */ + +/* + * QIB_CMD_ACK_EVENT obsoletes QIB_CMD_DISARM_BUFS, but we keep it for + * compatibility with libraries from previous release. The ACK_EVENT + * will take appropriate driver action (if any, just DISARM for now), + * then clear the bits passed in as part of the mask. These bits are + * in the first 64bit word at spi_sendbuf_status, and are passed to + * the driver in the event_mask union as well. + */ +#define _QIB_EVENT_DISARM_BUFS_BIT 0 +#define _QIB_EVENT_LINKDOWN_BIT 1 +#define _QIB_EVENT_LID_CHANGE_BIT 2 +#define _QIB_EVENT_LMC_CHANGE_BIT 3 +#define _QIB_EVENT_SL2VL_CHANGE_BIT 4 +#define _QIB_MAX_EVENT_BIT _QIB_EVENT_SL2VL_CHANGE_BIT + +#define QIB_EVENT_DISARM_BUFS_BIT (1UL << _QIB_EVENT_DISARM_BUFS_BIT) +#define QIB_EVENT_LINKDOWN_BIT (1UL << _QIB_EVENT_LINKDOWN_BIT) +#define QIB_EVENT_LID_CHANGE_BIT (1UL << _QIB_EVENT_LID_CHANGE_BIT) +#define QIB_EVENT_LMC_CHANGE_BIT (1UL << _QIB_EVENT_LMC_CHANGE_BIT) +#define QIB_EVENT_SL2VL_CHANGE_BIT (1UL << _QIB_EVENT_SL2VL_CHANGE_BIT) + + +/* + * Poll types + */ +#define QIB_POLL_TYPE_ANYRCV 0x0 +#define QIB_POLL_TYPE_URGENT 0x1 + +struct qib_ctxt_info { + __u16 num_active; /* number of active units */ + __u16 unit; /* unit (chip) assigned to caller */ + __u16 port; /* IB port assigned to caller (1-based) */ + __u16 ctxt; /* ctxt on unit assigned to caller */ + __u16 subctxt; /* subctxt on unit assigned to caller */ + __u16 num_ctxts; /* number of ctxts available on unit */ + __u16 num_subctxts; /* number of subctxts opened on ctxt */ + __u16 rec_cpu; /* cpu # for affinity (ffff if none) */ +}; + +struct qib_tid_info { + __u32 tidcnt; + /* make structure same size in 32 and 64 bit */ + __u32 tid__unused; + /* virtual address of first page in transfer */ + __u64 tidvaddr; + /* pointer (same size 32/64 bit) to __u16 tid array */ + __u64 tidlist; + + /* + * pointer (same size 32/64 bit) to bitmap of TIDs used + * for this call; checked for being large enough at open + */ + __u64 tidmap; +}; + +struct qib_cmd { + __u32 type; /* command type */ + union { + struct qib_tid_info tid_info; + struct qib_user_info user_info; + + /* + * address in userspace where we should put the sdma + * inflight counter + */ + __u64 sdma_inflight; + /* + * address in userspace where we should put the sdma + * completion counter + */ + __u64 sdma_complete; + /* address in userspace of struct qib_ctxt_info to + write result to */ + __u64 ctxt_info; + /* enable/disable receipt of packets */ + __u32 recv_ctrl; + /* enable/disable armlaunch errors (non-zero to enable) */ + __u32 armlaunch_ctrl; + /* partition key to set */ + __u16 part_key; + /* user address of __u32 bitmask of active slaves */ + __u64 slave_mask_addr; + /* type of polling we want */ + __u16 poll_type; + /* back pressure enable bit for one particular context */ + __u8 ctxt_bp; + /* qib_user_event_ack(), IPATH_EVENT_* bits */ + __u64 event_mask; + } cmd; +}; + +struct qib_iovec { + /* Pointer to data, but same size 32 and 64 bit */ + __u64 iov_base; + + /* + * Length of data; don't need 64 bits, but want + * qib_sendpkt to remain same size as before 32 bit changes, so... + */ + __u64 iov_len; +}; + +/* + * Describes a single packet for send. Each packet can have one or more + * buffers, but the total length (exclusive of IB headers) must be less + * than the MTU, and if using the PIO method, entire packet length, + * including IB headers, must be less than the qib_piosize value (words). + * Use of this necessitates including sys/uio.h + */ +struct __qib_sendpkt { + __u32 sps_flags; /* flags for packet (TBD) */ + __u32 sps_cnt; /* number of entries to use in sps_iov */ + /* array of iov's describing packet. TEMPORARY */ + struct qib_iovec sps_iov[4]; +}; + +/* + * Diagnostics can send a packet by "writing" the following + * structs to the diag data special file. + * This allows a custom + * pbc (+ static rate) qword, so that special modes and deliberate + * changes to CRCs can be used. The elements were also re-ordered + * for better alignment and to avoid padding issues. + */ +#define _DIAG_XPKT_VERS 3 +struct qib_diag_xpkt { + __u16 version; + __u16 unit; + __u16 port; + __u16 len; + __u64 data; + __u64 pbc_wd; +}; + +/* + * Data layout in I2C flash (for GUID, etc.) + * All fields are little-endian binary unless otherwise stated + */ +#define QIB_FLASH_VERSION 2 +struct qib_flash { + /* flash layout version (QIB_FLASH_VERSION) */ + __u8 if_fversion; + /* checksum protecting if_length bytes */ + __u8 if_csum; + /* + * valid length (in use, protected by if_csum), including + * if_fversion and if_csum themselves) + */ + __u8 if_length; + /* the GUID, in network order */ + __u8 if_guid[8]; + /* number of GUIDs to use, starting from if_guid */ + __u8 if_numguid; + /* the (last 10 characters of) board serial number, in ASCII */ + char if_serial[12]; + /* board mfg date (YYYYMMDD ASCII) */ + char if_mfgdate[8]; + /* last board rework/test date (YYYYMMDD ASCII) */ + char if_testdate[8]; + /* logging of error counts, TBD */ + __u8 if_errcntp[4]; + /* powered on hours, updated at driver unload */ + __u8 if_powerhour[2]; + /* ASCII free-form comment field */ + char if_comment[32]; + /* Backwards compatible prefix for longer QLogic Serial Numbers */ + char if_sprefix[4]; + /* 82 bytes used, min flash size is 128 bytes */ + __u8 if_future[46]; +}; + +/* + * These are the counters implemented in the chip, and are listed in order. + * The InterCaps naming is taken straight from the chip spec. + */ +struct qlogic_ib_counters { + __u64 LBIntCnt; + __u64 LBFlowStallCnt; + __u64 TxSDmaDescCnt; /* was Reserved1 */ + __u64 TxUnsupVLErrCnt; + __u64 TxDataPktCnt; + __u64 TxFlowPktCnt; + __u64 TxDwordCnt; + __u64 TxLenErrCnt; + __u64 TxMaxMinLenErrCnt; + __u64 TxUnderrunCnt; + __u64 TxFlowStallCnt; + __u64 TxDroppedPktCnt; + __u64 RxDroppedPktCnt; + __u64 RxDataPktCnt; + __u64 RxFlowPktCnt; + __u64 RxDwordCnt; + __u64 RxLenErrCnt; + __u64 RxMaxMinLenErrCnt; + __u64 RxICRCErrCnt; + __u64 RxVCRCErrCnt; + __u64 RxFlowCtrlErrCnt; + __u64 RxBadFormatCnt; + __u64 RxLinkProblemCnt; + __u64 RxEBPCnt; + __u64 RxLPCRCErrCnt; + __u64 RxBufOvflCnt; + __u64 RxTIDFullErrCnt; + __u64 RxTIDValidErrCnt; + __u64 RxPKeyMismatchCnt; + __u64 RxP0HdrEgrOvflCnt; + __u64 RxP1HdrEgrOvflCnt; + __u64 RxP2HdrEgrOvflCnt; + __u64 RxP3HdrEgrOvflCnt; + __u64 RxP4HdrEgrOvflCnt; + __u64 RxP5HdrEgrOvflCnt; + __u64 RxP6HdrEgrOvflCnt; + __u64 RxP7HdrEgrOvflCnt; + __u64 RxP8HdrEgrOvflCnt; + __u64 RxP9HdrEgrOvflCnt; + __u64 RxP10HdrEgrOvflCnt; + __u64 RxP11HdrEgrOvflCnt; + __u64 RxP12HdrEgrOvflCnt; + __u64 RxP13HdrEgrOvflCnt; + __u64 RxP14HdrEgrOvflCnt; + __u64 RxP15HdrEgrOvflCnt; + __u64 RxP16HdrEgrOvflCnt; + __u64 IBStatusChangeCnt; + __u64 IBLinkErrRecoveryCnt; + __u64 IBLinkDownedCnt; + __u64 IBSymbolErrCnt; + __u64 RxVL15DroppedPktCnt; + __u64 RxOtherLocalPhyErrCnt; + __u64 PcieRetryBufDiagQwordCnt; + __u64 ExcessBufferOvflCnt; + __u64 LocalLinkIntegrityErrCnt; + __u64 RxVlErrCnt; + __u64 RxDlidFltrCnt; +}; + +/* + * The next set of defines are for packet headers, and chip register + * and memory bits that are visible to and/or used by user-mode software. + */ + +/* RcvHdrFlags bits */ +#define QLOGIC_IB_RHF_LENGTH_MASK 0x7FF +#define QLOGIC_IB_RHF_LENGTH_SHIFT 0 +#define QLOGIC_IB_RHF_RCVTYPE_MASK 0x7 +#define QLOGIC_IB_RHF_RCVTYPE_SHIFT 11 +#define QLOGIC_IB_RHF_EGRINDEX_MASK 0xFFF +#define QLOGIC_IB_RHF_EGRINDEX_SHIFT 16 +#define QLOGIC_IB_RHF_SEQ_MASK 0xF +#define QLOGIC_IB_RHF_SEQ_SHIFT 0 +#define QLOGIC_IB_RHF_HDRQ_OFFSET_MASK 0x7FF +#define QLOGIC_IB_RHF_HDRQ_OFFSET_SHIFT 4 +#define QLOGIC_IB_RHF_H_ICRCERR 0x80000000 +#define QLOGIC_IB_RHF_H_VCRCERR 0x40000000 +#define QLOGIC_IB_RHF_H_PARITYERR 0x20000000 +#define QLOGIC_IB_RHF_H_LENERR 0x10000000 +#define QLOGIC_IB_RHF_H_MTUERR 0x08000000 +#define QLOGIC_IB_RHF_H_IHDRERR 0x04000000 +#define QLOGIC_IB_RHF_H_TIDERR 0x02000000 +#define QLOGIC_IB_RHF_H_MKERR 0x01000000 +#define QLOGIC_IB_RHF_H_IBERR 0x00800000 +#define QLOGIC_IB_RHF_H_ERR_MASK 0xFF800000 +#define QLOGIC_IB_RHF_L_USE_EGR 0x80000000 +#define QLOGIC_IB_RHF_L_SWA 0x00008000 +#define QLOGIC_IB_RHF_L_SWB 0x00004000 + +/* qlogic_ib header fields */ +#define QLOGIC_IB_I_VERS_MASK 0xF +#define QLOGIC_IB_I_VERS_SHIFT 28 +#define QLOGIC_IB_I_CTXT_MASK 0xF +#define QLOGIC_IB_I_CTXT_SHIFT 24 +#define QLOGIC_IB_I_TID_MASK 0x7FF +#define QLOGIC_IB_I_TID_SHIFT 13 +#define QLOGIC_IB_I_OFFSET_MASK 0x1FFF +#define QLOGIC_IB_I_OFFSET_SHIFT 0 + +/* K_PktFlags bits */ +#define QLOGIC_IB_KPF_INTR 0x1 +#define QLOGIC_IB_KPF_SUBCTXT_MASK 0x3 +#define QLOGIC_IB_KPF_SUBCTXT_SHIFT 1 + +#define QLOGIC_IB_MAX_SUBCTXT 4 + +/* SendPIO per-buffer control */ +#define QLOGIC_IB_SP_TEST 0x40 +#define QLOGIC_IB_SP_TESTEBP 0x20 +#define QLOGIC_IB_SP_TRIGGER_SHIFT 15 + +/* SendPIOAvail bits */ +#define QLOGIC_IB_SENDPIOAVAIL_BUSY_SHIFT 1 +#define QLOGIC_IB_SENDPIOAVAIL_CHECK_SHIFT 0 + +/* qlogic_ib header format */ +struct qib_header { + /* + * Version - 4 bits, Context - 4 bits, TID - 10 bits and Offset - + * 14 bits before ECO change ~28 Dec 03. After that, Vers 4, + * Context 4, TID 11, offset 13. + */ + __le32 ver_ctxt_tid_offset; + __le16 chksum; + __le16 pkt_flags; +}; + +/* + * qlogic_ib user message header format. + * This structure contains the first 4 fields common to all protocols + * that employ qlogic_ib. + */ +struct qib_message_header { + __be16 lrh[4]; + __be32 bth[3]; + /* fields below this point are in host byte order */ + struct qib_header iph; + /* fields below are simplified, but should match PSM */ + /* some are accessed by driver when packet spliting is needed */ + __u8 sub_opcode; + __u8 flags; + __u16 commidx; + __u32 ack_seq_num; + __u8 flowid; + __u8 hdr_dlen; + __u16 mqhdr; + __u32 uwords[4]; +}; + +/* sequence number bits for message */ +union qib_seqnum { + struct { + __u32 seq:11; + __u32 gen:8; + __u32 flow:5; + }; + struct { + __u32 pkt:16; + __u32 msg:8; + }; + __u32 val; +}; + +/* qib receiving-dma tid-session-member */ +struct qib_tid_session_member { + __u16 tid; + __u16 offset; + __u16 length; +}; + +/* IB - LRH header consts */ +#define QIB_LRH_GRH 0x0003 /* 1. word of IB LRH - next header: GRH */ +#define QIB_LRH_BTH 0x0002 /* 1. word of IB LRH - next header: BTH */ + +/* misc. */ +#define SIZE_OF_CRC 1 + +#define QIB_DEFAULT_P_KEY 0xFFFF +#define QIB_PERMISSIVE_LID 0xFFFF +#define QIB_AETH_CREDIT_SHIFT 24 +#define QIB_AETH_CREDIT_MASK 0x1F +#define QIB_AETH_CREDIT_INVAL 0x1F +#define QIB_PSN_MASK 0xFFFFFF +#define QIB_MSN_MASK 0xFFFFFF +#define QIB_QPN_MASK 0xFFFFFF +#define QIB_MULTICAST_LID_BASE 0xC000 +#define QIB_EAGER_TID_ID QLOGIC_IB_I_TID_MASK +#define QIB_MULTICAST_QPN 0xFFFFFF + +/* Receive Header Queue: receive type (from qlogic_ib) */ +#define RCVHQ_RCV_TYPE_EXPECTED 0 +#define RCVHQ_RCV_TYPE_EAGER 1 +#define RCVHQ_RCV_TYPE_NON_KD 2 +#define RCVHQ_RCV_TYPE_ERROR 3 + +#define QIB_HEADER_QUEUE_WORDS 9 + +/* functions for extracting fields from rcvhdrq entries for the driver. + */ +static inline __u32 qib_hdrget_err_flags(const __le32 *rbuf) +{ + return __le32_to_cpu(rbuf[1]) & QLOGIC_IB_RHF_H_ERR_MASK; +} + +static inline __u32 qib_hdrget_rcv_type(const __le32 *rbuf) +{ + return (__le32_to_cpu(rbuf[0]) >> QLOGIC_IB_RHF_RCVTYPE_SHIFT) & + QLOGIC_IB_RHF_RCVTYPE_MASK; +} + +static inline __u32 qib_hdrget_length_in_bytes(const __le32 *rbuf) +{ + return ((__le32_to_cpu(rbuf[0]) >> QLOGIC_IB_RHF_LENGTH_SHIFT) & + QLOGIC_IB_RHF_LENGTH_MASK) << 2; +} + +static inline __u32 qib_hdrget_index(const __le32 *rbuf) +{ + return (__le32_to_cpu(rbuf[0]) >> QLOGIC_IB_RHF_EGRINDEX_SHIFT) & + QLOGIC_IB_RHF_EGRINDEX_MASK; +} + +static inline __u32 qib_hdrget_seq(const __le32 *rbuf) +{ + return (__le32_to_cpu(rbuf[1]) >> QLOGIC_IB_RHF_SEQ_SHIFT) & + QLOGIC_IB_RHF_SEQ_MASK; +} + +static inline __u32 qib_hdrget_offset(const __le32 *rbuf) +{ + return (__le32_to_cpu(rbuf[1]) >> QLOGIC_IB_RHF_HDRQ_OFFSET_SHIFT) & + QLOGIC_IB_RHF_HDRQ_OFFSET_MASK; +} + +static inline __u32 qib_hdrget_use_egr_buf(const __le32 *rbuf) +{ + return __le32_to_cpu(rbuf[0]) & QLOGIC_IB_RHF_L_USE_EGR; +} + +static inline __u32 qib_hdrget_qib_ver(__le32 hdrword) +{ + return (__le32_to_cpu(hdrword) >> QLOGIC_IB_I_VERS_SHIFT) & + QLOGIC_IB_I_VERS_MASK; +} + +#endif /* _QIB_COMMON_H */ diff --git a/kernel/drivers/infiniband/hw/qib/qib_cq.c b/kernel/drivers/infiniband/hw/qib/qib_cq.c new file mode 100644 index 000000000..ab4e11cfa --- /dev/null +++ b/kernel/drivers/infiniband/hw/qib/qib_cq.c @@ -0,0 +1,540 @@ +/* + * Copyright (c) 2013 Intel Corporation. All rights reserved. + * Copyright (c) 2006, 2007, 2008, 2010 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#include "qib_verbs.h" +#include "qib.h" + +/** + * qib_cq_enter - add a new entry to the completion queue + * @cq: completion queue + * @entry: work completion entry to add + * @sig: true if @entry is a solicitated entry + * + * This may be called with qp->s_lock held. + */ +void qib_cq_enter(struct qib_cq *cq, struct ib_wc *entry, int solicited) +{ + struct qib_cq_wc *wc; + unsigned long flags; + u32 head; + u32 next; + + spin_lock_irqsave(&cq->lock, flags); + + /* + * Note that the head pointer might be writable by user processes. + * Take care to verify it is a sane value. + */ + wc = cq->queue; + head = wc->head; + if (head >= (unsigned) cq->ibcq.cqe) { + head = cq->ibcq.cqe; + next = 0; + } else + next = head + 1; + if (unlikely(next == wc->tail)) { + spin_unlock_irqrestore(&cq->lock, flags); + if (cq->ibcq.event_handler) { + struct ib_event ev; + + ev.device = cq->ibcq.device; + ev.element.cq = &cq->ibcq; + ev.event = IB_EVENT_CQ_ERR; + cq->ibcq.event_handler(&ev, cq->ibcq.cq_context); + } + return; + } + if (cq->ip) { + wc->uqueue[head].wr_id = entry->wr_id; + wc->uqueue[head].status = entry->status; + wc->uqueue[head].opcode = entry->opcode; + wc->uqueue[head].vendor_err = entry->vendor_err; + wc->uqueue[head].byte_len = entry->byte_len; + wc->uqueue[head].ex.imm_data = + (__u32 __force)entry->ex.imm_data; + wc->uqueue[head].qp_num = entry->qp->qp_num; + wc->uqueue[head].src_qp = entry->src_qp; + wc->uqueue[head].wc_flags = entry->wc_flags; + wc->uqueue[head].pkey_index = entry->pkey_index; + wc->uqueue[head].slid = entry->slid; + wc->uqueue[head].sl = entry->sl; + wc->uqueue[head].dlid_path_bits = entry->dlid_path_bits; + wc->uqueue[head].port_num = entry->port_num; + /* Make sure entry is written before the head index. */ + smp_wmb(); + } else + wc->kqueue[head] = *entry; + wc->head = next; + + if (cq->notify == IB_CQ_NEXT_COMP || + (cq->notify == IB_CQ_SOLICITED && + (solicited || entry->status != IB_WC_SUCCESS))) { + struct kthread_worker *worker; + /* + * This will cause send_complete() to be called in + * another thread. + */ + smp_rmb(); + worker = cq->dd->worker; + if (likely(worker)) { + cq->notify = IB_CQ_NONE; + cq->triggered++; + queue_kthread_work(worker, &cq->comptask); + } + } + + spin_unlock_irqrestore(&cq->lock, flags); +} + +/** + * qib_poll_cq - poll for work completion entries + * @ibcq: the completion queue to poll + * @num_entries: the maximum number of entries to return + * @entry: pointer to array where work completions are placed + * + * Returns the number of completion entries polled. + * + * This may be called from interrupt context. Also called by ib_poll_cq() + * in the generic verbs code. + */ +int qib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) +{ + struct qib_cq *cq = to_icq(ibcq); + struct qib_cq_wc *wc; + unsigned long flags; + int npolled; + u32 tail; + + /* The kernel can only poll a kernel completion queue */ + if (cq->ip) { + npolled = -EINVAL; + goto bail; + } + + spin_lock_irqsave(&cq->lock, flags); + + wc = cq->queue; + tail = wc->tail; + if (tail > (u32) cq->ibcq.cqe) + tail = (u32) cq->ibcq.cqe; + for (npolled = 0; npolled < num_entries; ++npolled, ++entry) { + if (tail == wc->head) + break; + /* The kernel doesn't need a RMB since it has the lock. */ + *entry = wc->kqueue[tail]; + if (tail >= cq->ibcq.cqe) + tail = 0; + else + tail++; + } + wc->tail = tail; + + spin_unlock_irqrestore(&cq->lock, flags); + +bail: + return npolled; +} + +static void send_complete(struct kthread_work *work) +{ + struct qib_cq *cq = container_of(work, struct qib_cq, comptask); + + /* + * The completion handler will most likely rearm the notification + * and poll for all pending entries. If a new completion entry + * is added while we are in this routine, queue_work() + * won't call us again until we return so we check triggered to + * see if we need to call the handler again. + */ + for (;;) { + u8 triggered = cq->triggered; + + /* + * IPoIB connected mode assumes the callback is from a + * soft IRQ. We simulate this by blocking "bottom halves". + * See the implementation for ipoib_cm_handle_tx_wc(), + * netif_tx_lock_bh() and netif_tx_lock(). + */ + local_bh_disable(); + cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); + local_bh_enable(); + + if (cq->triggered == triggered) + return; + } +} + +/** + * qib_create_cq - create a completion queue + * @ibdev: the device this completion queue is attached to + * @entries: the minimum size of the completion queue + * @context: unused by the QLogic_IB driver + * @udata: user data for libibverbs.so + * + * Returns a pointer to the completion queue or negative errno values + * for failure. + * + * Called by ib_create_cq() in the generic verbs code. + */ +struct ib_cq *qib_create_cq(struct ib_device *ibdev, int entries, + int comp_vector, struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct qib_ibdev *dev = to_idev(ibdev); + struct qib_cq *cq; + struct qib_cq_wc *wc; + struct ib_cq *ret; + u32 sz; + + if (entries < 1 || entries > ib_qib_max_cqes) { + ret = ERR_PTR(-EINVAL); + goto done; + } + + /* Allocate the completion queue structure. */ + cq = kmalloc(sizeof(*cq), GFP_KERNEL); + if (!cq) { + ret = ERR_PTR(-ENOMEM); + goto done; + } + + /* + * Allocate the completion queue entries and head/tail pointers. + * This is allocated separately so that it can be resized and + * also mapped into user space. + * We need to use vmalloc() in order to support mmap and large + * numbers of entries. + */ + sz = sizeof(*wc); + if (udata && udata->outlen >= sizeof(__u64)) + sz += sizeof(struct ib_uverbs_wc) * (entries + 1); + else + sz += sizeof(struct ib_wc) * (entries + 1); + wc = vmalloc_user(sz); + if (!wc) { + ret = ERR_PTR(-ENOMEM); + goto bail_cq; + } + + /* + * Return the address of the WC as the offset to mmap. + * See qib_mmap() for details. + */ + if (udata && udata->outlen >= sizeof(__u64)) { + int err; + + cq->ip = qib_create_mmap_info(dev, sz, context, wc); + if (!cq->ip) { + ret = ERR_PTR(-ENOMEM); + goto bail_wc; + } + + err = ib_copy_to_udata(udata, &cq->ip->offset, + sizeof(cq->ip->offset)); + if (err) { + ret = ERR_PTR(err); + goto bail_ip; + } + } else + cq->ip = NULL; + + spin_lock(&dev->n_cqs_lock); + if (dev->n_cqs_allocated == ib_qib_max_cqs) { + spin_unlock(&dev->n_cqs_lock); + ret = ERR_PTR(-ENOMEM); + goto bail_ip; + } + + dev->n_cqs_allocated++; + spin_unlock(&dev->n_cqs_lock); + + if (cq->ip) { + spin_lock_irq(&dev->pending_lock); + list_add(&cq->ip->pending_mmaps, &dev->pending_mmaps); + spin_unlock_irq(&dev->pending_lock); + } + + /* + * ib_create_cq() will initialize cq->ibcq except for cq->ibcq.cqe. + * The number of entries should be >= the number requested or return + * an error. + */ + cq->dd = dd_from_dev(dev); + cq->ibcq.cqe = entries; + cq->notify = IB_CQ_NONE; + cq->triggered = 0; + spin_lock_init(&cq->lock); + init_kthread_work(&cq->comptask, send_complete); + wc->head = 0; + wc->tail = 0; + cq->queue = wc; + + ret = &cq->ibcq; + + goto done; + +bail_ip: + kfree(cq->ip); +bail_wc: + vfree(wc); +bail_cq: + kfree(cq); +done: + return ret; +} + +/** + * qib_destroy_cq - destroy a completion queue + * @ibcq: the completion queue to destroy. + * + * Returns 0 for success. + * + * Called by ib_destroy_cq() in the generic verbs code. + */ +int qib_destroy_cq(struct ib_cq *ibcq) +{ + struct qib_ibdev *dev = to_idev(ibcq->device); + struct qib_cq *cq = to_icq(ibcq); + + flush_kthread_work(&cq->comptask); + spin_lock(&dev->n_cqs_lock); + dev->n_cqs_allocated--; + spin_unlock(&dev->n_cqs_lock); + if (cq->ip) + kref_put(&cq->ip->ref, qib_release_mmap_info); + else + vfree(cq->queue); + kfree(cq); + + return 0; +} + +/** + * qib_req_notify_cq - change the notification type for a completion queue + * @ibcq: the completion queue + * @notify_flags: the type of notification to request + * + * Returns 0 for success. + * + * This may be called from interrupt context. Also called by + * ib_req_notify_cq() in the generic verbs code. + */ +int qib_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags) +{ + struct qib_cq *cq = to_icq(ibcq); + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&cq->lock, flags); + /* + * Don't change IB_CQ_NEXT_COMP to IB_CQ_SOLICITED but allow + * any other transitions (see C11-31 and C11-32 in ch. 11.4.2.2). + */ + if (cq->notify != IB_CQ_NEXT_COMP) + cq->notify = notify_flags & IB_CQ_SOLICITED_MASK; + + if ((notify_flags & IB_CQ_REPORT_MISSED_EVENTS) && + cq->queue->head != cq->queue->tail) + ret = 1; + + spin_unlock_irqrestore(&cq->lock, flags); + + return ret; +} + +/** + * qib_resize_cq - change the size of the CQ + * @ibcq: the completion queue + * + * Returns 0 for success. + */ +int qib_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) +{ + struct qib_cq *cq = to_icq(ibcq); + struct qib_cq_wc *old_wc; + struct qib_cq_wc *wc; + u32 head, tail, n; + int ret; + u32 sz; + + if (cqe < 1 || cqe > ib_qib_max_cqes) { + ret = -EINVAL; + goto bail; + } + + /* + * Need to use vmalloc() if we want to support large #s of entries. + */ + sz = sizeof(*wc); + if (udata && udata->outlen >= sizeof(__u64)) + sz += sizeof(struct ib_uverbs_wc) * (cqe + 1); + else + sz += sizeof(struct ib_wc) * (cqe + 1); + wc = vmalloc_user(sz); + if (!wc) { + ret = -ENOMEM; + goto bail; + } + + /* Check that we can write the offset to mmap. */ + if (udata && udata->outlen >= sizeof(__u64)) { + __u64 offset = 0; + + ret = ib_copy_to_udata(udata, &offset, sizeof(offset)); + if (ret) + goto bail_free; + } + + spin_lock_irq(&cq->lock); + /* + * Make sure head and tail are sane since they + * might be user writable. + */ + old_wc = cq->queue; + head = old_wc->head; + if (head > (u32) cq->ibcq.cqe) + head = (u32) cq->ibcq.cqe; + tail = old_wc->tail; + if (tail > (u32) cq->ibcq.cqe) + tail = (u32) cq->ibcq.cqe; + if (head < tail) + n = cq->ibcq.cqe + 1 + head - tail; + else + n = head - tail; + if (unlikely((u32)cqe < n)) { + ret = -EINVAL; + goto bail_unlock; + } + for (n = 0; tail != head; n++) { + if (cq->ip) + wc->uqueue[n] = old_wc->uqueue[tail]; + else + wc->kqueue[n] = old_wc->kqueue[tail]; + if (tail == (u32) cq->ibcq.cqe) + tail = 0; + else + tail++; + } + cq->ibcq.cqe = cqe; + wc->head = n; + wc->tail = 0; + cq->queue = wc; + spin_unlock_irq(&cq->lock); + + vfree(old_wc); + + if (cq->ip) { + struct qib_ibdev *dev = to_idev(ibcq->device); + struct qib_mmap_info *ip = cq->ip; + + qib_update_mmap_info(dev, ip, sz, wc); + + /* + * Return the offset to mmap. + * See qib_mmap() for details. + */ + if (udata && udata->outlen >= sizeof(__u64)) { + ret = ib_copy_to_udata(udata, &ip->offset, + sizeof(ip->offset)); + if (ret) + goto bail; + } + + spin_lock_irq(&dev->pending_lock); + if (list_empty(&ip->pending_mmaps)) + list_add(&ip->pending_mmaps, &dev->pending_mmaps); + spin_unlock_irq(&dev->pending_lock); + } + + ret = 0; + goto bail; + +bail_unlock: + spin_unlock_irq(&cq->lock); +bail_free: + vfree(wc); +bail: + return ret; +} + +int qib_cq_init(struct qib_devdata *dd) +{ + int ret = 0; + int cpu; + struct task_struct *task; + + if (dd->worker) + return 0; + dd->worker = kzalloc(sizeof(*dd->worker), GFP_KERNEL); + if (!dd->worker) + return -ENOMEM; + init_kthread_worker(dd->worker); + task = kthread_create_on_node( + kthread_worker_fn, + dd->worker, + dd->assigned_node_id, + "qib_cq%d", dd->unit); + if (IS_ERR(task)) + goto task_fail; + cpu = cpumask_first(cpumask_of_node(dd->assigned_node_id)); + kthread_bind(task, cpu); + wake_up_process(task); +out: + return ret; +task_fail: + ret = PTR_ERR(task); + kfree(dd->worker); + dd->worker = NULL; + goto out; +} + +void qib_cq_exit(struct qib_devdata *dd) +{ + struct kthread_worker *worker; + + worker = dd->worker; + if (!worker) + return; + /* blocks future queuing from send_complete() */ + dd->worker = NULL; + smp_wmb(); + flush_kthread_worker(worker); + kthread_stop(worker->task); + kfree(worker); +} diff --git a/kernel/drivers/infiniband/hw/qib/qib_debugfs.c b/kernel/drivers/infiniband/hw/qib/qib_debugfs.c new file mode 100644 index 000000000..5e75b43c5 --- /dev/null +++ b/kernel/drivers/infiniband/hw/qib/qib_debugfs.c @@ -0,0 +1,283 @@ +#ifdef CONFIG_DEBUG_FS +/* + * Copyright (c) 2013 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include +#include + +#include "qib.h" +#include "qib_verbs.h" +#include "qib_debugfs.h" + +static struct dentry *qib_dbg_root; + +#define DEBUGFS_FILE(name) \ +static const struct seq_operations _##name##_seq_ops = { \ + .start = _##name##_seq_start, \ + .next = _##name##_seq_next, \ + .stop = _##name##_seq_stop, \ + .show = _##name##_seq_show \ +}; \ +static int _##name##_open(struct inode *inode, struct file *s) \ +{ \ + struct seq_file *seq; \ + int ret; \ + ret = seq_open(s, &_##name##_seq_ops); \ + if (ret) \ + return ret; \ + seq = s->private_data; \ + seq->private = inode->i_private; \ + return 0; \ +} \ +static const struct file_operations _##name##_file_ops = { \ + .owner = THIS_MODULE, \ + .open = _##name##_open, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = seq_release \ +}; + +#define DEBUGFS_FILE_CREATE(name) \ +do { \ + struct dentry *ent; \ + ent = debugfs_create_file(#name , 0400, ibd->qib_ibdev_dbg, \ + ibd, &_##name##_file_ops); \ + if (!ent) \ + pr_warn("create of " #name " failed\n"); \ +} while (0) + +static void *_opcode_stats_seq_start(struct seq_file *s, loff_t *pos) +{ + struct qib_opcode_stats_perctx *opstats; + + if (*pos >= ARRAY_SIZE(opstats->stats)) + return NULL; + return pos; +} + +static void *_opcode_stats_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct qib_opcode_stats_perctx *opstats; + + ++*pos; + if (*pos >= ARRAY_SIZE(opstats->stats)) + return NULL; + return pos; +} + + +static void _opcode_stats_seq_stop(struct seq_file *s, void *v) +{ + /* nothing allocated */ +} + +static int _opcode_stats_seq_show(struct seq_file *s, void *v) +{ + loff_t *spos = v; + loff_t i = *spos, j; + u64 n_packets = 0, n_bytes = 0; + struct qib_ibdev *ibd = (struct qib_ibdev *)s->private; + struct qib_devdata *dd = dd_from_dev(ibd); + + for (j = 0; j < dd->first_user_ctxt; j++) { + if (!dd->rcd[j]) + continue; + n_packets += dd->rcd[j]->opstats->stats[i].n_packets; + n_bytes += dd->rcd[j]->opstats->stats[i].n_bytes; + } + if (!n_packets && !n_bytes) + return SEQ_SKIP; + seq_printf(s, "%02llx %llu/%llu\n", i, + (unsigned long long) n_packets, + (unsigned long long) n_bytes); + + return 0; +} + +DEBUGFS_FILE(opcode_stats) + +static void *_ctx_stats_seq_start(struct seq_file *s, loff_t *pos) +{ + struct qib_ibdev *ibd = (struct qib_ibdev *)s->private; + struct qib_devdata *dd = dd_from_dev(ibd); + + if (!*pos) + return SEQ_START_TOKEN; + if (*pos >= dd->first_user_ctxt) + return NULL; + return pos; +} + +static void *_ctx_stats_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct qib_ibdev *ibd = (struct qib_ibdev *)s->private; + struct qib_devdata *dd = dd_from_dev(ibd); + + if (v == SEQ_START_TOKEN) + return pos; + + ++*pos; + if (*pos >= dd->first_user_ctxt) + return NULL; + return pos; +} + +static void _ctx_stats_seq_stop(struct seq_file *s, void *v) +{ + /* nothing allocated */ +} + +static int _ctx_stats_seq_show(struct seq_file *s, void *v) +{ + loff_t *spos; + loff_t i, j; + u64 n_packets = 0; + struct qib_ibdev *ibd = (struct qib_ibdev *)s->private; + struct qib_devdata *dd = dd_from_dev(ibd); + + if (v == SEQ_START_TOKEN) { + seq_puts(s, "Ctx:npkts\n"); + return 0; + } + + spos = v; + i = *spos; + + if (!dd->rcd[i]) + return SEQ_SKIP; + + for (j = 0; j < ARRAY_SIZE(dd->rcd[i]->opstats->stats); j++) + n_packets += dd->rcd[i]->opstats->stats[j].n_packets; + + if (!n_packets) + return SEQ_SKIP; + + seq_printf(s, " %llu:%llu\n", i, n_packets); + return 0; +} + +DEBUGFS_FILE(ctx_stats) + +static void *_qp_stats_seq_start(struct seq_file *s, loff_t *pos) +{ + struct qib_qp_iter *iter; + loff_t n = *pos; + + rcu_read_lock(); + iter = qib_qp_iter_init(s->private); + if (!iter) + return NULL; + + while (n--) { + if (qib_qp_iter_next(iter)) { + kfree(iter); + return NULL; + } + } + + return iter; +} + +static void *_qp_stats_seq_next(struct seq_file *s, void *iter_ptr, + loff_t *pos) +{ + struct qib_qp_iter *iter = iter_ptr; + + (*pos)++; + + if (qib_qp_iter_next(iter)) { + kfree(iter); + return NULL; + } + + return iter; +} + +static void _qp_stats_seq_stop(struct seq_file *s, void *iter_ptr) +{ + rcu_read_unlock(); +} + +static int _qp_stats_seq_show(struct seq_file *s, void *iter_ptr) +{ + struct qib_qp_iter *iter = iter_ptr; + + if (!iter) + return 0; + + qib_qp_iter_print(s, iter); + + return 0; +} + +DEBUGFS_FILE(qp_stats) + +void qib_dbg_ibdev_init(struct qib_ibdev *ibd) +{ + char name[10]; + + snprintf(name, sizeof(name), "qib%d", dd_from_dev(ibd)->unit); + ibd->qib_ibdev_dbg = debugfs_create_dir(name, qib_dbg_root); + if (!ibd->qib_ibdev_dbg) { + pr_warn("create of %s failed\n", name); + return; + } + DEBUGFS_FILE_CREATE(opcode_stats); + DEBUGFS_FILE_CREATE(ctx_stats); + DEBUGFS_FILE_CREATE(qp_stats); +} + +void qib_dbg_ibdev_exit(struct qib_ibdev *ibd) +{ + if (!qib_dbg_root) + goto out; + debugfs_remove_recursive(ibd->qib_ibdev_dbg); +out: + ibd->qib_ibdev_dbg = NULL; +} + +void qib_dbg_init(void) +{ + qib_dbg_root = debugfs_create_dir(QIB_DRV_NAME, NULL); + if (!qib_dbg_root) + pr_warn("init of debugfs failed\n"); +} + +void qib_dbg_exit(void) +{ + debugfs_remove_recursive(qib_dbg_root); + qib_dbg_root = NULL; +} + +#endif + diff --git a/kernel/drivers/infiniband/hw/qib/qib_debugfs.h b/kernel/drivers/infiniband/hw/qib/qib_debugfs.h new file mode 100644 index 000000000..7ae983a91 --- /dev/null +++ b/kernel/drivers/infiniband/hw/qib/qib_debugfs.h @@ -0,0 +1,45 @@ +#ifndef _QIB_DEBUGFS_H +#define _QIB_DEBUGFS_H + +#ifdef CONFIG_DEBUG_FS +/* + * Copyright (c) 2013 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +struct qib_ibdev; +void qib_dbg_ibdev_init(struct qib_ibdev *ibd); +void qib_dbg_ibdev_exit(struct qib_ibdev *ibd); +void qib_dbg_init(void); +void qib_dbg_exit(void); + +#endif + +#endif /* _QIB_DEBUGFS_H */ diff --git a/kernel/drivers/infiniband/hw/qib/qib_diag.c b/kernel/drivers/infiniband/hw/qib/qib_diag.c new file mode 100644 index 000000000..8c34b23e5 --- /dev/null +++ b/kernel/drivers/infiniband/hw/qib/qib_diag.c @@ -0,0 +1,916 @@ +/* + * Copyright (c) 2012 Intel Corporation. All rights reserved. + * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * This file contains support for diagnostic functions. It is accessed by + * opening the qib_diag device, normally minor number 129. Diagnostic use + * of the QLogic_IB chip may render the chip or board unusable until the + * driver is unloaded, or in some cases, until the system is rebooted. + * + * Accesses to the chip through this interface are not similar to going + * through the /sys/bus/pci resource mmap interface. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "qib.h" +#include "qib_common.h" + +#undef pr_fmt +#define pr_fmt(fmt) QIB_DRV_NAME ": " fmt + +/* + * Each client that opens the diag device must read then write + * offset 0, to prevent lossage from random cat or od. diag_state + * sequences this "handshake". + */ +enum diag_state { UNUSED = 0, OPENED, INIT, READY }; + +/* State for an individual client. PID so children cannot abuse handshake */ +static struct qib_diag_client { + struct qib_diag_client *next; + struct qib_devdata *dd; + pid_t pid; + enum diag_state state; +} *client_pool; + +/* + * Get a client struct. Recycled if possible, else kmalloc. + * Must be called with qib_mutex held + */ +static struct qib_diag_client *get_client(struct qib_devdata *dd) +{ + struct qib_diag_client *dc; + + dc = client_pool; + if (dc) + /* got from pool remove it and use */ + client_pool = dc->next; + else + /* None in pool, alloc and init */ + dc = kmalloc(sizeof(*dc), GFP_KERNEL); + + if (dc) { + dc->next = NULL; + dc->dd = dd; + dc->pid = current->pid; + dc->state = OPENED; + } + return dc; +} + +/* + * Return to pool. Must be called with qib_mutex held + */ +static void return_client(struct qib_diag_client *dc) +{ + struct qib_devdata *dd = dc->dd; + struct qib_diag_client *tdc, *rdc; + + rdc = NULL; + if (dc == dd->diag_client) { + dd->diag_client = dc->next; + rdc = dc; + } else { + tdc = dc->dd->diag_client; + while (tdc) { + if (dc == tdc->next) { + tdc->next = dc->next; + rdc = dc; + break; + } + tdc = tdc->next; + } + } + if (rdc) { + rdc->state = UNUSED; + rdc->dd = NULL; + rdc->pid = 0; + rdc->next = client_pool; + client_pool = rdc; + } +} + +static int qib_diag_open(struct inode *in, struct file *fp); +static int qib_diag_release(struct inode *in, struct file *fp); +static ssize_t qib_diag_read(struct file *fp, char __user *data, + size_t count, loff_t *off); +static ssize_t qib_diag_write(struct file *fp, const char __user *data, + size_t count, loff_t *off); + +static const struct file_operations diag_file_ops = { + .owner = THIS_MODULE, + .write = qib_diag_write, + .read = qib_diag_read, + .open = qib_diag_open, + .release = qib_diag_release, + .llseek = default_llseek, +}; + +static atomic_t diagpkt_count = ATOMIC_INIT(0); +static struct cdev *diagpkt_cdev; +static struct device *diagpkt_device; + +static ssize_t qib_diagpkt_write(struct file *fp, const char __user *data, + size_t count, loff_t *off); + +static const struct file_operations diagpkt_file_ops = { + .owner = THIS_MODULE, + .write = qib_diagpkt_write, + .llseek = noop_llseek, +}; + +int qib_diag_add(struct qib_devdata *dd) +{ + char name[16]; + int ret = 0; + + if (atomic_inc_return(&diagpkt_count) == 1) { + ret = qib_cdev_init(QIB_DIAGPKT_MINOR, "ipath_diagpkt", + &diagpkt_file_ops, &diagpkt_cdev, + &diagpkt_device); + if (ret) + goto done; + } + + snprintf(name, sizeof(name), "ipath_diag%d", dd->unit); + ret = qib_cdev_init(QIB_DIAG_MINOR_BASE + dd->unit, name, + &diag_file_ops, &dd->diag_cdev, + &dd->diag_device); +done: + return ret; +} + +static void qib_unregister_observers(struct qib_devdata *dd); + +void qib_diag_remove(struct qib_devdata *dd) +{ + struct qib_diag_client *dc; + + if (atomic_dec_and_test(&diagpkt_count)) + qib_cdev_cleanup(&diagpkt_cdev, &diagpkt_device); + + qib_cdev_cleanup(&dd->diag_cdev, &dd->diag_device); + + /* + * Return all diag_clients of this device. There should be none, + * as we are "guaranteed" that no clients are still open + */ + while (dd->diag_client) + return_client(dd->diag_client); + + /* Now clean up all unused client structs */ + while (client_pool) { + dc = client_pool; + client_pool = dc->next; + kfree(dc); + } + /* Clean up observer list */ + qib_unregister_observers(dd); +} + +/* qib_remap_ioaddr32 - remap an offset into chip address space to __iomem * + * + * @dd: the qlogic_ib device + * @offs: the offset in chip-space + * @cntp: Pointer to max (byte) count for transfer starting at offset + * This returns a u32 __iomem * so it can be used for both 64 and 32-bit + * mapping. It is needed because with the use of PAT for control of + * write-combining, the logically contiguous address-space of the chip + * may be split into virtually non-contiguous spaces, with different + * attributes, which are them mapped to contiguous physical space + * based from the first BAR. + * + * The code below makes the same assumptions as were made in + * init_chip_wc_pat() (qib_init.c), copied here: + * Assumes chip address space looks like: + * - kregs + sregs + cregs + uregs (in any order) + * - piobufs (2K and 4K bufs in either order) + * or: + * - kregs + sregs + cregs (in any order) + * - piobufs (2K and 4K bufs in either order) + * - uregs + * + * If cntp is non-NULL, returns how many bytes from offset can be accessed + * Returns 0 if the offset is not mapped. + */ +static u32 __iomem *qib_remap_ioaddr32(struct qib_devdata *dd, u32 offset, + u32 *cntp) +{ + u32 kreglen; + u32 snd_bottom, snd_lim = 0; + u32 __iomem *krb32 = (u32 __iomem *)dd->kregbase; + u32 __iomem *map = NULL; + u32 cnt = 0; + u32 tot4k, offs4k; + + /* First, simplest case, offset is within the first map. */ + kreglen = (dd->kregend - dd->kregbase) * sizeof(u64); + if (offset < kreglen) { + map = krb32 + (offset / sizeof(u32)); + cnt = kreglen - offset; + goto mapped; + } + + /* + * Next check for user regs, the next most common case, + * and a cheap check because if they are not in the first map + * they are last in chip. + */ + if (dd->userbase) { + /* If user regs mapped, they are after send, so set limit. */ + u32 ulim = (dd->cfgctxts * dd->ureg_align) + dd->uregbase; + + if (!dd->piovl15base) + snd_lim = dd->uregbase; + krb32 = (u32 __iomem *)dd->userbase; + if (offset >= dd->uregbase && offset < ulim) { + map = krb32 + (offset - dd->uregbase) / sizeof(u32); + cnt = ulim - offset; + goto mapped; + } + } + + /* + * Lastly, check for offset within Send Buffers. + * This is gnarly because struct devdata is deliberately vague + * about things like 7322 VL15 buffers, and we are not in + * chip-specific code here, so should not make many assumptions. + * The one we _do_ make is that the only chip that has more sndbufs + * than we admit is the 7322, and it has userregs above that, so + * we know the snd_lim. + */ + /* Assume 2K buffers are first. */ + snd_bottom = dd->pio2k_bufbase; + if (snd_lim == 0) { + u32 tot2k = dd->piobcnt2k * ALIGN(dd->piosize2k, dd->palign); + + snd_lim = snd_bottom + tot2k; + } + /* If 4k buffers exist, account for them by bumping + * appropriate limit. + */ + tot4k = dd->piobcnt4k * dd->align4k; + offs4k = dd->piobufbase >> 32; + if (dd->piobcnt4k) { + if (snd_bottom > offs4k) + snd_bottom = offs4k; + else { + /* 4k above 2k. Bump snd_lim, if needed*/ + if (!dd->userbase || dd->piovl15base) + snd_lim = offs4k + tot4k; + } + } + /* + * Judgement call: can we ignore the space between SendBuffs and + * UserRegs, where we would like to see vl15 buffs, but not more? + */ + if (offset >= snd_bottom && offset < snd_lim) { + offset -= snd_bottom; + map = (u32 __iomem *)dd->piobase + (offset / sizeof(u32)); + cnt = snd_lim - offset; + } + + if (!map && offs4k && dd->piovl15base) { + snd_lim = offs4k + tot4k + 2 * dd->align4k; + if (offset >= (offs4k + tot4k) && offset < snd_lim) { + map = (u32 __iomem *)dd->piovl15base + + ((offset - (offs4k + tot4k)) / sizeof(u32)); + cnt = snd_lim - offset; + } + } + +mapped: + if (cntp) + *cntp = cnt; + return map; +} + +/* + * qib_read_umem64 - read a 64-bit quantity from the chip into user space + * @dd: the qlogic_ib device + * @uaddr: the location to store the data in user memory + * @regoffs: the offset from BAR0 (_NOT_ full pointer, anymore) + * @count: number of bytes to copy (multiple of 32 bits) + * + * This function also localizes all chip memory accesses. + * The copy should be written such that we read full cacheline packets + * from the chip. This is usually used for a single qword + * + * NOTE: This assumes the chip address is 64-bit aligned. + */ +static int qib_read_umem64(struct qib_devdata *dd, void __user *uaddr, + u32 regoffs, size_t count) +{ + const u64 __iomem *reg_addr; + const u64 __iomem *reg_end; + u32 limit; + int ret; + + reg_addr = (const u64 __iomem *)qib_remap_ioaddr32(dd, regoffs, &limit); + if (reg_addr == NULL || limit == 0 || !(dd->flags & QIB_PRESENT)) { + ret = -EINVAL; + goto bail; + } + if (count >= limit) + count = limit; + reg_end = reg_addr + (count / sizeof(u64)); + + /* not very efficient, but it works for now */ + while (reg_addr < reg_end) { + u64 data = readq(reg_addr); + + if (copy_to_user(uaddr, &data, sizeof(u64))) { + ret = -EFAULT; + goto bail; + } + reg_addr++; + uaddr += sizeof(u64); + } + ret = 0; +bail: + return ret; +} + +/* + * qib_write_umem64 - write a 64-bit quantity to the chip from user space + * @dd: the qlogic_ib device + * @regoffs: the offset from BAR0 (_NOT_ full pointer, anymore) + * @uaddr: the source of the data in user memory + * @count: the number of bytes to copy (multiple of 32 bits) + * + * This is usually used for a single qword + * NOTE: This assumes the chip address is 64-bit aligned. + */ + +static int qib_write_umem64(struct qib_devdata *dd, u32 regoffs, + const void __user *uaddr, size_t count) +{ + u64 __iomem *reg_addr; + const u64 __iomem *reg_end; + u32 limit; + int ret; + + reg_addr = (u64 __iomem *)qib_remap_ioaddr32(dd, regoffs, &limit); + if (reg_addr == NULL || limit == 0 || !(dd->flags & QIB_PRESENT)) { + ret = -EINVAL; + goto bail; + } + if (count >= limit) + count = limit; + reg_end = reg_addr + (count / sizeof(u64)); + + /* not very efficient, but it works for now */ + while (reg_addr < reg_end) { + u64 data; + + if (copy_from_user(&data, uaddr, sizeof(data))) { + ret = -EFAULT; + goto bail; + } + writeq(data, reg_addr); + + reg_addr++; + uaddr += sizeof(u64); + } + ret = 0; +bail: + return ret; +} + +/* + * qib_read_umem32 - read a 32-bit quantity from the chip into user space + * @dd: the qlogic_ib device + * @uaddr: the location to store the data in user memory + * @regoffs: the offset from BAR0 (_NOT_ full pointer, anymore) + * @count: number of bytes to copy + * + * read 32 bit values, not 64 bit; for memories that only + * support 32 bit reads; usually a single dword. + */ +static int qib_read_umem32(struct qib_devdata *dd, void __user *uaddr, + u32 regoffs, size_t count) +{ + const u32 __iomem *reg_addr; + const u32 __iomem *reg_end; + u32 limit; + int ret; + + reg_addr = qib_remap_ioaddr32(dd, regoffs, &limit); + if (reg_addr == NULL || limit == 0 || !(dd->flags & QIB_PRESENT)) { + ret = -EINVAL; + goto bail; + } + if (count >= limit) + count = limit; + reg_end = reg_addr + (count / sizeof(u32)); + + /* not very efficient, but it works for now */ + while (reg_addr < reg_end) { + u32 data = readl(reg_addr); + + if (copy_to_user(uaddr, &data, sizeof(data))) { + ret = -EFAULT; + goto bail; + } + + reg_addr++; + uaddr += sizeof(u32); + + } + ret = 0; +bail: + return ret; +} + +/* + * qib_write_umem32 - write a 32-bit quantity to the chip from user space + * @dd: the qlogic_ib device + * @regoffs: the offset from BAR0 (_NOT_ full pointer, anymore) + * @uaddr: the source of the data in user memory + * @count: number of bytes to copy + * + * write 32 bit values, not 64 bit; for memories that only + * support 32 bit write; usually a single dword. + */ + +static int qib_write_umem32(struct qib_devdata *dd, u32 regoffs, + const void __user *uaddr, size_t count) +{ + u32 __iomem *reg_addr; + const u32 __iomem *reg_end; + u32 limit; + int ret; + + reg_addr = qib_remap_ioaddr32(dd, regoffs, &limit); + if (reg_addr == NULL || limit == 0 || !(dd->flags & QIB_PRESENT)) { + ret = -EINVAL; + goto bail; + } + if (count >= limit) + count = limit; + reg_end = reg_addr + (count / sizeof(u32)); + + while (reg_addr < reg_end) { + u32 data; + + if (copy_from_user(&data, uaddr, sizeof(data))) { + ret = -EFAULT; + goto bail; + } + writel(data, reg_addr); + + reg_addr++; + uaddr += sizeof(u32); + } + ret = 0; +bail: + return ret; +} + +static int qib_diag_open(struct inode *in, struct file *fp) +{ + int unit = iminor(in) - QIB_DIAG_MINOR_BASE; + struct qib_devdata *dd; + struct qib_diag_client *dc; + int ret; + + mutex_lock(&qib_mutex); + + dd = qib_lookup(unit); + + if (dd == NULL || !(dd->flags & QIB_PRESENT) || + !dd->kregbase) { + ret = -ENODEV; + goto bail; + } + + dc = get_client(dd); + if (!dc) { + ret = -ENOMEM; + goto bail; + } + dc->next = dd->diag_client; + dd->diag_client = dc; + fp->private_data = dc; + ret = 0; +bail: + mutex_unlock(&qib_mutex); + + return ret; +} + +/** + * qib_diagpkt_write - write an IB packet + * @fp: the diag data device file pointer + * @data: qib_diag_pkt structure saying where to get the packet + * @count: size of data to write + * @off: unused by this code + */ +static ssize_t qib_diagpkt_write(struct file *fp, + const char __user *data, + size_t count, loff_t *off) +{ + u32 __iomem *piobuf; + u32 plen, pbufn, maxlen_reserve; + struct qib_diag_xpkt dp; + u32 *tmpbuf = NULL; + struct qib_devdata *dd; + struct qib_pportdata *ppd; + ssize_t ret = 0; + + if (count != sizeof(dp)) { + ret = -EINVAL; + goto bail; + } + if (copy_from_user(&dp, data, sizeof(dp))) { + ret = -EFAULT; + goto bail; + } + + dd = qib_lookup(dp.unit); + if (!dd || !(dd->flags & QIB_PRESENT) || !dd->kregbase) { + ret = -ENODEV; + goto bail; + } + if (!(dd->flags & QIB_INITTED)) { + /* no hardware, freeze, etc. */ + ret = -ENODEV; + goto bail; + } + + if (dp.version != _DIAG_XPKT_VERS) { + qib_dev_err(dd, "Invalid version %u for diagpkt_write\n", + dp.version); + ret = -EINVAL; + goto bail; + } + /* send count must be an exact number of dwords */ + if (dp.len & 3) { + ret = -EINVAL; + goto bail; + } + if (!dp.port || dp.port > dd->num_pports) { + ret = -EINVAL; + goto bail; + } + ppd = &dd->pport[dp.port - 1]; + + /* + * need total length before first word written, plus 2 Dwords. One Dword + * is for padding so we get the full user data when not aligned on + * a word boundary. The other Dword is to make sure we have room for the + * ICRC which gets tacked on later. + */ + maxlen_reserve = 2 * sizeof(u32); + if (dp.len > ppd->ibmaxlen - maxlen_reserve) { + ret = -EINVAL; + goto bail; + } + + plen = sizeof(u32) + dp.len; + + tmpbuf = vmalloc(plen); + if (!tmpbuf) { + qib_devinfo(dd->pcidev, + "Unable to allocate tmp buffer, failing\n"); + ret = -ENOMEM; + goto bail; + } + + if (copy_from_user(tmpbuf, + (const void __user *) (unsigned long) dp.data, + dp.len)) { + ret = -EFAULT; + goto bail; + } + + plen >>= 2; /* in dwords */ + + if (dp.pbc_wd == 0) + dp.pbc_wd = plen; + + piobuf = dd->f_getsendbuf(ppd, dp.pbc_wd, &pbufn); + if (!piobuf) { + ret = -EBUSY; + goto bail; + } + /* disarm it just to be extra sure */ + dd->f_sendctrl(dd->pport, QIB_SENDCTRL_DISARM_BUF(pbufn)); + + /* disable header check on pbufn for this packet */ + dd->f_txchk_change(dd, pbufn, 1, TXCHK_CHG_TYPE_DIS1, NULL); + + writeq(dp.pbc_wd, piobuf); + /* + * Copy all but the trigger word, then flush, so it's written + * to chip before trigger word, then write trigger word, then + * flush again, so packet is sent. + */ + if (dd->flags & QIB_PIO_FLUSH_WC) { + qib_flush_wc(); + qib_pio_copy(piobuf + 2, tmpbuf, plen - 1); + qib_flush_wc(); + __raw_writel(tmpbuf[plen - 1], piobuf + plen + 1); + } else + qib_pio_copy(piobuf + 2, tmpbuf, plen); + + if (dd->flags & QIB_USE_SPCL_TRIG) { + u32 spcl_off = (pbufn >= dd->piobcnt2k) ? 2047 : 1023; + + qib_flush_wc(); + __raw_writel(0xaebecede, piobuf + spcl_off); + } + + /* + * Ensure buffer is written to the chip, then re-enable + * header checks (if supported by chip). The txchk + * code will ensure seen by chip before returning. + */ + qib_flush_wc(); + qib_sendbuf_done(dd, pbufn); + dd->f_txchk_change(dd, pbufn, 1, TXCHK_CHG_TYPE_ENAB1, NULL); + + ret = sizeof(dp); + +bail: + vfree(tmpbuf); + return ret; +} + +static int qib_diag_release(struct inode *in, struct file *fp) +{ + mutex_lock(&qib_mutex); + return_client(fp->private_data); + fp->private_data = NULL; + mutex_unlock(&qib_mutex); + return 0; +} + +/* + * Chip-specific code calls to register its interest in + * a specific range. + */ +struct diag_observer_list_elt { + struct diag_observer_list_elt *next; + const struct diag_observer *op; +}; + +int qib_register_observer(struct qib_devdata *dd, + const struct diag_observer *op) +{ + struct diag_observer_list_elt *olp; + unsigned long flags; + + if (!dd || !op) + return -EINVAL; + olp = vmalloc(sizeof(*olp)); + if (!olp) { + pr_err("vmalloc for observer failed\n"); + return -ENOMEM; + } + + spin_lock_irqsave(&dd->qib_diag_trans_lock, flags); + olp->op = op; + olp->next = dd->diag_observer_list; + dd->diag_observer_list = olp; + spin_unlock_irqrestore(&dd->qib_diag_trans_lock, flags); + + return 0; +} + +/* Remove all registered observers when device is closed */ +static void qib_unregister_observers(struct qib_devdata *dd) +{ + struct diag_observer_list_elt *olp; + unsigned long flags; + + spin_lock_irqsave(&dd->qib_diag_trans_lock, flags); + olp = dd->diag_observer_list; + while (olp) { + /* Pop one observer, let go of lock */ + dd->diag_observer_list = olp->next; + spin_unlock_irqrestore(&dd->qib_diag_trans_lock, flags); + vfree(olp); + /* try again. */ + spin_lock_irqsave(&dd->qib_diag_trans_lock, flags); + olp = dd->diag_observer_list; + } + spin_unlock_irqrestore(&dd->qib_diag_trans_lock, flags); +} + +/* + * Find the observer, if any, for the specified address. Initial implementation + * is simple stack of observers. This must be called with diag transaction + * lock held. + */ +static const struct diag_observer *diag_get_observer(struct qib_devdata *dd, + u32 addr) +{ + struct diag_observer_list_elt *olp; + const struct diag_observer *op = NULL; + + olp = dd->diag_observer_list; + while (olp) { + op = olp->op; + if (addr >= op->bottom && addr <= op->top) + break; + olp = olp->next; + } + if (!olp) + op = NULL; + + return op; +} + +static ssize_t qib_diag_read(struct file *fp, char __user *data, + size_t count, loff_t *off) +{ + struct qib_diag_client *dc = fp->private_data; + struct qib_devdata *dd = dc->dd; + void __iomem *kreg_base; + ssize_t ret; + + if (dc->pid != current->pid) { + ret = -EPERM; + goto bail; + } + + kreg_base = dd->kregbase; + + if (count == 0) + ret = 0; + else if ((count % 4) || (*off % 4)) + /* address or length is not 32-bit aligned, hence invalid */ + ret = -EINVAL; + else if (dc->state < READY && (*off || count != 8)) + ret = -EINVAL; /* prevent cat /dev/qib_diag* */ + else { + unsigned long flags; + u64 data64 = 0; + int use_32; + const struct diag_observer *op; + + use_32 = (count % 8) || (*off % 8); + ret = -1; + spin_lock_irqsave(&dd->qib_diag_trans_lock, flags); + /* + * Check for observer on this address range. + * we only support a single 32 or 64-bit read + * via observer, currently. + */ + op = diag_get_observer(dd, *off); + if (op) { + u32 offset = *off; + + ret = op->hook(dd, op, offset, &data64, 0, use_32); + } + /* + * We need to release lock before any copy_to_user(), + * whether implicit in qib_read_umem* or explicit below. + */ + spin_unlock_irqrestore(&dd->qib_diag_trans_lock, flags); + if (!op) { + if (use_32) + /* + * Address or length is not 64-bit aligned; + * do 32-bit rd + */ + ret = qib_read_umem32(dd, data, (u32) *off, + count); + else + ret = qib_read_umem64(dd, data, (u32) *off, + count); + } else if (ret == count) { + /* Below finishes case where observer existed */ + ret = copy_to_user(data, &data64, use_32 ? + sizeof(u32) : sizeof(u64)); + if (ret) + ret = -EFAULT; + } + } + + if (ret >= 0) { + *off += count; + ret = count; + if (dc->state == OPENED) + dc->state = INIT; + } +bail: + return ret; +} + +static ssize_t qib_diag_write(struct file *fp, const char __user *data, + size_t count, loff_t *off) +{ + struct qib_diag_client *dc = fp->private_data; + struct qib_devdata *dd = dc->dd; + void __iomem *kreg_base; + ssize_t ret; + + if (dc->pid != current->pid) { + ret = -EPERM; + goto bail; + } + + kreg_base = dd->kregbase; + + if (count == 0) + ret = 0; + else if ((count % 4) || (*off % 4)) + /* address or length is not 32-bit aligned, hence invalid */ + ret = -EINVAL; + else if (dc->state < READY && + ((*off || count != 8) || dc->state != INIT)) + /* No writes except second-step of init seq */ + ret = -EINVAL; /* before any other write allowed */ + else { + unsigned long flags; + const struct diag_observer *op = NULL; + int use_32 = (count % 8) || (*off % 8); + + /* + * Check for observer on this address range. + * We only support a single 32 or 64-bit write + * via observer, currently. This helps, because + * we would otherwise have to jump through hoops + * to make "diag transaction" meaningful when we + * cannot do a copy_from_user while holding the lock. + */ + if (count == 4 || count == 8) { + u64 data64; + u32 offset = *off; + + ret = copy_from_user(&data64, data, count); + if (ret) { + ret = -EFAULT; + goto bail; + } + spin_lock_irqsave(&dd->qib_diag_trans_lock, flags); + op = diag_get_observer(dd, *off); + if (op) + ret = op->hook(dd, op, offset, &data64, ~0Ull, + use_32); + spin_unlock_irqrestore(&dd->qib_diag_trans_lock, flags); + } + + if (!op) { + if (use_32) + /* + * Address or length is not 64-bit aligned; + * do 32-bit write + */ + ret = qib_write_umem32(dd, (u32) *off, data, + count); + else + ret = qib_write_umem64(dd, (u32) *off, data, + count); + } + } + + if (ret >= 0) { + *off += count; + ret = count; + if (dc->state == INIT) + dc->state = READY; /* all read/write OK now */ + } +bail: + return ret; +} diff --git a/kernel/drivers/infiniband/hw/qib/qib_dma.c b/kernel/drivers/infiniband/hw/qib/qib_dma.c new file mode 100644 index 000000000..59fe092b4 --- /dev/null +++ b/kernel/drivers/infiniband/hw/qib/qib_dma.c @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2006, 2009, 2010 QLogic, Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include + +#include "qib_verbs.h" + +#define BAD_DMA_ADDRESS ((u64) 0) + +/* + * The following functions implement driver specific replacements + * for the ib_dma_*() functions. + * + * These functions return kernel virtual addresses instead of + * device bus addresses since the driver uses the CPU to copy + * data instead of using hardware DMA. + */ + +static int qib_mapping_error(struct ib_device *dev, u64 dma_addr) +{ + return dma_addr == BAD_DMA_ADDRESS; +} + +static u64 qib_dma_map_single(struct ib_device *dev, void *cpu_addr, + size_t size, enum dma_data_direction direction) +{ + BUG_ON(!valid_dma_direction(direction)); + return (u64) cpu_addr; +} + +static void qib_dma_unmap_single(struct ib_device *dev, u64 addr, size_t size, + enum dma_data_direction direction) +{ + BUG_ON(!valid_dma_direction(direction)); +} + +static u64 qib_dma_map_page(struct ib_device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction direction) +{ + u64 addr; + + BUG_ON(!valid_dma_direction(direction)); + + if (offset + size > PAGE_SIZE) { + addr = BAD_DMA_ADDRESS; + goto done; + } + + addr = (u64) page_address(page); + if (addr) + addr += offset; + /* TODO: handle highmem pages */ + +done: + return addr; +} + +static void qib_dma_unmap_page(struct ib_device *dev, u64 addr, size_t size, + enum dma_data_direction direction) +{ + BUG_ON(!valid_dma_direction(direction)); +} + +static int qib_map_sg(struct ib_device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction direction) +{ + struct scatterlist *sg; + u64 addr; + int i; + int ret = nents; + + BUG_ON(!valid_dma_direction(direction)); + + for_each_sg(sgl, sg, nents, i) { + addr = (u64) page_address(sg_page(sg)); + /* TODO: handle highmem pages */ + if (!addr) { + ret = 0; + break; + } + sg->dma_address = addr + sg->offset; +#ifdef CONFIG_NEED_SG_DMA_LENGTH + sg->dma_length = sg->length; +#endif + } + return ret; +} + +static void qib_unmap_sg(struct ib_device *dev, + struct scatterlist *sg, int nents, + enum dma_data_direction direction) +{ + BUG_ON(!valid_dma_direction(direction)); +} + +static void qib_sync_single_for_cpu(struct ib_device *dev, u64 addr, + size_t size, enum dma_data_direction dir) +{ +} + +static void qib_sync_single_for_device(struct ib_device *dev, u64 addr, + size_t size, + enum dma_data_direction dir) +{ +} + +static void *qib_dma_alloc_coherent(struct ib_device *dev, size_t size, + u64 *dma_handle, gfp_t flag) +{ + struct page *p; + void *addr = NULL; + + p = alloc_pages(flag, get_order(size)); + if (p) + addr = page_address(p); + if (dma_handle) + *dma_handle = (u64) addr; + return addr; +} + +static void qib_dma_free_coherent(struct ib_device *dev, size_t size, + void *cpu_addr, u64 dma_handle) +{ + free_pages((unsigned long) cpu_addr, get_order(size)); +} + +struct ib_dma_mapping_ops qib_dma_mapping_ops = { + .mapping_error = qib_mapping_error, + .map_single = qib_dma_map_single, + .unmap_single = qib_dma_unmap_single, + .map_page = qib_dma_map_page, + .unmap_page = qib_dma_unmap_page, + .map_sg = qib_map_sg, + .unmap_sg = qib_unmap_sg, + .sync_single_for_cpu = qib_sync_single_for_cpu, + .sync_single_for_device = qib_sync_single_for_device, + .alloc_coherent = qib_dma_alloc_coherent, + .free_coherent = qib_dma_free_coherent +}; diff --git a/kernel/drivers/infiniband/hw/qib/qib_driver.c b/kernel/drivers/infiniband/hw/qib/qib_driver.c new file mode 100644 index 000000000..f58fdc3d2 --- /dev/null +++ b/kernel/drivers/infiniband/hw/qib/qib_driver.c @@ -0,0 +1,820 @@ +/* + * Copyright (c) 2013 Intel Corporation. All rights reserved. + * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "qib.h" + +/* + * The size has to be longer than this string, so we can append + * board/chip information to it in the init code. + */ +const char ib_qib_version[] = QIB_DRIVER_VERSION "\n"; + +DEFINE_SPINLOCK(qib_devs_lock); +LIST_HEAD(qib_dev_list); +DEFINE_MUTEX(qib_mutex); /* general driver use */ + +unsigned qib_ibmtu; +module_param_named(ibmtu, qib_ibmtu, uint, S_IRUGO); +MODULE_PARM_DESC(ibmtu, "Set max IB MTU (0=2KB, 1=256, 2=512, ... 5=4096"); + +unsigned qib_compat_ddr_negotiate = 1; +module_param_named(compat_ddr_negotiate, qib_compat_ddr_negotiate, uint, + S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(compat_ddr_negotiate, + "Attempt pre-IBTA 1.2 DDR speed negotiation"); + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_AUTHOR("Intel "); +MODULE_DESCRIPTION("Intel IB driver"); +MODULE_VERSION(QIB_DRIVER_VERSION); + +/* + * QIB_PIO_MAXIBHDR is the max IB header size allowed for in our + * PIO send buffers. This is well beyond anything currently + * defined in the InfiniBand spec. + */ +#define QIB_PIO_MAXIBHDR 128 + +/* + * QIB_MAX_PKT_RCV is the max # if packets processed per receive interrupt. + */ +#define QIB_MAX_PKT_RECV 64 + +struct qlogic_ib_stats qib_stats; + +const char *qib_get_unit_name(int unit) +{ + static char iname[16]; + + snprintf(iname, sizeof(iname), "infinipath%u", unit); + return iname; +} + +/* + * Return count of units with at least one port ACTIVE. + */ +int qib_count_active_units(void) +{ + struct qib_devdata *dd; + struct qib_pportdata *ppd; + unsigned long flags; + int pidx, nunits_active = 0; + + spin_lock_irqsave(&qib_devs_lock, flags); + list_for_each_entry(dd, &qib_dev_list, list) { + if (!(dd->flags & QIB_PRESENT) || !dd->kregbase) + continue; + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + if (ppd->lid && (ppd->lflags & (QIBL_LINKINIT | + QIBL_LINKARMED | QIBL_LINKACTIVE))) { + nunits_active++; + break; + } + } + } + spin_unlock_irqrestore(&qib_devs_lock, flags); + return nunits_active; +} + +/* + * Return count of all units, optionally return in arguments + * the number of usable (present) units, and the number of + * ports that are up. + */ +int qib_count_units(int *npresentp, int *nupp) +{ + int nunits = 0, npresent = 0, nup = 0; + struct qib_devdata *dd; + unsigned long flags; + int pidx; + struct qib_pportdata *ppd; + + spin_lock_irqsave(&qib_devs_lock, flags); + + list_for_each_entry(dd, &qib_dev_list, list) { + nunits++; + if ((dd->flags & QIB_PRESENT) && dd->kregbase) + npresent++; + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + if (ppd->lid && (ppd->lflags & (QIBL_LINKINIT | + QIBL_LINKARMED | QIBL_LINKACTIVE))) + nup++; + } + } + + spin_unlock_irqrestore(&qib_devs_lock, flags); + + if (npresentp) + *npresentp = npresent; + if (nupp) + *nupp = nup; + + return nunits; +} + +/** + * qib_wait_linkstate - wait for an IB link state change to occur + * @dd: the qlogic_ib device + * @state: the state to wait for + * @msecs: the number of milliseconds to wait + * + * wait up to msecs milliseconds for IB link state change to occur for + * now, take the easy polling route. Currently used only by + * qib_set_linkstate. Returns 0 if state reached, otherwise + * -ETIMEDOUT state can have multiple states set, for any of several + * transitions. + */ +int qib_wait_linkstate(struct qib_pportdata *ppd, u32 state, int msecs) +{ + int ret; + unsigned long flags; + + spin_lock_irqsave(&ppd->lflags_lock, flags); + if (ppd->state_wanted) { + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + ret = -EBUSY; + goto bail; + } + ppd->state_wanted = state; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + wait_event_interruptible_timeout(ppd->state_wait, + (ppd->lflags & state), + msecs_to_jiffies(msecs)); + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->state_wanted = 0; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + + if (!(ppd->lflags & state)) + ret = -ETIMEDOUT; + else + ret = 0; +bail: + return ret; +} + +int qib_set_linkstate(struct qib_pportdata *ppd, u8 newstate) +{ + u32 lstate; + int ret; + struct qib_devdata *dd = ppd->dd; + unsigned long flags; + + switch (newstate) { + case QIB_IB_LINKDOWN_ONLY: + dd->f_set_ib_cfg(ppd, QIB_IB_CFG_LSTATE, + IB_LINKCMD_DOWN | IB_LINKINITCMD_NOP); + /* don't wait */ + ret = 0; + goto bail; + + case QIB_IB_LINKDOWN: + dd->f_set_ib_cfg(ppd, QIB_IB_CFG_LSTATE, + IB_LINKCMD_DOWN | IB_LINKINITCMD_POLL); + /* don't wait */ + ret = 0; + goto bail; + + case QIB_IB_LINKDOWN_SLEEP: + dd->f_set_ib_cfg(ppd, QIB_IB_CFG_LSTATE, + IB_LINKCMD_DOWN | IB_LINKINITCMD_SLEEP); + /* don't wait */ + ret = 0; + goto bail; + + case QIB_IB_LINKDOWN_DISABLE: + dd->f_set_ib_cfg(ppd, QIB_IB_CFG_LSTATE, + IB_LINKCMD_DOWN | IB_LINKINITCMD_DISABLE); + /* don't wait */ + ret = 0; + goto bail; + + case QIB_IB_LINKARM: + if (ppd->lflags & QIBL_LINKARMED) { + ret = 0; + goto bail; + } + if (!(ppd->lflags & (QIBL_LINKINIT | QIBL_LINKACTIVE))) { + ret = -EINVAL; + goto bail; + } + /* + * Since the port can be ACTIVE when we ask for ARMED, + * clear QIBL_LINKV so we can wait for a transition. + * If the link isn't ARMED, then something else happened + * and there is no point waiting for ARMED. + */ + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags &= ~QIBL_LINKV; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + dd->f_set_ib_cfg(ppd, QIB_IB_CFG_LSTATE, + IB_LINKCMD_ARMED | IB_LINKINITCMD_NOP); + lstate = QIBL_LINKV; + break; + + case QIB_IB_LINKACTIVE: + if (ppd->lflags & QIBL_LINKACTIVE) { + ret = 0; + goto bail; + } + if (!(ppd->lflags & QIBL_LINKARMED)) { + ret = -EINVAL; + goto bail; + } + dd->f_set_ib_cfg(ppd, QIB_IB_CFG_LSTATE, + IB_LINKCMD_ACTIVE | IB_LINKINITCMD_NOP); + lstate = QIBL_LINKACTIVE; + break; + + default: + ret = -EINVAL; + goto bail; + } + ret = qib_wait_linkstate(ppd, lstate, 10); + +bail: + return ret; +} + +/* + * Get address of eager buffer from it's index (allocated in chunks, not + * contiguous). + */ +static inline void *qib_get_egrbuf(const struct qib_ctxtdata *rcd, u32 etail) +{ + const u32 chunk = etail >> rcd->rcvegrbufs_perchunk_shift; + const u32 idx = etail & ((u32)rcd->rcvegrbufs_perchunk - 1); + + return rcd->rcvegrbuf[chunk] + (idx << rcd->dd->rcvegrbufsize_shift); +} + +/* + * Returns 1 if error was a CRC, else 0. + * Needed for some chip's synthesized error counters. + */ +static u32 qib_rcv_hdrerr(struct qib_ctxtdata *rcd, struct qib_pportdata *ppd, + u32 ctxt, u32 eflags, u32 l, u32 etail, + __le32 *rhf_addr, struct qib_message_header *rhdr) +{ + u32 ret = 0; + + if (eflags & (QLOGIC_IB_RHF_H_ICRCERR | QLOGIC_IB_RHF_H_VCRCERR)) + ret = 1; + else if (eflags == QLOGIC_IB_RHF_H_TIDERR) { + /* For TIDERR and RC QPs premptively schedule a NAK */ + struct qib_ib_header *hdr = (struct qib_ib_header *) rhdr; + struct qib_other_headers *ohdr = NULL; + struct qib_ibport *ibp = &ppd->ibport_data; + struct qib_qp *qp = NULL; + u32 tlen = qib_hdrget_length_in_bytes(rhf_addr); + u16 lid = be16_to_cpu(hdr->lrh[1]); + int lnh = be16_to_cpu(hdr->lrh[0]) & 3; + u32 qp_num; + u32 opcode; + u32 psn; + int diff; + + /* Sanity check packet */ + if (tlen < 24) + goto drop; + + if (lid < QIB_MULTICAST_LID_BASE) { + lid &= ~((1 << ppd->lmc) - 1); + if (unlikely(lid != ppd->lid)) + goto drop; + } + + /* Check for GRH */ + if (lnh == QIB_LRH_BTH) + ohdr = &hdr->u.oth; + else if (lnh == QIB_LRH_GRH) { + u32 vtf; + + ohdr = &hdr->u.l.oth; + if (hdr->u.l.grh.next_hdr != IB_GRH_NEXT_HDR) + goto drop; + vtf = be32_to_cpu(hdr->u.l.grh.version_tclass_flow); + if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION) + goto drop; + } else + goto drop; + + /* Get opcode and PSN from packet */ + opcode = be32_to_cpu(ohdr->bth[0]); + opcode >>= 24; + psn = be32_to_cpu(ohdr->bth[2]); + + /* Get the destination QP number. */ + qp_num = be32_to_cpu(ohdr->bth[1]) & QIB_QPN_MASK; + if (qp_num != QIB_MULTICAST_QPN) { + int ruc_res; + + qp = qib_lookup_qpn(ibp, qp_num); + if (!qp) + goto drop; + + /* + * Handle only RC QPs - for other QP types drop error + * packet. + */ + spin_lock(&qp->r_lock); + + /* Check for valid receive state. */ + if (!(ib_qib_state_ops[qp->state] & + QIB_PROCESS_RECV_OK)) { + ibp->n_pkt_drops++; + goto unlock; + } + + switch (qp->ibqp.qp_type) { + case IB_QPT_RC: + ruc_res = + qib_ruc_check_hdr( + ibp, hdr, + lnh == QIB_LRH_GRH, + qp, + be32_to_cpu(ohdr->bth[0])); + if (ruc_res) + goto unlock; + + /* Only deal with RDMA Writes for now */ + if (opcode < + IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST) { + diff = qib_cmp24(psn, qp->r_psn); + if (!qp->r_nak_state && diff >= 0) { + ibp->n_rc_seqnak++; + qp->r_nak_state = + IB_NAK_PSN_ERROR; + /* Use the expected PSN. */ + qp->r_ack_psn = qp->r_psn; + /* + * Wait to send the sequence + * NAK until all packets + * in the receive queue have + * been processed. + * Otherwise, we end up + * propagating congestion. + */ + if (list_empty(&qp->rspwait)) { + qp->r_flags |= + QIB_R_RSP_NAK; + atomic_inc( + &qp->refcount); + list_add_tail( + &qp->rspwait, + &rcd->qp_wait_list); + } + } /* Out of sequence NAK */ + } /* QP Request NAKs */ + break; + case IB_QPT_SMI: + case IB_QPT_GSI: + case IB_QPT_UD: + case IB_QPT_UC: + default: + /* For now don't handle any other QP types */ + break; + } + +unlock: + spin_unlock(&qp->r_lock); + /* + * Notify qib_destroy_qp() if it is waiting + * for us to finish. + */ + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + } /* Unicast QP */ + } /* Valid packet with TIDErr */ + +drop: + return ret; +} + +/* + * qib_kreceive - receive a packet + * @rcd: the qlogic_ib context + * @llic: gets count of good packets needed to clear lli, + * (used with chips that need need to track crcs for lli) + * + * called from interrupt handler for errors or receive interrupt + * Returns number of CRC error packets, needed by some chips for + * local link integrity tracking. crcs are adjusted down by following + * good packets, if any, and count of good packets is also tracked. + */ +u32 qib_kreceive(struct qib_ctxtdata *rcd, u32 *llic, u32 *npkts) +{ + struct qib_devdata *dd = rcd->dd; + struct qib_pportdata *ppd = rcd->ppd; + __le32 *rhf_addr; + void *ebuf; + const u32 rsize = dd->rcvhdrentsize; /* words */ + const u32 maxcnt = dd->rcvhdrcnt * rsize; /* words */ + u32 etail = -1, l, hdrqtail; + struct qib_message_header *hdr; + u32 eflags, etype, tlen, i = 0, updegr = 0, crcs = 0; + int last; + u64 lval; + struct qib_qp *qp, *nqp; + + l = rcd->head; + rhf_addr = (__le32 *) rcd->rcvhdrq + l + dd->rhf_offset; + if (dd->flags & QIB_NODMA_RTAIL) { + u32 seq = qib_hdrget_seq(rhf_addr); + + if (seq != rcd->seq_cnt) + goto bail; + hdrqtail = 0; + } else { + hdrqtail = qib_get_rcvhdrtail(rcd); + if (l == hdrqtail) + goto bail; + smp_rmb(); /* prevent speculative reads of dma'ed hdrq */ + } + + for (last = 0, i = 1; !last; i += !last) { + hdr = dd->f_get_msgheader(dd, rhf_addr); + eflags = qib_hdrget_err_flags(rhf_addr); + etype = qib_hdrget_rcv_type(rhf_addr); + /* total length */ + tlen = qib_hdrget_length_in_bytes(rhf_addr); + ebuf = NULL; + if ((dd->flags & QIB_NODMA_RTAIL) ? + qib_hdrget_use_egr_buf(rhf_addr) : + (etype != RCVHQ_RCV_TYPE_EXPECTED)) { + etail = qib_hdrget_index(rhf_addr); + updegr = 1; + if (tlen > sizeof(*hdr) || + etype >= RCVHQ_RCV_TYPE_NON_KD) { + ebuf = qib_get_egrbuf(rcd, etail); + prefetch_range(ebuf, tlen - sizeof(*hdr)); + } + } + if (!eflags) { + u16 lrh_len = be16_to_cpu(hdr->lrh[2]) << 2; + + if (lrh_len != tlen) { + qib_stats.sps_lenerrs++; + goto move_along; + } + } + if (etype == RCVHQ_RCV_TYPE_NON_KD && !eflags && + ebuf == NULL && + tlen > (dd->rcvhdrentsize - 2 + 1 - + qib_hdrget_offset(rhf_addr)) << 2) { + goto move_along; + } + + /* + * Both tiderr and qibhdrerr are set for all plain IB + * packets; only qibhdrerr should be set. + */ + if (unlikely(eflags)) + crcs += qib_rcv_hdrerr(rcd, ppd, rcd->ctxt, eflags, l, + etail, rhf_addr, hdr); + else if (etype == RCVHQ_RCV_TYPE_NON_KD) { + qib_ib_rcv(rcd, hdr, ebuf, tlen); + if (crcs) + crcs--; + else if (llic && *llic) + --*llic; + } +move_along: + l += rsize; + if (l >= maxcnt) + l = 0; + if (i == QIB_MAX_PKT_RECV) + last = 1; + + rhf_addr = (__le32 *) rcd->rcvhdrq + l + dd->rhf_offset; + if (dd->flags & QIB_NODMA_RTAIL) { + u32 seq = qib_hdrget_seq(rhf_addr); + + if (++rcd->seq_cnt > 13) + rcd->seq_cnt = 1; + if (seq != rcd->seq_cnt) + last = 1; + } else if (l == hdrqtail) + last = 1; + /* + * Update head regs etc., every 16 packets, if not last pkt, + * to help prevent rcvhdrq overflows, when many packets + * are processed and queue is nearly full. + * Don't request an interrupt for intermediate updates. + */ + lval = l; + if (!last && !(i & 0xf)) { + dd->f_update_usrhead(rcd, lval, updegr, etail, i); + updegr = 0; + } + } + /* + * Notify qib_destroy_qp() if it is waiting + * for lookaside_qp to finish. + */ + if (rcd->lookaside_qp) { + if (atomic_dec_and_test(&rcd->lookaside_qp->refcount)) + wake_up(&rcd->lookaside_qp->wait); + rcd->lookaside_qp = NULL; + } + + rcd->head = l; + + /* + * Iterate over all QPs waiting to respond. + * The list won't change since the IRQ is only run on one CPU. + */ + list_for_each_entry_safe(qp, nqp, &rcd->qp_wait_list, rspwait) { + list_del_init(&qp->rspwait); + if (qp->r_flags & QIB_R_RSP_NAK) { + qp->r_flags &= ~QIB_R_RSP_NAK; + qib_send_rc_ack(qp); + } + if (qp->r_flags & QIB_R_RSP_SEND) { + unsigned long flags; + + qp->r_flags &= ~QIB_R_RSP_SEND; + spin_lock_irqsave(&qp->s_lock, flags); + if (ib_qib_state_ops[qp->state] & + QIB_PROCESS_OR_FLUSH_SEND) + qib_schedule_send(qp); + spin_unlock_irqrestore(&qp->s_lock, flags); + } + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + } + +bail: + /* Report number of packets consumed */ + if (npkts) + *npkts = i; + + /* + * Always write head at end, and setup rcv interrupt, even + * if no packets were processed. + */ + lval = (u64)rcd->head | dd->rhdrhead_intr_off; + dd->f_update_usrhead(rcd, lval, updegr, etail, i); + return crcs; +} + +/** + * qib_set_mtu - set the MTU + * @ppd: the perport data + * @arg: the new MTU + * + * We can handle "any" incoming size, the issue here is whether we + * need to restrict our outgoing size. For now, we don't do any + * sanity checking on this, and we don't deal with what happens to + * programs that are already running when the size changes. + * NOTE: changing the MTU will usually cause the IBC to go back to + * link INIT state... + */ +int qib_set_mtu(struct qib_pportdata *ppd, u16 arg) +{ + u32 piosize; + int ret, chk; + + if (arg != 256 && arg != 512 && arg != 1024 && arg != 2048 && + arg != 4096) { + ret = -EINVAL; + goto bail; + } + chk = ib_mtu_enum_to_int(qib_ibmtu); + if (chk > 0 && arg > chk) { + ret = -EINVAL; + goto bail; + } + + piosize = ppd->ibmaxlen; + ppd->ibmtu = arg; + + if (arg >= (piosize - QIB_PIO_MAXIBHDR)) { + /* Only if it's not the initial value (or reset to it) */ + if (piosize != ppd->init_ibmaxlen) { + if (arg > piosize && arg <= ppd->init_ibmaxlen) + piosize = ppd->init_ibmaxlen - 2 * sizeof(u32); + ppd->ibmaxlen = piosize; + } + } else if ((arg + QIB_PIO_MAXIBHDR) != ppd->ibmaxlen) { + piosize = arg + QIB_PIO_MAXIBHDR - 2 * sizeof(u32); + ppd->ibmaxlen = piosize; + } + + ppd->dd->f_set_ib_cfg(ppd, QIB_IB_CFG_MTU, 0); + + ret = 0; + +bail: + return ret; +} + +int qib_set_lid(struct qib_pportdata *ppd, u32 lid, u8 lmc) +{ + struct qib_devdata *dd = ppd->dd; + + ppd->lid = lid; + ppd->lmc = lmc; + + dd->f_set_ib_cfg(ppd, QIB_IB_CFG_LIDLMC, + lid | (~((1U << lmc) - 1)) << 16); + + qib_devinfo(dd->pcidev, "IB%u:%u got a lid: 0x%x\n", + dd->unit, ppd->port, lid); + + return 0; +} + +/* + * Following deal with the "obviously simple" task of overriding the state + * of the LEDS, which normally indicate link physical and logical status. + * The complications arise in dealing with different hardware mappings + * and the board-dependent routine being called from interrupts. + * and then there's the requirement to _flash_ them. + */ +#define LED_OVER_FREQ_SHIFT 8 +#define LED_OVER_FREQ_MASK (0xFF<dd; + int timeoff; + int ph_idx; + + if (!(dd->flags & QIB_INITTED)) + return; + + ph_idx = ppd->led_override_phase++ & 1; + ppd->led_override = ppd->led_override_vals[ph_idx]; + timeoff = ppd->led_override_timeoff; + + dd->f_setextled(ppd, 1); + /* + * don't re-fire the timer if user asked for it to be off; we let + * it fire one more time after they turn it off to simplify + */ + if (ppd->led_override_vals[0] || ppd->led_override_vals[1]) + mod_timer(&ppd->led_override_timer, jiffies + timeoff); +} + +void qib_set_led_override(struct qib_pportdata *ppd, unsigned int val) +{ + struct qib_devdata *dd = ppd->dd; + int timeoff, freq; + + if (!(dd->flags & QIB_INITTED)) + return; + + /* First check if we are blinking. If not, use 1HZ polling */ + timeoff = HZ; + freq = (val & LED_OVER_FREQ_MASK) >> LED_OVER_FREQ_SHIFT; + + if (freq) { + /* For blink, set each phase from one nybble of val */ + ppd->led_override_vals[0] = val & 0xF; + ppd->led_override_vals[1] = (val >> 4) & 0xF; + timeoff = (HZ << 4)/freq; + } else { + /* Non-blink set both phases the same. */ + ppd->led_override_vals[0] = val & 0xF; + ppd->led_override_vals[1] = val & 0xF; + } + ppd->led_override_timeoff = timeoff; + + /* + * If the timer has not already been started, do so. Use a "quick" + * timeout so the function will be called soon, to look at our request. + */ + if (atomic_inc_return(&ppd->led_override_timer_active) == 1) { + /* Need to start timer */ + init_timer(&ppd->led_override_timer); + ppd->led_override_timer.function = qib_run_led_override; + ppd->led_override_timer.data = (unsigned long) ppd; + ppd->led_override_timer.expires = jiffies + 1; + add_timer(&ppd->led_override_timer); + } else { + if (ppd->led_override_vals[0] || ppd->led_override_vals[1]) + mod_timer(&ppd->led_override_timer, jiffies + 1); + atomic_dec(&ppd->led_override_timer_active); + } +} + +/** + * qib_reset_device - reset the chip if possible + * @unit: the device to reset + * + * Whether or not reset is successful, we attempt to re-initialize the chip + * (that is, much like a driver unload/reload). We clear the INITTED flag + * so that the various entry points will fail until we reinitialize. For + * now, we only allow this if no user contexts are open that use chip resources + */ +int qib_reset_device(int unit) +{ + int ret, i; + struct qib_devdata *dd = qib_lookup(unit); + struct qib_pportdata *ppd; + unsigned long flags; + int pidx; + + if (!dd) { + ret = -ENODEV; + goto bail; + } + + qib_devinfo(dd->pcidev, "Reset on unit %u requested\n", unit); + + if (!dd->kregbase || !(dd->flags & QIB_PRESENT)) { + qib_devinfo(dd->pcidev, + "Invalid unit number %u or not initialized or not present\n", + unit); + ret = -ENXIO; + goto bail; + } + + spin_lock_irqsave(&dd->uctxt_lock, flags); + if (dd->rcd) + for (i = dd->first_user_ctxt; i < dd->cfgctxts; i++) { + if (!dd->rcd[i] || !dd->rcd[i]->cnt) + continue; + spin_unlock_irqrestore(&dd->uctxt_lock, flags); + ret = -EBUSY; + goto bail; + } + spin_unlock_irqrestore(&dd->uctxt_lock, flags); + + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + if (atomic_read(&ppd->led_override_timer_active)) { + /* Need to stop LED timer, _then_ shut off LEDs */ + del_timer_sync(&ppd->led_override_timer); + atomic_set(&ppd->led_override_timer_active, 0); + } + + /* Shut off LEDs after we are sure timer is not running */ + ppd->led_override = LED_OVER_BOTH_OFF; + dd->f_setextled(ppd, 0); + if (dd->flags & QIB_HAS_SEND_DMA) + qib_teardown_sdma(ppd); + } + + ret = dd->f_reset(dd); + if (ret == 1) + ret = qib_init(dd, 1); + else + ret = -EAGAIN; + if (ret) + qib_dev_err(dd, + "Reinitialize unit %u after reset failed with %d\n", + unit, ret); + else + qib_devinfo(dd->pcidev, + "Reinitialized unit %u after resetting\n", + unit); + +bail: + return ret; +} diff --git a/kernel/drivers/infiniband/hw/qib/qib_eeprom.c b/kernel/drivers/infiniband/hw/qib/qib_eeprom.c new file mode 100644 index 000000000..311ee6c3d --- /dev/null +++ b/kernel/drivers/infiniband/hw/qib/qib_eeprom.c @@ -0,0 +1,276 @@ +/* + * Copyright (c) 2012 Intel Corporation. All rights reserved. + * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include "qib.h" + +/* + * Functions specific to the serial EEPROM on cards handled by ib_qib. + * The actual serail interface code is in qib_twsi.c. This file is a client + */ + +/** + * qib_eeprom_read - receives bytes from the eeprom via I2C + * @dd: the qlogic_ib device + * @eeprom_offset: address to read from + * @buffer: where to store result + * @len: number of bytes to receive + */ +int qib_eeprom_read(struct qib_devdata *dd, u8 eeprom_offset, + void *buff, int len) +{ + int ret; + + ret = mutex_lock_interruptible(&dd->eep_lock); + if (!ret) { + ret = qib_twsi_reset(dd); + if (ret) + qib_dev_err(dd, "EEPROM Reset for read failed\n"); + else + ret = qib_twsi_blk_rd(dd, dd->twsi_eeprom_dev, + eeprom_offset, buff, len); + mutex_unlock(&dd->eep_lock); + } + + return ret; +} + +/* + * Actually update the eeprom, first doing write enable if + * needed, then restoring write enable state. + * Must be called with eep_lock held + */ +static int eeprom_write_with_enable(struct qib_devdata *dd, u8 offset, + const void *buf, int len) +{ + int ret, pwen; + + pwen = dd->f_eeprom_wen(dd, 1); + ret = qib_twsi_reset(dd); + if (ret) + qib_dev_err(dd, "EEPROM Reset for write failed\n"); + else + ret = qib_twsi_blk_wr(dd, dd->twsi_eeprom_dev, + offset, buf, len); + dd->f_eeprom_wen(dd, pwen); + return ret; +} + +/** + * qib_eeprom_write - writes data to the eeprom via I2C + * @dd: the qlogic_ib device + * @eeprom_offset: where to place data + * @buffer: data to write + * @len: number of bytes to write + */ +int qib_eeprom_write(struct qib_devdata *dd, u8 eeprom_offset, + const void *buff, int len) +{ + int ret; + + ret = mutex_lock_interruptible(&dd->eep_lock); + if (!ret) { + ret = eeprom_write_with_enable(dd, eeprom_offset, buff, len); + mutex_unlock(&dd->eep_lock); + } + + return ret; +} + +static u8 flash_csum(struct qib_flash *ifp, int adjust) +{ + u8 *ip = (u8 *) ifp; + u8 csum = 0, len; + + /* + * Limit length checksummed to max length of actual data. + * Checksum of erased eeprom will still be bad, but we avoid + * reading past the end of the buffer we were passed. + */ + len = ifp->if_length; + if (len > sizeof(struct qib_flash)) + len = sizeof(struct qib_flash); + while (len--) + csum += *ip++; + csum -= ifp->if_csum; + csum = ~csum; + if (adjust) + ifp->if_csum = csum; + + return csum; +} + +/** + * qib_get_eeprom_info- get the GUID et al. from the TSWI EEPROM device + * @dd: the qlogic_ib device + * + * We have the capability to use the nguid field, and get + * the guid from the first chip's flash, to use for all of them. + */ +void qib_get_eeprom_info(struct qib_devdata *dd) +{ + void *buf; + struct qib_flash *ifp; + __be64 guid; + int len, eep_stat; + u8 csum, *bguid; + int t = dd->unit; + struct qib_devdata *dd0 = qib_lookup(0); + + if (t && dd0->nguid > 1 && t <= dd0->nguid) { + u8 oguid; + + dd->base_guid = dd0->base_guid; + bguid = (u8 *) &dd->base_guid; + + oguid = bguid[7]; + bguid[7] += t; + if (oguid > bguid[7]) { + if (bguid[6] == 0xff) { + if (bguid[5] == 0xff) { + qib_dev_err(dd, + "Can't set %s GUID from base, wraps to OUI!\n", + qib_get_unit_name(t)); + dd->base_guid = 0; + goto bail; + } + bguid[5]++; + } + bguid[6]++; + } + dd->nguid = 1; + goto bail; + } + + /* + * Read full flash, not just currently used part, since it may have + * been written with a newer definition. + * */ + len = sizeof(struct qib_flash); + buf = vmalloc(len); + if (!buf) { + qib_dev_err(dd, + "Couldn't allocate memory to read %u bytes from eeprom for GUID\n", + len); + goto bail; + } + + /* + * Use "public" eeprom read function, which does locking and + * figures out device. This will migrate to chip-specific. + */ + eep_stat = qib_eeprom_read(dd, 0, buf, len); + + if (eep_stat) { + qib_dev_err(dd, "Failed reading GUID from eeprom\n"); + goto done; + } + ifp = (struct qib_flash *)buf; + + csum = flash_csum(ifp, 0); + if (csum != ifp->if_csum) { + qib_devinfo(dd->pcidev, + "Bad I2C flash checksum: 0x%x, not 0x%x\n", + csum, ifp->if_csum); + goto done; + } + if (*(__be64 *) ifp->if_guid == cpu_to_be64(0) || + *(__be64 *) ifp->if_guid == ~cpu_to_be64(0)) { + qib_dev_err(dd, + "Invalid GUID %llx from flash; ignoring\n", + *(unsigned long long *) ifp->if_guid); + /* don't allow GUID if all 0 or all 1's */ + goto done; + } + + /* complain, but allow it */ + if (*(u64 *) ifp->if_guid == 0x100007511000000ULL) + qib_devinfo(dd->pcidev, + "Warning, GUID %llx is default, probably not correct!\n", + *(unsigned long long *) ifp->if_guid); + + bguid = ifp->if_guid; + if (!bguid[0] && !bguid[1] && !bguid[2]) { + /* + * Original incorrect GUID format in flash; fix in + * core copy, by shifting up 2 octets; don't need to + * change top octet, since both it and shifted are 0. + */ + bguid[1] = bguid[3]; + bguid[2] = bguid[4]; + bguid[3] = 0; + bguid[4] = 0; + guid = *(__be64 *) ifp->if_guid; + } else + guid = *(__be64 *) ifp->if_guid; + dd->base_guid = guid; + dd->nguid = ifp->if_numguid; + /* + * Things are slightly complicated by the desire to transparently + * support both the Pathscale 10-digit serial number and the QLogic + * 13-character version. + */ + if ((ifp->if_fversion > 1) && ifp->if_sprefix[0] && + ((u8 *) ifp->if_sprefix)[0] != 0xFF) { + char *snp = dd->serial; + + /* + * This board has a Serial-prefix, which is stored + * elsewhere for backward-compatibility. + */ + memcpy(snp, ifp->if_sprefix, sizeof(ifp->if_sprefix)); + snp[sizeof(ifp->if_sprefix)] = '\0'; + len = strlen(snp); + snp += len; + len = sizeof(dd->serial) - len; + if (len > sizeof(ifp->if_serial)) + len = sizeof(ifp->if_serial); + memcpy(snp, ifp->if_serial, len); + } else { + memcpy(dd->serial, ifp->if_serial, sizeof(ifp->if_serial)); + } + if (!strstr(ifp->if_comment, "Tested successfully")) + qib_dev_err(dd, + "Board SN %s did not pass functional test: %s\n", + dd->serial, ifp->if_comment); + +done: + vfree(buf); + +bail:; +} + diff --git a/kernel/drivers/infiniband/hw/qib/qib_file_ops.c b/kernel/drivers/infiniband/hw/qib/qib_file_ops.c new file mode 100644 index 000000000..725881890 --- /dev/null +++ b/kernel/drivers/infiniband/hw/qib/qib_file_ops.c @@ -0,0 +1,2418 @@ +/* + * Copyright (c) 2012, 2013 Intel Corporation. All rights reserved. + * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "qib.h" +#include "qib_common.h" +#include "qib_user_sdma.h" + +#undef pr_fmt +#define pr_fmt(fmt) QIB_DRV_NAME ": " fmt + +static int qib_open(struct inode *, struct file *); +static int qib_close(struct inode *, struct file *); +static ssize_t qib_write(struct file *, const char __user *, size_t, loff_t *); +static ssize_t qib_write_iter(struct kiocb *, struct iov_iter *); +static unsigned int qib_poll(struct file *, struct poll_table_struct *); +static int qib_mmapf(struct file *, struct vm_area_struct *); + +/* + * This is really, really weird shit - write() and writev() here + * have completely unrelated semantics. Sucky userland ABI, + * film at 11. + */ +static const struct file_operations qib_file_ops = { + .owner = THIS_MODULE, + .write = qib_write, + .write_iter = qib_write_iter, + .open = qib_open, + .release = qib_close, + .poll = qib_poll, + .mmap = qib_mmapf, + .llseek = noop_llseek, +}; + +/* + * Convert kernel virtual addresses to physical addresses so they don't + * potentially conflict with the chip addresses used as mmap offsets. + * It doesn't really matter what mmap offset we use as long as we can + * interpret it correctly. + */ +static u64 cvt_kvaddr(void *p) +{ + struct page *page; + u64 paddr = 0; + + page = vmalloc_to_page(p); + if (page) + paddr = page_to_pfn(page) << PAGE_SHIFT; + + return paddr; +} + +static int qib_get_base_info(struct file *fp, void __user *ubase, + size_t ubase_size) +{ + struct qib_ctxtdata *rcd = ctxt_fp(fp); + int ret = 0; + struct qib_base_info *kinfo = NULL; + struct qib_devdata *dd = rcd->dd; + struct qib_pportdata *ppd = rcd->ppd; + unsigned subctxt_cnt; + int shared, master; + size_t sz; + + subctxt_cnt = rcd->subctxt_cnt; + if (!subctxt_cnt) { + shared = 0; + master = 0; + subctxt_cnt = 1; + } else { + shared = 1; + master = !subctxt_fp(fp); + } + + sz = sizeof(*kinfo); + /* If context sharing is not requested, allow the old size structure */ + if (!shared) + sz -= 7 * sizeof(u64); + if (ubase_size < sz) { + ret = -EINVAL; + goto bail; + } + + kinfo = kzalloc(sizeof(*kinfo), GFP_KERNEL); + if (kinfo == NULL) { + ret = -ENOMEM; + goto bail; + } + + ret = dd->f_get_base_info(rcd, kinfo); + if (ret < 0) + goto bail; + + kinfo->spi_rcvhdr_cnt = dd->rcvhdrcnt; + kinfo->spi_rcvhdrent_size = dd->rcvhdrentsize; + kinfo->spi_tidegrcnt = rcd->rcvegrcnt; + kinfo->spi_rcv_egrbufsize = dd->rcvegrbufsize; + /* + * have to mmap whole thing + */ + kinfo->spi_rcv_egrbuftotlen = + rcd->rcvegrbuf_chunks * rcd->rcvegrbuf_size; + kinfo->spi_rcv_egrperchunk = rcd->rcvegrbufs_perchunk; + kinfo->spi_rcv_egrchunksize = kinfo->spi_rcv_egrbuftotlen / + rcd->rcvegrbuf_chunks; + kinfo->spi_tidcnt = dd->rcvtidcnt / subctxt_cnt; + if (master) + kinfo->spi_tidcnt += dd->rcvtidcnt % subctxt_cnt; + /* + * for this use, may be cfgctxts summed over all chips that + * are are configured and present + */ + kinfo->spi_nctxts = dd->cfgctxts; + /* unit (chip/board) our context is on */ + kinfo->spi_unit = dd->unit; + kinfo->spi_port = ppd->port; + /* for now, only a single page */ + kinfo->spi_tid_maxsize = PAGE_SIZE; + + /* + * Doing this per context, and based on the skip value, etc. This has + * to be the actual buffer size, since the protocol code treats it + * as an array. + * + * These have to be set to user addresses in the user code via mmap. + * These values are used on return to user code for the mmap target + * addresses only. For 32 bit, same 44 bit address problem, so use + * the physical address, not virtual. Before 2.6.11, using the + * page_address() macro worked, but in 2.6.11, even that returns the + * full 64 bit address (upper bits all 1's). So far, using the + * physical addresses (or chip offsets, for chip mapping) works, but + * no doubt some future kernel release will change that, and we'll be + * on to yet another method of dealing with this. + * Normally only one of rcvhdr_tailaddr or rhf_offset is useful + * since the chips with non-zero rhf_offset don't normally + * enable tail register updates to host memory, but for testing, + * both can be enabled and used. + */ + kinfo->spi_rcvhdr_base = (u64) rcd->rcvhdrq_phys; + kinfo->spi_rcvhdr_tailaddr = (u64) rcd->rcvhdrqtailaddr_phys; + kinfo->spi_rhf_offset = dd->rhf_offset; + kinfo->spi_rcv_egrbufs = (u64) rcd->rcvegr_phys; + kinfo->spi_pioavailaddr = (u64) dd->pioavailregs_phys; + /* setup per-unit (not port) status area for user programs */ + kinfo->spi_status = (u64) kinfo->spi_pioavailaddr + + (char *) ppd->statusp - + (char *) dd->pioavailregs_dma; + kinfo->spi_uregbase = (u64) dd->uregbase + dd->ureg_align * rcd->ctxt; + if (!shared) { + kinfo->spi_piocnt = rcd->piocnt; + kinfo->spi_piobufbase = (u64) rcd->piobufs; + kinfo->spi_sendbuf_status = cvt_kvaddr(rcd->user_event_mask); + } else if (master) { + kinfo->spi_piocnt = (rcd->piocnt / subctxt_cnt) + + (rcd->piocnt % subctxt_cnt); + /* Master's PIO buffers are after all the slave's */ + kinfo->spi_piobufbase = (u64) rcd->piobufs + + dd->palign * + (rcd->piocnt - kinfo->spi_piocnt); + } else { + unsigned slave = subctxt_fp(fp) - 1; + + kinfo->spi_piocnt = rcd->piocnt / subctxt_cnt; + kinfo->spi_piobufbase = (u64) rcd->piobufs + + dd->palign * kinfo->spi_piocnt * slave; + } + + if (shared) { + kinfo->spi_sendbuf_status = + cvt_kvaddr(&rcd->user_event_mask[subctxt_fp(fp)]); + /* only spi_subctxt_* fields should be set in this block! */ + kinfo->spi_subctxt_uregbase = cvt_kvaddr(rcd->subctxt_uregbase); + + kinfo->spi_subctxt_rcvegrbuf = + cvt_kvaddr(rcd->subctxt_rcvegrbuf); + kinfo->spi_subctxt_rcvhdr_base = + cvt_kvaddr(rcd->subctxt_rcvhdr_base); + } + + /* + * All user buffers are 2KB buffers. If we ever support + * giving 4KB buffers to user processes, this will need some + * work. Can't use piobufbase directly, because it has + * both 2K and 4K buffer base values. + */ + kinfo->spi_pioindex = (kinfo->spi_piobufbase - dd->pio2k_bufbase) / + dd->palign; + kinfo->spi_pioalign = dd->palign; + kinfo->spi_qpair = QIB_KD_QP; + /* + * user mode PIO buffers are always 2KB, even when 4KB can + * be received, and sent via the kernel; this is ibmaxlen + * for 2K MTU. + */ + kinfo->spi_piosize = dd->piosize2k - 2 * sizeof(u32); + kinfo->spi_mtu = ppd->ibmaxlen; /* maxlen, not ibmtu */ + kinfo->spi_ctxt = rcd->ctxt; + kinfo->spi_subctxt = subctxt_fp(fp); + kinfo->spi_sw_version = QIB_KERN_SWVERSION; + kinfo->spi_sw_version |= 1U << 31; /* QLogic-built, not kernel.org */ + kinfo->spi_hw_version = dd->revision; + + if (master) + kinfo->spi_runtime_flags |= QIB_RUNTIME_MASTER; + + sz = (ubase_size < sizeof(*kinfo)) ? ubase_size : sizeof(*kinfo); + if (copy_to_user(ubase, kinfo, sz)) + ret = -EFAULT; +bail: + kfree(kinfo); + return ret; +} + +/** + * qib_tid_update - update a context TID + * @rcd: the context + * @fp: the qib device file + * @ti: the TID information + * + * The new implementation as of Oct 2004 is that the driver assigns + * the tid and returns it to the caller. To reduce search time, we + * keep a cursor for each context, walking the shadow tid array to find + * one that's not in use. + * + * For now, if we can't allocate the full list, we fail, although + * in the long run, we'll allocate as many as we can, and the + * caller will deal with that by trying the remaining pages later. + * That means that when we fail, we have to mark the tids as not in + * use again, in our shadow copy. + * + * It's up to the caller to free the tids when they are done. + * We'll unlock the pages as they free them. + * + * Also, right now we are locking one page at a time, but since + * the intended use of this routine is for a single group of + * virtually contiguous pages, that should change to improve + * performance. + */ +static int qib_tid_update(struct qib_ctxtdata *rcd, struct file *fp, + const struct qib_tid_info *ti) +{ + int ret = 0, ntids; + u32 tid, ctxttid, cnt, i, tidcnt, tidoff; + u16 *tidlist; + struct qib_devdata *dd = rcd->dd; + u64 physaddr; + unsigned long vaddr; + u64 __iomem *tidbase; + unsigned long tidmap[8]; + struct page **pagep = NULL; + unsigned subctxt = subctxt_fp(fp); + + if (!dd->pageshadow) { + ret = -ENOMEM; + goto done; + } + + cnt = ti->tidcnt; + if (!cnt) { + ret = -EFAULT; + goto done; + } + ctxttid = rcd->ctxt * dd->rcvtidcnt; + if (!rcd->subctxt_cnt) { + tidcnt = dd->rcvtidcnt; + tid = rcd->tidcursor; + tidoff = 0; + } else if (!subctxt) { + tidcnt = (dd->rcvtidcnt / rcd->subctxt_cnt) + + (dd->rcvtidcnt % rcd->subctxt_cnt); + tidoff = dd->rcvtidcnt - tidcnt; + ctxttid += tidoff; + tid = tidcursor_fp(fp); + } else { + tidcnt = dd->rcvtidcnt / rcd->subctxt_cnt; + tidoff = tidcnt * (subctxt - 1); + ctxttid += tidoff; + tid = tidcursor_fp(fp); + } + if (cnt > tidcnt) { + /* make sure it all fits in tid_pg_list */ + qib_devinfo(dd->pcidev, + "Process tried to allocate %u TIDs, only trying max (%u)\n", + cnt, tidcnt); + cnt = tidcnt; + } + pagep = (struct page **) rcd->tid_pg_list; + tidlist = (u16 *) &pagep[dd->rcvtidcnt]; + pagep += tidoff; + tidlist += tidoff; + + memset(tidmap, 0, sizeof(tidmap)); + /* before decrement; chip actual # */ + ntids = tidcnt; + tidbase = (u64 __iomem *) (((char __iomem *) dd->kregbase) + + dd->rcvtidbase + + ctxttid * sizeof(*tidbase)); + + /* virtual address of first page in transfer */ + vaddr = ti->tidvaddr; + if (!access_ok(VERIFY_WRITE, (void __user *) vaddr, + cnt * PAGE_SIZE)) { + ret = -EFAULT; + goto done; + } + ret = qib_get_user_pages(vaddr, cnt, pagep); + if (ret) { + /* + * if (ret == -EBUSY) + * We can't continue because the pagep array won't be + * initialized. This should never happen, + * unless perhaps the user has mpin'ed the pages + * themselves. + */ + qib_devinfo( + dd->pcidev, + "Failed to lock addr %p, %u pages: errno %d\n", + (void *) vaddr, cnt, -ret); + goto done; + } + for (i = 0; i < cnt; i++, vaddr += PAGE_SIZE) { + for (; ntids--; tid++) { + if (tid == tidcnt) + tid = 0; + if (!dd->pageshadow[ctxttid + tid]) + break; + } + if (ntids < 0) { + /* + * Oops, wrapped all the way through their TIDs, + * and didn't have enough free; see comments at + * start of routine + */ + i--; /* last tidlist[i] not filled in */ + ret = -ENOMEM; + break; + } + tidlist[i] = tid + tidoff; + /* we "know" system pages and TID pages are same size */ + dd->pageshadow[ctxttid + tid] = pagep[i]; + dd->physshadow[ctxttid + tid] = + qib_map_page(dd->pcidev, pagep[i], 0, PAGE_SIZE, + PCI_DMA_FROMDEVICE); + /* + * don't need atomic or it's overhead + */ + __set_bit(tid, tidmap); + physaddr = dd->physshadow[ctxttid + tid]; + /* PERFORMANCE: below should almost certainly be cached */ + dd->f_put_tid(dd, &tidbase[tid], + RCVHQ_RCV_TYPE_EXPECTED, physaddr); + /* + * don't check this tid in qib_ctxtshadow, since we + * just filled it in; start with the next one. + */ + tid++; + } + + if (ret) { + u32 limit; +cleanup: + /* jump here if copy out of updated info failed... */ + /* same code that's in qib_free_tid() */ + limit = sizeof(tidmap) * BITS_PER_BYTE; + if (limit > tidcnt) + /* just in case size changes in future */ + limit = tidcnt; + tid = find_first_bit((const unsigned long *)tidmap, limit); + for (; tid < limit; tid++) { + if (!test_bit(tid, tidmap)) + continue; + if (dd->pageshadow[ctxttid + tid]) { + dma_addr_t phys; + + phys = dd->physshadow[ctxttid + tid]; + dd->physshadow[ctxttid + tid] = dd->tidinvalid; + /* PERFORMANCE: below should almost certainly + * be cached + */ + dd->f_put_tid(dd, &tidbase[tid], + RCVHQ_RCV_TYPE_EXPECTED, + dd->tidinvalid); + pci_unmap_page(dd->pcidev, phys, PAGE_SIZE, + PCI_DMA_FROMDEVICE); + dd->pageshadow[ctxttid + tid] = NULL; + } + } + qib_release_user_pages(pagep, cnt); + } else { + /* + * Copy the updated array, with qib_tid's filled in, back + * to user. Since we did the copy in already, this "should + * never fail" If it does, we have to clean up... + */ + if (copy_to_user((void __user *) + (unsigned long) ti->tidlist, + tidlist, cnt * sizeof(*tidlist))) { + ret = -EFAULT; + goto cleanup; + } + if (copy_to_user((void __user *) (unsigned long) ti->tidmap, + tidmap, sizeof(tidmap))) { + ret = -EFAULT; + goto cleanup; + } + if (tid == tidcnt) + tid = 0; + if (!rcd->subctxt_cnt) + rcd->tidcursor = tid; + else + tidcursor_fp(fp) = tid; + } + +done: + return ret; +} + +/** + * qib_tid_free - free a context TID + * @rcd: the context + * @subctxt: the subcontext + * @ti: the TID info + * + * right now we are unlocking one page at a time, but since + * the intended use of this routine is for a single group of + * virtually contiguous pages, that should change to improve + * performance. We check that the TID is in range for this context + * but otherwise don't check validity; if user has an error and + * frees the wrong tid, it's only their own data that can thereby + * be corrupted. We do check that the TID was in use, for sanity + * We always use our idea of the saved address, not the address that + * they pass in to us. + */ +static int qib_tid_free(struct qib_ctxtdata *rcd, unsigned subctxt, + const struct qib_tid_info *ti) +{ + int ret = 0; + u32 tid, ctxttid, cnt, limit, tidcnt; + struct qib_devdata *dd = rcd->dd; + u64 __iomem *tidbase; + unsigned long tidmap[8]; + + if (!dd->pageshadow) { + ret = -ENOMEM; + goto done; + } + + if (copy_from_user(tidmap, (void __user *)(unsigned long)ti->tidmap, + sizeof(tidmap))) { + ret = -EFAULT; + goto done; + } + + ctxttid = rcd->ctxt * dd->rcvtidcnt; + if (!rcd->subctxt_cnt) + tidcnt = dd->rcvtidcnt; + else if (!subctxt) { + tidcnt = (dd->rcvtidcnt / rcd->subctxt_cnt) + + (dd->rcvtidcnt % rcd->subctxt_cnt); + ctxttid += dd->rcvtidcnt - tidcnt; + } else { + tidcnt = dd->rcvtidcnt / rcd->subctxt_cnt; + ctxttid += tidcnt * (subctxt - 1); + } + tidbase = (u64 __iomem *) ((char __iomem *)(dd->kregbase) + + dd->rcvtidbase + + ctxttid * sizeof(*tidbase)); + + limit = sizeof(tidmap) * BITS_PER_BYTE; + if (limit > tidcnt) + /* just in case size changes in future */ + limit = tidcnt; + tid = find_first_bit(tidmap, limit); + for (cnt = 0; tid < limit; tid++) { + /* + * small optimization; if we detect a run of 3 or so without + * any set, use find_first_bit again. That's mainly to + * accelerate the case where we wrapped, so we have some at + * the beginning, and some at the end, and a big gap + * in the middle. + */ + if (!test_bit(tid, tidmap)) + continue; + cnt++; + if (dd->pageshadow[ctxttid + tid]) { + struct page *p; + dma_addr_t phys; + + p = dd->pageshadow[ctxttid + tid]; + dd->pageshadow[ctxttid + tid] = NULL; + phys = dd->physshadow[ctxttid + tid]; + dd->physshadow[ctxttid + tid] = dd->tidinvalid; + /* PERFORMANCE: below should almost certainly be + * cached + */ + dd->f_put_tid(dd, &tidbase[tid], + RCVHQ_RCV_TYPE_EXPECTED, dd->tidinvalid); + pci_unmap_page(dd->pcidev, phys, PAGE_SIZE, + PCI_DMA_FROMDEVICE); + qib_release_user_pages(&p, 1); + } + } +done: + return ret; +} + +/** + * qib_set_part_key - set a partition key + * @rcd: the context + * @key: the key + * + * We can have up to 4 active at a time (other than the default, which is + * always allowed). This is somewhat tricky, since multiple contexts may set + * the same key, so we reference count them, and clean up at exit. All 4 + * partition keys are packed into a single qlogic_ib register. It's an + * error for a process to set the same pkey multiple times. We provide no + * mechanism to de-allocate a pkey at this time, we may eventually need to + * do that. I've used the atomic operations, and no locking, and only make + * a single pass through what's available. This should be more than + * adequate for some time. I'll think about spinlocks or the like if and as + * it's necessary. + */ +static int qib_set_part_key(struct qib_ctxtdata *rcd, u16 key) +{ + struct qib_pportdata *ppd = rcd->ppd; + int i, any = 0, pidx = -1; + u16 lkey = key & 0x7FFF; + int ret; + + if (lkey == (QIB_DEFAULT_P_KEY & 0x7FFF)) { + /* nothing to do; this key always valid */ + ret = 0; + goto bail; + } + + if (!lkey) { + ret = -EINVAL; + goto bail; + } + + /* + * Set the full membership bit, because it has to be + * set in the register or the packet, and it seems + * cleaner to set in the register than to force all + * callers to set it. + */ + key |= 0x8000; + + for (i = 0; i < ARRAY_SIZE(rcd->pkeys); i++) { + if (!rcd->pkeys[i] && pidx == -1) + pidx = i; + if (rcd->pkeys[i] == key) { + ret = -EEXIST; + goto bail; + } + } + if (pidx == -1) { + ret = -EBUSY; + goto bail; + } + for (any = i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) { + if (!ppd->pkeys[i]) { + any++; + continue; + } + if (ppd->pkeys[i] == key) { + atomic_t *pkrefs = &ppd->pkeyrefs[i]; + + if (atomic_inc_return(pkrefs) > 1) { + rcd->pkeys[pidx] = key; + ret = 0; + goto bail; + } else { + /* + * lost race, decrement count, catch below + */ + atomic_dec(pkrefs); + any++; + } + } + if ((ppd->pkeys[i] & 0x7FFF) == lkey) { + /* + * It makes no sense to have both the limited and + * full membership PKEY set at the same time since + * the unlimited one will disable the limited one. + */ + ret = -EEXIST; + goto bail; + } + } + if (!any) { + ret = -EBUSY; + goto bail; + } + for (any = i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) { + if (!ppd->pkeys[i] && + atomic_inc_return(&ppd->pkeyrefs[i]) == 1) { + rcd->pkeys[pidx] = key; + ppd->pkeys[i] = key; + (void) ppd->dd->f_set_ib_cfg(ppd, QIB_IB_CFG_PKEYS, 0); + ret = 0; + goto bail; + } + } + ret = -EBUSY; + +bail: + return ret; +} + +/** + * qib_manage_rcvq - manage a context's receive queue + * @rcd: the context + * @subctxt: the subcontext + * @start_stop: action to carry out + * + * start_stop == 0 disables receive on the context, for use in queue + * overflow conditions. start_stop==1 re-enables, to be used to + * re-init the software copy of the head register + */ +static int qib_manage_rcvq(struct qib_ctxtdata *rcd, unsigned subctxt, + int start_stop) +{ + struct qib_devdata *dd = rcd->dd; + unsigned int rcvctrl_op; + + if (subctxt) + goto bail; + /* atomically clear receive enable ctxt. */ + if (start_stop) { + /* + * On enable, force in-memory copy of the tail register to + * 0, so that protocol code doesn't have to worry about + * whether or not the chip has yet updated the in-memory + * copy or not on return from the system call. The chip + * always resets it's tail register back to 0 on a + * transition from disabled to enabled. + */ + if (rcd->rcvhdrtail_kvaddr) + qib_clear_rcvhdrtail(rcd); + rcvctrl_op = QIB_RCVCTRL_CTXT_ENB; + } else + rcvctrl_op = QIB_RCVCTRL_CTXT_DIS; + dd->f_rcvctrl(rcd->ppd, rcvctrl_op, rcd->ctxt); + /* always; new head should be equal to new tail; see above */ +bail: + return 0; +} + +static void qib_clean_part_key(struct qib_ctxtdata *rcd, + struct qib_devdata *dd) +{ + int i, j, pchanged = 0; + u64 oldpkey; + struct qib_pportdata *ppd = rcd->ppd; + + /* for debugging only */ + oldpkey = (u64) ppd->pkeys[0] | + ((u64) ppd->pkeys[1] << 16) | + ((u64) ppd->pkeys[2] << 32) | + ((u64) ppd->pkeys[3] << 48); + + for (i = 0; i < ARRAY_SIZE(rcd->pkeys); i++) { + if (!rcd->pkeys[i]) + continue; + for (j = 0; j < ARRAY_SIZE(ppd->pkeys); j++) { + /* check for match independent of the global bit */ + if ((ppd->pkeys[j] & 0x7fff) != + (rcd->pkeys[i] & 0x7fff)) + continue; + if (atomic_dec_and_test(&ppd->pkeyrefs[j])) { + ppd->pkeys[j] = 0; + pchanged++; + } + break; + } + rcd->pkeys[i] = 0; + } + if (pchanged) + (void) ppd->dd->f_set_ib_cfg(ppd, QIB_IB_CFG_PKEYS, 0); +} + +/* common code for the mappings on dma_alloc_coherent mem */ +static int qib_mmap_mem(struct vm_area_struct *vma, struct qib_ctxtdata *rcd, + unsigned len, void *kvaddr, u32 write_ok, char *what) +{ + struct qib_devdata *dd = rcd->dd; + unsigned long pfn; + int ret; + + if ((vma->vm_end - vma->vm_start) > len) { + qib_devinfo(dd->pcidev, + "FAIL on %s: len %lx > %x\n", what, + vma->vm_end - vma->vm_start, len); + ret = -EFAULT; + goto bail; + } + + /* + * shared context user code requires rcvhdrq mapped r/w, others + * only allowed readonly mapping. + */ + if (!write_ok) { + if (vma->vm_flags & VM_WRITE) { + qib_devinfo(dd->pcidev, + "%s must be mapped readonly\n", what); + ret = -EPERM; + goto bail; + } + + /* don't allow them to later change with mprotect */ + vma->vm_flags &= ~VM_MAYWRITE; + } + + pfn = virt_to_phys(kvaddr) >> PAGE_SHIFT; + ret = remap_pfn_range(vma, vma->vm_start, pfn, + len, vma->vm_page_prot); + if (ret) + qib_devinfo(dd->pcidev, + "%s ctxt%u mmap of %lx, %x bytes failed: %d\n", + what, rcd->ctxt, pfn, len, ret); +bail: + return ret; +} + +static int mmap_ureg(struct vm_area_struct *vma, struct qib_devdata *dd, + u64 ureg) +{ + unsigned long phys; + unsigned long sz; + int ret; + + /* + * This is real hardware, so use io_remap. This is the mechanism + * for the user process to update the head registers for their ctxt + * in the chip. + */ + sz = dd->flags & QIB_HAS_HDRSUPP ? 2 * PAGE_SIZE : PAGE_SIZE; + if ((vma->vm_end - vma->vm_start) > sz) { + qib_devinfo(dd->pcidev, + "FAIL mmap userreg: reqlen %lx > PAGE\n", + vma->vm_end - vma->vm_start); + ret = -EFAULT; + } else { + phys = dd->physaddr + ureg; + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + + vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND; + ret = io_remap_pfn_range(vma, vma->vm_start, + phys >> PAGE_SHIFT, + vma->vm_end - vma->vm_start, + vma->vm_page_prot); + } + return ret; +} + +static int mmap_piobufs(struct vm_area_struct *vma, + struct qib_devdata *dd, + struct qib_ctxtdata *rcd, + unsigned piobufs, unsigned piocnt) +{ + unsigned long phys; + int ret; + + /* + * When we map the PIO buffers in the chip, we want to map them as + * writeonly, no read possible; unfortunately, x86 doesn't allow + * for this in hardware, but we still prevent users from asking + * for it. + */ + if ((vma->vm_end - vma->vm_start) > (piocnt * dd->palign)) { + qib_devinfo(dd->pcidev, + "FAIL mmap piobufs: reqlen %lx > PAGE\n", + vma->vm_end - vma->vm_start); + ret = -EINVAL; + goto bail; + } + + phys = dd->physaddr + piobufs; + +#if defined(__powerpc__) + /* There isn't a generic way to specify writethrough mappings */ + pgprot_val(vma->vm_page_prot) |= _PAGE_NO_CACHE; + pgprot_val(vma->vm_page_prot) |= _PAGE_WRITETHRU; + pgprot_val(vma->vm_page_prot) &= ~_PAGE_GUARDED; +#endif + + /* + * don't allow them to later change to readable with mprotect (for when + * not initially mapped readable, as is normally the case) + */ + vma->vm_flags &= ~VM_MAYREAD; + vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND; + + /* We used PAT if wc_cookie == 0 */ + if (!dd->wc_cookie) + vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); + + ret = io_remap_pfn_range(vma, vma->vm_start, phys >> PAGE_SHIFT, + vma->vm_end - vma->vm_start, + vma->vm_page_prot); +bail: + return ret; +} + +static int mmap_rcvegrbufs(struct vm_area_struct *vma, + struct qib_ctxtdata *rcd) +{ + struct qib_devdata *dd = rcd->dd; + unsigned long start, size; + size_t total_size, i; + unsigned long pfn; + int ret; + + size = rcd->rcvegrbuf_size; + total_size = rcd->rcvegrbuf_chunks * size; + if ((vma->vm_end - vma->vm_start) > total_size) { + qib_devinfo(dd->pcidev, + "FAIL on egr bufs: reqlen %lx > actual %lx\n", + vma->vm_end - vma->vm_start, + (unsigned long) total_size); + ret = -EINVAL; + goto bail; + } + + if (vma->vm_flags & VM_WRITE) { + qib_devinfo(dd->pcidev, + "Can't map eager buffers as writable (flags=%lx)\n", + vma->vm_flags); + ret = -EPERM; + goto bail; + } + /* don't allow them to later change to writeable with mprotect */ + vma->vm_flags &= ~VM_MAYWRITE; + + start = vma->vm_start; + + for (i = 0; i < rcd->rcvegrbuf_chunks; i++, start += size) { + pfn = virt_to_phys(rcd->rcvegrbuf[i]) >> PAGE_SHIFT; + ret = remap_pfn_range(vma, start, pfn, size, + vma->vm_page_prot); + if (ret < 0) + goto bail; + } + ret = 0; + +bail: + return ret; +} + +/* + * qib_file_vma_fault - handle a VMA page fault. + */ +static int qib_file_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct page *page; + + page = vmalloc_to_page((void *)(vmf->pgoff << PAGE_SHIFT)); + if (!page) + return VM_FAULT_SIGBUS; + + get_page(page); + vmf->page = page; + + return 0; +} + +static struct vm_operations_struct qib_file_vm_ops = { + .fault = qib_file_vma_fault, +}; + +static int mmap_kvaddr(struct vm_area_struct *vma, u64 pgaddr, + struct qib_ctxtdata *rcd, unsigned subctxt) +{ + struct qib_devdata *dd = rcd->dd; + unsigned subctxt_cnt; + unsigned long len; + void *addr; + size_t size; + int ret = 0; + + subctxt_cnt = rcd->subctxt_cnt; + size = rcd->rcvegrbuf_chunks * rcd->rcvegrbuf_size; + + /* + * Each process has all the subctxt uregbase, rcvhdrq, and + * rcvegrbufs mmapped - as an array for all the processes, + * and also separately for this process. + */ + if (pgaddr == cvt_kvaddr(rcd->subctxt_uregbase)) { + addr = rcd->subctxt_uregbase; + size = PAGE_SIZE * subctxt_cnt; + } else if (pgaddr == cvt_kvaddr(rcd->subctxt_rcvhdr_base)) { + addr = rcd->subctxt_rcvhdr_base; + size = rcd->rcvhdrq_size * subctxt_cnt; + } else if (pgaddr == cvt_kvaddr(rcd->subctxt_rcvegrbuf)) { + addr = rcd->subctxt_rcvegrbuf; + size *= subctxt_cnt; + } else if (pgaddr == cvt_kvaddr(rcd->subctxt_uregbase + + PAGE_SIZE * subctxt)) { + addr = rcd->subctxt_uregbase + PAGE_SIZE * subctxt; + size = PAGE_SIZE; + } else if (pgaddr == cvt_kvaddr(rcd->subctxt_rcvhdr_base + + rcd->rcvhdrq_size * subctxt)) { + addr = rcd->subctxt_rcvhdr_base + + rcd->rcvhdrq_size * subctxt; + size = rcd->rcvhdrq_size; + } else if (pgaddr == cvt_kvaddr(&rcd->user_event_mask[subctxt])) { + addr = rcd->user_event_mask; + size = PAGE_SIZE; + } else if (pgaddr == cvt_kvaddr(rcd->subctxt_rcvegrbuf + + size * subctxt)) { + addr = rcd->subctxt_rcvegrbuf + size * subctxt; + /* rcvegrbufs are read-only on the slave */ + if (vma->vm_flags & VM_WRITE) { + qib_devinfo(dd->pcidev, + "Can't map eager buffers as writable (flags=%lx)\n", + vma->vm_flags); + ret = -EPERM; + goto bail; + } + /* + * Don't allow permission to later change to writeable + * with mprotect. + */ + vma->vm_flags &= ~VM_MAYWRITE; + } else + goto bail; + len = vma->vm_end - vma->vm_start; + if (len > size) { + ret = -EINVAL; + goto bail; + } + + vma->vm_pgoff = (unsigned long) addr >> PAGE_SHIFT; + vma->vm_ops = &qib_file_vm_ops; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + ret = 1; + +bail: + return ret; +} + +/** + * qib_mmapf - mmap various structures into user space + * @fp: the file pointer + * @vma: the VM area + * + * We use this to have a shared buffer between the kernel and the user code + * for the rcvhdr queue, egr buffers, and the per-context user regs and pio + * buffers in the chip. We have the open and close entries so we can bump + * the ref count and keep the driver from being unloaded while still mapped. + */ +static int qib_mmapf(struct file *fp, struct vm_area_struct *vma) +{ + struct qib_ctxtdata *rcd; + struct qib_devdata *dd; + u64 pgaddr, ureg; + unsigned piobufs, piocnt; + int ret, match = 1; + + rcd = ctxt_fp(fp); + if (!rcd || !(vma->vm_flags & VM_SHARED)) { + ret = -EINVAL; + goto bail; + } + dd = rcd->dd; + + /* + * This is the qib_do_user_init() code, mapping the shared buffers + * and per-context user registers into the user process. The address + * referred to by vm_pgoff is the file offset passed via mmap(). + * For shared contexts, this is the kernel vmalloc() address of the + * pages to share with the master. + * For non-shared or master ctxts, this is a physical address. + * We only do one mmap for each space mapped. + */ + pgaddr = vma->vm_pgoff << PAGE_SHIFT; + + /* + * Check for 0 in case one of the allocations failed, but user + * called mmap anyway. + */ + if (!pgaddr) { + ret = -EINVAL; + goto bail; + } + + /* + * Physical addresses must fit in 40 bits for our hardware. + * Check for kernel virtual addresses first, anything else must + * match a HW or memory address. + */ + ret = mmap_kvaddr(vma, pgaddr, rcd, subctxt_fp(fp)); + if (ret) { + if (ret > 0) + ret = 0; + goto bail; + } + + ureg = dd->uregbase + dd->ureg_align * rcd->ctxt; + if (!rcd->subctxt_cnt) { + /* ctxt is not shared */ + piocnt = rcd->piocnt; + piobufs = rcd->piobufs; + } else if (!subctxt_fp(fp)) { + /* caller is the master */ + piocnt = (rcd->piocnt / rcd->subctxt_cnt) + + (rcd->piocnt % rcd->subctxt_cnt); + piobufs = rcd->piobufs + + dd->palign * (rcd->piocnt - piocnt); + } else { + unsigned slave = subctxt_fp(fp) - 1; + + /* caller is a slave */ + piocnt = rcd->piocnt / rcd->subctxt_cnt; + piobufs = rcd->piobufs + dd->palign * piocnt * slave; + } + + if (pgaddr == ureg) + ret = mmap_ureg(vma, dd, ureg); + else if (pgaddr == piobufs) + ret = mmap_piobufs(vma, dd, rcd, piobufs, piocnt); + else if (pgaddr == dd->pioavailregs_phys) + /* in-memory copy of pioavail registers */ + ret = qib_mmap_mem(vma, rcd, PAGE_SIZE, + (void *) dd->pioavailregs_dma, 0, + "pioavail registers"); + else if (pgaddr == rcd->rcvegr_phys) + ret = mmap_rcvegrbufs(vma, rcd); + else if (pgaddr == (u64) rcd->rcvhdrq_phys) + /* + * The rcvhdrq itself; multiple pages, contiguous + * from an i/o perspective. Shared contexts need + * to map r/w, so we allow writing. + */ + ret = qib_mmap_mem(vma, rcd, rcd->rcvhdrq_size, + rcd->rcvhdrq, 1, "rcvhdrq"); + else if (pgaddr == (u64) rcd->rcvhdrqtailaddr_phys) + /* in-memory copy of rcvhdrq tail register */ + ret = qib_mmap_mem(vma, rcd, PAGE_SIZE, + rcd->rcvhdrtail_kvaddr, 0, + "rcvhdrq tail"); + else + match = 0; + if (!match) + ret = -EINVAL; + + vma->vm_private_data = NULL; + + if (ret < 0) + qib_devinfo(dd->pcidev, + "mmap Failure %d: off %llx len %lx\n", + -ret, (unsigned long long)pgaddr, + vma->vm_end - vma->vm_start); +bail: + return ret; +} + +static unsigned int qib_poll_urgent(struct qib_ctxtdata *rcd, + struct file *fp, + struct poll_table_struct *pt) +{ + struct qib_devdata *dd = rcd->dd; + unsigned pollflag; + + poll_wait(fp, &rcd->wait, pt); + + spin_lock_irq(&dd->uctxt_lock); + if (rcd->urgent != rcd->urgent_poll) { + pollflag = POLLIN | POLLRDNORM; + rcd->urgent_poll = rcd->urgent; + } else { + pollflag = 0; + set_bit(QIB_CTXT_WAITING_URG, &rcd->flag); + } + spin_unlock_irq(&dd->uctxt_lock); + + return pollflag; +} + +static unsigned int qib_poll_next(struct qib_ctxtdata *rcd, + struct file *fp, + struct poll_table_struct *pt) +{ + struct qib_devdata *dd = rcd->dd; + unsigned pollflag; + + poll_wait(fp, &rcd->wait, pt); + + spin_lock_irq(&dd->uctxt_lock); + if (dd->f_hdrqempty(rcd)) { + set_bit(QIB_CTXT_WAITING_RCV, &rcd->flag); + dd->f_rcvctrl(rcd->ppd, QIB_RCVCTRL_INTRAVAIL_ENB, rcd->ctxt); + pollflag = 0; + } else + pollflag = POLLIN | POLLRDNORM; + spin_unlock_irq(&dd->uctxt_lock); + + return pollflag; +} + +static unsigned int qib_poll(struct file *fp, struct poll_table_struct *pt) +{ + struct qib_ctxtdata *rcd; + unsigned pollflag; + + rcd = ctxt_fp(fp); + if (!rcd) + pollflag = POLLERR; + else if (rcd->poll_type == QIB_POLL_TYPE_URGENT) + pollflag = qib_poll_urgent(rcd, fp, pt); + else if (rcd->poll_type == QIB_POLL_TYPE_ANYRCV) + pollflag = qib_poll_next(rcd, fp, pt); + else /* invalid */ + pollflag = POLLERR; + + return pollflag; +} + +static void assign_ctxt_affinity(struct file *fp, struct qib_devdata *dd) +{ + struct qib_filedata *fd = fp->private_data; + const unsigned int weight = cpumask_weight(¤t->cpus_allowed); + const struct cpumask *local_mask = cpumask_of_pcibus(dd->pcidev->bus); + int local_cpu; + + /* + * If process has NOT already set it's affinity, select and + * reserve a processor for it on the local NUMA node. + */ + if ((weight >= qib_cpulist_count) && + (cpumask_weight(local_mask) <= qib_cpulist_count)) { + for_each_cpu(local_cpu, local_mask) + if (!test_and_set_bit(local_cpu, qib_cpulist)) { + fd->rec_cpu_num = local_cpu; + return; + } + } + + /* + * If process has NOT already set it's affinity, select and + * reserve a processor for it, as a rendevous for all + * users of the driver. If they don't actually later + * set affinity to this cpu, or set it to some other cpu, + * it just means that sooner or later we don't recommend + * a cpu, and let the scheduler do it's best. + */ + if (weight >= qib_cpulist_count) { + int cpu; + + cpu = find_first_zero_bit(qib_cpulist, + qib_cpulist_count); + if (cpu == qib_cpulist_count) + qib_dev_err(dd, + "no cpus avail for affinity PID %u\n", + current->pid); + else { + __set_bit(cpu, qib_cpulist); + fd->rec_cpu_num = cpu; + } + } +} + +/* + * Check that userland and driver are compatible for subcontexts. + */ +static int qib_compatible_subctxts(int user_swmajor, int user_swminor) +{ + /* this code is written long-hand for clarity */ + if (QIB_USER_SWMAJOR != user_swmajor) { + /* no promise of compatibility if major mismatch */ + return 0; + } + if (QIB_USER_SWMAJOR == 1) { + switch (QIB_USER_SWMINOR) { + case 0: + case 1: + case 2: + /* no subctxt implementation so cannot be compatible */ + return 0; + case 3: + /* 3 is only compatible with itself */ + return user_swminor == 3; + default: + /* >= 4 are compatible (or are expected to be) */ + return user_swminor <= QIB_USER_SWMINOR; + } + } + /* make no promises yet for future major versions */ + return 0; +} + +static int init_subctxts(struct qib_devdata *dd, + struct qib_ctxtdata *rcd, + const struct qib_user_info *uinfo) +{ + int ret = 0; + unsigned num_subctxts; + size_t size; + + /* + * If the user is requesting zero subctxts, + * skip the subctxt allocation. + */ + if (uinfo->spu_subctxt_cnt <= 0) + goto bail; + num_subctxts = uinfo->spu_subctxt_cnt; + + /* Check for subctxt compatibility */ + if (!qib_compatible_subctxts(uinfo->spu_userversion >> 16, + uinfo->spu_userversion & 0xffff)) { + qib_devinfo(dd->pcidev, + "Mismatched user version (%d.%d) and driver version (%d.%d) while context sharing. Ensure that driver and library are from the same release.\n", + (int) (uinfo->spu_userversion >> 16), + (int) (uinfo->spu_userversion & 0xffff), + QIB_USER_SWMAJOR, QIB_USER_SWMINOR); + goto bail; + } + if (num_subctxts > QLOGIC_IB_MAX_SUBCTXT) { + ret = -EINVAL; + goto bail; + } + + rcd->subctxt_uregbase = vmalloc_user(PAGE_SIZE * num_subctxts); + if (!rcd->subctxt_uregbase) { + ret = -ENOMEM; + goto bail; + } + /* Note: rcd->rcvhdrq_size isn't initialized yet. */ + size = ALIGN(dd->rcvhdrcnt * dd->rcvhdrentsize * + sizeof(u32), PAGE_SIZE) * num_subctxts; + rcd->subctxt_rcvhdr_base = vmalloc_user(size); + if (!rcd->subctxt_rcvhdr_base) { + ret = -ENOMEM; + goto bail_ureg; + } + + rcd->subctxt_rcvegrbuf = vmalloc_user(rcd->rcvegrbuf_chunks * + rcd->rcvegrbuf_size * + num_subctxts); + if (!rcd->subctxt_rcvegrbuf) { + ret = -ENOMEM; + goto bail_rhdr; + } + + rcd->subctxt_cnt = uinfo->spu_subctxt_cnt; + rcd->subctxt_id = uinfo->spu_subctxt_id; + rcd->active_slaves = 1; + rcd->redirect_seq_cnt = 1; + set_bit(QIB_CTXT_MASTER_UNINIT, &rcd->flag); + goto bail; + +bail_rhdr: + vfree(rcd->subctxt_rcvhdr_base); +bail_ureg: + vfree(rcd->subctxt_uregbase); + rcd->subctxt_uregbase = NULL; +bail: + return ret; +} + +static int setup_ctxt(struct qib_pportdata *ppd, int ctxt, + struct file *fp, const struct qib_user_info *uinfo) +{ + struct qib_filedata *fd = fp->private_data; + struct qib_devdata *dd = ppd->dd; + struct qib_ctxtdata *rcd; + void *ptmp = NULL; + int ret; + int numa_id; + + assign_ctxt_affinity(fp, dd); + + numa_id = qib_numa_aware ? ((fd->rec_cpu_num != -1) ? + cpu_to_node(fd->rec_cpu_num) : + numa_node_id()) : dd->assigned_node_id; + + rcd = qib_create_ctxtdata(ppd, ctxt, numa_id); + + /* + * Allocate memory for use in qib_tid_update() at open to + * reduce cost of expected send setup per message segment + */ + if (rcd) + ptmp = kmalloc(dd->rcvtidcnt * sizeof(u16) + + dd->rcvtidcnt * sizeof(struct page **), + GFP_KERNEL); + + if (!rcd || !ptmp) { + qib_dev_err(dd, + "Unable to allocate ctxtdata memory, failing open\n"); + ret = -ENOMEM; + goto bailerr; + } + rcd->userversion = uinfo->spu_userversion; + ret = init_subctxts(dd, rcd, uinfo); + if (ret) + goto bailerr; + rcd->tid_pg_list = ptmp; + rcd->pid = current->pid; + init_waitqueue_head(&dd->rcd[ctxt]->wait); + strlcpy(rcd->comm, current->comm, sizeof(rcd->comm)); + ctxt_fp(fp) = rcd; + qib_stats.sps_ctxts++; + dd->freectxts--; + ret = 0; + goto bail; + +bailerr: + if (fd->rec_cpu_num != -1) + __clear_bit(fd->rec_cpu_num, qib_cpulist); + + dd->rcd[ctxt] = NULL; + kfree(rcd); + kfree(ptmp); +bail: + return ret; +} + +static inline int usable(struct qib_pportdata *ppd) +{ + struct qib_devdata *dd = ppd->dd; + + return dd && (dd->flags & QIB_PRESENT) && dd->kregbase && ppd->lid && + (ppd->lflags & QIBL_LINKACTIVE); +} + +/* + * Select a context on the given device, either using a requested port + * or the port based on the context number. + */ +static int choose_port_ctxt(struct file *fp, struct qib_devdata *dd, u32 port, + const struct qib_user_info *uinfo) +{ + struct qib_pportdata *ppd = NULL; + int ret, ctxt; + + if (port) { + if (!usable(dd->pport + port - 1)) { + ret = -ENETDOWN; + goto done; + } else + ppd = dd->pport + port - 1; + } + for (ctxt = dd->first_user_ctxt; ctxt < dd->cfgctxts && dd->rcd[ctxt]; + ctxt++) + ; + if (ctxt == dd->cfgctxts) { + ret = -EBUSY; + goto done; + } + if (!ppd) { + u32 pidx = ctxt % dd->num_pports; + + if (usable(dd->pport + pidx)) + ppd = dd->pport + pidx; + else { + for (pidx = 0; pidx < dd->num_pports && !ppd; + pidx++) + if (usable(dd->pport + pidx)) + ppd = dd->pport + pidx; + } + } + ret = ppd ? setup_ctxt(ppd, ctxt, fp, uinfo) : -ENETDOWN; +done: + return ret; +} + +static int find_free_ctxt(int unit, struct file *fp, + const struct qib_user_info *uinfo) +{ + struct qib_devdata *dd = qib_lookup(unit); + int ret; + + if (!dd || (uinfo->spu_port && uinfo->spu_port > dd->num_pports)) + ret = -ENODEV; + else + ret = choose_port_ctxt(fp, dd, uinfo->spu_port, uinfo); + + return ret; +} + +static int get_a_ctxt(struct file *fp, const struct qib_user_info *uinfo, + unsigned alg) +{ + struct qib_devdata *udd = NULL; + int ret = 0, devmax, npresent, nup, ndev, dusable = 0, i; + u32 port = uinfo->spu_port, ctxt; + + devmax = qib_count_units(&npresent, &nup); + if (!npresent) { + ret = -ENXIO; + goto done; + } + if (nup == 0) { + ret = -ENETDOWN; + goto done; + } + + if (alg == QIB_PORT_ALG_ACROSS) { + unsigned inuse = ~0U; + + /* find device (with ACTIVE ports) with fewest ctxts in use */ + for (ndev = 0; ndev < devmax; ndev++) { + struct qib_devdata *dd = qib_lookup(ndev); + unsigned cused = 0, cfree = 0, pusable = 0; + + if (!dd) + continue; + if (port && port <= dd->num_pports && + usable(dd->pport + port - 1)) + pusable = 1; + else + for (i = 0; i < dd->num_pports; i++) + if (usable(dd->pport + i)) + pusable++; + if (!pusable) + continue; + for (ctxt = dd->first_user_ctxt; ctxt < dd->cfgctxts; + ctxt++) + if (dd->rcd[ctxt]) + cused++; + else + cfree++; + if (cfree && cused < inuse) { + udd = dd; + inuse = cused; + } + } + if (udd) { + ret = choose_port_ctxt(fp, udd, port, uinfo); + goto done; + } + } else { + for (ndev = 0; ndev < devmax; ndev++) { + struct qib_devdata *dd = qib_lookup(ndev); + + if (dd) { + ret = choose_port_ctxt(fp, dd, port, uinfo); + if (!ret) + goto done; + if (ret == -EBUSY) + dusable++; + } + } + } + ret = dusable ? -EBUSY : -ENETDOWN; + +done: + return ret; +} + +static int find_shared_ctxt(struct file *fp, + const struct qib_user_info *uinfo) +{ + int devmax, ndev, i; + int ret = 0; + + devmax = qib_count_units(NULL, NULL); + + for (ndev = 0; ndev < devmax; ndev++) { + struct qib_devdata *dd = qib_lookup(ndev); + + /* device portion of usable() */ + if (!(dd && (dd->flags & QIB_PRESENT) && dd->kregbase)) + continue; + for (i = dd->first_user_ctxt; i < dd->cfgctxts; i++) { + struct qib_ctxtdata *rcd = dd->rcd[i]; + + /* Skip ctxts which are not yet open */ + if (!rcd || !rcd->cnt) + continue; + /* Skip ctxt if it doesn't match the requested one */ + if (rcd->subctxt_id != uinfo->spu_subctxt_id) + continue; + /* Verify the sharing process matches the master */ + if (rcd->subctxt_cnt != uinfo->spu_subctxt_cnt || + rcd->userversion != uinfo->spu_userversion || + rcd->cnt >= rcd->subctxt_cnt) { + ret = -EINVAL; + goto done; + } + ctxt_fp(fp) = rcd; + subctxt_fp(fp) = rcd->cnt++; + rcd->subpid[subctxt_fp(fp)] = current->pid; + tidcursor_fp(fp) = 0; + rcd->active_slaves |= 1 << subctxt_fp(fp); + ret = 1; + goto done; + } + } + +done: + return ret; +} + +static int qib_open(struct inode *in, struct file *fp) +{ + /* The real work is performed later in qib_assign_ctxt() */ + fp->private_data = kzalloc(sizeof(struct qib_filedata), GFP_KERNEL); + if (fp->private_data) /* no cpu affinity by default */ + ((struct qib_filedata *)fp->private_data)->rec_cpu_num = -1; + return fp->private_data ? 0 : -ENOMEM; +} + +static int find_hca(unsigned int cpu, int *unit) +{ + int ret = 0, devmax, npresent, nup, ndev; + + *unit = -1; + + devmax = qib_count_units(&npresent, &nup); + if (!npresent) { + ret = -ENXIO; + goto done; + } + if (!nup) { + ret = -ENETDOWN; + goto done; + } + for (ndev = 0; ndev < devmax; ndev++) { + struct qib_devdata *dd = qib_lookup(ndev); + + if (dd) { + if (pcibus_to_node(dd->pcidev->bus) < 0) { + ret = -EINVAL; + goto done; + } + if (cpu_to_node(cpu) == + pcibus_to_node(dd->pcidev->bus)) { + *unit = ndev; + goto done; + } + } + } +done: + return ret; +} + +static int do_qib_user_sdma_queue_create(struct file *fp) +{ + struct qib_filedata *fd = fp->private_data; + struct qib_ctxtdata *rcd = fd->rcd; + struct qib_devdata *dd = rcd->dd; + + if (dd->flags & QIB_HAS_SEND_DMA) { + + fd->pq = qib_user_sdma_queue_create(&dd->pcidev->dev, + dd->unit, + rcd->ctxt, + fd->subctxt); + if (!fd->pq) + return -ENOMEM; + } + + return 0; +} + +/* + * Get ctxt early, so can set affinity prior to memory allocation. + */ +static int qib_assign_ctxt(struct file *fp, const struct qib_user_info *uinfo) +{ + int ret; + int i_minor; + unsigned swmajor, swminor, alg = QIB_PORT_ALG_ACROSS; + + /* Check to be sure we haven't already initialized this file */ + if (ctxt_fp(fp)) { + ret = -EINVAL; + goto done; + } + + /* for now, if major version is different, bail */ + swmajor = uinfo->spu_userversion >> 16; + if (swmajor != QIB_USER_SWMAJOR) { + ret = -ENODEV; + goto done; + } + + swminor = uinfo->spu_userversion & 0xffff; + + if (swminor >= 11 && uinfo->spu_port_alg < QIB_PORT_ALG_COUNT) + alg = uinfo->spu_port_alg; + + mutex_lock(&qib_mutex); + + if (qib_compatible_subctxts(swmajor, swminor) && + uinfo->spu_subctxt_cnt) { + ret = find_shared_ctxt(fp, uinfo); + if (ret > 0) { + ret = do_qib_user_sdma_queue_create(fp); + if (!ret) + assign_ctxt_affinity(fp, (ctxt_fp(fp))->dd); + goto done_ok; + } + } + + i_minor = iminor(file_inode(fp)) - QIB_USER_MINOR_BASE; + if (i_minor) + ret = find_free_ctxt(i_minor - 1, fp, uinfo); + else { + int unit; + const unsigned int cpu = cpumask_first(¤t->cpus_allowed); + const unsigned int weight = + cpumask_weight(¤t->cpus_allowed); + + if (weight == 1 && !test_bit(cpu, qib_cpulist)) + if (!find_hca(cpu, &unit) && unit >= 0) + if (!find_free_ctxt(unit, fp, uinfo)) { + ret = 0; + goto done_chk_sdma; + } + ret = get_a_ctxt(fp, uinfo, alg); + } + +done_chk_sdma: + if (!ret) + ret = do_qib_user_sdma_queue_create(fp); +done_ok: + mutex_unlock(&qib_mutex); + +done: + return ret; +} + + +static int qib_do_user_init(struct file *fp, + const struct qib_user_info *uinfo) +{ + int ret; + struct qib_ctxtdata *rcd = ctxt_fp(fp); + struct qib_devdata *dd; + unsigned uctxt; + + /* Subctxts don't need to initialize anything since master did it. */ + if (subctxt_fp(fp)) { + ret = wait_event_interruptible(rcd->wait, + !test_bit(QIB_CTXT_MASTER_UNINIT, &rcd->flag)); + goto bail; + } + + dd = rcd->dd; + + /* some ctxts may get extra buffers, calculate that here */ + uctxt = rcd->ctxt - dd->first_user_ctxt; + if (uctxt < dd->ctxts_extrabuf) { + rcd->piocnt = dd->pbufsctxt + 1; + rcd->pio_base = rcd->piocnt * uctxt; + } else { + rcd->piocnt = dd->pbufsctxt; + rcd->pio_base = rcd->piocnt * uctxt + + dd->ctxts_extrabuf; + } + + /* + * All user buffers are 2KB buffers. If we ever support + * giving 4KB buffers to user processes, this will need some + * work. Can't use piobufbase directly, because it has + * both 2K and 4K buffer base values. So check and handle. + */ + if ((rcd->pio_base + rcd->piocnt) > dd->piobcnt2k) { + if (rcd->pio_base >= dd->piobcnt2k) { + qib_dev_err(dd, + "%u:ctxt%u: no 2KB buffers available\n", + dd->unit, rcd->ctxt); + ret = -ENOBUFS; + goto bail; + } + rcd->piocnt = dd->piobcnt2k - rcd->pio_base; + qib_dev_err(dd, "Ctxt%u: would use 4KB bufs, using %u\n", + rcd->ctxt, rcd->piocnt); + } + + rcd->piobufs = dd->pio2k_bufbase + rcd->pio_base * dd->palign; + qib_chg_pioavailkernel(dd, rcd->pio_base, rcd->piocnt, + TXCHK_CHG_TYPE_USER, rcd); + /* + * try to ensure that processes start up with consistent avail update + * for their own range, at least. If system very quiet, it might + * have the in-memory copy out of date at startup for this range of + * buffers, when a context gets re-used. Do after the chg_pioavail + * and before the rest of setup, so it's "almost certain" the dma + * will have occurred (can't 100% guarantee, but should be many + * decimals of 9s, with this ordering), given how much else happens + * after this. + */ + dd->f_sendctrl(dd->pport, QIB_SENDCTRL_AVAIL_BLIP); + + /* + * Now allocate the rcvhdr Q and eager TIDs; skip the TID + * array for time being. If rcd->ctxt > chip-supported, + * we need to do extra stuff here to handle by handling overflow + * through ctxt 0, someday + */ + ret = qib_create_rcvhdrq(dd, rcd); + if (!ret) + ret = qib_setup_eagerbufs(rcd); + if (ret) + goto bail_pio; + + rcd->tidcursor = 0; /* start at beginning after open */ + + /* initialize poll variables... */ + rcd->urgent = 0; + rcd->urgent_poll = 0; + + /* + * Now enable the ctxt for receive. + * For chips that are set to DMA the tail register to memory + * when they change (and when the update bit transitions from + * 0 to 1. So for those chips, we turn it off and then back on. + * This will (very briefly) affect any other open ctxts, but the + * duration is very short, and therefore isn't an issue. We + * explicitly set the in-memory tail copy to 0 beforehand, so we + * don't have to wait to be sure the DMA update has happened + * (chip resets head/tail to 0 on transition to enable). + */ + if (rcd->rcvhdrtail_kvaddr) + qib_clear_rcvhdrtail(rcd); + + dd->f_rcvctrl(rcd->ppd, QIB_RCVCTRL_CTXT_ENB | QIB_RCVCTRL_TIDFLOW_ENB, + rcd->ctxt); + + /* Notify any waiting slaves */ + if (rcd->subctxt_cnt) { + clear_bit(QIB_CTXT_MASTER_UNINIT, &rcd->flag); + wake_up(&rcd->wait); + } + return 0; + +bail_pio: + qib_chg_pioavailkernel(dd, rcd->pio_base, rcd->piocnt, + TXCHK_CHG_TYPE_KERN, rcd); +bail: + return ret; +} + +/** + * unlock_exptid - unlock any expected TID entries context still had in use + * @rcd: ctxt + * + * We don't actually update the chip here, because we do a bulk update + * below, using f_clear_tids. + */ +static void unlock_expected_tids(struct qib_ctxtdata *rcd) +{ + struct qib_devdata *dd = rcd->dd; + int ctxt_tidbase = rcd->ctxt * dd->rcvtidcnt; + int i, cnt = 0, maxtid = ctxt_tidbase + dd->rcvtidcnt; + + for (i = ctxt_tidbase; i < maxtid; i++) { + struct page *p = dd->pageshadow[i]; + dma_addr_t phys; + + if (!p) + continue; + + phys = dd->physshadow[i]; + dd->physshadow[i] = dd->tidinvalid; + dd->pageshadow[i] = NULL; + pci_unmap_page(dd->pcidev, phys, PAGE_SIZE, + PCI_DMA_FROMDEVICE); + qib_release_user_pages(&p, 1); + cnt++; + } +} + +static int qib_close(struct inode *in, struct file *fp) +{ + int ret = 0; + struct qib_filedata *fd; + struct qib_ctxtdata *rcd; + struct qib_devdata *dd; + unsigned long flags; + unsigned ctxt; + pid_t pid; + + mutex_lock(&qib_mutex); + + fd = fp->private_data; + fp->private_data = NULL; + rcd = fd->rcd; + if (!rcd) { + mutex_unlock(&qib_mutex); + goto bail; + } + + dd = rcd->dd; + + /* ensure all pio buffer writes in progress are flushed */ + qib_flush_wc(); + + /* drain user sdma queue */ + if (fd->pq) { + qib_user_sdma_queue_drain(rcd->ppd, fd->pq); + qib_user_sdma_queue_destroy(fd->pq); + } + + if (fd->rec_cpu_num != -1) + __clear_bit(fd->rec_cpu_num, qib_cpulist); + + if (--rcd->cnt) { + /* + * XXX If the master closes the context before the slave(s), + * revoke the mmap for the eager receive queue so + * the slave(s) don't wait for receive data forever. + */ + rcd->active_slaves &= ~(1 << fd->subctxt); + rcd->subpid[fd->subctxt] = 0; + mutex_unlock(&qib_mutex); + goto bail; + } + + /* early; no interrupt users after this */ + spin_lock_irqsave(&dd->uctxt_lock, flags); + ctxt = rcd->ctxt; + dd->rcd[ctxt] = NULL; + pid = rcd->pid; + rcd->pid = 0; + spin_unlock_irqrestore(&dd->uctxt_lock, flags); + + if (rcd->rcvwait_to || rcd->piowait_to || + rcd->rcvnowait || rcd->pionowait) { + rcd->rcvwait_to = 0; + rcd->piowait_to = 0; + rcd->rcvnowait = 0; + rcd->pionowait = 0; + } + if (rcd->flag) + rcd->flag = 0; + + if (dd->kregbase) { + /* atomically clear receive enable ctxt and intr avail. */ + dd->f_rcvctrl(rcd->ppd, QIB_RCVCTRL_CTXT_DIS | + QIB_RCVCTRL_INTRAVAIL_DIS, ctxt); + + /* clean up the pkeys for this ctxt user */ + qib_clean_part_key(rcd, dd); + qib_disarm_piobufs(dd, rcd->pio_base, rcd->piocnt); + qib_chg_pioavailkernel(dd, rcd->pio_base, + rcd->piocnt, TXCHK_CHG_TYPE_KERN, NULL); + + dd->f_clear_tids(dd, rcd); + + if (dd->pageshadow) + unlock_expected_tids(rcd); + qib_stats.sps_ctxts--; + dd->freectxts++; + } + + mutex_unlock(&qib_mutex); + qib_free_ctxtdata(dd, rcd); /* after releasing the mutex */ + +bail: + kfree(fd); + return ret; +} + +static int qib_ctxt_info(struct file *fp, struct qib_ctxt_info __user *uinfo) +{ + struct qib_ctxt_info info; + int ret; + size_t sz; + struct qib_ctxtdata *rcd = ctxt_fp(fp); + struct qib_filedata *fd; + + fd = fp->private_data; + + info.num_active = qib_count_active_units(); + info.unit = rcd->dd->unit; + info.port = rcd->ppd->port; + info.ctxt = rcd->ctxt; + info.subctxt = subctxt_fp(fp); + /* Number of user ctxts available for this device. */ + info.num_ctxts = rcd->dd->cfgctxts - rcd->dd->first_user_ctxt; + info.num_subctxts = rcd->subctxt_cnt; + info.rec_cpu = fd->rec_cpu_num; + sz = sizeof(info); + + if (copy_to_user(uinfo, &info, sz)) { + ret = -EFAULT; + goto bail; + } + ret = 0; + +bail: + return ret; +} + +static int qib_sdma_get_inflight(struct qib_user_sdma_queue *pq, + u32 __user *inflightp) +{ + const u32 val = qib_user_sdma_inflight_counter(pq); + + if (put_user(val, inflightp)) + return -EFAULT; + + return 0; +} + +static int qib_sdma_get_complete(struct qib_pportdata *ppd, + struct qib_user_sdma_queue *pq, + u32 __user *completep) +{ + u32 val; + int err; + + if (!pq) + return -EINVAL; + + err = qib_user_sdma_make_progress(ppd, pq); + if (err < 0) + return err; + + val = qib_user_sdma_complete_counter(pq); + if (put_user(val, completep)) + return -EFAULT; + + return 0; +} + +static int disarm_req_delay(struct qib_ctxtdata *rcd) +{ + int ret = 0; + + if (!usable(rcd->ppd)) { + int i; + /* + * if link is down, or otherwise not usable, delay + * the caller up to 30 seconds, so we don't thrash + * in trying to get the chip back to ACTIVE, and + * set flag so they make the call again. + */ + if (rcd->user_event_mask) { + /* + * subctxt_cnt is 0 if not shared, so do base + * separately, first, then remaining subctxt, if any + */ + set_bit(_QIB_EVENT_DISARM_BUFS_BIT, + &rcd->user_event_mask[0]); + for (i = 1; i < rcd->subctxt_cnt; i++) + set_bit(_QIB_EVENT_DISARM_BUFS_BIT, + &rcd->user_event_mask[i]); + } + for (i = 0; !usable(rcd->ppd) && i < 300; i++) + msleep(100); + ret = -ENETDOWN; + } + return ret; +} + +/* + * Find all user contexts in use, and set the specified bit in their + * event mask. + * See also find_ctxt() for a similar use, that is specific to send buffers. + */ +int qib_set_uevent_bits(struct qib_pportdata *ppd, const int evtbit) +{ + struct qib_ctxtdata *rcd; + unsigned ctxt; + int ret = 0; + unsigned long flags; + + spin_lock_irqsave(&ppd->dd->uctxt_lock, flags); + for (ctxt = ppd->dd->first_user_ctxt; ctxt < ppd->dd->cfgctxts; + ctxt++) { + rcd = ppd->dd->rcd[ctxt]; + if (!rcd) + continue; + if (rcd->user_event_mask) { + int i; + /* + * subctxt_cnt is 0 if not shared, so do base + * separately, first, then remaining subctxt, if any + */ + set_bit(evtbit, &rcd->user_event_mask[0]); + for (i = 1; i < rcd->subctxt_cnt; i++) + set_bit(evtbit, &rcd->user_event_mask[i]); + } + ret = 1; + break; + } + spin_unlock_irqrestore(&ppd->dd->uctxt_lock, flags); + + return ret; +} + +/* + * clear the event notifier events for this context. + * For the DISARM_BUFS case, we also take action (this obsoletes + * the older QIB_CMD_DISARM_BUFS, but we keep it for backwards + * compatibility. + * Other bits don't currently require actions, just atomically clear. + * User process then performs actions appropriate to bit having been + * set, if desired, and checks again in future. + */ +static int qib_user_event_ack(struct qib_ctxtdata *rcd, int subctxt, + unsigned long events) +{ + int ret = 0, i; + + for (i = 0; i <= _QIB_MAX_EVENT_BIT; i++) { + if (!test_bit(i, &events)) + continue; + if (i == _QIB_EVENT_DISARM_BUFS_BIT) { + (void)qib_disarm_piobufs_ifneeded(rcd); + ret = disarm_req_delay(rcd); + } else + clear_bit(i, &rcd->user_event_mask[subctxt]); + } + return ret; +} + +static ssize_t qib_write(struct file *fp, const char __user *data, + size_t count, loff_t *off) +{ + const struct qib_cmd __user *ucmd; + struct qib_ctxtdata *rcd; + const void __user *src; + size_t consumed, copy = 0; + struct qib_cmd cmd; + ssize_t ret = 0; + void *dest; + + if (count < sizeof(cmd.type)) { + ret = -EINVAL; + goto bail; + } + + ucmd = (const struct qib_cmd __user *) data; + + if (copy_from_user(&cmd.type, &ucmd->type, sizeof(cmd.type))) { + ret = -EFAULT; + goto bail; + } + + consumed = sizeof(cmd.type); + + switch (cmd.type) { + case QIB_CMD_ASSIGN_CTXT: + case QIB_CMD_USER_INIT: + copy = sizeof(cmd.cmd.user_info); + dest = &cmd.cmd.user_info; + src = &ucmd->cmd.user_info; + break; + + case QIB_CMD_RECV_CTRL: + copy = sizeof(cmd.cmd.recv_ctrl); + dest = &cmd.cmd.recv_ctrl; + src = &ucmd->cmd.recv_ctrl; + break; + + case QIB_CMD_CTXT_INFO: + copy = sizeof(cmd.cmd.ctxt_info); + dest = &cmd.cmd.ctxt_info; + src = &ucmd->cmd.ctxt_info; + break; + + case QIB_CMD_TID_UPDATE: + case QIB_CMD_TID_FREE: + copy = sizeof(cmd.cmd.tid_info); + dest = &cmd.cmd.tid_info; + src = &ucmd->cmd.tid_info; + break; + + case QIB_CMD_SET_PART_KEY: + copy = sizeof(cmd.cmd.part_key); + dest = &cmd.cmd.part_key; + src = &ucmd->cmd.part_key; + break; + + case QIB_CMD_DISARM_BUFS: + case QIB_CMD_PIOAVAILUPD: /* force an update of PIOAvail reg */ + copy = 0; + src = NULL; + dest = NULL; + break; + + case QIB_CMD_POLL_TYPE: + copy = sizeof(cmd.cmd.poll_type); + dest = &cmd.cmd.poll_type; + src = &ucmd->cmd.poll_type; + break; + + case QIB_CMD_ARMLAUNCH_CTRL: + copy = sizeof(cmd.cmd.armlaunch_ctrl); + dest = &cmd.cmd.armlaunch_ctrl; + src = &ucmd->cmd.armlaunch_ctrl; + break; + + case QIB_CMD_SDMA_INFLIGHT: + copy = sizeof(cmd.cmd.sdma_inflight); + dest = &cmd.cmd.sdma_inflight; + src = &ucmd->cmd.sdma_inflight; + break; + + case QIB_CMD_SDMA_COMPLETE: + copy = sizeof(cmd.cmd.sdma_complete); + dest = &cmd.cmd.sdma_complete; + src = &ucmd->cmd.sdma_complete; + break; + + case QIB_CMD_ACK_EVENT: + copy = sizeof(cmd.cmd.event_mask); + dest = &cmd.cmd.event_mask; + src = &ucmd->cmd.event_mask; + break; + + default: + ret = -EINVAL; + goto bail; + } + + if (copy) { + if ((count - consumed) < copy) { + ret = -EINVAL; + goto bail; + } + if (copy_from_user(dest, src, copy)) { + ret = -EFAULT; + goto bail; + } + consumed += copy; + } + + rcd = ctxt_fp(fp); + if (!rcd && cmd.type != QIB_CMD_ASSIGN_CTXT) { + ret = -EINVAL; + goto bail; + } + + switch (cmd.type) { + case QIB_CMD_ASSIGN_CTXT: + ret = qib_assign_ctxt(fp, &cmd.cmd.user_info); + if (ret) + goto bail; + break; + + case QIB_CMD_USER_INIT: + ret = qib_do_user_init(fp, &cmd.cmd.user_info); + if (ret) + goto bail; + ret = qib_get_base_info(fp, (void __user *) (unsigned long) + cmd.cmd.user_info.spu_base_info, + cmd.cmd.user_info.spu_base_info_size); + break; + + case QIB_CMD_RECV_CTRL: + ret = qib_manage_rcvq(rcd, subctxt_fp(fp), cmd.cmd.recv_ctrl); + break; + + case QIB_CMD_CTXT_INFO: + ret = qib_ctxt_info(fp, (struct qib_ctxt_info __user *) + (unsigned long) cmd.cmd.ctxt_info); + break; + + case QIB_CMD_TID_UPDATE: + ret = qib_tid_update(rcd, fp, &cmd.cmd.tid_info); + break; + + case QIB_CMD_TID_FREE: + ret = qib_tid_free(rcd, subctxt_fp(fp), &cmd.cmd.tid_info); + break; + + case QIB_CMD_SET_PART_KEY: + ret = qib_set_part_key(rcd, cmd.cmd.part_key); + break; + + case QIB_CMD_DISARM_BUFS: + (void)qib_disarm_piobufs_ifneeded(rcd); + ret = disarm_req_delay(rcd); + break; + + case QIB_CMD_PIOAVAILUPD: + qib_force_pio_avail_update(rcd->dd); + break; + + case QIB_CMD_POLL_TYPE: + rcd->poll_type = cmd.cmd.poll_type; + break; + + case QIB_CMD_ARMLAUNCH_CTRL: + rcd->dd->f_set_armlaunch(rcd->dd, cmd.cmd.armlaunch_ctrl); + break; + + case QIB_CMD_SDMA_INFLIGHT: + ret = qib_sdma_get_inflight(user_sdma_queue_fp(fp), + (u32 __user *) (unsigned long) + cmd.cmd.sdma_inflight); + break; + + case QIB_CMD_SDMA_COMPLETE: + ret = qib_sdma_get_complete(rcd->ppd, + user_sdma_queue_fp(fp), + (u32 __user *) (unsigned long) + cmd.cmd.sdma_complete); + break; + + case QIB_CMD_ACK_EVENT: + ret = qib_user_event_ack(rcd, subctxt_fp(fp), + cmd.cmd.event_mask); + break; + } + + if (ret >= 0) + ret = consumed; + +bail: + return ret; +} + +static ssize_t qib_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct qib_filedata *fp = iocb->ki_filp->private_data; + struct qib_ctxtdata *rcd = ctxt_fp(iocb->ki_filp); + struct qib_user_sdma_queue *pq = fp->pq; + + if (!iter_is_iovec(from) || !from->nr_segs || !pq) + return -EINVAL; + + return qib_user_sdma_writev(rcd, pq, from->iov, from->nr_segs); +} + +static struct class *qib_class; +static dev_t qib_dev; + +int qib_cdev_init(int minor, const char *name, + const struct file_operations *fops, + struct cdev **cdevp, struct device **devp) +{ + const dev_t dev = MKDEV(MAJOR(qib_dev), minor); + struct cdev *cdev; + struct device *device = NULL; + int ret; + + cdev = cdev_alloc(); + if (!cdev) { + pr_err("Could not allocate cdev for minor %d, %s\n", + minor, name); + ret = -ENOMEM; + goto done; + } + + cdev->owner = THIS_MODULE; + cdev->ops = fops; + kobject_set_name(&cdev->kobj, name); + + ret = cdev_add(cdev, dev, 1); + if (ret < 0) { + pr_err("Could not add cdev for minor %d, %s (err %d)\n", + minor, name, -ret); + goto err_cdev; + } + + device = device_create(qib_class, NULL, dev, NULL, "%s", name); + if (!IS_ERR(device)) + goto done; + ret = PTR_ERR(device); + device = NULL; + pr_err("Could not create device for minor %d, %s (err %d)\n", + minor, name, -ret); +err_cdev: + cdev_del(cdev); + cdev = NULL; +done: + *cdevp = cdev; + *devp = device; + return ret; +} + +void qib_cdev_cleanup(struct cdev **cdevp, struct device **devp) +{ + struct device *device = *devp; + + if (device) { + device_unregister(device); + *devp = NULL; + } + + if (*cdevp) { + cdev_del(*cdevp); + *cdevp = NULL; + } +} + +static struct cdev *wildcard_cdev; +static struct device *wildcard_device; + +int __init qib_dev_init(void) +{ + int ret; + + ret = alloc_chrdev_region(&qib_dev, 0, QIB_NMINORS, QIB_DRV_NAME); + if (ret < 0) { + pr_err("Could not allocate chrdev region (err %d)\n", -ret); + goto done; + } + + qib_class = class_create(THIS_MODULE, "ipath"); + if (IS_ERR(qib_class)) { + ret = PTR_ERR(qib_class); + pr_err("Could not create device class (err %d)\n", -ret); + unregister_chrdev_region(qib_dev, QIB_NMINORS); + } + +done: + return ret; +} + +void qib_dev_cleanup(void) +{ + if (qib_class) { + class_destroy(qib_class); + qib_class = NULL; + } + + unregister_chrdev_region(qib_dev, QIB_NMINORS); +} + +static atomic_t user_count = ATOMIC_INIT(0); + +static void qib_user_remove(struct qib_devdata *dd) +{ + if (atomic_dec_return(&user_count) == 0) + qib_cdev_cleanup(&wildcard_cdev, &wildcard_device); + + qib_cdev_cleanup(&dd->user_cdev, &dd->user_device); +} + +static int qib_user_add(struct qib_devdata *dd) +{ + char name[10]; + int ret; + + if (atomic_inc_return(&user_count) == 1) { + ret = qib_cdev_init(0, "ipath", &qib_file_ops, + &wildcard_cdev, &wildcard_device); + if (ret) + goto done; + } + + snprintf(name, sizeof(name), "ipath%d", dd->unit); + ret = qib_cdev_init(dd->unit + 1, name, &qib_file_ops, + &dd->user_cdev, &dd->user_device); + if (ret) + qib_user_remove(dd); +done: + return ret; +} + +/* + * Create per-unit files in /dev + */ +int qib_device_create(struct qib_devdata *dd) +{ + int r, ret; + + r = qib_user_add(dd); + ret = qib_diag_add(dd); + if (r && !ret) + ret = r; + return ret; +} + +/* + * Remove per-unit files in /dev + * void, core kernel returns no errors for this stuff + */ +void qib_device_remove(struct qib_devdata *dd) +{ + qib_user_remove(dd); + qib_diag_remove(dd); +} diff --git a/kernel/drivers/infiniband/hw/qib/qib_fs.c b/kernel/drivers/infiniband/hw/qib/qib_fs.c new file mode 100644 index 000000000..bdd5d3857 --- /dev/null +++ b/kernel/drivers/infiniband/hw/qib/qib_fs.c @@ -0,0 +1,621 @@ +/* + * Copyright (c) 2012 Intel Corporation. All rights reserved. + * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved. + * Copyright (c) 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include + +#include "qib.h" + +#define QIBFS_MAGIC 0x726a77 + +static struct super_block *qib_super; + +#define private2dd(file) (file_inode(file)->i_private) + +static int qibfs_mknod(struct inode *dir, struct dentry *dentry, + umode_t mode, const struct file_operations *fops, + void *data) +{ + int error; + struct inode *inode = new_inode(dir->i_sb); + + if (!inode) { + error = -EPERM; + goto bail; + } + + inode->i_ino = get_next_ino(); + inode->i_mode = mode; + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; + inode->i_blocks = 0; + inode->i_atime = CURRENT_TIME; + inode->i_mtime = inode->i_atime; + inode->i_ctime = inode->i_atime; + inode->i_private = data; + if (S_ISDIR(mode)) { + inode->i_op = &simple_dir_inode_operations; + inc_nlink(inode); + inc_nlink(dir); + } + + inode->i_fop = fops; + + d_instantiate(dentry, inode); + error = 0; + +bail: + return error; +} + +static int create_file(const char *name, umode_t mode, + struct dentry *parent, struct dentry **dentry, + const struct file_operations *fops, void *data) +{ + int error; + + mutex_lock(&d_inode(parent)->i_mutex); + *dentry = lookup_one_len(name, parent, strlen(name)); + if (!IS_ERR(*dentry)) + error = qibfs_mknod(d_inode(parent), *dentry, + mode, fops, data); + else + error = PTR_ERR(*dentry); + mutex_unlock(&d_inode(parent)->i_mutex); + + return error; +} + +static ssize_t driver_stats_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + qib_stats.sps_ints = qib_sps_ints(); + return simple_read_from_buffer(buf, count, ppos, &qib_stats, + sizeof(qib_stats)); +} + +/* + * driver stats field names, one line per stat, single string. Used by + * programs like ipathstats to print the stats in a way which works for + * different versions of drivers, without changing program source. + * if qlogic_ib_stats changes, this needs to change. Names need to be + * 12 chars or less (w/o newline), for proper display by ipathstats utility. + */ +static const char qib_statnames[] = + "KernIntr\n" + "ErrorIntr\n" + "Tx_Errs\n" + "Rcv_Errs\n" + "H/W_Errs\n" + "NoPIOBufs\n" + "CtxtsOpen\n" + "RcvLen_Errs\n" + "EgrBufFull\n" + "EgrHdrFull\n" + ; + +static ssize_t driver_names_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + return simple_read_from_buffer(buf, count, ppos, qib_statnames, + sizeof(qib_statnames) - 1); /* no null */ +} + +static const struct file_operations driver_ops[] = { + { .read = driver_stats_read, .llseek = generic_file_llseek, }, + { .read = driver_names_read, .llseek = generic_file_llseek, }, +}; + +/* read the per-device counters */ +static ssize_t dev_counters_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + u64 *counters; + size_t avail; + struct qib_devdata *dd = private2dd(file); + + avail = dd->f_read_cntrs(dd, *ppos, NULL, &counters); + return simple_read_from_buffer(buf, count, ppos, counters, avail); +} + +/* read the per-device counters */ +static ssize_t dev_names_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + char *names; + size_t avail; + struct qib_devdata *dd = private2dd(file); + + avail = dd->f_read_cntrs(dd, *ppos, &names, NULL); + return simple_read_from_buffer(buf, count, ppos, names, avail); +} + +static const struct file_operations cntr_ops[] = { + { .read = dev_counters_read, .llseek = generic_file_llseek, }, + { .read = dev_names_read, .llseek = generic_file_llseek, }, +}; + +/* + * Could use file_inode(file)->i_ino to figure out which file, + * instead of separate routine for each, but for now, this works... + */ + +/* read the per-port names (same for each port) */ +static ssize_t portnames_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + char *names; + size_t avail; + struct qib_devdata *dd = private2dd(file); + + avail = dd->f_read_portcntrs(dd, *ppos, 0, &names, NULL); + return simple_read_from_buffer(buf, count, ppos, names, avail); +} + +/* read the per-port counters for port 1 (pidx 0) */ +static ssize_t portcntrs_1_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + u64 *counters; + size_t avail; + struct qib_devdata *dd = private2dd(file); + + avail = dd->f_read_portcntrs(dd, *ppos, 0, NULL, &counters); + return simple_read_from_buffer(buf, count, ppos, counters, avail); +} + +/* read the per-port counters for port 2 (pidx 1) */ +static ssize_t portcntrs_2_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + u64 *counters; + size_t avail; + struct qib_devdata *dd = private2dd(file); + + avail = dd->f_read_portcntrs(dd, *ppos, 1, NULL, &counters); + return simple_read_from_buffer(buf, count, ppos, counters, avail); +} + +static const struct file_operations portcntr_ops[] = { + { .read = portnames_read, .llseek = generic_file_llseek, }, + { .read = portcntrs_1_read, .llseek = generic_file_llseek, }, + { .read = portcntrs_2_read, .llseek = generic_file_llseek, }, +}; + +/* + * read the per-port QSFP data for port 1 (pidx 0) + */ +static ssize_t qsfp_1_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct qib_devdata *dd = private2dd(file); + char *tmp; + int ret; + + tmp = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!tmp) + return -ENOMEM; + + ret = qib_qsfp_dump(dd->pport, tmp, PAGE_SIZE); + if (ret > 0) + ret = simple_read_from_buffer(buf, count, ppos, tmp, ret); + kfree(tmp); + return ret; +} + +/* + * read the per-port QSFP data for port 2 (pidx 1) + */ +static ssize_t qsfp_2_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct qib_devdata *dd = private2dd(file); + char *tmp; + int ret; + + if (dd->num_pports < 2) + return -ENODEV; + + tmp = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!tmp) + return -ENOMEM; + + ret = qib_qsfp_dump(dd->pport + 1, tmp, PAGE_SIZE); + if (ret > 0) + ret = simple_read_from_buffer(buf, count, ppos, tmp, ret); + kfree(tmp); + return ret; +} + +static const struct file_operations qsfp_ops[] = { + { .read = qsfp_1_read, .llseek = generic_file_llseek, }, + { .read = qsfp_2_read, .llseek = generic_file_llseek, }, +}; + +static ssize_t flash_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct qib_devdata *dd; + ssize_t ret; + loff_t pos; + char *tmp; + + pos = *ppos; + + if (pos < 0) { + ret = -EINVAL; + goto bail; + } + + if (pos >= sizeof(struct qib_flash)) { + ret = 0; + goto bail; + } + + if (count > sizeof(struct qib_flash) - pos) + count = sizeof(struct qib_flash) - pos; + + tmp = kmalloc(count, GFP_KERNEL); + if (!tmp) { + ret = -ENOMEM; + goto bail; + } + + dd = private2dd(file); + if (qib_eeprom_read(dd, pos, tmp, count)) { + qib_dev_err(dd, "failed to read from flash\n"); + ret = -ENXIO; + goto bail_tmp; + } + + if (copy_to_user(buf, tmp, count)) { + ret = -EFAULT; + goto bail_tmp; + } + + *ppos = pos + count; + ret = count; + +bail_tmp: + kfree(tmp); + +bail: + return ret; +} + +static ssize_t flash_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct qib_devdata *dd; + ssize_t ret; + loff_t pos; + char *tmp; + + pos = *ppos; + + if (pos != 0) { + ret = -EINVAL; + goto bail; + } + + if (count != sizeof(struct qib_flash)) { + ret = -EINVAL; + goto bail; + } + + tmp = kmalloc(count, GFP_KERNEL); + if (!tmp) { + ret = -ENOMEM; + goto bail; + } + + if (copy_from_user(tmp, buf, count)) { + ret = -EFAULT; + goto bail_tmp; + } + + dd = private2dd(file); + if (qib_eeprom_write(dd, pos, tmp, count)) { + ret = -ENXIO; + qib_dev_err(dd, "failed to write to flash\n"); + goto bail_tmp; + } + + *ppos = pos + count; + ret = count; + +bail_tmp: + kfree(tmp); + +bail: + return ret; +} + +static const struct file_operations flash_ops = { + .read = flash_read, + .write = flash_write, + .llseek = default_llseek, +}; + +static int add_cntr_files(struct super_block *sb, struct qib_devdata *dd) +{ + struct dentry *dir, *tmp; + char unit[10]; + int ret, i; + + /* create the per-unit directory */ + snprintf(unit, sizeof(unit), "%u", dd->unit); + ret = create_file(unit, S_IFDIR|S_IRUGO|S_IXUGO, sb->s_root, &dir, + &simple_dir_operations, dd); + if (ret) { + pr_err("create_file(%s) failed: %d\n", unit, ret); + goto bail; + } + + /* create the files in the new directory */ + ret = create_file("counters", S_IFREG|S_IRUGO, dir, &tmp, + &cntr_ops[0], dd); + if (ret) { + pr_err("create_file(%s/counters) failed: %d\n", + unit, ret); + goto bail; + } + ret = create_file("counter_names", S_IFREG|S_IRUGO, dir, &tmp, + &cntr_ops[1], dd); + if (ret) { + pr_err("create_file(%s/counter_names) failed: %d\n", + unit, ret); + goto bail; + } + ret = create_file("portcounter_names", S_IFREG|S_IRUGO, dir, &tmp, + &portcntr_ops[0], dd); + if (ret) { + pr_err("create_file(%s/%s) failed: %d\n", + unit, "portcounter_names", ret); + goto bail; + } + for (i = 1; i <= dd->num_pports; i++) { + char fname[24]; + + sprintf(fname, "port%dcounters", i); + /* create the files in the new directory */ + ret = create_file(fname, S_IFREG|S_IRUGO, dir, &tmp, + &portcntr_ops[i], dd); + if (ret) { + pr_err("create_file(%s/%s) failed: %d\n", + unit, fname, ret); + goto bail; + } + if (!(dd->flags & QIB_HAS_QSFP)) + continue; + sprintf(fname, "qsfp%d", i); + ret = create_file(fname, S_IFREG|S_IRUGO, dir, &tmp, + &qsfp_ops[i - 1], dd); + if (ret) { + pr_err("create_file(%s/%s) failed: %d\n", + unit, fname, ret); + goto bail; + } + } + + ret = create_file("flash", S_IFREG|S_IWUSR|S_IRUGO, dir, &tmp, + &flash_ops, dd); + if (ret) + pr_err("create_file(%s/flash) failed: %d\n", + unit, ret); +bail: + return ret; +} + +static int remove_file(struct dentry *parent, char *name) +{ + struct dentry *tmp; + int ret; + + tmp = lookup_one_len(name, parent, strlen(name)); + + if (IS_ERR(tmp)) { + ret = PTR_ERR(tmp); + goto bail; + } + + spin_lock(&tmp->d_lock); + if (!d_unhashed(tmp) && d_really_is_positive(tmp)) { + __d_drop(tmp); + spin_unlock(&tmp->d_lock); + simple_unlink(d_inode(parent), tmp); + } else { + spin_unlock(&tmp->d_lock); + } + dput(tmp); + + ret = 0; +bail: + /* + * We don't expect clients to care about the return value, but + * it's there if they need it. + */ + return ret; +} + +static int remove_device_files(struct super_block *sb, + struct qib_devdata *dd) +{ + struct dentry *dir, *root; + char unit[10]; + int ret, i; + + root = dget(sb->s_root); + mutex_lock(&d_inode(root)->i_mutex); + snprintf(unit, sizeof(unit), "%u", dd->unit); + dir = lookup_one_len(unit, root, strlen(unit)); + + if (IS_ERR(dir)) { + ret = PTR_ERR(dir); + pr_err("Lookup of %s failed\n", unit); + goto bail; + } + + mutex_lock(&d_inode(dir)->i_mutex); + remove_file(dir, "counters"); + remove_file(dir, "counter_names"); + remove_file(dir, "portcounter_names"); + for (i = 0; i < dd->num_pports; i++) { + char fname[24]; + + sprintf(fname, "port%dcounters", i + 1); + remove_file(dir, fname); + if (dd->flags & QIB_HAS_QSFP) { + sprintf(fname, "qsfp%d", i + 1); + remove_file(dir, fname); + } + } + remove_file(dir, "flash"); + mutex_unlock(&d_inode(dir)->i_mutex); + ret = simple_rmdir(d_inode(root), dir); + d_delete(dir); + dput(dir); + +bail: + mutex_unlock(&d_inode(root)->i_mutex); + dput(root); + return ret; +} + +/* + * This fills everything in when the fs is mounted, to handle umount/mount + * after device init. The direct add_cntr_files() call handles adding + * them from the init code, when the fs is already mounted. + */ +static int qibfs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct qib_devdata *dd, *tmp; + unsigned long flags; + int ret; + + static struct tree_descr files[] = { + [2] = {"driver_stats", &driver_ops[0], S_IRUGO}, + [3] = {"driver_stats_names", &driver_ops[1], S_IRUGO}, + {""}, + }; + + ret = simple_fill_super(sb, QIBFS_MAGIC, files); + if (ret) { + pr_err("simple_fill_super failed: %d\n", ret); + goto bail; + } + + spin_lock_irqsave(&qib_devs_lock, flags); + + list_for_each_entry_safe(dd, tmp, &qib_dev_list, list) { + spin_unlock_irqrestore(&qib_devs_lock, flags); + ret = add_cntr_files(sb, dd); + if (ret) + goto bail; + spin_lock_irqsave(&qib_devs_lock, flags); + } + + spin_unlock_irqrestore(&qib_devs_lock, flags); + +bail: + return ret; +} + +static struct dentry *qibfs_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) +{ + struct dentry *ret; + + ret = mount_single(fs_type, flags, data, qibfs_fill_super); + if (!IS_ERR(ret)) + qib_super = ret->d_sb; + return ret; +} + +static void qibfs_kill_super(struct super_block *s) +{ + kill_litter_super(s); + qib_super = NULL; +} + +int qibfs_add(struct qib_devdata *dd) +{ + int ret; + + /* + * On first unit initialized, qib_super will not yet exist + * because nobody has yet tried to mount the filesystem, so + * we can't consider that to be an error; if an error occurs + * during the mount, that will get a complaint, so this is OK. + * add_cntr_files() for all units is done at mount from + * qibfs_fill_super(), so one way or another, everything works. + */ + if (qib_super == NULL) + ret = 0; + else + ret = add_cntr_files(qib_super, dd); + return ret; +} + +int qibfs_remove(struct qib_devdata *dd) +{ + int ret = 0; + + if (qib_super) + ret = remove_device_files(qib_super, dd); + + return ret; +} + +static struct file_system_type qibfs_fs_type = { + .owner = THIS_MODULE, + .name = "ipathfs", + .mount = qibfs_mount, + .kill_sb = qibfs_kill_super, +}; +MODULE_ALIAS_FS("ipathfs"); + +int __init qib_init_qibfs(void) +{ + return register_filesystem(&qibfs_fs_type); +} + +int __exit qib_exit_qibfs(void) +{ + return unregister_filesystem(&qibfs_fs_type); +} diff --git a/kernel/drivers/infiniband/hw/qib/qib_iba6120.c b/kernel/drivers/infiniband/hw/qib/qib_iba6120.c new file mode 100644 index 000000000..4b927809d --- /dev/null +++ b/kernel/drivers/infiniband/hw/qib/qib_iba6120.c @@ -0,0 +1,3600 @@ +/* + * Copyright (c) 2013 Intel Corporation. All rights reserved. + * Copyright (c) 2006, 2007, 2008, 2009, 2010 QLogic Corporation. + * All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +/* + * This file contains all of the code that is specific to the + * QLogic_IB 6120 PCIe chip. + */ + +#include +#include +#include +#include + +#include "qib.h" +#include "qib_6120_regs.h" + +static void qib_6120_setup_setextled(struct qib_pportdata *, u32); +static void sendctrl_6120_mod(struct qib_pportdata *ppd, u32 op); +static u8 qib_6120_phys_portstate(u64); +static u32 qib_6120_iblink_state(u64); + +/* + * This file contains all the chip-specific register information and + * access functions for the Intel Intel_IB PCI-Express chip. + * + */ + +/* KREG_IDX uses machine-generated #defines */ +#define KREG_IDX(regname) (QIB_6120_##regname##_OFFS / sizeof(u64)) + +/* Use defines to tie machine-generated names to lower-case names */ +#define kr_extctrl KREG_IDX(EXTCtrl) +#define kr_extstatus KREG_IDX(EXTStatus) +#define kr_gpio_clear KREG_IDX(GPIOClear) +#define kr_gpio_mask KREG_IDX(GPIOMask) +#define kr_gpio_out KREG_IDX(GPIOOut) +#define kr_gpio_status KREG_IDX(GPIOStatus) +#define kr_rcvctrl KREG_IDX(RcvCtrl) +#define kr_sendctrl KREG_IDX(SendCtrl) +#define kr_partitionkey KREG_IDX(RcvPartitionKey) +#define kr_hwdiagctrl KREG_IDX(HwDiagCtrl) +#define kr_ibcstatus KREG_IDX(IBCStatus) +#define kr_ibcctrl KREG_IDX(IBCCtrl) +#define kr_sendbuffererror KREG_IDX(SendBufErr0) +#define kr_rcvbthqp KREG_IDX(RcvBTHQP) +#define kr_counterregbase KREG_IDX(CntrRegBase) +#define kr_palign KREG_IDX(PageAlign) +#define kr_rcvegrbase KREG_IDX(RcvEgrBase) +#define kr_rcvegrcnt KREG_IDX(RcvEgrCnt) +#define kr_rcvhdrcnt KREG_IDX(RcvHdrCnt) +#define kr_rcvhdrentsize KREG_IDX(RcvHdrEntSize) +#define kr_rcvhdrsize KREG_IDX(RcvHdrSize) +#define kr_rcvtidbase KREG_IDX(RcvTIDBase) +#define kr_rcvtidcnt KREG_IDX(RcvTIDCnt) +#define kr_scratch KREG_IDX(Scratch) +#define kr_sendctrl KREG_IDX(SendCtrl) +#define kr_sendpioavailaddr KREG_IDX(SendPIOAvailAddr) +#define kr_sendpiobufbase KREG_IDX(SendPIOBufBase) +#define kr_sendpiobufcnt KREG_IDX(SendPIOBufCnt) +#define kr_sendpiosize KREG_IDX(SendPIOSize) +#define kr_sendregbase KREG_IDX(SendRegBase) +#define kr_userregbase KREG_IDX(UserRegBase) +#define kr_control KREG_IDX(Control) +#define kr_intclear KREG_IDX(IntClear) +#define kr_intmask KREG_IDX(IntMask) +#define kr_intstatus KREG_IDX(IntStatus) +#define kr_errclear KREG_IDX(ErrClear) +#define kr_errmask KREG_IDX(ErrMask) +#define kr_errstatus KREG_IDX(ErrStatus) +#define kr_hwerrclear KREG_IDX(HwErrClear) +#define kr_hwerrmask KREG_IDX(HwErrMask) +#define kr_hwerrstatus KREG_IDX(HwErrStatus) +#define kr_revision KREG_IDX(Revision) +#define kr_portcnt KREG_IDX(PortCnt) +#define kr_serdes_cfg0 KREG_IDX(SerdesCfg0) +#define kr_serdes_cfg1 (kr_serdes_cfg0 + 1) +#define kr_serdes_stat KREG_IDX(SerdesStat) +#define kr_xgxs_cfg KREG_IDX(XGXSCfg) + +/* These must only be written via qib_write_kreg_ctxt() */ +#define kr_rcvhdraddr KREG_IDX(RcvHdrAddr0) +#define kr_rcvhdrtailaddr KREG_IDX(RcvHdrTailAddr0) + +#define CREG_IDX(regname) ((QIB_6120_##regname##_OFFS - \ + QIB_6120_LBIntCnt_OFFS) / sizeof(u64)) + +#define cr_badformat CREG_IDX(RxBadFormatCnt) +#define cr_erricrc CREG_IDX(RxICRCErrCnt) +#define cr_errlink CREG_IDX(RxLinkProblemCnt) +#define cr_errlpcrc CREG_IDX(RxLPCRCErrCnt) +#define cr_errpkey CREG_IDX(RxPKeyMismatchCnt) +#define cr_rcvflowctrl_err CREG_IDX(RxFlowCtrlErrCnt) +#define cr_err_rlen CREG_IDX(RxLenErrCnt) +#define cr_errslen CREG_IDX(TxLenErrCnt) +#define cr_errtidfull CREG_IDX(RxTIDFullErrCnt) +#define cr_errtidvalid CREG_IDX(RxTIDValidErrCnt) +#define cr_errvcrc CREG_IDX(RxVCRCErrCnt) +#define cr_ibstatuschange CREG_IDX(IBStatusChangeCnt) +#define cr_lbint CREG_IDX(LBIntCnt) +#define cr_invalidrlen CREG_IDX(RxMaxMinLenErrCnt) +#define cr_invalidslen CREG_IDX(TxMaxMinLenErrCnt) +#define cr_lbflowstall CREG_IDX(LBFlowStallCnt) +#define cr_pktrcv CREG_IDX(RxDataPktCnt) +#define cr_pktrcvflowctrl CREG_IDX(RxFlowPktCnt) +#define cr_pktsend CREG_IDX(TxDataPktCnt) +#define cr_pktsendflow CREG_IDX(TxFlowPktCnt) +#define cr_portovfl CREG_IDX(RxP0HdrEgrOvflCnt) +#define cr_rcvebp CREG_IDX(RxEBPCnt) +#define cr_rcvovfl CREG_IDX(RxBufOvflCnt) +#define cr_senddropped CREG_IDX(TxDroppedPktCnt) +#define cr_sendstall CREG_IDX(TxFlowStallCnt) +#define cr_sendunderrun CREG_IDX(TxUnderrunCnt) +#define cr_wordrcv CREG_IDX(RxDwordCnt) +#define cr_wordsend CREG_IDX(TxDwordCnt) +#define cr_txunsupvl CREG_IDX(TxUnsupVLErrCnt) +#define cr_rxdroppkt CREG_IDX(RxDroppedPktCnt) +#define cr_iblinkerrrecov CREG_IDX(IBLinkErrRecoveryCnt) +#define cr_iblinkdown CREG_IDX(IBLinkDownedCnt) +#define cr_ibsymbolerr CREG_IDX(IBSymbolErrCnt) + +#define SYM_RMASK(regname, fldname) ((u64) \ + QIB_6120_##regname##_##fldname##_RMASK) +#define SYM_MASK(regname, fldname) ((u64) \ + QIB_6120_##regname##_##fldname##_RMASK << \ + QIB_6120_##regname##_##fldname##_LSB) +#define SYM_LSB(regname, fldname) (QIB_6120_##regname##_##fldname##_LSB) + +#define SYM_FIELD(value, regname, fldname) ((u64) \ + (((value) >> SYM_LSB(regname, fldname)) & \ + SYM_RMASK(regname, fldname))) +#define ERR_MASK(fldname) SYM_MASK(ErrMask, fldname##Mask) +#define HWE_MASK(fldname) SYM_MASK(HwErrMask, fldname##Mask) + +/* link training states, from IBC */ +#define IB_6120_LT_STATE_DISABLED 0x00 +#define IB_6120_LT_STATE_LINKUP 0x01 +#define IB_6120_LT_STATE_POLLACTIVE 0x02 +#define IB_6120_LT_STATE_POLLQUIET 0x03 +#define IB_6120_LT_STATE_SLEEPDELAY 0x04 +#define IB_6120_LT_STATE_SLEEPQUIET 0x05 +#define IB_6120_LT_STATE_CFGDEBOUNCE 0x08 +#define IB_6120_LT_STATE_CFGRCVFCFG 0x09 +#define IB_6120_LT_STATE_CFGWAITRMT 0x0a +#define IB_6120_LT_STATE_CFGIDLE 0x0b +#define IB_6120_LT_STATE_RECOVERRETRAIN 0x0c +#define IB_6120_LT_STATE_RECOVERWAITRMT 0x0e +#define IB_6120_LT_STATE_RECOVERIDLE 0x0f + +/* link state machine states from IBC */ +#define IB_6120_L_STATE_DOWN 0x0 +#define IB_6120_L_STATE_INIT 0x1 +#define IB_6120_L_STATE_ARM 0x2 +#define IB_6120_L_STATE_ACTIVE 0x3 +#define IB_6120_L_STATE_ACT_DEFER 0x4 + +static const u8 qib_6120_physportstate[0x20] = { + [IB_6120_LT_STATE_DISABLED] = IB_PHYSPORTSTATE_DISABLED, + [IB_6120_LT_STATE_LINKUP] = IB_PHYSPORTSTATE_LINKUP, + [IB_6120_LT_STATE_POLLACTIVE] = IB_PHYSPORTSTATE_POLL, + [IB_6120_LT_STATE_POLLQUIET] = IB_PHYSPORTSTATE_POLL, + [IB_6120_LT_STATE_SLEEPDELAY] = IB_PHYSPORTSTATE_SLEEP, + [IB_6120_LT_STATE_SLEEPQUIET] = IB_PHYSPORTSTATE_SLEEP, + [IB_6120_LT_STATE_CFGDEBOUNCE] = + IB_PHYSPORTSTATE_CFG_TRAIN, + [IB_6120_LT_STATE_CFGRCVFCFG] = + IB_PHYSPORTSTATE_CFG_TRAIN, + [IB_6120_LT_STATE_CFGWAITRMT] = + IB_PHYSPORTSTATE_CFG_TRAIN, + [IB_6120_LT_STATE_CFGIDLE] = IB_PHYSPORTSTATE_CFG_TRAIN, + [IB_6120_LT_STATE_RECOVERRETRAIN] = + IB_PHYSPORTSTATE_LINK_ERR_RECOVER, + [IB_6120_LT_STATE_RECOVERWAITRMT] = + IB_PHYSPORTSTATE_LINK_ERR_RECOVER, + [IB_6120_LT_STATE_RECOVERIDLE] = + IB_PHYSPORTSTATE_LINK_ERR_RECOVER, + [0x10] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x11] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x12] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x13] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x14] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x15] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x16] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x17] = IB_PHYSPORTSTATE_CFG_TRAIN +}; + + +struct qib_chip_specific { + u64 __iomem *cregbase; + u64 *cntrs; + u64 *portcntrs; + void *dummy_hdrq; /* used after ctxt close */ + dma_addr_t dummy_hdrq_phys; + spinlock_t kernel_tid_lock; /* no back to back kernel TID writes */ + spinlock_t user_tid_lock; /* no back to back user TID writes */ + spinlock_t rcvmod_lock; /* protect rcvctrl shadow changes */ + spinlock_t gpio_lock; /* RMW of shadows/regs for ExtCtrl and GPIO */ + u64 hwerrmask; + u64 errormask; + u64 gpio_out; /* shadow of kr_gpio_out, for rmw ops */ + u64 gpio_mask; /* shadow the gpio mask register */ + u64 extctrl; /* shadow the gpio output enable, etc... */ + /* + * these 5 fields are used to establish deltas for IB symbol + * errors and linkrecovery errors. They can be reported on + * some chips during link negotiation prior to INIT, and with + * DDR when faking DDR negotiations with non-IBTA switches. + * The chip counters are adjusted at driver unload if there is + * a non-zero delta. + */ + u64 ibdeltainprog; + u64 ibsymdelta; + u64 ibsymsnap; + u64 iblnkerrdelta; + u64 iblnkerrsnap; + u64 ibcctrl; /* shadow for kr_ibcctrl */ + u32 lastlinkrecov; /* link recovery issue */ + int irq; + u32 cntrnamelen; + u32 portcntrnamelen; + u32 ncntrs; + u32 nportcntrs; + /* used with gpio interrupts to implement IB counters */ + u32 rxfc_unsupvl_errs; + u32 overrun_thresh_errs; + /* + * these count only cases where _successive_ LocalLinkIntegrity + * errors were seen in the receive headers of IB standard packets + */ + u32 lli_errs; + u32 lli_counter; + u64 lli_thresh; + u64 sword; /* total dwords sent (sample result) */ + u64 rword; /* total dwords received (sample result) */ + u64 spkts; /* total packets sent (sample result) */ + u64 rpkts; /* total packets received (sample result) */ + u64 xmit_wait; /* # of ticks no data sent (sample result) */ + struct timer_list pma_timer; + char emsgbuf[128]; + char bitsmsgbuf[64]; + u8 pma_sample_status; +}; + +/* ibcctrl bits */ +#define QLOGIC_IB_IBCC_LINKINITCMD_DISABLE 1 +/* cycle through TS1/TS2 till OK */ +#define QLOGIC_IB_IBCC_LINKINITCMD_POLL 2 +/* wait for TS1, then go on */ +#define QLOGIC_IB_IBCC_LINKINITCMD_SLEEP 3 +#define QLOGIC_IB_IBCC_LINKINITCMD_SHIFT 16 + +#define QLOGIC_IB_IBCC_LINKCMD_DOWN 1 /* move to 0x11 */ +#define QLOGIC_IB_IBCC_LINKCMD_ARMED 2 /* move to 0x21 */ +#define QLOGIC_IB_IBCC_LINKCMD_ACTIVE 3 /* move to 0x31 */ +#define QLOGIC_IB_IBCC_LINKCMD_SHIFT 18 + +/* + * We could have a single register get/put routine, that takes a group type, + * but this is somewhat clearer and cleaner. It also gives us some error + * checking. 64 bit register reads should always work, but are inefficient + * on opteron (the northbridge always generates 2 separate HT 32 bit reads), + * so we use kreg32 wherever possible. User register and counter register + * reads are always 32 bit reads, so only one form of those routines. + */ + +/** + * qib_read_ureg32 - read 32-bit virtualized per-context register + * @dd: device + * @regno: register number + * @ctxt: context number + * + * Return the contents of a register that is virtualized to be per context. + * Returns -1 on errors (not distinguishable from valid contents at + * runtime; we may add a separate error variable at some point). + */ +static inline u32 qib_read_ureg32(const struct qib_devdata *dd, + enum qib_ureg regno, int ctxt) +{ + if (!dd->kregbase || !(dd->flags & QIB_PRESENT)) + return 0; + + if (dd->userbase) + return readl(regno + (u64 __iomem *) + ((char __iomem *)dd->userbase + + dd->ureg_align * ctxt)); + else + return readl(regno + (u64 __iomem *) + (dd->uregbase + + (char __iomem *)dd->kregbase + + dd->ureg_align * ctxt)); +} + +/** + * qib_write_ureg - write 32-bit virtualized per-context register + * @dd: device + * @regno: register number + * @value: value + * @ctxt: context + * + * Write the contents of a register that is virtualized to be per context. + */ +static inline void qib_write_ureg(const struct qib_devdata *dd, + enum qib_ureg regno, u64 value, int ctxt) +{ + u64 __iomem *ubase; + + if (dd->userbase) + ubase = (u64 __iomem *) + ((char __iomem *) dd->userbase + + dd->ureg_align * ctxt); + else + ubase = (u64 __iomem *) + (dd->uregbase + + (char __iomem *) dd->kregbase + + dd->ureg_align * ctxt); + + if (dd->kregbase && (dd->flags & QIB_PRESENT)) + writeq(value, &ubase[regno]); +} + +static inline u32 qib_read_kreg32(const struct qib_devdata *dd, + const u16 regno) +{ + if (!dd->kregbase || !(dd->flags & QIB_PRESENT)) + return -1; + return readl((u32 __iomem *)&dd->kregbase[regno]); +} + +static inline u64 qib_read_kreg64(const struct qib_devdata *dd, + const u16 regno) +{ + if (!dd->kregbase || !(dd->flags & QIB_PRESENT)) + return -1; + + return readq(&dd->kregbase[regno]); +} + +static inline void qib_write_kreg(const struct qib_devdata *dd, + const u16 regno, u64 value) +{ + if (dd->kregbase && (dd->flags & QIB_PRESENT)) + writeq(value, &dd->kregbase[regno]); +} + +/** + * qib_write_kreg_ctxt - write a device's per-ctxt 64-bit kernel register + * @dd: the qlogic_ib device + * @regno: the register number to write + * @ctxt: the context containing the register + * @value: the value to write + */ +static inline void qib_write_kreg_ctxt(const struct qib_devdata *dd, + const u16 regno, unsigned ctxt, + u64 value) +{ + qib_write_kreg(dd, regno + ctxt, value); +} + +static inline void write_6120_creg(const struct qib_devdata *dd, + u16 regno, u64 value) +{ + if (dd->cspec->cregbase && (dd->flags & QIB_PRESENT)) + writeq(value, &dd->cspec->cregbase[regno]); +} + +static inline u64 read_6120_creg(const struct qib_devdata *dd, u16 regno) +{ + if (!dd->cspec->cregbase || !(dd->flags & QIB_PRESENT)) + return 0; + return readq(&dd->cspec->cregbase[regno]); +} + +static inline u32 read_6120_creg32(const struct qib_devdata *dd, u16 regno) +{ + if (!dd->cspec->cregbase || !(dd->flags & QIB_PRESENT)) + return 0; + return readl(&dd->cspec->cregbase[regno]); +} + +/* kr_control bits */ +#define QLOGIC_IB_C_RESET 1U + +/* kr_intstatus, kr_intclear, kr_intmask bits */ +#define QLOGIC_IB_I_RCVURG_MASK ((1U << 5) - 1) +#define QLOGIC_IB_I_RCVURG_SHIFT 0 +#define QLOGIC_IB_I_RCVAVAIL_MASK ((1U << 5) - 1) +#define QLOGIC_IB_I_RCVAVAIL_SHIFT 12 + +#define QLOGIC_IB_C_FREEZEMODE 0x00000002 +#define QLOGIC_IB_C_LINKENABLE 0x00000004 +#define QLOGIC_IB_I_ERROR 0x0000000080000000ULL +#define QLOGIC_IB_I_SPIOSENT 0x0000000040000000ULL +#define QLOGIC_IB_I_SPIOBUFAVAIL 0x0000000020000000ULL +#define QLOGIC_IB_I_GPIO 0x0000000010000000ULL +#define QLOGIC_IB_I_BITSEXTANT \ + ((QLOGIC_IB_I_RCVURG_MASK << QLOGIC_IB_I_RCVURG_SHIFT) | \ + (QLOGIC_IB_I_RCVAVAIL_MASK << \ + QLOGIC_IB_I_RCVAVAIL_SHIFT) | \ + QLOGIC_IB_I_ERROR | QLOGIC_IB_I_SPIOSENT | \ + QLOGIC_IB_I_SPIOBUFAVAIL | QLOGIC_IB_I_GPIO) + +/* kr_hwerrclear, kr_hwerrmask, kr_hwerrstatus, bits */ +#define QLOGIC_IB_HWE_PCIEMEMPARITYERR_MASK 0x000000000000003fULL +#define QLOGIC_IB_HWE_PCIEMEMPARITYERR_SHIFT 0 +#define QLOGIC_IB_HWE_PCIEPOISONEDTLP 0x0000000010000000ULL +#define QLOGIC_IB_HWE_PCIECPLTIMEOUT 0x0000000020000000ULL +#define QLOGIC_IB_HWE_PCIEBUSPARITYXTLH 0x0000000040000000ULL +#define QLOGIC_IB_HWE_PCIEBUSPARITYXADM 0x0000000080000000ULL +#define QLOGIC_IB_HWE_PCIEBUSPARITYRADM 0x0000000100000000ULL +#define QLOGIC_IB_HWE_COREPLL_FBSLIP 0x0080000000000000ULL +#define QLOGIC_IB_HWE_COREPLL_RFSLIP 0x0100000000000000ULL +#define QLOGIC_IB_HWE_PCIE1PLLFAILED 0x0400000000000000ULL +#define QLOGIC_IB_HWE_PCIE0PLLFAILED 0x0800000000000000ULL +#define QLOGIC_IB_HWE_SERDESPLLFAILED 0x1000000000000000ULL + + +/* kr_extstatus bits */ +#define QLOGIC_IB_EXTS_FREQSEL 0x2 +#define QLOGIC_IB_EXTS_SERDESSEL 0x4 +#define QLOGIC_IB_EXTS_MEMBIST_ENDTEST 0x0000000000004000 +#define QLOGIC_IB_EXTS_MEMBIST_FOUND 0x0000000000008000 + +/* kr_xgxsconfig bits */ +#define QLOGIC_IB_XGXS_RESET 0x5ULL + +#define _QIB_GPIO_SDA_NUM 1 +#define _QIB_GPIO_SCL_NUM 0 + +/* Bits in GPIO for the added IB link interrupts */ +#define GPIO_RXUVL_BIT 3 +#define GPIO_OVRUN_BIT 4 +#define GPIO_LLI_BIT 5 +#define GPIO_ERRINTR_MASK 0x38 + + +#define QLOGIC_IB_RT_BUFSIZE_MASK 0xe0000000ULL +#define QLOGIC_IB_RT_BUFSIZE_SHIFTVAL(tid) \ + ((((tid) & QLOGIC_IB_RT_BUFSIZE_MASK) >> 29) + 11 - 1) +#define QLOGIC_IB_RT_BUFSIZE(tid) (1 << QLOGIC_IB_RT_BUFSIZE_SHIFTVAL(tid)) +#define QLOGIC_IB_RT_IS_VALID(tid) \ + (((tid) & QLOGIC_IB_RT_BUFSIZE_MASK) && \ + ((((tid) & QLOGIC_IB_RT_BUFSIZE_MASK) != QLOGIC_IB_RT_BUFSIZE_MASK))) +#define QLOGIC_IB_RT_ADDR_MASK 0x1FFFFFFFULL /* 29 bits valid */ +#define QLOGIC_IB_RT_ADDR_SHIFT 10 + +#define QLOGIC_IB_R_INTRAVAIL_SHIFT 16 +#define QLOGIC_IB_R_TAILUPD_SHIFT 31 +#define IBA6120_R_PKEY_DIS_SHIFT 30 + +#define PBC_6120_VL15_SEND_CTRL (1ULL << 31) /* pbc; VL15; link_buf only */ + +#define IBCBUSFRSPCPARITYERR HWE_MASK(IBCBusFromSPCParityErr) +#define IBCBUSTOSPCPARITYERR HWE_MASK(IBCBusToSPCParityErr) + +#define SYM_MASK_BIT(regname, fldname, bit) ((u64) \ + ((1ULL << (SYM_LSB(regname, fldname) + (bit))))) + +#define TXEMEMPARITYERR_PIOBUF \ + SYM_MASK_BIT(HwErrMask, TXEMemParityErrMask, 0) +#define TXEMEMPARITYERR_PIOPBC \ + SYM_MASK_BIT(HwErrMask, TXEMemParityErrMask, 1) +#define TXEMEMPARITYERR_PIOLAUNCHFIFO \ + SYM_MASK_BIT(HwErrMask, TXEMemParityErrMask, 2) + +#define RXEMEMPARITYERR_RCVBUF \ + SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 0) +#define RXEMEMPARITYERR_LOOKUPQ \ + SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 1) +#define RXEMEMPARITYERR_EXPTID \ + SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 2) +#define RXEMEMPARITYERR_EAGERTID \ + SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 3) +#define RXEMEMPARITYERR_FLAGBUF \ + SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 4) +#define RXEMEMPARITYERR_DATAINFO \ + SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 5) +#define RXEMEMPARITYERR_HDRINFO \ + SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 6) + +/* 6120 specific hardware errors... */ +static const struct qib_hwerror_msgs qib_6120_hwerror_msgs[] = { + /* generic hardware errors */ + QLOGIC_IB_HWE_MSG(IBCBUSFRSPCPARITYERR, "QIB2IB Parity"), + QLOGIC_IB_HWE_MSG(IBCBUSTOSPCPARITYERR, "IB2QIB Parity"), + + QLOGIC_IB_HWE_MSG(TXEMEMPARITYERR_PIOBUF, + "TXE PIOBUF Memory Parity"), + QLOGIC_IB_HWE_MSG(TXEMEMPARITYERR_PIOPBC, + "TXE PIOPBC Memory Parity"), + QLOGIC_IB_HWE_MSG(TXEMEMPARITYERR_PIOLAUNCHFIFO, + "TXE PIOLAUNCHFIFO Memory Parity"), + + QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_RCVBUF, + "RXE RCVBUF Memory Parity"), + QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_LOOKUPQ, + "RXE LOOKUPQ Memory Parity"), + QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_EAGERTID, + "RXE EAGERTID Memory Parity"), + QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_EXPTID, + "RXE EXPTID Memory Parity"), + QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_FLAGBUF, + "RXE FLAGBUF Memory Parity"), + QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_DATAINFO, + "RXE DATAINFO Memory Parity"), + QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_HDRINFO, + "RXE HDRINFO Memory Parity"), + + /* chip-specific hardware errors */ + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIEPOISONEDTLP, + "PCIe Poisoned TLP"), + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIECPLTIMEOUT, + "PCIe completion timeout"), + /* + * In practice, it's unlikely wthat we'll see PCIe PLL, or bus + * parity or memory parity error failures, because most likely we + * won't be able to talk to the core of the chip. Nonetheless, we + * might see them, if they are in parts of the PCIe core that aren't + * essential. + */ + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIE1PLLFAILED, + "PCIePLL1"), + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIE0PLLFAILED, + "PCIePLL0"), + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIEBUSPARITYXTLH, + "PCIe XTLH core parity"), + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIEBUSPARITYXADM, + "PCIe ADM TX core parity"), + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIEBUSPARITYRADM, + "PCIe ADM RX core parity"), + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_SERDESPLLFAILED, + "SerDes PLL"), +}; + +#define TXE_PIO_PARITY (TXEMEMPARITYERR_PIOBUF | TXEMEMPARITYERR_PIOPBC) +#define _QIB_PLL_FAIL (QLOGIC_IB_HWE_COREPLL_FBSLIP | \ + QLOGIC_IB_HWE_COREPLL_RFSLIP) + + /* variables for sanity checking interrupt and errors */ +#define IB_HWE_BITSEXTANT \ + (HWE_MASK(RXEMemParityErr) | \ + HWE_MASK(TXEMemParityErr) | \ + (QLOGIC_IB_HWE_PCIEMEMPARITYERR_MASK << \ + QLOGIC_IB_HWE_PCIEMEMPARITYERR_SHIFT) | \ + QLOGIC_IB_HWE_PCIE1PLLFAILED | \ + QLOGIC_IB_HWE_PCIE0PLLFAILED | \ + QLOGIC_IB_HWE_PCIEPOISONEDTLP | \ + QLOGIC_IB_HWE_PCIECPLTIMEOUT | \ + QLOGIC_IB_HWE_PCIEBUSPARITYXTLH | \ + QLOGIC_IB_HWE_PCIEBUSPARITYXADM | \ + QLOGIC_IB_HWE_PCIEBUSPARITYRADM | \ + HWE_MASK(PowerOnBISTFailed) | \ + QLOGIC_IB_HWE_COREPLL_FBSLIP | \ + QLOGIC_IB_HWE_COREPLL_RFSLIP | \ + QLOGIC_IB_HWE_SERDESPLLFAILED | \ + HWE_MASK(IBCBusToSPCParityErr) | \ + HWE_MASK(IBCBusFromSPCParityErr)) + +#define IB_E_BITSEXTANT \ + (ERR_MASK(RcvFormatErr) | ERR_MASK(RcvVCRCErr) | \ + ERR_MASK(RcvICRCErr) | ERR_MASK(RcvMinPktLenErr) | \ + ERR_MASK(RcvMaxPktLenErr) | ERR_MASK(RcvLongPktLenErr) | \ + ERR_MASK(RcvShortPktLenErr) | ERR_MASK(RcvUnexpectedCharErr) | \ + ERR_MASK(RcvUnsupportedVLErr) | ERR_MASK(RcvEBPErr) | \ + ERR_MASK(RcvIBFlowErr) | ERR_MASK(RcvBadVersionErr) | \ + ERR_MASK(RcvEgrFullErr) | ERR_MASK(RcvHdrFullErr) | \ + ERR_MASK(RcvBadTidErr) | ERR_MASK(RcvHdrLenErr) | \ + ERR_MASK(RcvHdrErr) | ERR_MASK(RcvIBLostLinkErr) | \ + ERR_MASK(SendMinPktLenErr) | ERR_MASK(SendMaxPktLenErr) | \ + ERR_MASK(SendUnderRunErr) | ERR_MASK(SendPktLenErr) | \ + ERR_MASK(SendDroppedSmpPktErr) | \ + ERR_MASK(SendDroppedDataPktErr) | \ + ERR_MASK(SendPioArmLaunchErr) | \ + ERR_MASK(SendUnexpectedPktNumErr) | \ + ERR_MASK(SendUnsupportedVLErr) | ERR_MASK(IBStatusChanged) | \ + ERR_MASK(InvalidAddrErr) | ERR_MASK(ResetNegated) | \ + ERR_MASK(HardwareErr)) + +#define QLOGIC_IB_E_PKTERRS ( \ + ERR_MASK(SendPktLenErr) | \ + ERR_MASK(SendDroppedDataPktErr) | \ + ERR_MASK(RcvVCRCErr) | \ + ERR_MASK(RcvICRCErr) | \ + ERR_MASK(RcvShortPktLenErr) | \ + ERR_MASK(RcvEBPErr)) + +/* These are all rcv-related errors which we want to count for stats */ +#define E_SUM_PKTERRS \ + (ERR_MASK(RcvHdrLenErr) | ERR_MASK(RcvBadTidErr) | \ + ERR_MASK(RcvBadVersionErr) | ERR_MASK(RcvHdrErr) | \ + ERR_MASK(RcvLongPktLenErr) | ERR_MASK(RcvShortPktLenErr) | \ + ERR_MASK(RcvMaxPktLenErr) | ERR_MASK(RcvMinPktLenErr) | \ + ERR_MASK(RcvFormatErr) | ERR_MASK(RcvUnsupportedVLErr) | \ + ERR_MASK(RcvUnexpectedCharErr) | ERR_MASK(RcvEBPErr)) + +/* These are all send-related errors which we want to count for stats */ +#define E_SUM_ERRS \ + (ERR_MASK(SendPioArmLaunchErr) | \ + ERR_MASK(SendUnexpectedPktNumErr) | \ + ERR_MASK(SendDroppedDataPktErr) | \ + ERR_MASK(SendDroppedSmpPktErr) | \ + ERR_MASK(SendMaxPktLenErr) | ERR_MASK(SendUnsupportedVLErr) | \ + ERR_MASK(SendMinPktLenErr) | ERR_MASK(SendPktLenErr) | \ + ERR_MASK(InvalidAddrErr)) + +/* + * this is similar to E_SUM_ERRS, but can't ignore armlaunch, don't ignore + * errors not related to freeze and cancelling buffers. Can't ignore + * armlaunch because could get more while still cleaning up, and need + * to cancel those as they happen. + */ +#define E_SPKT_ERRS_IGNORE \ + (ERR_MASK(SendDroppedDataPktErr) | \ + ERR_MASK(SendDroppedSmpPktErr) | \ + ERR_MASK(SendMaxPktLenErr) | ERR_MASK(SendMinPktLenErr) | \ + ERR_MASK(SendPktLenErr)) + +/* + * these are errors that can occur when the link changes state while + * a packet is being sent or received. This doesn't cover things + * like EBP or VCRC that can be the result of a sending having the + * link change state, so we receive a "known bad" packet. + */ +#define E_SUM_LINK_PKTERRS \ + (ERR_MASK(SendDroppedDataPktErr) | \ + ERR_MASK(SendDroppedSmpPktErr) | \ + ERR_MASK(SendMinPktLenErr) | ERR_MASK(SendPktLenErr) | \ + ERR_MASK(RcvShortPktLenErr) | ERR_MASK(RcvMinPktLenErr) | \ + ERR_MASK(RcvUnexpectedCharErr)) + +static void qib_6120_put_tid_2(struct qib_devdata *, u64 __iomem *, + u32, unsigned long); + +/* + * On platforms using this chip, and not having ordered WC stores, we + * can get TXE parity errors due to speculative reads to the PIO buffers, + * and this, due to a chip issue can result in (many) false parity error + * reports. So it's a debug print on those, and an info print on systems + * where the speculative reads don't occur. + */ +static void qib_6120_txe_recover(struct qib_devdata *dd) +{ + if (!qib_unordered_wc()) + qib_devinfo(dd->pcidev, + "Recovering from TXE PIO parity error\n"); +} + +/* enable/disable chip from delivering interrupts */ +static void qib_6120_set_intr_state(struct qib_devdata *dd, u32 enable) +{ + if (enable) { + if (dd->flags & QIB_BADINTR) + return; + qib_write_kreg(dd, kr_intmask, ~0ULL); + /* force re-interrupt of any pending interrupts. */ + qib_write_kreg(dd, kr_intclear, 0ULL); + } else + qib_write_kreg(dd, kr_intmask, 0ULL); +} + +/* + * Try to cleanup as much as possible for anything that might have gone + * wrong while in freeze mode, such as pio buffers being written by user + * processes (causing armlaunch), send errors due to going into freeze mode, + * etc., and try to avoid causing extra interrupts while doing so. + * Forcibly update the in-memory pioavail register copies after cleanup + * because the chip won't do it while in freeze mode (the register values + * themselves are kept correct). + * Make sure that we don't lose any important interrupts by using the chip + * feature that says that writing 0 to a bit in *clear that is set in + * *status will cause an interrupt to be generated again (if allowed by + * the *mask value). + * This is in chip-specific code because of all of the register accesses, + * even though the details are similar on most chips + */ +static void qib_6120_clear_freeze(struct qib_devdata *dd) +{ + /* disable error interrupts, to avoid confusion */ + qib_write_kreg(dd, kr_errmask, 0ULL); + + /* also disable interrupts; errormask is sometimes overwriten */ + qib_6120_set_intr_state(dd, 0); + + qib_cancel_sends(dd->pport); + + /* clear the freeze, and be sure chip saw it */ + qib_write_kreg(dd, kr_control, dd->control); + qib_read_kreg32(dd, kr_scratch); + + /* force in-memory update now we are out of freeze */ + qib_force_pio_avail_update(dd); + + /* + * force new interrupt if any hwerr, error or interrupt bits are + * still set, and clear "safe" send packet errors related to freeze + * and cancelling sends. Re-enable error interrupts before possible + * force of re-interrupt on pending interrupts. + */ + qib_write_kreg(dd, kr_hwerrclear, 0ULL); + qib_write_kreg(dd, kr_errclear, E_SPKT_ERRS_IGNORE); + qib_write_kreg(dd, kr_errmask, dd->cspec->errormask); + qib_6120_set_intr_state(dd, 1); +} + +/** + * qib_handle_6120_hwerrors - display hardware errors. + * @dd: the qlogic_ib device + * @msg: the output buffer + * @msgl: the size of the output buffer + * + * Use same msg buffer as regular errors to avoid excessive stack + * use. Most hardware errors are catastrophic, but for right now, + * we'll print them and continue. Reuse the same message buffer as + * handle_6120_errors() to avoid excessive stack usage. + */ +static void qib_handle_6120_hwerrors(struct qib_devdata *dd, char *msg, + size_t msgl) +{ + u64 hwerrs; + u32 bits, ctrl; + int isfatal = 0; + char *bitsmsg; + int log_idx; + + hwerrs = qib_read_kreg64(dd, kr_hwerrstatus); + if (!hwerrs) + return; + if (hwerrs == ~0ULL) { + qib_dev_err(dd, + "Read of hardware error status failed (all bits set); ignoring\n"); + return; + } + qib_stats.sps_hwerrs++; + + /* Always clear the error status register, except MEMBISTFAIL, + * regardless of whether we continue or stop using the chip. + * We want that set so we know it failed, even across driver reload. + * We'll still ignore it in the hwerrmask. We do this partly for + * diagnostics, but also for support */ + qib_write_kreg(dd, kr_hwerrclear, + hwerrs & ~HWE_MASK(PowerOnBISTFailed)); + + hwerrs &= dd->cspec->hwerrmask; + + /* We log some errors to EEPROM, check if we have any of those. */ + for (log_idx = 0; log_idx < QIB_EEP_LOG_CNT; ++log_idx) + if (hwerrs & dd->eep_st_masks[log_idx].hwerrs_to_log) + qib_inc_eeprom_err(dd, log_idx, 1); + + /* + * Make sure we get this much out, unless told to be quiet, + * or it's occurred within the last 5 seconds. + */ + if (hwerrs & ~(TXE_PIO_PARITY | RXEMEMPARITYERR_EAGERTID)) + qib_devinfo(dd->pcidev, + "Hardware error: hwerr=0x%llx (cleared)\n", + (unsigned long long) hwerrs); + + if (hwerrs & ~IB_HWE_BITSEXTANT) + qib_dev_err(dd, + "hwerror interrupt with unknown errors %llx set\n", + (unsigned long long)(hwerrs & ~IB_HWE_BITSEXTANT)); + + ctrl = qib_read_kreg32(dd, kr_control); + if ((ctrl & QLOGIC_IB_C_FREEZEMODE) && !dd->diag_client) { + /* + * Parity errors in send memory are recoverable, + * just cancel the send (if indicated in * sendbuffererror), + * count the occurrence, unfreeze (if no other handled + * hardware error bits are set), and continue. They can + * occur if a processor speculative read is done to the PIO + * buffer while we are sending a packet, for example. + */ + if (hwerrs & TXE_PIO_PARITY) { + qib_6120_txe_recover(dd); + hwerrs &= ~TXE_PIO_PARITY; + } + + if (!hwerrs) { + static u32 freeze_cnt; + + freeze_cnt++; + qib_6120_clear_freeze(dd); + } else + isfatal = 1; + } + + *msg = '\0'; + + if (hwerrs & HWE_MASK(PowerOnBISTFailed)) { + isfatal = 1; + strlcat(msg, + "[Memory BIST test failed, InfiniPath hardware unusable]", + msgl); + /* ignore from now on, so disable until driver reloaded */ + dd->cspec->hwerrmask &= ~HWE_MASK(PowerOnBISTFailed); + qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask); + } + + qib_format_hwerrors(hwerrs, qib_6120_hwerror_msgs, + ARRAY_SIZE(qib_6120_hwerror_msgs), msg, msgl); + + bitsmsg = dd->cspec->bitsmsgbuf; + if (hwerrs & (QLOGIC_IB_HWE_PCIEMEMPARITYERR_MASK << + QLOGIC_IB_HWE_PCIEMEMPARITYERR_SHIFT)) { + bits = (u32) ((hwerrs >> + QLOGIC_IB_HWE_PCIEMEMPARITYERR_SHIFT) & + QLOGIC_IB_HWE_PCIEMEMPARITYERR_MASK); + snprintf(bitsmsg, sizeof(dd->cspec->bitsmsgbuf), + "[PCIe Mem Parity Errs %x] ", bits); + strlcat(msg, bitsmsg, msgl); + } + + if (hwerrs & _QIB_PLL_FAIL) { + isfatal = 1; + snprintf(bitsmsg, sizeof(dd->cspec->bitsmsgbuf), + "[PLL failed (%llx), InfiniPath hardware unusable]", + (unsigned long long) hwerrs & _QIB_PLL_FAIL); + strlcat(msg, bitsmsg, msgl); + /* ignore from now on, so disable until driver reloaded */ + dd->cspec->hwerrmask &= ~(hwerrs & _QIB_PLL_FAIL); + qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask); + } + + if (hwerrs & QLOGIC_IB_HWE_SERDESPLLFAILED) { + /* + * If it occurs, it is left masked since the external + * interface is unused + */ + dd->cspec->hwerrmask &= ~QLOGIC_IB_HWE_SERDESPLLFAILED; + qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask); + } + + if (hwerrs) + /* + * if any set that we aren't ignoring; only + * make the complaint once, in case it's stuck + * or recurring, and we get here multiple + * times. + */ + qib_dev_err(dd, "%s hardware error\n", msg); + else + *msg = 0; /* recovered from all of them */ + + if (isfatal && !dd->diag_client) { + qib_dev_err(dd, + "Fatal Hardware Error, no longer usable, SN %.16s\n", + dd->serial); + /* + * for /sys status file and user programs to print; if no + * trailing brace is copied, we'll know it was truncated. + */ + if (dd->freezemsg) + snprintf(dd->freezemsg, dd->freezelen, + "{%s}", msg); + qib_disable_after_error(dd); + } +} + +/* + * Decode the error status into strings, deciding whether to always + * print * it or not depending on "normal packet errors" vs everything + * else. Return 1 if "real" errors, otherwise 0 if only packet + * errors, so caller can decide what to print with the string. + */ +static int qib_decode_6120_err(struct qib_devdata *dd, char *buf, size_t blen, + u64 err) +{ + int iserr = 1; + + *buf = '\0'; + if (err & QLOGIC_IB_E_PKTERRS) { + if (!(err & ~QLOGIC_IB_E_PKTERRS)) + iserr = 0; + if ((err & ERR_MASK(RcvICRCErr)) && + !(err&(ERR_MASK(RcvVCRCErr)|ERR_MASK(RcvEBPErr)))) + strlcat(buf, "CRC ", blen); + if (!iserr) + goto done; + } + if (err & ERR_MASK(RcvHdrLenErr)) + strlcat(buf, "rhdrlen ", blen); + if (err & ERR_MASK(RcvBadTidErr)) + strlcat(buf, "rbadtid ", blen); + if (err & ERR_MASK(RcvBadVersionErr)) + strlcat(buf, "rbadversion ", blen); + if (err & ERR_MASK(RcvHdrErr)) + strlcat(buf, "rhdr ", blen); + if (err & ERR_MASK(RcvLongPktLenErr)) + strlcat(buf, "rlongpktlen ", blen); + if (err & ERR_MASK(RcvMaxPktLenErr)) + strlcat(buf, "rmaxpktlen ", blen); + if (err & ERR_MASK(RcvMinPktLenErr)) + strlcat(buf, "rminpktlen ", blen); + if (err & ERR_MASK(SendMinPktLenErr)) + strlcat(buf, "sminpktlen ", blen); + if (err & ERR_MASK(RcvFormatErr)) + strlcat(buf, "rformaterr ", blen); + if (err & ERR_MASK(RcvUnsupportedVLErr)) + strlcat(buf, "runsupvl ", blen); + if (err & ERR_MASK(RcvUnexpectedCharErr)) + strlcat(buf, "runexpchar ", blen); + if (err & ERR_MASK(RcvIBFlowErr)) + strlcat(buf, "ribflow ", blen); + if (err & ERR_MASK(SendUnderRunErr)) + strlcat(buf, "sunderrun ", blen); + if (err & ERR_MASK(SendPioArmLaunchErr)) + strlcat(buf, "spioarmlaunch ", blen); + if (err & ERR_MASK(SendUnexpectedPktNumErr)) + strlcat(buf, "sunexperrpktnum ", blen); + if (err & ERR_MASK(SendDroppedSmpPktErr)) + strlcat(buf, "sdroppedsmppkt ", blen); + if (err & ERR_MASK(SendMaxPktLenErr)) + strlcat(buf, "smaxpktlen ", blen); + if (err & ERR_MASK(SendUnsupportedVLErr)) + strlcat(buf, "sunsupVL ", blen); + if (err & ERR_MASK(InvalidAddrErr)) + strlcat(buf, "invalidaddr ", blen); + if (err & ERR_MASK(RcvEgrFullErr)) + strlcat(buf, "rcvegrfull ", blen); + if (err & ERR_MASK(RcvHdrFullErr)) + strlcat(buf, "rcvhdrfull ", blen); + if (err & ERR_MASK(IBStatusChanged)) + strlcat(buf, "ibcstatuschg ", blen); + if (err & ERR_MASK(RcvIBLostLinkErr)) + strlcat(buf, "riblostlink ", blen); + if (err & ERR_MASK(HardwareErr)) + strlcat(buf, "hardware ", blen); + if (err & ERR_MASK(ResetNegated)) + strlcat(buf, "reset ", blen); +done: + return iserr; +} + +/* + * Called when we might have an error that is specific to a particular + * PIO buffer, and may need to cancel that buffer, so it can be re-used. + */ +static void qib_disarm_6120_senderrbufs(struct qib_pportdata *ppd) +{ + unsigned long sbuf[2]; + struct qib_devdata *dd = ppd->dd; + + /* + * It's possible that sendbuffererror could have bits set; might + * have already done this as a result of hardware error handling. + */ + sbuf[0] = qib_read_kreg64(dd, kr_sendbuffererror); + sbuf[1] = qib_read_kreg64(dd, kr_sendbuffererror + 1); + + if (sbuf[0] || sbuf[1]) + qib_disarm_piobufs_set(dd, sbuf, + dd->piobcnt2k + dd->piobcnt4k); +} + +static int chk_6120_linkrecovery(struct qib_devdata *dd, u64 ibcs) +{ + int ret = 1; + u32 ibstate = qib_6120_iblink_state(ibcs); + u32 linkrecov = read_6120_creg32(dd, cr_iblinkerrrecov); + + if (linkrecov != dd->cspec->lastlinkrecov) { + /* and no more until active again */ + dd->cspec->lastlinkrecov = 0; + qib_set_linkstate(dd->pport, QIB_IB_LINKDOWN); + ret = 0; + } + if (ibstate == IB_PORT_ACTIVE) + dd->cspec->lastlinkrecov = + read_6120_creg32(dd, cr_iblinkerrrecov); + return ret; +} + +static void handle_6120_errors(struct qib_devdata *dd, u64 errs) +{ + char *msg; + u64 ignore_this_time = 0; + u64 iserr = 0; + int log_idx; + struct qib_pportdata *ppd = dd->pport; + u64 mask; + + /* don't report errors that are masked */ + errs &= dd->cspec->errormask; + msg = dd->cspec->emsgbuf; + + /* do these first, they are most important */ + if (errs & ERR_MASK(HardwareErr)) + qib_handle_6120_hwerrors(dd, msg, sizeof(dd->cspec->emsgbuf)); + else + for (log_idx = 0; log_idx < QIB_EEP_LOG_CNT; ++log_idx) + if (errs & dd->eep_st_masks[log_idx].errs_to_log) + qib_inc_eeprom_err(dd, log_idx, 1); + + if (errs & ~IB_E_BITSEXTANT) + qib_dev_err(dd, + "error interrupt with unknown errors %llx set\n", + (unsigned long long) (errs & ~IB_E_BITSEXTANT)); + + if (errs & E_SUM_ERRS) { + qib_disarm_6120_senderrbufs(ppd); + if ((errs & E_SUM_LINK_PKTERRS) && + !(ppd->lflags & QIBL_LINKACTIVE)) { + /* + * This can happen when trying to bring the link + * up, but the IB link changes state at the "wrong" + * time. The IB logic then complains that the packet + * isn't valid. We don't want to confuse people, so + * we just don't print them, except at debug + */ + ignore_this_time = errs & E_SUM_LINK_PKTERRS; + } + } else if ((errs & E_SUM_LINK_PKTERRS) && + !(ppd->lflags & QIBL_LINKACTIVE)) { + /* + * This can happen when SMA is trying to bring the link + * up, but the IB link changes state at the "wrong" time. + * The IB logic then complains that the packet isn't + * valid. We don't want to confuse people, so we just + * don't print them, except at debug + */ + ignore_this_time = errs & E_SUM_LINK_PKTERRS; + } + + qib_write_kreg(dd, kr_errclear, errs); + + errs &= ~ignore_this_time; + if (!errs) + goto done; + + /* + * The ones we mask off are handled specially below + * or above. + */ + mask = ERR_MASK(IBStatusChanged) | ERR_MASK(RcvEgrFullErr) | + ERR_MASK(RcvHdrFullErr) | ERR_MASK(HardwareErr); + qib_decode_6120_err(dd, msg, sizeof(dd->cspec->emsgbuf), errs & ~mask); + + if (errs & E_SUM_PKTERRS) + qib_stats.sps_rcverrs++; + if (errs & E_SUM_ERRS) + qib_stats.sps_txerrs++; + + iserr = errs & ~(E_SUM_PKTERRS | QLOGIC_IB_E_PKTERRS); + + if (errs & ERR_MASK(IBStatusChanged)) { + u64 ibcs = qib_read_kreg64(dd, kr_ibcstatus); + u32 ibstate = qib_6120_iblink_state(ibcs); + int handle = 1; + + if (ibstate != IB_PORT_INIT && dd->cspec->lastlinkrecov) + handle = chk_6120_linkrecovery(dd, ibcs); + /* + * Since going into a recovery state causes the link state + * to go down and since recovery is transitory, it is better + * if we "miss" ever seeing the link training state go into + * recovery (i.e., ignore this transition for link state + * special handling purposes) without updating lastibcstat. + */ + if (handle && qib_6120_phys_portstate(ibcs) == + IB_PHYSPORTSTATE_LINK_ERR_RECOVER) + handle = 0; + if (handle) + qib_handle_e_ibstatuschanged(ppd, ibcs); + } + + if (errs & ERR_MASK(ResetNegated)) { + qib_dev_err(dd, + "Got reset, requires re-init (unload and reload driver)\n"); + dd->flags &= ~QIB_INITTED; /* needs re-init */ + /* mark as having had error */ + *dd->devstatusp |= QIB_STATUS_HWERROR; + *dd->pport->statusp &= ~QIB_STATUS_IB_CONF; + } + + if (*msg && iserr) + qib_dev_porterr(dd, ppd->port, "%s error\n", msg); + + if (ppd->state_wanted & ppd->lflags) + wake_up_interruptible(&ppd->state_wait); + + /* + * If there were hdrq or egrfull errors, wake up any processes + * waiting in poll. We used to try to check which contexts had + * the overflow, but given the cost of that and the chip reads + * to support it, it's better to just wake everybody up if we + * get an overflow; waiters can poll again if it's not them. + */ + if (errs & (ERR_MASK(RcvEgrFullErr) | ERR_MASK(RcvHdrFullErr))) { + qib_handle_urcv(dd, ~0U); + if (errs & ERR_MASK(RcvEgrFullErr)) + qib_stats.sps_buffull++; + else + qib_stats.sps_hdrfull++; + } +done: + return; +} + +/** + * qib_6120_init_hwerrors - enable hardware errors + * @dd: the qlogic_ib device + * + * now that we have finished initializing everything that might reasonably + * cause a hardware error, and cleared those errors bits as they occur, + * we can enable hardware errors in the mask (potentially enabling + * freeze mode), and enable hardware errors as errors (along with + * everything else) in errormask + */ +static void qib_6120_init_hwerrors(struct qib_devdata *dd) +{ + u64 val; + u64 extsval; + + extsval = qib_read_kreg64(dd, kr_extstatus); + + if (!(extsval & QLOGIC_IB_EXTS_MEMBIST_ENDTEST)) + qib_dev_err(dd, "MemBIST did not complete!\n"); + + /* init so all hwerrors interrupt, and enter freeze, ajdust below */ + val = ~0ULL; + if (dd->minrev < 2) { + /* + * Avoid problem with internal interface bus parity + * checking. Fixed in Rev2. + */ + val &= ~QLOGIC_IB_HWE_PCIEBUSPARITYRADM; + } + /* avoid some intel cpu's speculative read freeze mode issue */ + val &= ~TXEMEMPARITYERR_PIOBUF; + + dd->cspec->hwerrmask = val; + + qib_write_kreg(dd, kr_hwerrclear, ~HWE_MASK(PowerOnBISTFailed)); + qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask); + + /* clear all */ + qib_write_kreg(dd, kr_errclear, ~0ULL); + /* enable errors that are masked, at least this first time. */ + qib_write_kreg(dd, kr_errmask, ~0ULL); + dd->cspec->errormask = qib_read_kreg64(dd, kr_errmask); + /* clear any interrupts up to this point (ints still not enabled) */ + qib_write_kreg(dd, kr_intclear, ~0ULL); + + qib_write_kreg(dd, kr_rcvbthqp, + dd->qpn_mask << (QIB_6120_RcvBTHQP_BTHQP_Mask_LSB - 1) | + QIB_KD_QP); +} + +/* + * Disable and enable the armlaunch error. Used for PIO bandwidth testing + * on chips that are count-based, rather than trigger-based. There is no + * reference counting, but that's also fine, given the intended use. + * Only chip-specific because it's all register accesses + */ +static void qib_set_6120_armlaunch(struct qib_devdata *dd, u32 enable) +{ + if (enable) { + qib_write_kreg(dd, kr_errclear, + ERR_MASK(SendPioArmLaunchErr)); + dd->cspec->errormask |= ERR_MASK(SendPioArmLaunchErr); + } else + dd->cspec->errormask &= ~ERR_MASK(SendPioArmLaunchErr); + qib_write_kreg(dd, kr_errmask, dd->cspec->errormask); +} + +/* + * Formerly took parameter in pre-shifted, + * pre-merged form with LinkCmd and LinkInitCmd + * together, and assuming the zero was NOP. + */ +static void qib_set_ib_6120_lstate(struct qib_pportdata *ppd, u16 linkcmd, + u16 linitcmd) +{ + u64 mod_wd; + struct qib_devdata *dd = ppd->dd; + unsigned long flags; + + if (linitcmd == QLOGIC_IB_IBCC_LINKINITCMD_DISABLE) { + /* + * If we are told to disable, note that so link-recovery + * code does not attempt to bring us back up. + */ + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags |= QIBL_IB_LINK_DISABLED; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + } else if (linitcmd || linkcmd == QLOGIC_IB_IBCC_LINKCMD_DOWN) { + /* + * Any other linkinitcmd will lead to LINKDOWN and then + * to INIT (if all is well), so clear flag to let + * link-recovery code attempt to bring us back up. + */ + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags &= ~QIBL_IB_LINK_DISABLED; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + } + + mod_wd = (linkcmd << QLOGIC_IB_IBCC_LINKCMD_SHIFT) | + (linitcmd << QLOGIC_IB_IBCC_LINKINITCMD_SHIFT); + + qib_write_kreg(dd, kr_ibcctrl, dd->cspec->ibcctrl | mod_wd); + /* write to chip to prevent back-to-back writes of control reg */ + qib_write_kreg(dd, kr_scratch, 0); +} + +/** + * qib_6120_bringup_serdes - bring up the serdes + * @dd: the qlogic_ib device + */ +static int qib_6120_bringup_serdes(struct qib_pportdata *ppd) +{ + struct qib_devdata *dd = ppd->dd; + u64 val, config1, prev_val, hwstat, ibc; + + /* Put IBC in reset, sends disabled */ + dd->control &= ~QLOGIC_IB_C_LINKENABLE; + qib_write_kreg(dd, kr_control, 0ULL); + + dd->cspec->ibdeltainprog = 1; + dd->cspec->ibsymsnap = read_6120_creg32(dd, cr_ibsymbolerr); + dd->cspec->iblnkerrsnap = read_6120_creg32(dd, cr_iblinkerrrecov); + + /* flowcontrolwatermark is in units of KBytes */ + ibc = 0x5ULL << SYM_LSB(IBCCtrl, FlowCtrlWaterMark); + /* + * How often flowctrl sent. More or less in usecs; balance against + * watermark value, so that in theory senders always get a flow + * control update in time to not let the IB link go idle. + */ + ibc |= 0x3ULL << SYM_LSB(IBCCtrl, FlowCtrlPeriod); + /* max error tolerance */ + dd->cspec->lli_thresh = 0xf; + ibc |= (u64) dd->cspec->lli_thresh << SYM_LSB(IBCCtrl, PhyerrThreshold); + /* use "real" buffer space for */ + ibc |= 4ULL << SYM_LSB(IBCCtrl, CreditScale); + /* IB credit flow control. */ + ibc |= 0xfULL << SYM_LSB(IBCCtrl, OverrunThreshold); + /* + * set initial max size pkt IBC will send, including ICRC; it's the + * PIO buffer size in dwords, less 1; also see qib_set_mtu() + */ + ibc |= ((u64)(ppd->ibmaxlen >> 2) + 1) << SYM_LSB(IBCCtrl, MaxPktLen); + dd->cspec->ibcctrl = ibc; /* without linkcmd or linkinitcmd! */ + + /* initially come up waiting for TS1, without sending anything. */ + val = dd->cspec->ibcctrl | (QLOGIC_IB_IBCC_LINKINITCMD_DISABLE << + QLOGIC_IB_IBCC_LINKINITCMD_SHIFT); + qib_write_kreg(dd, kr_ibcctrl, val); + + val = qib_read_kreg64(dd, kr_serdes_cfg0); + config1 = qib_read_kreg64(dd, kr_serdes_cfg1); + + /* + * Force reset on, also set rxdetect enable. Must do before reading + * serdesstatus at least for simulation, or some of the bits in + * serdes status will come back as undefined and cause simulation + * failures + */ + val |= SYM_MASK(SerdesCfg0, ResetPLL) | + SYM_MASK(SerdesCfg0, RxDetEnX) | + (SYM_MASK(SerdesCfg0, L1PwrDnA) | + SYM_MASK(SerdesCfg0, L1PwrDnB) | + SYM_MASK(SerdesCfg0, L1PwrDnC) | + SYM_MASK(SerdesCfg0, L1PwrDnD)); + qib_write_kreg(dd, kr_serdes_cfg0, val); + /* be sure chip saw it */ + qib_read_kreg64(dd, kr_scratch); + udelay(5); /* need pll reset set at least for a bit */ + /* + * after PLL is reset, set the per-lane Resets and TxIdle and + * clear the PLL reset and rxdetect (to get falling edge). + * Leave L1PWR bits set (permanently) + */ + val &= ~(SYM_MASK(SerdesCfg0, RxDetEnX) | + SYM_MASK(SerdesCfg0, ResetPLL) | + (SYM_MASK(SerdesCfg0, L1PwrDnA) | + SYM_MASK(SerdesCfg0, L1PwrDnB) | + SYM_MASK(SerdesCfg0, L1PwrDnC) | + SYM_MASK(SerdesCfg0, L1PwrDnD))); + val |= (SYM_MASK(SerdesCfg0, ResetA) | + SYM_MASK(SerdesCfg0, ResetB) | + SYM_MASK(SerdesCfg0, ResetC) | + SYM_MASK(SerdesCfg0, ResetD)) | + SYM_MASK(SerdesCfg0, TxIdeEnX); + qib_write_kreg(dd, kr_serdes_cfg0, val); + /* be sure chip saw it */ + (void) qib_read_kreg64(dd, kr_scratch); + /* need PLL reset clear for at least 11 usec before lane + * resets cleared; give it a few more to be sure */ + udelay(15); + val &= ~((SYM_MASK(SerdesCfg0, ResetA) | + SYM_MASK(SerdesCfg0, ResetB) | + SYM_MASK(SerdesCfg0, ResetC) | + SYM_MASK(SerdesCfg0, ResetD)) | + SYM_MASK(SerdesCfg0, TxIdeEnX)); + + qib_write_kreg(dd, kr_serdes_cfg0, val); + /* be sure chip saw it */ + (void) qib_read_kreg64(dd, kr_scratch); + + val = qib_read_kreg64(dd, kr_xgxs_cfg); + prev_val = val; + if (val & QLOGIC_IB_XGXS_RESET) + val &= ~QLOGIC_IB_XGXS_RESET; + if (SYM_FIELD(val, XGXSCfg, polarity_inv) != ppd->rx_pol_inv) { + /* need to compensate for Tx inversion in partner */ + val &= ~SYM_MASK(XGXSCfg, polarity_inv); + val |= (u64)ppd->rx_pol_inv << SYM_LSB(XGXSCfg, polarity_inv); + } + if (val != prev_val) + qib_write_kreg(dd, kr_xgxs_cfg, val); + + val = qib_read_kreg64(dd, kr_serdes_cfg0); + + /* clear current and de-emphasis bits */ + config1 &= ~0x0ffffffff00ULL; + /* set current to 20ma */ + config1 |= 0x00000000000ULL; + /* set de-emphasis to -5.68dB */ + config1 |= 0x0cccc000000ULL; + qib_write_kreg(dd, kr_serdes_cfg1, config1); + + /* base and port guid same for single port */ + ppd->guid = dd->base_guid; + + /* + * the process of setting and un-resetting the serdes normally + * causes a serdes PLL error, so check for that and clear it + * here. Also clearr hwerr bit in errstatus, but not others. + */ + hwstat = qib_read_kreg64(dd, kr_hwerrstatus); + if (hwstat) { + /* should just have PLL, clear all set, in an case */ + qib_write_kreg(dd, kr_hwerrclear, hwstat); + qib_write_kreg(dd, kr_errclear, ERR_MASK(HardwareErr)); + } + + dd->control |= QLOGIC_IB_C_LINKENABLE; + dd->control &= ~QLOGIC_IB_C_FREEZEMODE; + qib_write_kreg(dd, kr_control, dd->control); + + return 0; +} + +/** + * qib_6120_quiet_serdes - set serdes to txidle + * @ppd: physical port of the qlogic_ib device + * Called when driver is being unloaded + */ +static void qib_6120_quiet_serdes(struct qib_pportdata *ppd) +{ + struct qib_devdata *dd = ppd->dd; + u64 val; + + qib_set_ib_6120_lstate(ppd, 0, QLOGIC_IB_IBCC_LINKINITCMD_DISABLE); + + /* disable IBC */ + dd->control &= ~QLOGIC_IB_C_LINKENABLE; + qib_write_kreg(dd, kr_control, + dd->control | QLOGIC_IB_C_FREEZEMODE); + + if (dd->cspec->ibsymdelta || dd->cspec->iblnkerrdelta || + dd->cspec->ibdeltainprog) { + u64 diagc; + + /* enable counter writes */ + diagc = qib_read_kreg64(dd, kr_hwdiagctrl); + qib_write_kreg(dd, kr_hwdiagctrl, + diagc | SYM_MASK(HwDiagCtrl, CounterWrEnable)); + + if (dd->cspec->ibsymdelta || dd->cspec->ibdeltainprog) { + val = read_6120_creg32(dd, cr_ibsymbolerr); + if (dd->cspec->ibdeltainprog) + val -= val - dd->cspec->ibsymsnap; + val -= dd->cspec->ibsymdelta; + write_6120_creg(dd, cr_ibsymbolerr, val); + } + if (dd->cspec->iblnkerrdelta || dd->cspec->ibdeltainprog) { + val = read_6120_creg32(dd, cr_iblinkerrrecov); + if (dd->cspec->ibdeltainprog) + val -= val - dd->cspec->iblnkerrsnap; + val -= dd->cspec->iblnkerrdelta; + write_6120_creg(dd, cr_iblinkerrrecov, val); + } + + /* and disable counter writes */ + qib_write_kreg(dd, kr_hwdiagctrl, diagc); + } + + val = qib_read_kreg64(dd, kr_serdes_cfg0); + val |= SYM_MASK(SerdesCfg0, TxIdeEnX); + qib_write_kreg(dd, kr_serdes_cfg0, val); +} + +/** + * qib_6120_setup_setextled - set the state of the two external LEDs + * @dd: the qlogic_ib device + * @on: whether the link is up or not + * + * The exact combo of LEDs if on is true is determined by looking + * at the ibcstatus. + + * These LEDs indicate the physical and logical state of IB link. + * For this chip (at least with recommended board pinouts), LED1 + * is Yellow (logical state) and LED2 is Green (physical state), + * + * Note: We try to match the Mellanox HCA LED behavior as best + * we can. Green indicates physical link state is OK (something is + * plugged in, and we can train). + * Amber indicates the link is logically up (ACTIVE). + * Mellanox further blinks the amber LED to indicate data packet + * activity, but we have no hardware support for that, so it would + * require waking up every 10-20 msecs and checking the counters + * on the chip, and then turning the LED off if appropriate. That's + * visible overhead, so not something we will do. + * + */ +static void qib_6120_setup_setextled(struct qib_pportdata *ppd, u32 on) +{ + u64 extctl, val, lst, ltst; + unsigned long flags; + struct qib_devdata *dd = ppd->dd; + + /* + * The diags use the LED to indicate diag info, so we leave + * the external LED alone when the diags are running. + */ + if (dd->diag_client) + return; + + /* Allow override of LED display for, e.g. Locating system in rack */ + if (ppd->led_override) { + ltst = (ppd->led_override & QIB_LED_PHYS) ? + IB_PHYSPORTSTATE_LINKUP : IB_PHYSPORTSTATE_DISABLED, + lst = (ppd->led_override & QIB_LED_LOG) ? + IB_PORT_ACTIVE : IB_PORT_DOWN; + } else if (on) { + val = qib_read_kreg64(dd, kr_ibcstatus); + ltst = qib_6120_phys_portstate(val); + lst = qib_6120_iblink_state(val); + } else { + ltst = 0; + lst = 0; + } + + spin_lock_irqsave(&dd->cspec->gpio_lock, flags); + extctl = dd->cspec->extctrl & ~(SYM_MASK(EXTCtrl, LEDPriPortGreenOn) | + SYM_MASK(EXTCtrl, LEDPriPortYellowOn)); + + if (ltst == IB_PHYSPORTSTATE_LINKUP) + extctl |= SYM_MASK(EXTCtrl, LEDPriPortYellowOn); + if (lst == IB_PORT_ACTIVE) + extctl |= SYM_MASK(EXTCtrl, LEDPriPortGreenOn); + dd->cspec->extctrl = extctl; + qib_write_kreg(dd, kr_extctrl, extctl); + spin_unlock_irqrestore(&dd->cspec->gpio_lock, flags); +} + +static void qib_6120_free_irq(struct qib_devdata *dd) +{ + if (dd->cspec->irq) { + free_irq(dd->cspec->irq, dd); + dd->cspec->irq = 0; + } + qib_nomsi(dd); +} + +/** + * qib_6120_setup_cleanup - clean up any per-chip chip-specific stuff + * @dd: the qlogic_ib device + * + * This is called during driver unload. +*/ +static void qib_6120_setup_cleanup(struct qib_devdata *dd) +{ + qib_6120_free_irq(dd); + kfree(dd->cspec->cntrs); + kfree(dd->cspec->portcntrs); + if (dd->cspec->dummy_hdrq) { + dma_free_coherent(&dd->pcidev->dev, + ALIGN(dd->rcvhdrcnt * + dd->rcvhdrentsize * + sizeof(u32), PAGE_SIZE), + dd->cspec->dummy_hdrq, + dd->cspec->dummy_hdrq_phys); + dd->cspec->dummy_hdrq = NULL; + } +} + +static void qib_wantpiobuf_6120_intr(struct qib_devdata *dd, u32 needint) +{ + unsigned long flags; + + spin_lock_irqsave(&dd->sendctrl_lock, flags); + if (needint) + dd->sendctrl |= SYM_MASK(SendCtrl, PIOIntBufAvail); + else + dd->sendctrl &= ~SYM_MASK(SendCtrl, PIOIntBufAvail); + qib_write_kreg(dd, kr_sendctrl, dd->sendctrl); + qib_write_kreg(dd, kr_scratch, 0ULL); + spin_unlock_irqrestore(&dd->sendctrl_lock, flags); +} + +/* + * handle errors and unusual events first, separate function + * to improve cache hits for fast path interrupt handling + */ +static noinline void unlikely_6120_intr(struct qib_devdata *dd, u64 istat) +{ + if (unlikely(istat & ~QLOGIC_IB_I_BITSEXTANT)) + qib_dev_err(dd, "interrupt with unknown interrupts %Lx set\n", + istat & ~QLOGIC_IB_I_BITSEXTANT); + + if (istat & QLOGIC_IB_I_ERROR) { + u64 estat = 0; + + qib_stats.sps_errints++; + estat = qib_read_kreg64(dd, kr_errstatus); + if (!estat) + qib_devinfo(dd->pcidev, + "error interrupt (%Lx), but no error bits set!\n", + istat); + handle_6120_errors(dd, estat); + } + + if (istat & QLOGIC_IB_I_GPIO) { + u32 gpiostatus; + u32 to_clear = 0; + + /* + * GPIO_3..5 on IBA6120 Rev2 chips indicate + * errors that we need to count. + */ + gpiostatus = qib_read_kreg32(dd, kr_gpio_status); + /* First the error-counter case. */ + if (gpiostatus & GPIO_ERRINTR_MASK) { + /* want to clear the bits we see asserted. */ + to_clear |= (gpiostatus & GPIO_ERRINTR_MASK); + + /* + * Count appropriately, clear bits out of our copy, + * as they have been "handled". + */ + if (gpiostatus & (1 << GPIO_RXUVL_BIT)) + dd->cspec->rxfc_unsupvl_errs++; + if (gpiostatus & (1 << GPIO_OVRUN_BIT)) + dd->cspec->overrun_thresh_errs++; + if (gpiostatus & (1 << GPIO_LLI_BIT)) + dd->cspec->lli_errs++; + gpiostatus &= ~GPIO_ERRINTR_MASK; + } + if (gpiostatus) { + /* + * Some unexpected bits remain. If they could have + * caused the interrupt, complain and clear. + * To avoid repetition of this condition, also clear + * the mask. It is almost certainly due to error. + */ + const u32 mask = qib_read_kreg32(dd, kr_gpio_mask); + + /* + * Also check that the chip reflects our shadow, + * and report issues, If they caused the interrupt. + * we will suppress by refreshing from the shadow. + */ + if (mask & gpiostatus) { + to_clear |= (gpiostatus & mask); + dd->cspec->gpio_mask &= ~(gpiostatus & mask); + qib_write_kreg(dd, kr_gpio_mask, + dd->cspec->gpio_mask); + } + } + if (to_clear) + qib_write_kreg(dd, kr_gpio_clear, (u64) to_clear); + } +} + +static irqreturn_t qib_6120intr(int irq, void *data) +{ + struct qib_devdata *dd = data; + irqreturn_t ret; + u32 istat, ctxtrbits, rmask, crcs = 0; + unsigned i; + + if ((dd->flags & (QIB_PRESENT | QIB_BADINTR)) != QIB_PRESENT) { + /* + * This return value is not great, but we do not want the + * interrupt core code to remove our interrupt handler + * because we don't appear to be handling an interrupt + * during a chip reset. + */ + ret = IRQ_HANDLED; + goto bail; + } + + istat = qib_read_kreg32(dd, kr_intstatus); + + if (unlikely(!istat)) { + ret = IRQ_NONE; /* not our interrupt, or already handled */ + goto bail; + } + if (unlikely(istat == -1)) { + qib_bad_intrstatus(dd); + /* don't know if it was our interrupt or not */ + ret = IRQ_NONE; + goto bail; + } + + this_cpu_inc(*dd->int_counter); + + if (unlikely(istat & (~QLOGIC_IB_I_BITSEXTANT | + QLOGIC_IB_I_GPIO | QLOGIC_IB_I_ERROR))) + unlikely_6120_intr(dd, istat); + + /* + * Clear the interrupt bits we found set, relatively early, so we + * "know" know the chip will have seen this by the time we process + * the queue, and will re-interrupt if necessary. The processor + * itself won't take the interrupt again until we return. + */ + qib_write_kreg(dd, kr_intclear, istat); + + /* + * Handle kernel receive queues before checking for pio buffers + * available since receives can overflow; piobuf waiters can afford + * a few extra cycles, since they were waiting anyway. + */ + ctxtrbits = istat & + ((QLOGIC_IB_I_RCVAVAIL_MASK << QLOGIC_IB_I_RCVAVAIL_SHIFT) | + (QLOGIC_IB_I_RCVURG_MASK << QLOGIC_IB_I_RCVURG_SHIFT)); + if (ctxtrbits) { + rmask = (1U << QLOGIC_IB_I_RCVAVAIL_SHIFT) | + (1U << QLOGIC_IB_I_RCVURG_SHIFT); + for (i = 0; i < dd->first_user_ctxt; i++) { + if (ctxtrbits & rmask) { + ctxtrbits &= ~rmask; + crcs += qib_kreceive(dd->rcd[i], + &dd->cspec->lli_counter, + NULL); + } + rmask <<= 1; + } + if (crcs) { + u32 cntr = dd->cspec->lli_counter; + + cntr += crcs; + if (cntr) { + if (cntr > dd->cspec->lli_thresh) { + dd->cspec->lli_counter = 0; + dd->cspec->lli_errs++; + } else + dd->cspec->lli_counter += cntr; + } + } + + + if (ctxtrbits) { + ctxtrbits = + (ctxtrbits >> QLOGIC_IB_I_RCVAVAIL_SHIFT) | + (ctxtrbits >> QLOGIC_IB_I_RCVURG_SHIFT); + qib_handle_urcv(dd, ctxtrbits); + } + } + + if ((istat & QLOGIC_IB_I_SPIOBUFAVAIL) && (dd->flags & QIB_INITTED)) + qib_ib_piobufavail(dd); + + ret = IRQ_HANDLED; +bail: + return ret; +} + +/* + * Set up our chip-specific interrupt handler + * The interrupt type has already been setup, so + * we just need to do the registration and error checking. + */ +static void qib_setup_6120_interrupt(struct qib_devdata *dd) +{ + /* + * If the chip supports added error indication via GPIO pins, + * enable interrupts on those bits so the interrupt routine + * can count the events. Also set flag so interrupt routine + * can know they are expected. + */ + if (SYM_FIELD(dd->revision, Revision_R, + ChipRevMinor) > 1) { + /* Rev2+ reports extra errors via internal GPIO pins */ + dd->cspec->gpio_mask |= GPIO_ERRINTR_MASK; + qib_write_kreg(dd, kr_gpio_mask, dd->cspec->gpio_mask); + } + + if (!dd->cspec->irq) + qib_dev_err(dd, + "irq is 0, BIOS error? Interrupts won't work\n"); + else { + int ret; + + ret = request_irq(dd->cspec->irq, qib_6120intr, 0, + QIB_DRV_NAME, dd); + if (ret) + qib_dev_err(dd, + "Couldn't setup interrupt (irq=%d): %d\n", + dd->cspec->irq, ret); + } +} + +/** + * pe_boardname - fill in the board name + * @dd: the qlogic_ib device + * + * info is based on the board revision register + */ +static void pe_boardname(struct qib_devdata *dd) +{ + char *n; + u32 boardid, namelen; + + boardid = SYM_FIELD(dd->revision, Revision, + BoardID); + + switch (boardid) { + case 2: + n = "InfiniPath_QLE7140"; + break; + default: + qib_dev_err(dd, "Unknown 6120 board with ID %u\n", boardid); + n = "Unknown_InfiniPath_6120"; + break; + } + namelen = strlen(n) + 1; + dd->boardname = kmalloc(namelen, GFP_KERNEL); + if (!dd->boardname) + qib_dev_err(dd, "Failed allocation for board name: %s\n", n); + else + snprintf(dd->boardname, namelen, "%s", n); + + if (dd->majrev != 4 || !dd->minrev || dd->minrev > 2) + qib_dev_err(dd, + "Unsupported InfiniPath hardware revision %u.%u!\n", + dd->majrev, dd->minrev); + + snprintf(dd->boardversion, sizeof(dd->boardversion), + "ChipABI %u.%u, %s, InfiniPath%u %u.%u, SW Compat %u\n", + QIB_CHIP_VERS_MAJ, QIB_CHIP_VERS_MIN, dd->boardname, + (unsigned)SYM_FIELD(dd->revision, Revision_R, Arch), + dd->majrev, dd->minrev, + (unsigned)SYM_FIELD(dd->revision, Revision_R, SW)); + +} + +/* + * This routine sleeps, so it can only be called from user context, not + * from interrupt context. If we need interrupt context, we can split + * it into two routines. + */ +static int qib_6120_setup_reset(struct qib_devdata *dd) +{ + u64 val; + int i; + int ret; + u16 cmdval; + u8 int_line, clinesz; + + qib_pcie_getcmd(dd, &cmdval, &int_line, &clinesz); + + /* Use ERROR so it shows up in logs, etc. */ + qib_dev_err(dd, "Resetting InfiniPath unit %u\n", dd->unit); + + /* no interrupts till re-initted */ + qib_6120_set_intr_state(dd, 0); + + dd->cspec->ibdeltainprog = 0; + dd->cspec->ibsymdelta = 0; + dd->cspec->iblnkerrdelta = 0; + + /* + * Keep chip from being accessed until we are ready. Use + * writeq() directly, to allow the write even though QIB_PRESENT + * isn't set. + */ + dd->flags &= ~(QIB_INITTED | QIB_PRESENT); + /* so we check interrupts work again */ + dd->z_int_counter = qib_int_counter(dd); + val = dd->control | QLOGIC_IB_C_RESET; + writeq(val, &dd->kregbase[kr_control]); + mb(); /* prevent compiler re-ordering around actual reset */ + + for (i = 1; i <= 5; i++) { + /* + * Allow MBIST, etc. to complete; longer on each retry. + * We sometimes get machine checks from bus timeout if no + * response, so for now, make it *really* long. + */ + msleep(1000 + (1 + i) * 2000); + + qib_pcie_reenable(dd, cmdval, int_line, clinesz); + + /* + * Use readq directly, so we don't need to mark it as PRESENT + * until we get a successful indication that all is well. + */ + val = readq(&dd->kregbase[kr_revision]); + if (val == dd->revision) { + dd->flags |= QIB_PRESENT; /* it's back */ + ret = qib_reinit_intr(dd); + goto bail; + } + } + ret = 0; /* failed */ + +bail: + if (ret) { + if (qib_pcie_params(dd, dd->lbus_width, NULL, NULL)) + qib_dev_err(dd, + "Reset failed to setup PCIe or interrupts; continuing anyway\n"); + /* clear the reset error, init error/hwerror mask */ + qib_6120_init_hwerrors(dd); + /* for Rev2 error interrupts; nop for rev 1 */ + qib_write_kreg(dd, kr_gpio_mask, dd->cspec->gpio_mask); + /* clear the reset error, init error/hwerror mask */ + qib_6120_init_hwerrors(dd); + } + return ret; +} + +/** + * qib_6120_put_tid - write a TID in chip + * @dd: the qlogic_ib device + * @tidptr: pointer to the expected TID (in chip) to update + * @tidtype: RCVHQ_RCV_TYPE_EAGER (1) for eager, RCVHQ_RCV_TYPE_EXPECTED (0) + * for expected + * @pa: physical address of in memory buffer; tidinvalid if freeing + * + * This exists as a separate routine to allow for special locking etc. + * It's used for both the full cleanup on exit, as well as the normal + * setup and teardown. + */ +static void qib_6120_put_tid(struct qib_devdata *dd, u64 __iomem *tidptr, + u32 type, unsigned long pa) +{ + u32 __iomem *tidp32 = (u32 __iomem *)tidptr; + unsigned long flags; + int tidx; + spinlock_t *tidlockp; /* select appropriate spinlock */ + + if (!dd->kregbase) + return; + + if (pa != dd->tidinvalid) { + if (pa & ((1U << 11) - 1)) { + qib_dev_err(dd, "Physaddr %lx not 2KB aligned!\n", + pa); + return; + } + pa >>= 11; + if (pa & ~QLOGIC_IB_RT_ADDR_MASK) { + qib_dev_err(dd, + "Physical page address 0x%lx larger than supported\n", + pa); + return; + } + + if (type == RCVHQ_RCV_TYPE_EAGER) + pa |= dd->tidtemplate; + else /* for now, always full 4KB page */ + pa |= 2 << 29; + } + + /* + * Avoid chip issue by writing the scratch register + * before and after the TID, and with an io write barrier. + * We use a spinlock around the writes, so they can't intermix + * with other TID (eager or expected) writes (the chip problem + * is triggered by back to back TID writes). Unfortunately, this + * call can be done from interrupt level for the ctxt 0 eager TIDs, + * so we have to use irqsave locks. + */ + /* + * Assumes tidptr always > egrtidbase + * if type == RCVHQ_RCV_TYPE_EAGER. + */ + tidx = tidptr - dd->egrtidbase; + + tidlockp = (type == RCVHQ_RCV_TYPE_EAGER && tidx < dd->rcvhdrcnt) + ? &dd->cspec->kernel_tid_lock : &dd->cspec->user_tid_lock; + spin_lock_irqsave(tidlockp, flags); + qib_write_kreg(dd, kr_scratch, 0xfeeddeaf); + writel(pa, tidp32); + qib_write_kreg(dd, kr_scratch, 0xdeadbeef); + mmiowb(); + spin_unlock_irqrestore(tidlockp, flags); +} + +/** + * qib_6120_put_tid_2 - write a TID in chip, Revision 2 or higher + * @dd: the qlogic_ib device + * @tidptr: pointer to the expected TID (in chip) to update + * @tidtype: RCVHQ_RCV_TYPE_EAGER (1) for eager, RCVHQ_RCV_TYPE_EXPECTED (0) + * for expected + * @pa: physical address of in memory buffer; tidinvalid if freeing + * + * This exists as a separate routine to allow for selection of the + * appropriate "flavor". The static calls in cleanup just use the + * revision-agnostic form, as they are not performance critical. + */ +static void qib_6120_put_tid_2(struct qib_devdata *dd, u64 __iomem *tidptr, + u32 type, unsigned long pa) +{ + u32 __iomem *tidp32 = (u32 __iomem *)tidptr; + u32 tidx; + + if (!dd->kregbase) + return; + + if (pa != dd->tidinvalid) { + if (pa & ((1U << 11) - 1)) { + qib_dev_err(dd, "Physaddr %lx not 2KB aligned!\n", + pa); + return; + } + pa >>= 11; + if (pa & ~QLOGIC_IB_RT_ADDR_MASK) { + qib_dev_err(dd, + "Physical page address 0x%lx larger than supported\n", + pa); + return; + } + + if (type == RCVHQ_RCV_TYPE_EAGER) + pa |= dd->tidtemplate; + else /* for now, always full 4KB page */ + pa |= 2 << 29; + } + tidx = tidptr - dd->egrtidbase; + writel(pa, tidp32); + mmiowb(); +} + + +/** + * qib_6120_clear_tids - clear all TID entries for a context, expected and eager + * @dd: the qlogic_ib device + * @ctxt: the context + * + * clear all TID entries for a context, expected and eager. + * Used from qib_close(). On this chip, TIDs are only 32 bits, + * not 64, but they are still on 64 bit boundaries, so tidbase + * is declared as u64 * for the pointer math, even though we write 32 bits + */ +static void qib_6120_clear_tids(struct qib_devdata *dd, + struct qib_ctxtdata *rcd) +{ + u64 __iomem *tidbase; + unsigned long tidinv; + u32 ctxt; + int i; + + if (!dd->kregbase || !rcd) + return; + + ctxt = rcd->ctxt; + + tidinv = dd->tidinvalid; + tidbase = (u64 __iomem *) + ((char __iomem *)(dd->kregbase) + + dd->rcvtidbase + + ctxt * dd->rcvtidcnt * sizeof(*tidbase)); + + for (i = 0; i < dd->rcvtidcnt; i++) + /* use func pointer because could be one of two funcs */ + dd->f_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EXPECTED, + tidinv); + + tidbase = (u64 __iomem *) + ((char __iomem *)(dd->kregbase) + + dd->rcvegrbase + + rcd->rcvegr_tid_base * sizeof(*tidbase)); + + for (i = 0; i < rcd->rcvegrcnt; i++) + /* use func pointer because could be one of two funcs */ + dd->f_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EAGER, + tidinv); +} + +/** + * qib_6120_tidtemplate - setup constants for TID updates + * @dd: the qlogic_ib device + * + * We setup stuff that we use a lot, to avoid calculating each time + */ +static void qib_6120_tidtemplate(struct qib_devdata *dd) +{ + u32 egrsize = dd->rcvegrbufsize; + + /* + * For now, we always allocate 4KB buffers (at init) so we can + * receive max size packets. We may want a module parameter to + * specify 2KB or 4KB and/or make be per ctxt instead of per device + * for those who want to reduce memory footprint. Note that the + * rcvhdrentsize size must be large enough to hold the largest + * IB header (currently 96 bytes) that we expect to handle (plus of + * course the 2 dwords of RHF). + */ + if (egrsize == 2048) + dd->tidtemplate = 1U << 29; + else if (egrsize == 4096) + dd->tidtemplate = 2U << 29; + dd->tidinvalid = 0; +} + +int __attribute__((weak)) qib_unordered_wc(void) +{ + return 0; +} + +/** + * qib_6120_get_base_info - set chip-specific flags for user code + * @rcd: the qlogic_ib ctxt + * @kbase: qib_base_info pointer + * + * We set the PCIE flag because the lower bandwidth on PCIe vs + * HyperTransport can affect some user packet algorithms. + */ +static int qib_6120_get_base_info(struct qib_ctxtdata *rcd, + struct qib_base_info *kinfo) +{ + if (qib_unordered_wc()) + kinfo->spi_runtime_flags |= QIB_RUNTIME_FORCE_WC_ORDER; + + kinfo->spi_runtime_flags |= QIB_RUNTIME_PCIE | + QIB_RUNTIME_FORCE_PIOAVAIL | QIB_RUNTIME_PIO_REGSWAPPED; + return 0; +} + + +static struct qib_message_header * +qib_6120_get_msgheader(struct qib_devdata *dd, __le32 *rhf_addr) +{ + return (struct qib_message_header *) + &rhf_addr[sizeof(u64) / sizeof(u32)]; +} + +static void qib_6120_config_ctxts(struct qib_devdata *dd) +{ + dd->ctxtcnt = qib_read_kreg32(dd, kr_portcnt); + if (qib_n_krcv_queues > 1) { + dd->first_user_ctxt = qib_n_krcv_queues * dd->num_pports; + if (dd->first_user_ctxt > dd->ctxtcnt) + dd->first_user_ctxt = dd->ctxtcnt; + dd->qpn_mask = dd->first_user_ctxt <= 2 ? 2 : 6; + } else + dd->first_user_ctxt = dd->num_pports; + dd->n_krcv_queues = dd->first_user_ctxt; +} + +static void qib_update_6120_usrhead(struct qib_ctxtdata *rcd, u64 hd, + u32 updegr, u32 egrhd, u32 npkts) +{ + if (updegr) + qib_write_ureg(rcd->dd, ur_rcvegrindexhead, egrhd, rcd->ctxt); + mmiowb(); + qib_write_ureg(rcd->dd, ur_rcvhdrhead, hd, rcd->ctxt); + mmiowb(); +} + +static u32 qib_6120_hdrqempty(struct qib_ctxtdata *rcd) +{ + u32 head, tail; + + head = qib_read_ureg32(rcd->dd, ur_rcvhdrhead, rcd->ctxt); + if (rcd->rcvhdrtail_kvaddr) + tail = qib_get_rcvhdrtail(rcd); + else + tail = qib_read_ureg32(rcd->dd, ur_rcvhdrtail, rcd->ctxt); + return head == tail; +} + +/* + * Used when we close any ctxt, for DMA already in flight + * at close. Can't be done until we know hdrq size, so not + * early in chip init. + */ +static void alloc_dummy_hdrq(struct qib_devdata *dd) +{ + dd->cspec->dummy_hdrq = dma_alloc_coherent(&dd->pcidev->dev, + dd->rcd[0]->rcvhdrq_size, + &dd->cspec->dummy_hdrq_phys, + GFP_ATOMIC | __GFP_COMP); + if (!dd->cspec->dummy_hdrq) { + qib_devinfo(dd->pcidev, "Couldn't allocate dummy hdrq\n"); + /* fallback to just 0'ing */ + dd->cspec->dummy_hdrq_phys = 0UL; + } +} + +/* + * Modify the RCVCTRL register in chip-specific way. This + * is a function because bit positions and (future) register + * location is chip-specific, but the needed operations are + * generic. is a bit-mask because we often want to + * do multiple modifications. + */ +static void rcvctrl_6120_mod(struct qib_pportdata *ppd, unsigned int op, + int ctxt) +{ + struct qib_devdata *dd = ppd->dd; + u64 mask, val; + unsigned long flags; + + spin_lock_irqsave(&dd->cspec->rcvmod_lock, flags); + + if (op & QIB_RCVCTRL_TAILUPD_ENB) + dd->rcvctrl |= (1ULL << QLOGIC_IB_R_TAILUPD_SHIFT); + if (op & QIB_RCVCTRL_TAILUPD_DIS) + dd->rcvctrl &= ~(1ULL << QLOGIC_IB_R_TAILUPD_SHIFT); + if (op & QIB_RCVCTRL_PKEY_ENB) + dd->rcvctrl &= ~(1ULL << IBA6120_R_PKEY_DIS_SHIFT); + if (op & QIB_RCVCTRL_PKEY_DIS) + dd->rcvctrl |= (1ULL << IBA6120_R_PKEY_DIS_SHIFT); + if (ctxt < 0) + mask = (1ULL << dd->ctxtcnt) - 1; + else + mask = (1ULL << ctxt); + if (op & QIB_RCVCTRL_CTXT_ENB) { + /* always done for specific ctxt */ + dd->rcvctrl |= (mask << SYM_LSB(RcvCtrl, PortEnable)); + if (!(dd->flags & QIB_NODMA_RTAIL)) + dd->rcvctrl |= 1ULL << QLOGIC_IB_R_TAILUPD_SHIFT; + /* Write these registers before the context is enabled. */ + qib_write_kreg_ctxt(dd, kr_rcvhdrtailaddr, ctxt, + dd->rcd[ctxt]->rcvhdrqtailaddr_phys); + qib_write_kreg_ctxt(dd, kr_rcvhdraddr, ctxt, + dd->rcd[ctxt]->rcvhdrq_phys); + + if (ctxt == 0 && !dd->cspec->dummy_hdrq) + alloc_dummy_hdrq(dd); + } + if (op & QIB_RCVCTRL_CTXT_DIS) + dd->rcvctrl &= ~(mask << SYM_LSB(RcvCtrl, PortEnable)); + if (op & QIB_RCVCTRL_INTRAVAIL_ENB) + dd->rcvctrl |= (mask << QLOGIC_IB_R_INTRAVAIL_SHIFT); + if (op & QIB_RCVCTRL_INTRAVAIL_DIS) + dd->rcvctrl &= ~(mask << QLOGIC_IB_R_INTRAVAIL_SHIFT); + qib_write_kreg(dd, kr_rcvctrl, dd->rcvctrl); + if ((op & QIB_RCVCTRL_INTRAVAIL_ENB) && dd->rhdrhead_intr_off) { + /* arm rcv interrupt */ + val = qib_read_ureg32(dd, ur_rcvhdrhead, ctxt) | + dd->rhdrhead_intr_off; + qib_write_ureg(dd, ur_rcvhdrhead, val, ctxt); + } + if (op & QIB_RCVCTRL_CTXT_ENB) { + /* + * Init the context registers also; if we were + * disabled, tail and head should both be zero + * already from the enable, but since we don't + * know, we have to do it explicitly. + */ + val = qib_read_ureg32(dd, ur_rcvegrindextail, ctxt); + qib_write_ureg(dd, ur_rcvegrindexhead, val, ctxt); + + val = qib_read_ureg32(dd, ur_rcvhdrtail, ctxt); + dd->rcd[ctxt]->head = val; + /* If kctxt, interrupt on next receive. */ + if (ctxt < dd->first_user_ctxt) + val |= dd->rhdrhead_intr_off; + qib_write_ureg(dd, ur_rcvhdrhead, val, ctxt); + } + if (op & QIB_RCVCTRL_CTXT_DIS) { + /* + * Be paranoid, and never write 0's to these, just use an + * unused page. Of course, + * rcvhdraddr points to a large chunk of memory, so this + * could still trash things, but at least it won't trash + * page 0, and by disabling the ctxt, it should stop "soon", + * even if a packet or two is in already in flight after we + * disabled the ctxt. Only 6120 has this issue. + */ + if (ctxt >= 0) { + qib_write_kreg_ctxt(dd, kr_rcvhdrtailaddr, ctxt, + dd->cspec->dummy_hdrq_phys); + qib_write_kreg_ctxt(dd, kr_rcvhdraddr, ctxt, + dd->cspec->dummy_hdrq_phys); + } else { + unsigned i; + + for (i = 0; i < dd->cfgctxts; i++) { + qib_write_kreg_ctxt(dd, kr_rcvhdrtailaddr, + i, dd->cspec->dummy_hdrq_phys); + qib_write_kreg_ctxt(dd, kr_rcvhdraddr, + i, dd->cspec->dummy_hdrq_phys); + } + } + } + spin_unlock_irqrestore(&dd->cspec->rcvmod_lock, flags); +} + +/* + * Modify the SENDCTRL register in chip-specific way. This + * is a function there may be multiple such registers with + * slightly different layouts. Only operations actually used + * are implemented yet. + * Chip requires no back-back sendctrl writes, so write + * scratch register after writing sendctrl + */ +static void sendctrl_6120_mod(struct qib_pportdata *ppd, u32 op) +{ + struct qib_devdata *dd = ppd->dd; + u64 tmp_dd_sendctrl; + unsigned long flags; + + spin_lock_irqsave(&dd->sendctrl_lock, flags); + + /* First the ones that are "sticky", saved in shadow */ + if (op & QIB_SENDCTRL_CLEAR) + dd->sendctrl = 0; + if (op & QIB_SENDCTRL_SEND_DIS) + dd->sendctrl &= ~SYM_MASK(SendCtrl, PIOEnable); + else if (op & QIB_SENDCTRL_SEND_ENB) + dd->sendctrl |= SYM_MASK(SendCtrl, PIOEnable); + if (op & QIB_SENDCTRL_AVAIL_DIS) + dd->sendctrl &= ~SYM_MASK(SendCtrl, PIOBufAvailUpd); + else if (op & QIB_SENDCTRL_AVAIL_ENB) + dd->sendctrl |= SYM_MASK(SendCtrl, PIOBufAvailUpd); + + if (op & QIB_SENDCTRL_DISARM_ALL) { + u32 i, last; + + tmp_dd_sendctrl = dd->sendctrl; + /* + * disarm any that are not yet launched, disabling sends + * and updates until done. + */ + last = dd->piobcnt2k + dd->piobcnt4k; + tmp_dd_sendctrl &= + ~(SYM_MASK(SendCtrl, PIOEnable) | + SYM_MASK(SendCtrl, PIOBufAvailUpd)); + for (i = 0; i < last; i++) { + qib_write_kreg(dd, kr_sendctrl, tmp_dd_sendctrl | + SYM_MASK(SendCtrl, Disarm) | i); + qib_write_kreg(dd, kr_scratch, 0); + } + } + + tmp_dd_sendctrl = dd->sendctrl; + + if (op & QIB_SENDCTRL_FLUSH) + tmp_dd_sendctrl |= SYM_MASK(SendCtrl, Abort); + if (op & QIB_SENDCTRL_DISARM) + tmp_dd_sendctrl |= SYM_MASK(SendCtrl, Disarm) | + ((op & QIB_6120_SendCtrl_DisarmPIOBuf_RMASK) << + SYM_LSB(SendCtrl, DisarmPIOBuf)); + if (op & QIB_SENDCTRL_AVAIL_BLIP) + tmp_dd_sendctrl &= ~SYM_MASK(SendCtrl, PIOBufAvailUpd); + + qib_write_kreg(dd, kr_sendctrl, tmp_dd_sendctrl); + qib_write_kreg(dd, kr_scratch, 0); + + if (op & QIB_SENDCTRL_AVAIL_BLIP) { + qib_write_kreg(dd, kr_sendctrl, dd->sendctrl); + qib_write_kreg(dd, kr_scratch, 0); + } + + spin_unlock_irqrestore(&dd->sendctrl_lock, flags); + + if (op & QIB_SENDCTRL_FLUSH) { + u32 v; + /* + * ensure writes have hit chip, then do a few + * more reads, to allow DMA of pioavail registers + * to occur, so in-memory copy is in sync with + * the chip. Not always safe to sleep. + */ + v = qib_read_kreg32(dd, kr_scratch); + qib_write_kreg(dd, kr_scratch, v); + v = qib_read_kreg32(dd, kr_scratch); + qib_write_kreg(dd, kr_scratch, v); + qib_read_kreg32(dd, kr_scratch); + } +} + +/** + * qib_portcntr_6120 - read a per-port counter + * @dd: the qlogic_ib device + * @creg: the counter to snapshot + */ +static u64 qib_portcntr_6120(struct qib_pportdata *ppd, u32 reg) +{ + u64 ret = 0ULL; + struct qib_devdata *dd = ppd->dd; + u16 creg; + /* 0xffff for unimplemented or synthesized counters */ + static const u16 xlator[] = { + [QIBPORTCNTR_PKTSEND] = cr_pktsend, + [QIBPORTCNTR_WORDSEND] = cr_wordsend, + [QIBPORTCNTR_PSXMITDATA] = 0xffff, + [QIBPORTCNTR_PSXMITPKTS] = 0xffff, + [QIBPORTCNTR_PSXMITWAIT] = 0xffff, + [QIBPORTCNTR_SENDSTALL] = cr_sendstall, + [QIBPORTCNTR_PKTRCV] = cr_pktrcv, + [QIBPORTCNTR_PSRCVDATA] = 0xffff, + [QIBPORTCNTR_PSRCVPKTS] = 0xffff, + [QIBPORTCNTR_RCVEBP] = cr_rcvebp, + [QIBPORTCNTR_RCVOVFL] = cr_rcvovfl, + [QIBPORTCNTR_WORDRCV] = cr_wordrcv, + [QIBPORTCNTR_RXDROPPKT] = cr_rxdroppkt, + [QIBPORTCNTR_RXLOCALPHYERR] = 0xffff, + [QIBPORTCNTR_RXVLERR] = 0xffff, + [QIBPORTCNTR_ERRICRC] = cr_erricrc, + [QIBPORTCNTR_ERRVCRC] = cr_errvcrc, + [QIBPORTCNTR_ERRLPCRC] = cr_errlpcrc, + [QIBPORTCNTR_BADFORMAT] = cr_badformat, + [QIBPORTCNTR_ERR_RLEN] = cr_err_rlen, + [QIBPORTCNTR_IBSYMBOLERR] = cr_ibsymbolerr, + [QIBPORTCNTR_INVALIDRLEN] = cr_invalidrlen, + [QIBPORTCNTR_UNSUPVL] = cr_txunsupvl, + [QIBPORTCNTR_EXCESSBUFOVFL] = 0xffff, + [QIBPORTCNTR_ERRLINK] = cr_errlink, + [QIBPORTCNTR_IBLINKDOWN] = cr_iblinkdown, + [QIBPORTCNTR_IBLINKERRRECOV] = cr_iblinkerrrecov, + [QIBPORTCNTR_LLI] = 0xffff, + [QIBPORTCNTR_PSINTERVAL] = 0xffff, + [QIBPORTCNTR_PSSTART] = 0xffff, + [QIBPORTCNTR_PSSTAT] = 0xffff, + [QIBPORTCNTR_VL15PKTDROP] = 0xffff, + [QIBPORTCNTR_ERRPKEY] = cr_errpkey, + [QIBPORTCNTR_KHDROVFL] = 0xffff, + }; + + if (reg >= ARRAY_SIZE(xlator)) { + qib_devinfo(ppd->dd->pcidev, + "Unimplemented portcounter %u\n", reg); + goto done; + } + creg = xlator[reg]; + + /* handle counters requests not implemented as chip counters */ + if (reg == QIBPORTCNTR_LLI) + ret = dd->cspec->lli_errs; + else if (reg == QIBPORTCNTR_EXCESSBUFOVFL) + ret = dd->cspec->overrun_thresh_errs; + else if (reg == QIBPORTCNTR_KHDROVFL) { + int i; + + /* sum over all kernel contexts */ + for (i = 0; i < dd->first_user_ctxt; i++) + ret += read_6120_creg32(dd, cr_portovfl + i); + } else if (reg == QIBPORTCNTR_PSSTAT) + ret = dd->cspec->pma_sample_status; + if (creg == 0xffff) + goto done; + + /* + * only fast incrementing counters are 64bit; use 32 bit reads to + * avoid two independent reads when on opteron + */ + if (creg == cr_wordsend || creg == cr_wordrcv || + creg == cr_pktsend || creg == cr_pktrcv) + ret = read_6120_creg(dd, creg); + else + ret = read_6120_creg32(dd, creg); + if (creg == cr_ibsymbolerr) { + if (dd->cspec->ibdeltainprog) + ret -= ret - dd->cspec->ibsymsnap; + ret -= dd->cspec->ibsymdelta; + } else if (creg == cr_iblinkerrrecov) { + if (dd->cspec->ibdeltainprog) + ret -= ret - dd->cspec->iblnkerrsnap; + ret -= dd->cspec->iblnkerrdelta; + } + if (reg == QIBPORTCNTR_RXDROPPKT) /* add special cased count */ + ret += dd->cspec->rxfc_unsupvl_errs; + +done: + return ret; +} + +/* + * Device counter names (not port-specific), one line per stat, + * single string. Used by utilities like ipathstats to print the stats + * in a way which works for different versions of drivers, without changing + * the utility. Names need to be 12 chars or less (w/o newline), for proper + * display by utility. + * Non-error counters are first. + * Start of "error" conters is indicated by a leading "E " on the first + * "error" counter, and doesn't count in label length. + * The EgrOvfl list needs to be last so we truncate them at the configured + * context count for the device. + * cntr6120indices contains the corresponding register indices. + */ +static const char cntr6120names[] = + "Interrupts\n" + "HostBusStall\n" + "E RxTIDFull\n" + "RxTIDInvalid\n" + "Ctxt0EgrOvfl\n" + "Ctxt1EgrOvfl\n" + "Ctxt2EgrOvfl\n" + "Ctxt3EgrOvfl\n" + "Ctxt4EgrOvfl\n"; + +static const size_t cntr6120indices[] = { + cr_lbint, + cr_lbflowstall, + cr_errtidfull, + cr_errtidvalid, + cr_portovfl + 0, + cr_portovfl + 1, + cr_portovfl + 2, + cr_portovfl + 3, + cr_portovfl + 4, +}; + +/* + * same as cntr6120names and cntr6120indices, but for port-specific counters. + * portcntr6120indices is somewhat complicated by some registers needing + * adjustments of various kinds, and those are ORed with _PORT_VIRT_FLAG + */ +static const char portcntr6120names[] = + "TxPkt\n" + "TxFlowPkt\n" + "TxWords\n" + "RxPkt\n" + "RxFlowPkt\n" + "RxWords\n" + "TxFlowStall\n" + "E IBStatusChng\n" + "IBLinkDown\n" + "IBLnkRecov\n" + "IBRxLinkErr\n" + "IBSymbolErr\n" + "RxLLIErr\n" + "RxBadFormat\n" + "RxBadLen\n" + "RxBufOvrfl\n" + "RxEBP\n" + "RxFlowCtlErr\n" + "RxICRCerr\n" + "RxLPCRCerr\n" + "RxVCRCerr\n" + "RxInvalLen\n" + "RxInvalPKey\n" + "RxPktDropped\n" + "TxBadLength\n" + "TxDropped\n" + "TxInvalLen\n" + "TxUnderrun\n" + "TxUnsupVL\n" + ; + +#define _PORT_VIRT_FLAG 0x8000 /* "virtual", need adjustments */ +static const size_t portcntr6120indices[] = { + QIBPORTCNTR_PKTSEND | _PORT_VIRT_FLAG, + cr_pktsendflow, + QIBPORTCNTR_WORDSEND | _PORT_VIRT_FLAG, + QIBPORTCNTR_PKTRCV | _PORT_VIRT_FLAG, + cr_pktrcvflowctrl, + QIBPORTCNTR_WORDRCV | _PORT_VIRT_FLAG, + QIBPORTCNTR_SENDSTALL | _PORT_VIRT_FLAG, + cr_ibstatuschange, + QIBPORTCNTR_IBLINKDOWN | _PORT_VIRT_FLAG, + QIBPORTCNTR_IBLINKERRRECOV | _PORT_VIRT_FLAG, + QIBPORTCNTR_ERRLINK | _PORT_VIRT_FLAG, + QIBPORTCNTR_IBSYMBOLERR | _PORT_VIRT_FLAG, + QIBPORTCNTR_LLI | _PORT_VIRT_FLAG, + QIBPORTCNTR_BADFORMAT | _PORT_VIRT_FLAG, + QIBPORTCNTR_ERR_RLEN | _PORT_VIRT_FLAG, + QIBPORTCNTR_RCVOVFL | _PORT_VIRT_FLAG, + QIBPORTCNTR_RCVEBP | _PORT_VIRT_FLAG, + cr_rcvflowctrl_err, + QIBPORTCNTR_ERRICRC | _PORT_VIRT_FLAG, + QIBPORTCNTR_ERRLPCRC | _PORT_VIRT_FLAG, + QIBPORTCNTR_ERRVCRC | _PORT_VIRT_FLAG, + QIBPORTCNTR_INVALIDRLEN | _PORT_VIRT_FLAG, + QIBPORTCNTR_ERRPKEY | _PORT_VIRT_FLAG, + QIBPORTCNTR_RXDROPPKT | _PORT_VIRT_FLAG, + cr_invalidslen, + cr_senddropped, + cr_errslen, + cr_sendunderrun, + cr_txunsupvl, +}; + +/* do all the setup to make the counter reads efficient later */ +static void init_6120_cntrnames(struct qib_devdata *dd) +{ + int i, j = 0; + char *s; + + for (i = 0, s = (char *)cntr6120names; s && j <= dd->cfgctxts; + i++) { + /* we always have at least one counter before the egrovfl */ + if (!j && !strncmp("Ctxt0EgrOvfl", s + 1, 12)) + j = 1; + s = strchr(s + 1, '\n'); + if (s && j) + j++; + } + dd->cspec->ncntrs = i; + if (!s) + /* full list; size is without terminating null */ + dd->cspec->cntrnamelen = sizeof(cntr6120names) - 1; + else + dd->cspec->cntrnamelen = 1 + s - cntr6120names; + dd->cspec->cntrs = kmalloc(dd->cspec->ncntrs + * sizeof(u64), GFP_KERNEL); + if (!dd->cspec->cntrs) + qib_dev_err(dd, "Failed allocation for counters\n"); + + for (i = 0, s = (char *)portcntr6120names; s; i++) + s = strchr(s + 1, '\n'); + dd->cspec->nportcntrs = i - 1; + dd->cspec->portcntrnamelen = sizeof(portcntr6120names) - 1; + dd->cspec->portcntrs = kmalloc(dd->cspec->nportcntrs + * sizeof(u64), GFP_KERNEL); + if (!dd->cspec->portcntrs) + qib_dev_err(dd, "Failed allocation for portcounters\n"); +} + +static u32 qib_read_6120cntrs(struct qib_devdata *dd, loff_t pos, char **namep, + u64 **cntrp) +{ + u32 ret; + + if (namep) { + ret = dd->cspec->cntrnamelen; + if (pos >= ret) + ret = 0; /* final read after getting everything */ + else + *namep = (char *)cntr6120names; + } else { + u64 *cntr = dd->cspec->cntrs; + int i; + + ret = dd->cspec->ncntrs * sizeof(u64); + if (!cntr || pos >= ret) { + /* everything read, or couldn't get memory */ + ret = 0; + goto done; + } + if (pos >= ret) { + ret = 0; /* final read after getting everything */ + goto done; + } + *cntrp = cntr; + for (i = 0; i < dd->cspec->ncntrs; i++) + *cntr++ = read_6120_creg32(dd, cntr6120indices[i]); + } +done: + return ret; +} + +static u32 qib_read_6120portcntrs(struct qib_devdata *dd, loff_t pos, u32 port, + char **namep, u64 **cntrp) +{ + u32 ret; + + if (namep) { + ret = dd->cspec->portcntrnamelen; + if (pos >= ret) + ret = 0; /* final read after getting everything */ + else + *namep = (char *)portcntr6120names; + } else { + u64 *cntr = dd->cspec->portcntrs; + struct qib_pportdata *ppd = &dd->pport[port]; + int i; + + ret = dd->cspec->nportcntrs * sizeof(u64); + if (!cntr || pos >= ret) { + /* everything read, or couldn't get memory */ + ret = 0; + goto done; + } + *cntrp = cntr; + for (i = 0; i < dd->cspec->nportcntrs; i++) { + if (portcntr6120indices[i] & _PORT_VIRT_FLAG) + *cntr++ = qib_portcntr_6120(ppd, + portcntr6120indices[i] & + ~_PORT_VIRT_FLAG); + else + *cntr++ = read_6120_creg32(dd, + portcntr6120indices[i]); + } + } +done: + return ret; +} + +static void qib_chk_6120_errormask(struct qib_devdata *dd) +{ + static u32 fixed; + u32 ctrl; + unsigned long errormask; + unsigned long hwerrs; + + if (!dd->cspec->errormask || !(dd->flags & QIB_INITTED)) + return; + + errormask = qib_read_kreg64(dd, kr_errmask); + + if (errormask == dd->cspec->errormask) + return; + fixed++; + + hwerrs = qib_read_kreg64(dd, kr_hwerrstatus); + ctrl = qib_read_kreg32(dd, kr_control); + + qib_write_kreg(dd, kr_errmask, + dd->cspec->errormask); + + if ((hwerrs & dd->cspec->hwerrmask) || + (ctrl & QLOGIC_IB_C_FREEZEMODE)) { + qib_write_kreg(dd, kr_hwerrclear, 0ULL); + qib_write_kreg(dd, kr_errclear, 0ULL); + /* force re-interrupt of pending events, just in case */ + qib_write_kreg(dd, kr_intclear, 0ULL); + qib_devinfo(dd->pcidev, + "errormask fixed(%u) %lx->%lx, ctrl %x hwerr %lx\n", + fixed, errormask, (unsigned long)dd->cspec->errormask, + ctrl, hwerrs); + } +} + +/** + * qib_get_faststats - get word counters from chip before they overflow + * @opaque - contains a pointer to the qlogic_ib device qib_devdata + * + * This needs more work; in particular, decision on whether we really + * need traffic_wds done the way it is + * called from add_timer + */ +static void qib_get_6120_faststats(unsigned long opaque) +{ + struct qib_devdata *dd = (struct qib_devdata *) opaque; + struct qib_pportdata *ppd = dd->pport; + unsigned long flags; + u64 traffic_wds; + + /* + * don't access the chip while running diags, or memory diags can + * fail + */ + if (!(dd->flags & QIB_INITTED) || dd->diag_client) + /* but re-arm the timer, for diags case; won't hurt other */ + goto done; + + /* + * We now try to maintain an activity timer, based on traffic + * exceeding a threshold, so we need to check the word-counts + * even if they are 64-bit. + */ + traffic_wds = qib_portcntr_6120(ppd, cr_wordsend) + + qib_portcntr_6120(ppd, cr_wordrcv); + spin_lock_irqsave(&dd->eep_st_lock, flags); + traffic_wds -= dd->traffic_wds; + dd->traffic_wds += traffic_wds; + spin_unlock_irqrestore(&dd->eep_st_lock, flags); + + qib_chk_6120_errormask(dd); +done: + mod_timer(&dd->stats_timer, jiffies + HZ * ACTIVITY_TIMER); +} + +/* no interrupt fallback for these chips */ +static int qib_6120_nointr_fallback(struct qib_devdata *dd) +{ + return 0; +} + +/* + * reset the XGXS (between serdes and IBC). Slightly less intrusive + * than resetting the IBC or external link state, and useful in some + * cases to cause some retraining. To do this right, we reset IBC + * as well. + */ +static void qib_6120_xgxs_reset(struct qib_pportdata *ppd) +{ + u64 val, prev_val; + struct qib_devdata *dd = ppd->dd; + + prev_val = qib_read_kreg64(dd, kr_xgxs_cfg); + val = prev_val | QLOGIC_IB_XGXS_RESET; + prev_val &= ~QLOGIC_IB_XGXS_RESET; /* be sure */ + qib_write_kreg(dd, kr_control, + dd->control & ~QLOGIC_IB_C_LINKENABLE); + qib_write_kreg(dd, kr_xgxs_cfg, val); + qib_read_kreg32(dd, kr_scratch); + qib_write_kreg(dd, kr_xgxs_cfg, prev_val); + qib_write_kreg(dd, kr_control, dd->control); +} + +static int qib_6120_get_ib_cfg(struct qib_pportdata *ppd, int which) +{ + int ret; + + switch (which) { + case QIB_IB_CFG_LWID: + ret = ppd->link_width_active; + break; + + case QIB_IB_CFG_SPD: + ret = ppd->link_speed_active; + break; + + case QIB_IB_CFG_LWID_ENB: + ret = ppd->link_width_enabled; + break; + + case QIB_IB_CFG_SPD_ENB: + ret = ppd->link_speed_enabled; + break; + + case QIB_IB_CFG_OP_VLS: + ret = ppd->vls_operational; + break; + + case QIB_IB_CFG_VL_HIGH_CAP: + ret = 0; + break; + + case QIB_IB_CFG_VL_LOW_CAP: + ret = 0; + break; + + case QIB_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */ + ret = SYM_FIELD(ppd->dd->cspec->ibcctrl, IBCCtrl, + OverrunThreshold); + break; + + case QIB_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */ + ret = SYM_FIELD(ppd->dd->cspec->ibcctrl, IBCCtrl, + PhyerrThreshold); + break; + + case QIB_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */ + /* will only take effect when the link state changes */ + ret = (ppd->dd->cspec->ibcctrl & + SYM_MASK(IBCCtrl, LinkDownDefaultState)) ? + IB_LINKINITCMD_SLEEP : IB_LINKINITCMD_POLL; + break; + + case QIB_IB_CFG_HRTBT: /* Get Heartbeat off/enable/auto */ + ret = 0; /* no heartbeat on this chip */ + break; + + case QIB_IB_CFG_PMA_TICKS: + ret = 250; /* 1 usec. */ + break; + + default: + ret = -EINVAL; + break; + } + return ret; +} + +/* + * We assume range checking is already done, if needed. + */ +static int qib_6120_set_ib_cfg(struct qib_pportdata *ppd, int which, u32 val) +{ + struct qib_devdata *dd = ppd->dd; + int ret = 0; + u64 val64; + u16 lcmd, licmd; + + switch (which) { + case QIB_IB_CFG_LWID_ENB: + ppd->link_width_enabled = val; + break; + + case QIB_IB_CFG_SPD_ENB: + ppd->link_speed_enabled = val; + break; + + case QIB_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */ + val64 = SYM_FIELD(dd->cspec->ibcctrl, IBCCtrl, + OverrunThreshold); + if (val64 != val) { + dd->cspec->ibcctrl &= + ~SYM_MASK(IBCCtrl, OverrunThreshold); + dd->cspec->ibcctrl |= (u64) val << + SYM_LSB(IBCCtrl, OverrunThreshold); + qib_write_kreg(dd, kr_ibcctrl, dd->cspec->ibcctrl); + qib_write_kreg(dd, kr_scratch, 0); + } + break; + + case QIB_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */ + val64 = SYM_FIELD(dd->cspec->ibcctrl, IBCCtrl, + PhyerrThreshold); + if (val64 != val) { + dd->cspec->ibcctrl &= + ~SYM_MASK(IBCCtrl, PhyerrThreshold); + dd->cspec->ibcctrl |= (u64) val << + SYM_LSB(IBCCtrl, PhyerrThreshold); + qib_write_kreg(dd, kr_ibcctrl, dd->cspec->ibcctrl); + qib_write_kreg(dd, kr_scratch, 0); + } + break; + + case QIB_IB_CFG_PKEYS: /* update pkeys */ + val64 = (u64) ppd->pkeys[0] | ((u64) ppd->pkeys[1] << 16) | + ((u64) ppd->pkeys[2] << 32) | + ((u64) ppd->pkeys[3] << 48); + qib_write_kreg(dd, kr_partitionkey, val64); + break; + + case QIB_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */ + /* will only take effect when the link state changes */ + if (val == IB_LINKINITCMD_POLL) + dd->cspec->ibcctrl &= + ~SYM_MASK(IBCCtrl, LinkDownDefaultState); + else /* SLEEP */ + dd->cspec->ibcctrl |= + SYM_MASK(IBCCtrl, LinkDownDefaultState); + qib_write_kreg(dd, kr_ibcctrl, dd->cspec->ibcctrl); + qib_write_kreg(dd, kr_scratch, 0); + break; + + case QIB_IB_CFG_MTU: /* update the MTU in IBC */ + /* + * Update our housekeeping variables, and set IBC max + * size, same as init code; max IBC is max we allow in + * buffer, less the qword pbc, plus 1 for ICRC, in dwords + * Set even if it's unchanged, print debug message only + * on changes. + */ + val = (ppd->ibmaxlen >> 2) + 1; + dd->cspec->ibcctrl &= ~SYM_MASK(IBCCtrl, MaxPktLen); + dd->cspec->ibcctrl |= (u64)val << + SYM_LSB(IBCCtrl, MaxPktLen); + qib_write_kreg(dd, kr_ibcctrl, dd->cspec->ibcctrl); + qib_write_kreg(dd, kr_scratch, 0); + break; + + case QIB_IB_CFG_LSTATE: /* set the IB link state */ + switch (val & 0xffff0000) { + case IB_LINKCMD_DOWN: + lcmd = QLOGIC_IB_IBCC_LINKCMD_DOWN; + if (!dd->cspec->ibdeltainprog) { + dd->cspec->ibdeltainprog = 1; + dd->cspec->ibsymsnap = + read_6120_creg32(dd, cr_ibsymbolerr); + dd->cspec->iblnkerrsnap = + read_6120_creg32(dd, cr_iblinkerrrecov); + } + break; + + case IB_LINKCMD_ARMED: + lcmd = QLOGIC_IB_IBCC_LINKCMD_ARMED; + break; + + case IB_LINKCMD_ACTIVE: + lcmd = QLOGIC_IB_IBCC_LINKCMD_ACTIVE; + break; + + default: + ret = -EINVAL; + qib_dev_err(dd, "bad linkcmd req 0x%x\n", val >> 16); + goto bail; + } + switch (val & 0xffff) { + case IB_LINKINITCMD_NOP: + licmd = 0; + break; + + case IB_LINKINITCMD_POLL: + licmd = QLOGIC_IB_IBCC_LINKINITCMD_POLL; + break; + + case IB_LINKINITCMD_SLEEP: + licmd = QLOGIC_IB_IBCC_LINKINITCMD_SLEEP; + break; + + case IB_LINKINITCMD_DISABLE: + licmd = QLOGIC_IB_IBCC_LINKINITCMD_DISABLE; + break; + + default: + ret = -EINVAL; + qib_dev_err(dd, "bad linkinitcmd req 0x%x\n", + val & 0xffff); + goto bail; + } + qib_set_ib_6120_lstate(ppd, lcmd, licmd); + goto bail; + + case QIB_IB_CFG_HRTBT: + ret = -EINVAL; + break; + + default: + ret = -EINVAL; + } +bail: + return ret; +} + +static int qib_6120_set_loopback(struct qib_pportdata *ppd, const char *what) +{ + int ret = 0; + + if (!strncmp(what, "ibc", 3)) { + ppd->dd->cspec->ibcctrl |= SYM_MASK(IBCCtrl, Loopback); + qib_devinfo(ppd->dd->pcidev, "Enabling IB%u:%u IBC loopback\n", + ppd->dd->unit, ppd->port); + } else if (!strncmp(what, "off", 3)) { + ppd->dd->cspec->ibcctrl &= ~SYM_MASK(IBCCtrl, Loopback); + qib_devinfo(ppd->dd->pcidev, + "Disabling IB%u:%u IBC loopback (normal)\n", + ppd->dd->unit, ppd->port); + } else + ret = -EINVAL; + if (!ret) { + qib_write_kreg(ppd->dd, kr_ibcctrl, ppd->dd->cspec->ibcctrl); + qib_write_kreg(ppd->dd, kr_scratch, 0); + } + return ret; +} + +static void pma_6120_timer(unsigned long data) +{ + struct qib_pportdata *ppd = (struct qib_pportdata *)data; + struct qib_chip_specific *cs = ppd->dd->cspec; + struct qib_ibport *ibp = &ppd->ibport_data; + unsigned long flags; + + spin_lock_irqsave(&ibp->lock, flags); + if (cs->pma_sample_status == IB_PMA_SAMPLE_STATUS_STARTED) { + cs->pma_sample_status = IB_PMA_SAMPLE_STATUS_RUNNING; + qib_snapshot_counters(ppd, &cs->sword, &cs->rword, + &cs->spkts, &cs->rpkts, &cs->xmit_wait); + mod_timer(&cs->pma_timer, + jiffies + usecs_to_jiffies(ibp->pma_sample_interval)); + } else if (cs->pma_sample_status == IB_PMA_SAMPLE_STATUS_RUNNING) { + u64 ta, tb, tc, td, te; + + cs->pma_sample_status = IB_PMA_SAMPLE_STATUS_DONE; + qib_snapshot_counters(ppd, &ta, &tb, &tc, &td, &te); + + cs->sword = ta - cs->sword; + cs->rword = tb - cs->rword; + cs->spkts = tc - cs->spkts; + cs->rpkts = td - cs->rpkts; + cs->xmit_wait = te - cs->xmit_wait; + } + spin_unlock_irqrestore(&ibp->lock, flags); +} + +/* + * Note that the caller has the ibp->lock held. + */ +static void qib_set_cntr_6120_sample(struct qib_pportdata *ppd, u32 intv, + u32 start) +{ + struct qib_chip_specific *cs = ppd->dd->cspec; + + if (start && intv) { + cs->pma_sample_status = IB_PMA_SAMPLE_STATUS_STARTED; + mod_timer(&cs->pma_timer, jiffies + usecs_to_jiffies(start)); + } else if (intv) { + cs->pma_sample_status = IB_PMA_SAMPLE_STATUS_RUNNING; + qib_snapshot_counters(ppd, &cs->sword, &cs->rword, + &cs->spkts, &cs->rpkts, &cs->xmit_wait); + mod_timer(&cs->pma_timer, jiffies + usecs_to_jiffies(intv)); + } else { + cs->pma_sample_status = IB_PMA_SAMPLE_STATUS_DONE; + cs->sword = 0; + cs->rword = 0; + cs->spkts = 0; + cs->rpkts = 0; + cs->xmit_wait = 0; + } +} + +static u32 qib_6120_iblink_state(u64 ibcs) +{ + u32 state = (u32)SYM_FIELD(ibcs, IBCStatus, LinkState); + + switch (state) { + case IB_6120_L_STATE_INIT: + state = IB_PORT_INIT; + break; + case IB_6120_L_STATE_ARM: + state = IB_PORT_ARMED; + break; + case IB_6120_L_STATE_ACTIVE: + /* fall through */ + case IB_6120_L_STATE_ACT_DEFER: + state = IB_PORT_ACTIVE; + break; + default: /* fall through */ + case IB_6120_L_STATE_DOWN: + state = IB_PORT_DOWN; + break; + } + return state; +} + +/* returns the IBTA port state, rather than the IBC link training state */ +static u8 qib_6120_phys_portstate(u64 ibcs) +{ + u8 state = (u8)SYM_FIELD(ibcs, IBCStatus, LinkTrainingState); + return qib_6120_physportstate[state]; +} + +static int qib_6120_ib_updown(struct qib_pportdata *ppd, int ibup, u64 ibcs) +{ + unsigned long flags; + + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags &= ~QIBL_IB_FORCE_NOTIFY; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + + if (ibup) { + if (ppd->dd->cspec->ibdeltainprog) { + ppd->dd->cspec->ibdeltainprog = 0; + ppd->dd->cspec->ibsymdelta += + read_6120_creg32(ppd->dd, cr_ibsymbolerr) - + ppd->dd->cspec->ibsymsnap; + ppd->dd->cspec->iblnkerrdelta += + read_6120_creg32(ppd->dd, cr_iblinkerrrecov) - + ppd->dd->cspec->iblnkerrsnap; + } + qib_hol_init(ppd); + } else { + ppd->dd->cspec->lli_counter = 0; + if (!ppd->dd->cspec->ibdeltainprog) { + ppd->dd->cspec->ibdeltainprog = 1; + ppd->dd->cspec->ibsymsnap = + read_6120_creg32(ppd->dd, cr_ibsymbolerr); + ppd->dd->cspec->iblnkerrsnap = + read_6120_creg32(ppd->dd, cr_iblinkerrrecov); + } + qib_hol_down(ppd); + } + + qib_6120_setup_setextled(ppd, ibup); + + return 0; +} + +/* Does read/modify/write to appropriate registers to + * set output and direction bits selected by mask. + * these are in their canonical postions (e.g. lsb of + * dir will end up in D48 of extctrl on existing chips). + * returns contents of GP Inputs. + */ +static int gpio_6120_mod(struct qib_devdata *dd, u32 out, u32 dir, u32 mask) +{ + u64 read_val, new_out; + unsigned long flags; + + if (mask) { + /* some bits being written, lock access to GPIO */ + dir &= mask; + out &= mask; + spin_lock_irqsave(&dd->cspec->gpio_lock, flags); + dd->cspec->extctrl &= ~((u64)mask << SYM_LSB(EXTCtrl, GPIOOe)); + dd->cspec->extctrl |= ((u64) dir << SYM_LSB(EXTCtrl, GPIOOe)); + new_out = (dd->cspec->gpio_out & ~mask) | out; + + qib_write_kreg(dd, kr_extctrl, dd->cspec->extctrl); + qib_write_kreg(dd, kr_gpio_out, new_out); + dd->cspec->gpio_out = new_out; + spin_unlock_irqrestore(&dd->cspec->gpio_lock, flags); + } + /* + * It is unlikely that a read at this time would get valid + * data on a pin whose direction line was set in the same + * call to this function. We include the read here because + * that allows us to potentially combine a change on one pin with + * a read on another, and because the old code did something like + * this. + */ + read_val = qib_read_kreg64(dd, kr_extstatus); + return SYM_FIELD(read_val, EXTStatus, GPIOIn); +} + +/* + * Read fundamental info we need to use the chip. These are + * the registers that describe chip capabilities, and are + * saved in shadow registers. + */ +static void get_6120_chip_params(struct qib_devdata *dd) +{ + u64 val; + u32 piobufs; + int mtu; + + dd->uregbase = qib_read_kreg32(dd, kr_userregbase); + + dd->rcvtidcnt = qib_read_kreg32(dd, kr_rcvtidcnt); + dd->rcvtidbase = qib_read_kreg32(dd, kr_rcvtidbase); + dd->rcvegrbase = qib_read_kreg32(dd, kr_rcvegrbase); + dd->palign = qib_read_kreg32(dd, kr_palign); + dd->piobufbase = qib_read_kreg64(dd, kr_sendpiobufbase); + dd->pio2k_bufbase = dd->piobufbase & 0xffffffff; + + dd->rcvhdrcnt = qib_read_kreg32(dd, kr_rcvegrcnt); + + val = qib_read_kreg64(dd, kr_sendpiosize); + dd->piosize2k = val & ~0U; + dd->piosize4k = val >> 32; + + mtu = ib_mtu_enum_to_int(qib_ibmtu); + if (mtu == -1) + mtu = QIB_DEFAULT_MTU; + dd->pport->ibmtu = (u32)mtu; + + val = qib_read_kreg64(dd, kr_sendpiobufcnt); + dd->piobcnt2k = val & ~0U; + dd->piobcnt4k = val >> 32; + dd->last_pio = dd->piobcnt4k + dd->piobcnt2k - 1; + /* these may be adjusted in init_chip_wc_pat() */ + dd->pio2kbase = (u32 __iomem *) + (((char __iomem *)dd->kregbase) + dd->pio2k_bufbase); + if (dd->piobcnt4k) { + dd->pio4kbase = (u32 __iomem *) + (((char __iomem *) dd->kregbase) + + (dd->piobufbase >> 32)); + /* + * 4K buffers take 2 pages; we use roundup just to be + * paranoid; we calculate it once here, rather than on + * ever buf allocate + */ + dd->align4k = ALIGN(dd->piosize4k, dd->palign); + } + + piobufs = dd->piobcnt4k + dd->piobcnt2k; + + dd->pioavregs = ALIGN(piobufs, sizeof(u64) * BITS_PER_BYTE / 2) / + (sizeof(u64) * BITS_PER_BYTE / 2); +} + +/* + * The chip base addresses in cspec and cpspec have to be set + * after possible init_chip_wc_pat(), rather than in + * get_6120_chip_params(), so split out as separate function + */ +static void set_6120_baseaddrs(struct qib_devdata *dd) +{ + u32 cregbase; + + cregbase = qib_read_kreg32(dd, kr_counterregbase); + dd->cspec->cregbase = (u64 __iomem *) + ((char __iomem *) dd->kregbase + cregbase); + + dd->egrtidbase = (u64 __iomem *) + ((char __iomem *) dd->kregbase + dd->rcvegrbase); +} + +/* + * Write the final few registers that depend on some of the + * init setup. Done late in init, just before bringing up + * the serdes. + */ +static int qib_late_6120_initreg(struct qib_devdata *dd) +{ + int ret = 0; + u64 val; + + qib_write_kreg(dd, kr_rcvhdrentsize, dd->rcvhdrentsize); + qib_write_kreg(dd, kr_rcvhdrsize, dd->rcvhdrsize); + qib_write_kreg(dd, kr_rcvhdrcnt, dd->rcvhdrcnt); + qib_write_kreg(dd, kr_sendpioavailaddr, dd->pioavailregs_phys); + val = qib_read_kreg64(dd, kr_sendpioavailaddr); + if (val != dd->pioavailregs_phys) { + qib_dev_err(dd, + "Catastrophic software error, SendPIOAvailAddr written as %lx, read back as %llx\n", + (unsigned long) dd->pioavailregs_phys, + (unsigned long long) val); + ret = -EINVAL; + } + return ret; +} + +static int init_6120_variables(struct qib_devdata *dd) +{ + int ret = 0; + struct qib_pportdata *ppd; + u32 sbufs; + + ppd = (struct qib_pportdata *)(dd + 1); + dd->pport = ppd; + dd->num_pports = 1; + + dd->cspec = (struct qib_chip_specific *)(ppd + dd->num_pports); + ppd->cpspec = NULL; /* not used in this chip */ + + spin_lock_init(&dd->cspec->kernel_tid_lock); + spin_lock_init(&dd->cspec->user_tid_lock); + spin_lock_init(&dd->cspec->rcvmod_lock); + spin_lock_init(&dd->cspec->gpio_lock); + + /* we haven't yet set QIB_PRESENT, so use read directly */ + dd->revision = readq(&dd->kregbase[kr_revision]); + + if ((dd->revision & 0xffffffffU) == 0xffffffffU) { + qib_dev_err(dd, + "Revision register read failure, giving up initialization\n"); + ret = -ENODEV; + goto bail; + } + dd->flags |= QIB_PRESENT; /* now register routines work */ + + dd->majrev = (u8) SYM_FIELD(dd->revision, Revision_R, + ChipRevMajor); + dd->minrev = (u8) SYM_FIELD(dd->revision, Revision_R, + ChipRevMinor); + + get_6120_chip_params(dd); + pe_boardname(dd); /* fill in boardname */ + + /* + * GPIO bits for TWSI data and clock, + * used for serial EEPROM. + */ + dd->gpio_sda_num = _QIB_GPIO_SDA_NUM; + dd->gpio_scl_num = _QIB_GPIO_SCL_NUM; + dd->twsi_eeprom_dev = QIB_TWSI_NO_DEV; + + if (qib_unordered_wc()) + dd->flags |= QIB_PIO_FLUSH_WC; + + /* + * EEPROM error log 0 is TXE Parity errors. 1 is RXE Parity. + * 2 is Some Misc, 3 is reserved for future. + */ + dd->eep_st_masks[0].hwerrs_to_log = HWE_MASK(TXEMemParityErr); + + /* Ignore errors in PIO/PBC on systems with unordered write-combining */ + if (qib_unordered_wc()) + dd->eep_st_masks[0].hwerrs_to_log &= ~TXE_PIO_PARITY; + + dd->eep_st_masks[1].hwerrs_to_log = HWE_MASK(RXEMemParityErr); + + dd->eep_st_masks[2].errs_to_log = ERR_MASK(ResetNegated); + + ret = qib_init_pportdata(ppd, dd, 0, 1); + if (ret) + goto bail; + ppd->link_width_supported = IB_WIDTH_1X | IB_WIDTH_4X; + ppd->link_speed_supported = QIB_IB_SDR; + ppd->link_width_enabled = IB_WIDTH_4X; + ppd->link_speed_enabled = ppd->link_speed_supported; + /* these can't change for this chip, so set once */ + ppd->link_width_active = ppd->link_width_enabled; + ppd->link_speed_active = ppd->link_speed_enabled; + ppd->vls_supported = IB_VL_VL0; + ppd->vls_operational = ppd->vls_supported; + + dd->rcvhdrentsize = QIB_RCVHDR_ENTSIZE; + dd->rcvhdrsize = QIB_DFLT_RCVHDRSIZE; + dd->rhf_offset = 0; + + /* we always allocate at least 2048 bytes for eager buffers */ + ret = ib_mtu_enum_to_int(qib_ibmtu); + dd->rcvegrbufsize = ret != -1 ? max(ret, 2048) : QIB_DEFAULT_MTU; + BUG_ON(!is_power_of_2(dd->rcvegrbufsize)); + dd->rcvegrbufsize_shift = ilog2(dd->rcvegrbufsize); + + qib_6120_tidtemplate(dd); + + /* + * We can request a receive interrupt for 1 or + * more packets from current offset. For now, we set this + * up for a single packet. + */ + dd->rhdrhead_intr_off = 1ULL << 32; + + /* setup the stats timer; the add_timer is done at end of init */ + init_timer(&dd->stats_timer); + dd->stats_timer.function = qib_get_6120_faststats; + dd->stats_timer.data = (unsigned long) dd; + + init_timer(&dd->cspec->pma_timer); + dd->cspec->pma_timer.function = pma_6120_timer; + dd->cspec->pma_timer.data = (unsigned long) ppd; + + dd->ureg_align = qib_read_kreg32(dd, kr_palign); + + dd->piosize2kmax_dwords = dd->piosize2k >> 2; + qib_6120_config_ctxts(dd); + qib_set_ctxtcnt(dd); + + ret = init_chip_wc_pat(dd, 0); + if (ret) + goto bail; + set_6120_baseaddrs(dd); /* set chip access pointers now */ + + ret = 0; + if (qib_mini_init) + goto bail; + + qib_num_cfg_vls = 1; /* if any 6120's, only one VL */ + + ret = qib_create_ctxts(dd); + init_6120_cntrnames(dd); + + /* use all of 4KB buffers for the kernel, otherwise 16 */ + sbufs = dd->piobcnt4k ? dd->piobcnt4k : 16; + + dd->lastctxt_piobuf = dd->piobcnt2k + dd->piobcnt4k - sbufs; + dd->pbufsctxt = dd->lastctxt_piobuf / + (dd->cfgctxts - dd->first_user_ctxt); + + if (ret) + goto bail; +bail: + return ret; +} + +/* + * For this chip, we want to use the same buffer every time + * when we are trying to bring the link up (they are always VL15 + * packets). At that link state the packet should always go out immediately + * (or at least be discarded at the tx interface if the link is down). + * If it doesn't, and the buffer isn't available, that means some other + * sender has gotten ahead of us, and is preventing our packet from going + * out. In that case, we flush all packets, and try again. If that still + * fails, we fail the request, and hope things work the next time around. + * + * We don't need very complicated heuristics on whether the packet had + * time to go out or not, since even at SDR 1X, it goes out in very short + * time periods, covered by the chip reads done here and as part of the + * flush. + */ +static u32 __iomem *get_6120_link_buf(struct qib_pportdata *ppd, u32 *bnum) +{ + u32 __iomem *buf; + u32 lbuf = ppd->dd->piobcnt2k + ppd->dd->piobcnt4k - 1; + + /* + * always blip to get avail list updated, since it's almost + * always needed, and is fairly cheap. + */ + sendctrl_6120_mod(ppd->dd->pport, QIB_SENDCTRL_AVAIL_BLIP); + qib_read_kreg64(ppd->dd, kr_scratch); /* extra chip flush */ + buf = qib_getsendbuf_range(ppd->dd, bnum, lbuf, lbuf); + if (buf) + goto done; + + sendctrl_6120_mod(ppd, QIB_SENDCTRL_DISARM_ALL | QIB_SENDCTRL_FLUSH | + QIB_SENDCTRL_AVAIL_BLIP); + ppd->dd->upd_pio_shadow = 1; /* update our idea of what's busy */ + qib_read_kreg64(ppd->dd, kr_scratch); /* extra chip flush */ + buf = qib_getsendbuf_range(ppd->dd, bnum, lbuf, lbuf); +done: + return buf; +} + +static u32 __iomem *qib_6120_getsendbuf(struct qib_pportdata *ppd, u64 pbc, + u32 *pbufnum) +{ + u32 first, last, plen = pbc & QIB_PBC_LENGTH_MASK; + struct qib_devdata *dd = ppd->dd; + u32 __iomem *buf; + + if (((pbc >> 32) & PBC_6120_VL15_SEND_CTRL) && + !(ppd->lflags & (QIBL_IB_AUTONEG_INPROG | QIBL_LINKACTIVE))) + buf = get_6120_link_buf(ppd, pbufnum); + else { + + if ((plen + 1) > dd->piosize2kmax_dwords) + first = dd->piobcnt2k; + else + first = 0; + /* try 4k if all 2k busy, so same last for both sizes */ + last = dd->piobcnt2k + dd->piobcnt4k - 1; + buf = qib_getsendbuf_range(dd, pbufnum, first, last); + } + return buf; +} + +static int init_sdma_6120_regs(struct qib_pportdata *ppd) +{ + return -ENODEV; +} + +static u16 qib_sdma_6120_gethead(struct qib_pportdata *ppd) +{ + return 0; +} + +static int qib_sdma_6120_busy(struct qib_pportdata *ppd) +{ + return 0; +} + +static void qib_sdma_update_6120_tail(struct qib_pportdata *ppd, u16 tail) +{ +} + +static void qib_6120_sdma_sendctrl(struct qib_pportdata *ppd, unsigned op) +{ +} + +static void qib_sdma_set_6120_desc_cnt(struct qib_pportdata *ppd, unsigned cnt) +{ +} + +/* + * the pbc doesn't need a VL15 indicator, but we need it for link_buf. + * The chip ignores the bit if set. + */ +static u32 qib_6120_setpbc_control(struct qib_pportdata *ppd, u32 plen, + u8 srate, u8 vl) +{ + return vl == 15 ? PBC_6120_VL15_SEND_CTRL : 0; +} + +static void qib_6120_initvl15_bufs(struct qib_devdata *dd) +{ +} + +static void qib_6120_init_ctxt(struct qib_ctxtdata *rcd) +{ + rcd->rcvegrcnt = rcd->dd->rcvhdrcnt; + rcd->rcvegr_tid_base = rcd->ctxt * rcd->rcvegrcnt; +} + +static void qib_6120_txchk_change(struct qib_devdata *dd, u32 start, + u32 len, u32 avail, struct qib_ctxtdata *rcd) +{ +} + +static void writescratch(struct qib_devdata *dd, u32 val) +{ + (void) qib_write_kreg(dd, kr_scratch, val); +} + +static int qib_6120_tempsense_rd(struct qib_devdata *dd, int regnum) +{ + return -ENXIO; +} + +#ifdef CONFIG_INFINIBAND_QIB_DCA +static int qib_6120_notify_dca(struct qib_devdata *dd, unsigned long event) +{ + return 0; +} +#endif + +/* Dummy function, as 6120 boards never disable EEPROM Write */ +static int qib_6120_eeprom_wen(struct qib_devdata *dd, int wen) +{ + return 1; +} + +/** + * qib_init_iba6120_funcs - set up the chip-specific function pointers + * @pdev: pci_dev of the qlogic_ib device + * @ent: pci_device_id matching this chip + * + * This is global, and is called directly at init to set up the + * chip-specific function pointers for later use. + * + * It also allocates/partially-inits the qib_devdata struct for + * this device. + */ +struct qib_devdata *qib_init_iba6120_funcs(struct pci_dev *pdev, + const struct pci_device_id *ent) +{ + struct qib_devdata *dd; + int ret; + + dd = qib_alloc_devdata(pdev, sizeof(struct qib_pportdata) + + sizeof(struct qib_chip_specific)); + if (IS_ERR(dd)) + goto bail; + + dd->f_bringup_serdes = qib_6120_bringup_serdes; + dd->f_cleanup = qib_6120_setup_cleanup; + dd->f_clear_tids = qib_6120_clear_tids; + dd->f_free_irq = qib_6120_free_irq; + dd->f_get_base_info = qib_6120_get_base_info; + dd->f_get_msgheader = qib_6120_get_msgheader; + dd->f_getsendbuf = qib_6120_getsendbuf; + dd->f_gpio_mod = gpio_6120_mod; + dd->f_eeprom_wen = qib_6120_eeprom_wen; + dd->f_hdrqempty = qib_6120_hdrqempty; + dd->f_ib_updown = qib_6120_ib_updown; + dd->f_init_ctxt = qib_6120_init_ctxt; + dd->f_initvl15_bufs = qib_6120_initvl15_bufs; + dd->f_intr_fallback = qib_6120_nointr_fallback; + dd->f_late_initreg = qib_late_6120_initreg; + dd->f_setpbc_control = qib_6120_setpbc_control; + dd->f_portcntr = qib_portcntr_6120; + dd->f_put_tid = (dd->minrev >= 2) ? + qib_6120_put_tid_2 : + qib_6120_put_tid; + dd->f_quiet_serdes = qib_6120_quiet_serdes; + dd->f_rcvctrl = rcvctrl_6120_mod; + dd->f_read_cntrs = qib_read_6120cntrs; + dd->f_read_portcntrs = qib_read_6120portcntrs; + dd->f_reset = qib_6120_setup_reset; + dd->f_init_sdma_regs = init_sdma_6120_regs; + dd->f_sdma_busy = qib_sdma_6120_busy; + dd->f_sdma_gethead = qib_sdma_6120_gethead; + dd->f_sdma_sendctrl = qib_6120_sdma_sendctrl; + dd->f_sdma_set_desc_cnt = qib_sdma_set_6120_desc_cnt; + dd->f_sdma_update_tail = qib_sdma_update_6120_tail; + dd->f_sendctrl = sendctrl_6120_mod; + dd->f_set_armlaunch = qib_set_6120_armlaunch; + dd->f_set_cntr_sample = qib_set_cntr_6120_sample; + dd->f_iblink_state = qib_6120_iblink_state; + dd->f_ibphys_portstate = qib_6120_phys_portstate; + dd->f_get_ib_cfg = qib_6120_get_ib_cfg; + dd->f_set_ib_cfg = qib_6120_set_ib_cfg; + dd->f_set_ib_loopback = qib_6120_set_loopback; + dd->f_set_intr_state = qib_6120_set_intr_state; + dd->f_setextled = qib_6120_setup_setextled; + dd->f_txchk_change = qib_6120_txchk_change; + dd->f_update_usrhead = qib_update_6120_usrhead; + dd->f_wantpiobuf_intr = qib_wantpiobuf_6120_intr; + dd->f_xgxs_reset = qib_6120_xgxs_reset; + dd->f_writescratch = writescratch; + dd->f_tempsense_rd = qib_6120_tempsense_rd; +#ifdef CONFIG_INFINIBAND_QIB_DCA + dd->f_notify_dca = qib_6120_notify_dca; +#endif + /* + * Do remaining pcie setup and save pcie values in dd. + * Any error printing is already done by the init code. + * On return, we have the chip mapped and accessible, + * but chip registers are not set up until start of + * init_6120_variables. + */ + ret = qib_pcie_ddinit(dd, pdev, ent); + if (ret < 0) + goto bail_free; + + /* initialize chip-specific variables */ + ret = init_6120_variables(dd); + if (ret) + goto bail_cleanup; + + if (qib_mini_init) + goto bail; + + if (qib_pcie_params(dd, 8, NULL, NULL)) + qib_dev_err(dd, + "Failed to setup PCIe or interrupts; continuing anyway\n"); + dd->cspec->irq = pdev->irq; /* save IRQ */ + + /* clear diagctrl register, in case diags were running and crashed */ + qib_write_kreg(dd, kr_hwdiagctrl, 0); + + if (qib_read_kreg64(dd, kr_hwerrstatus) & + QLOGIC_IB_HWE_SERDESPLLFAILED) + qib_write_kreg(dd, kr_hwerrclear, + QLOGIC_IB_HWE_SERDESPLLFAILED); + + /* setup interrupt handler (interrupt type handled above) */ + qib_setup_6120_interrupt(dd); + /* Note that qpn_mask is set by qib_6120_config_ctxts() first */ + qib_6120_init_hwerrors(dd); + + goto bail; + +bail_cleanup: + qib_pcie_ddcleanup(dd); +bail_free: + qib_free_devdata(dd); + dd = ERR_PTR(ret); +bail: + return dd; +} diff --git a/kernel/drivers/infiniband/hw/qib/qib_iba7220.c b/kernel/drivers/infiniband/hw/qib/qib_iba7220.c new file mode 100644 index 000000000..00b2af211 --- /dev/null +++ b/kernel/drivers/infiniband/hw/qib/qib_iba7220.c @@ -0,0 +1,4657 @@ +/* + * Copyright (c) 2006, 2007, 2008, 2009, 2010 QLogic Corporation. + * All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +/* + * This file contains all of the code that is specific to the + * QLogic_IB 7220 chip (except that specific to the SerDes) + */ + +#include +#include +#include +#include +#include +#include + +#include "qib.h" +#include "qib_7220.h" + +static void qib_setup_7220_setextled(struct qib_pportdata *, u32); +static void qib_7220_handle_hwerrors(struct qib_devdata *, char *, size_t); +static void sendctrl_7220_mod(struct qib_pportdata *ppd, u32 op); +static u32 qib_7220_iblink_state(u64); +static u8 qib_7220_phys_portstate(u64); +static void qib_sdma_update_7220_tail(struct qib_pportdata *, u16); +static void qib_set_ib_7220_lstate(struct qib_pportdata *, u16, u16); + +/* + * This file contains almost all the chip-specific register information and + * access functions for the QLogic QLogic_IB 7220 PCI-Express chip, with the + * exception of SerDes support, which in in qib_sd7220.c. + */ + +/* Below uses machine-generated qib_chipnum_regs.h file */ +#define KREG_IDX(regname) (QIB_7220_##regname##_OFFS / sizeof(u64)) + +/* Use defines to tie machine-generated names to lower-case names */ +#define kr_control KREG_IDX(Control) +#define kr_counterregbase KREG_IDX(CntrRegBase) +#define kr_errclear KREG_IDX(ErrClear) +#define kr_errmask KREG_IDX(ErrMask) +#define kr_errstatus KREG_IDX(ErrStatus) +#define kr_extctrl KREG_IDX(EXTCtrl) +#define kr_extstatus KREG_IDX(EXTStatus) +#define kr_gpio_clear KREG_IDX(GPIOClear) +#define kr_gpio_mask KREG_IDX(GPIOMask) +#define kr_gpio_out KREG_IDX(GPIOOut) +#define kr_gpio_status KREG_IDX(GPIOStatus) +#define kr_hrtbt_guid KREG_IDX(HRTBT_GUID) +#define kr_hwdiagctrl KREG_IDX(HwDiagCtrl) +#define kr_hwerrclear KREG_IDX(HwErrClear) +#define kr_hwerrmask KREG_IDX(HwErrMask) +#define kr_hwerrstatus KREG_IDX(HwErrStatus) +#define kr_ibcctrl KREG_IDX(IBCCtrl) +#define kr_ibcddrctrl KREG_IDX(IBCDDRCtrl) +#define kr_ibcddrstatus KREG_IDX(IBCDDRStatus) +#define kr_ibcstatus KREG_IDX(IBCStatus) +#define kr_ibserdesctrl KREG_IDX(IBSerDesCtrl) +#define kr_intclear KREG_IDX(IntClear) +#define kr_intmask KREG_IDX(IntMask) +#define kr_intstatus KREG_IDX(IntStatus) +#define kr_ncmodectrl KREG_IDX(IBNCModeCtrl) +#define kr_palign KREG_IDX(PageAlign) +#define kr_partitionkey KREG_IDX(RcvPartitionKey) +#define kr_portcnt KREG_IDX(PortCnt) +#define kr_rcvbthqp KREG_IDX(RcvBTHQP) +#define kr_rcvctrl KREG_IDX(RcvCtrl) +#define kr_rcvegrbase KREG_IDX(RcvEgrBase) +#define kr_rcvegrcnt KREG_IDX(RcvEgrCnt) +#define kr_rcvhdrcnt KREG_IDX(RcvHdrCnt) +#define kr_rcvhdrentsize KREG_IDX(RcvHdrEntSize) +#define kr_rcvhdrsize KREG_IDX(RcvHdrSize) +#define kr_rcvpktledcnt KREG_IDX(RcvPktLEDCnt) +#define kr_rcvtidbase KREG_IDX(RcvTIDBase) +#define kr_rcvtidcnt KREG_IDX(RcvTIDCnt) +#define kr_revision KREG_IDX(Revision) +#define kr_scratch KREG_IDX(Scratch) +#define kr_sendbuffererror KREG_IDX(SendBufErr0) +#define kr_sendctrl KREG_IDX(SendCtrl) +#define kr_senddmabase KREG_IDX(SendDmaBase) +#define kr_senddmabufmask0 KREG_IDX(SendDmaBufMask0) +#define kr_senddmabufmask1 (KREG_IDX(SendDmaBufMask0) + 1) +#define kr_senddmabufmask2 (KREG_IDX(SendDmaBufMask0) + 2) +#define kr_senddmahead KREG_IDX(SendDmaHead) +#define kr_senddmaheadaddr KREG_IDX(SendDmaHeadAddr) +#define kr_senddmalengen KREG_IDX(SendDmaLenGen) +#define kr_senddmastatus KREG_IDX(SendDmaStatus) +#define kr_senddmatail KREG_IDX(SendDmaTail) +#define kr_sendpioavailaddr KREG_IDX(SendBufAvailAddr) +#define kr_sendpiobufbase KREG_IDX(SendBufBase) +#define kr_sendpiobufcnt KREG_IDX(SendBufCnt) +#define kr_sendpiosize KREG_IDX(SendBufSize) +#define kr_sendregbase KREG_IDX(SendRegBase) +#define kr_userregbase KREG_IDX(UserRegBase) +#define kr_xgxs_cfg KREG_IDX(XGXSCfg) + +/* These must only be written via qib_write_kreg_ctxt() */ +#define kr_rcvhdraddr KREG_IDX(RcvHdrAddr0) +#define kr_rcvhdrtailaddr KREG_IDX(RcvHdrTailAddr0) + + +#define CREG_IDX(regname) ((QIB_7220_##regname##_OFFS - \ + QIB_7220_LBIntCnt_OFFS) / sizeof(u64)) + +#define cr_badformat CREG_IDX(RxVersionErrCnt) +#define cr_erricrc CREG_IDX(RxICRCErrCnt) +#define cr_errlink CREG_IDX(RxLinkMalformCnt) +#define cr_errlpcrc CREG_IDX(RxLPCRCErrCnt) +#define cr_errpkey CREG_IDX(RxPKeyMismatchCnt) +#define cr_rcvflowctrl_err CREG_IDX(RxFlowCtrlViolCnt) +#define cr_err_rlen CREG_IDX(RxLenErrCnt) +#define cr_errslen CREG_IDX(TxLenErrCnt) +#define cr_errtidfull CREG_IDX(RxTIDFullErrCnt) +#define cr_errtidvalid CREG_IDX(RxTIDValidErrCnt) +#define cr_errvcrc CREG_IDX(RxVCRCErrCnt) +#define cr_ibstatuschange CREG_IDX(IBStatusChangeCnt) +#define cr_lbint CREG_IDX(LBIntCnt) +#define cr_invalidrlen CREG_IDX(RxMaxMinLenErrCnt) +#define cr_invalidslen CREG_IDX(TxMaxMinLenErrCnt) +#define cr_lbflowstall CREG_IDX(LBFlowStallCnt) +#define cr_pktrcv CREG_IDX(RxDataPktCnt) +#define cr_pktrcvflowctrl CREG_IDX(RxFlowPktCnt) +#define cr_pktsend CREG_IDX(TxDataPktCnt) +#define cr_pktsendflow CREG_IDX(TxFlowPktCnt) +#define cr_portovfl CREG_IDX(RxP0HdrEgrOvflCnt) +#define cr_rcvebp CREG_IDX(RxEBPCnt) +#define cr_rcvovfl CREG_IDX(RxBufOvflCnt) +#define cr_senddropped CREG_IDX(TxDroppedPktCnt) +#define cr_sendstall CREG_IDX(TxFlowStallCnt) +#define cr_sendunderrun CREG_IDX(TxUnderrunCnt) +#define cr_wordrcv CREG_IDX(RxDwordCnt) +#define cr_wordsend CREG_IDX(TxDwordCnt) +#define cr_txunsupvl CREG_IDX(TxUnsupVLErrCnt) +#define cr_rxdroppkt CREG_IDX(RxDroppedPktCnt) +#define cr_iblinkerrrecov CREG_IDX(IBLinkErrRecoveryCnt) +#define cr_iblinkdown CREG_IDX(IBLinkDownedCnt) +#define cr_ibsymbolerr CREG_IDX(IBSymbolErrCnt) +#define cr_vl15droppedpkt CREG_IDX(RxVL15DroppedPktCnt) +#define cr_rxotherlocalphyerr CREG_IDX(RxOtherLocalPhyErrCnt) +#define cr_excessbufferovfl CREG_IDX(ExcessBufferOvflCnt) +#define cr_locallinkintegrityerr CREG_IDX(LocalLinkIntegrityErrCnt) +#define cr_rxvlerr CREG_IDX(RxVlErrCnt) +#define cr_rxdlidfltr CREG_IDX(RxDlidFltrCnt) +#define cr_psstat CREG_IDX(PSStat) +#define cr_psstart CREG_IDX(PSStart) +#define cr_psinterval CREG_IDX(PSInterval) +#define cr_psrcvdatacount CREG_IDX(PSRcvDataCount) +#define cr_psrcvpktscount CREG_IDX(PSRcvPktsCount) +#define cr_psxmitdatacount CREG_IDX(PSXmitDataCount) +#define cr_psxmitpktscount CREG_IDX(PSXmitPktsCount) +#define cr_psxmitwaitcount CREG_IDX(PSXmitWaitCount) +#define cr_txsdmadesc CREG_IDX(TxSDmaDescCnt) +#define cr_pcieretrydiag CREG_IDX(PcieRetryBufDiagQwordCnt) + +#define SYM_RMASK(regname, fldname) ((u64) \ + QIB_7220_##regname##_##fldname##_RMASK) +#define SYM_MASK(regname, fldname) ((u64) \ + QIB_7220_##regname##_##fldname##_RMASK << \ + QIB_7220_##regname##_##fldname##_LSB) +#define SYM_LSB(regname, fldname) (QIB_7220_##regname##_##fldname##_LSB) +#define SYM_FIELD(value, regname, fldname) ((u64) \ + (((value) >> SYM_LSB(regname, fldname)) & \ + SYM_RMASK(regname, fldname))) +#define ERR_MASK(fldname) SYM_MASK(ErrMask, fldname##Mask) +#define HWE_MASK(fldname) SYM_MASK(HwErrMask, fldname##Mask) + +/* ibcctrl bits */ +#define QLOGIC_IB_IBCC_LINKINITCMD_DISABLE 1 +/* cycle through TS1/TS2 till OK */ +#define QLOGIC_IB_IBCC_LINKINITCMD_POLL 2 +/* wait for TS1, then go on */ +#define QLOGIC_IB_IBCC_LINKINITCMD_SLEEP 3 +#define QLOGIC_IB_IBCC_LINKINITCMD_SHIFT 16 + +#define QLOGIC_IB_IBCC_LINKCMD_DOWN 1 /* move to 0x11 */ +#define QLOGIC_IB_IBCC_LINKCMD_ARMED 2 /* move to 0x21 */ +#define QLOGIC_IB_IBCC_LINKCMD_ACTIVE 3 /* move to 0x31 */ + +#define BLOB_7220_IBCHG 0x81 + +/* + * We could have a single register get/put routine, that takes a group type, + * but this is somewhat clearer and cleaner. It also gives us some error + * checking. 64 bit register reads should always work, but are inefficient + * on opteron (the northbridge always generates 2 separate HT 32 bit reads), + * so we use kreg32 wherever possible. User register and counter register + * reads are always 32 bit reads, so only one form of those routines. + */ + +/** + * qib_read_ureg32 - read 32-bit virtualized per-context register + * @dd: device + * @regno: register number + * @ctxt: context number + * + * Return the contents of a register that is virtualized to be per context. + * Returns -1 on errors (not distinguishable from valid contents at + * runtime; we may add a separate error variable at some point). + */ +static inline u32 qib_read_ureg32(const struct qib_devdata *dd, + enum qib_ureg regno, int ctxt) +{ + if (!dd->kregbase || !(dd->flags & QIB_PRESENT)) + return 0; + + if (dd->userbase) + return readl(regno + (u64 __iomem *) + ((char __iomem *)dd->userbase + + dd->ureg_align * ctxt)); + else + return readl(regno + (u64 __iomem *) + (dd->uregbase + + (char __iomem *)dd->kregbase + + dd->ureg_align * ctxt)); +} + +/** + * qib_write_ureg - write 32-bit virtualized per-context register + * @dd: device + * @regno: register number + * @value: value + * @ctxt: context + * + * Write the contents of a register that is virtualized to be per context. + */ +static inline void qib_write_ureg(const struct qib_devdata *dd, + enum qib_ureg regno, u64 value, int ctxt) +{ + u64 __iomem *ubase; + + if (dd->userbase) + ubase = (u64 __iomem *) + ((char __iomem *) dd->userbase + + dd->ureg_align * ctxt); + else + ubase = (u64 __iomem *) + (dd->uregbase + + (char __iomem *) dd->kregbase + + dd->ureg_align * ctxt); + + if (dd->kregbase && (dd->flags & QIB_PRESENT)) + writeq(value, &ubase[regno]); +} + +/** + * qib_write_kreg_ctxt - write a device's per-ctxt 64-bit kernel register + * @dd: the qlogic_ib device + * @regno: the register number to write + * @ctxt: the context containing the register + * @value: the value to write + */ +static inline void qib_write_kreg_ctxt(const struct qib_devdata *dd, + const u16 regno, unsigned ctxt, + u64 value) +{ + qib_write_kreg(dd, regno + ctxt, value); +} + +static inline void write_7220_creg(const struct qib_devdata *dd, + u16 regno, u64 value) +{ + if (dd->cspec->cregbase && (dd->flags & QIB_PRESENT)) + writeq(value, &dd->cspec->cregbase[regno]); +} + +static inline u64 read_7220_creg(const struct qib_devdata *dd, u16 regno) +{ + if (!dd->cspec->cregbase || !(dd->flags & QIB_PRESENT)) + return 0; + return readq(&dd->cspec->cregbase[regno]); +} + +static inline u32 read_7220_creg32(const struct qib_devdata *dd, u16 regno) +{ + if (!dd->cspec->cregbase || !(dd->flags & QIB_PRESENT)) + return 0; + return readl(&dd->cspec->cregbase[regno]); +} + +/* kr_revision bits */ +#define QLOGIC_IB_R_EMULATORREV_MASK ((1ULL << 22) - 1) +#define QLOGIC_IB_R_EMULATORREV_SHIFT 40 + +/* kr_control bits */ +#define QLOGIC_IB_C_RESET (1U << 7) + +/* kr_intstatus, kr_intclear, kr_intmask bits */ +#define QLOGIC_IB_I_RCVURG_MASK ((1ULL << 17) - 1) +#define QLOGIC_IB_I_RCVURG_SHIFT 32 +#define QLOGIC_IB_I_RCVAVAIL_MASK ((1ULL << 17) - 1) +#define QLOGIC_IB_I_RCVAVAIL_SHIFT 0 +#define QLOGIC_IB_I_SERDESTRIMDONE (1ULL << 27) + +#define QLOGIC_IB_C_FREEZEMODE 0x00000002 +#define QLOGIC_IB_C_LINKENABLE 0x00000004 + +#define QLOGIC_IB_I_SDMAINT 0x8000000000000000ULL +#define QLOGIC_IB_I_SDMADISABLED 0x4000000000000000ULL +#define QLOGIC_IB_I_ERROR 0x0000000080000000ULL +#define QLOGIC_IB_I_SPIOSENT 0x0000000040000000ULL +#define QLOGIC_IB_I_SPIOBUFAVAIL 0x0000000020000000ULL +#define QLOGIC_IB_I_GPIO 0x0000000010000000ULL + +/* variables for sanity checking interrupt and errors */ +#define QLOGIC_IB_I_BITSEXTANT \ + (QLOGIC_IB_I_SDMAINT | QLOGIC_IB_I_SDMADISABLED | \ + (QLOGIC_IB_I_RCVURG_MASK << QLOGIC_IB_I_RCVURG_SHIFT) | \ + (QLOGIC_IB_I_RCVAVAIL_MASK << \ + QLOGIC_IB_I_RCVAVAIL_SHIFT) | \ + QLOGIC_IB_I_ERROR | QLOGIC_IB_I_SPIOSENT | \ + QLOGIC_IB_I_SPIOBUFAVAIL | QLOGIC_IB_I_GPIO | \ + QLOGIC_IB_I_SERDESTRIMDONE) + +#define IB_HWE_BITSEXTANT \ + (HWE_MASK(RXEMemParityErr) | \ + HWE_MASK(TXEMemParityErr) | \ + (QLOGIC_IB_HWE_PCIEMEMPARITYERR_MASK << \ + QLOGIC_IB_HWE_PCIEMEMPARITYERR_SHIFT) | \ + QLOGIC_IB_HWE_PCIE1PLLFAILED | \ + QLOGIC_IB_HWE_PCIE0PLLFAILED | \ + QLOGIC_IB_HWE_PCIEPOISONEDTLP | \ + QLOGIC_IB_HWE_PCIECPLTIMEOUT | \ + QLOGIC_IB_HWE_PCIEBUSPARITYXTLH | \ + QLOGIC_IB_HWE_PCIEBUSPARITYXADM | \ + QLOGIC_IB_HWE_PCIEBUSPARITYRADM | \ + HWE_MASK(PowerOnBISTFailed) | \ + QLOGIC_IB_HWE_COREPLL_FBSLIP | \ + QLOGIC_IB_HWE_COREPLL_RFSLIP | \ + QLOGIC_IB_HWE_SERDESPLLFAILED | \ + HWE_MASK(IBCBusToSPCParityErr) | \ + HWE_MASK(IBCBusFromSPCParityErr) | \ + QLOGIC_IB_HWE_PCIECPLDATAQUEUEERR | \ + QLOGIC_IB_HWE_PCIECPLHDRQUEUEERR | \ + QLOGIC_IB_HWE_SDMAMEMREADERR | \ + QLOGIC_IB_HWE_CLK_UC_PLLNOTLOCKED | \ + QLOGIC_IB_HWE_PCIESERDESQ0PCLKNOTDETECT | \ + QLOGIC_IB_HWE_PCIESERDESQ1PCLKNOTDETECT | \ + QLOGIC_IB_HWE_PCIESERDESQ2PCLKNOTDETECT | \ + QLOGIC_IB_HWE_PCIESERDESQ3PCLKNOTDETECT | \ + QLOGIC_IB_HWE_DDSRXEQMEMORYPARITYERR | \ + QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR | \ + QLOGIC_IB_HWE_PCIE_UC_OCT0MEMORYPARITYERR | \ + QLOGIC_IB_HWE_PCIE_UC_OCT1MEMORYPARITYERR) + +#define IB_E_BITSEXTANT \ + (ERR_MASK(RcvFormatErr) | ERR_MASK(RcvVCRCErr) | \ + ERR_MASK(RcvICRCErr) | ERR_MASK(RcvMinPktLenErr) | \ + ERR_MASK(RcvMaxPktLenErr) | ERR_MASK(RcvLongPktLenErr) | \ + ERR_MASK(RcvShortPktLenErr) | ERR_MASK(RcvUnexpectedCharErr) | \ + ERR_MASK(RcvUnsupportedVLErr) | ERR_MASK(RcvEBPErr) | \ + ERR_MASK(RcvIBFlowErr) | ERR_MASK(RcvBadVersionErr) | \ + ERR_MASK(RcvEgrFullErr) | ERR_MASK(RcvHdrFullErr) | \ + ERR_MASK(RcvBadTidErr) | ERR_MASK(RcvHdrLenErr) | \ + ERR_MASK(RcvHdrErr) | ERR_MASK(RcvIBLostLinkErr) | \ + ERR_MASK(SendSpecialTriggerErr) | \ + ERR_MASK(SDmaDisabledErr) | ERR_MASK(SendMinPktLenErr) | \ + ERR_MASK(SendMaxPktLenErr) | ERR_MASK(SendUnderRunErr) | \ + ERR_MASK(SendPktLenErr) | ERR_MASK(SendDroppedSmpPktErr) | \ + ERR_MASK(SendDroppedDataPktErr) | \ + ERR_MASK(SendPioArmLaunchErr) | \ + ERR_MASK(SendUnexpectedPktNumErr) | \ + ERR_MASK(SendUnsupportedVLErr) | ERR_MASK(SendBufMisuseErr) | \ + ERR_MASK(SDmaGenMismatchErr) | ERR_MASK(SDmaOutOfBoundErr) | \ + ERR_MASK(SDmaTailOutOfBoundErr) | ERR_MASK(SDmaBaseErr) | \ + ERR_MASK(SDma1stDescErr) | ERR_MASK(SDmaRpyTagErr) | \ + ERR_MASK(SDmaDwEnErr) | ERR_MASK(SDmaMissingDwErr) | \ + ERR_MASK(SDmaUnexpDataErr) | \ + ERR_MASK(IBStatusChanged) | ERR_MASK(InvalidAddrErr) | \ + ERR_MASK(ResetNegated) | ERR_MASK(HardwareErr) | \ + ERR_MASK(SDmaDescAddrMisalignErr) | \ + ERR_MASK(InvalidEEPCmd)) + +/* kr_hwerrclear, kr_hwerrmask, kr_hwerrstatus, bits */ +#define QLOGIC_IB_HWE_PCIEMEMPARITYERR_MASK 0x00000000000000ffULL +#define QLOGIC_IB_HWE_PCIEMEMPARITYERR_SHIFT 0 +#define QLOGIC_IB_HWE_PCIEPOISONEDTLP 0x0000000010000000ULL +#define QLOGIC_IB_HWE_PCIECPLTIMEOUT 0x0000000020000000ULL +#define QLOGIC_IB_HWE_PCIEBUSPARITYXTLH 0x0000000040000000ULL +#define QLOGIC_IB_HWE_PCIEBUSPARITYXADM 0x0000000080000000ULL +#define QLOGIC_IB_HWE_PCIEBUSPARITYRADM 0x0000000100000000ULL +#define QLOGIC_IB_HWE_COREPLL_FBSLIP 0x0080000000000000ULL +#define QLOGIC_IB_HWE_COREPLL_RFSLIP 0x0100000000000000ULL +#define QLOGIC_IB_HWE_PCIE1PLLFAILED 0x0400000000000000ULL +#define QLOGIC_IB_HWE_PCIE0PLLFAILED 0x0800000000000000ULL +#define QLOGIC_IB_HWE_SERDESPLLFAILED 0x1000000000000000ULL +/* specific to this chip */ +#define QLOGIC_IB_HWE_PCIECPLDATAQUEUEERR 0x0000000000000040ULL +#define QLOGIC_IB_HWE_PCIECPLHDRQUEUEERR 0x0000000000000080ULL +#define QLOGIC_IB_HWE_SDMAMEMREADERR 0x0000000010000000ULL +#define QLOGIC_IB_HWE_CLK_UC_PLLNOTLOCKED 0x2000000000000000ULL +#define QLOGIC_IB_HWE_PCIESERDESQ0PCLKNOTDETECT 0x0100000000000000ULL +#define QLOGIC_IB_HWE_PCIESERDESQ1PCLKNOTDETECT 0x0200000000000000ULL +#define QLOGIC_IB_HWE_PCIESERDESQ2PCLKNOTDETECT 0x0400000000000000ULL +#define QLOGIC_IB_HWE_PCIESERDESQ3PCLKNOTDETECT 0x0800000000000000ULL +#define QLOGIC_IB_HWE_DDSRXEQMEMORYPARITYERR 0x0000008000000000ULL +#define QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR 0x0000004000000000ULL +#define QLOGIC_IB_HWE_PCIE_UC_OCT0MEMORYPARITYERR 0x0000001000000000ULL +#define QLOGIC_IB_HWE_PCIE_UC_OCT1MEMORYPARITYERR 0x0000002000000000ULL + +#define IBA7220_IBCC_LINKCMD_SHIFT 19 + +/* kr_ibcddrctrl bits */ +#define IBA7220_IBC_DLIDLMC_MASK 0xFFFFFFFFUL +#define IBA7220_IBC_DLIDLMC_SHIFT 32 + +#define IBA7220_IBC_HRTBT_MASK (SYM_RMASK(IBCDDRCtrl, HRTBT_AUTO) | \ + SYM_RMASK(IBCDDRCtrl, HRTBT_ENB)) +#define IBA7220_IBC_HRTBT_SHIFT SYM_LSB(IBCDDRCtrl, HRTBT_ENB) + +#define IBA7220_IBC_LANE_REV_SUPPORTED (1<<8) +#define IBA7220_IBC_LREV_MASK 1 +#define IBA7220_IBC_LREV_SHIFT 8 +#define IBA7220_IBC_RXPOL_MASK 1 +#define IBA7220_IBC_RXPOL_SHIFT 7 +#define IBA7220_IBC_WIDTH_SHIFT 5 +#define IBA7220_IBC_WIDTH_MASK 0x3 +#define IBA7220_IBC_WIDTH_1X_ONLY (0 << IBA7220_IBC_WIDTH_SHIFT) +#define IBA7220_IBC_WIDTH_4X_ONLY (1 << IBA7220_IBC_WIDTH_SHIFT) +#define IBA7220_IBC_WIDTH_AUTONEG (2 << IBA7220_IBC_WIDTH_SHIFT) +#define IBA7220_IBC_SPEED_AUTONEG (1 << 1) +#define IBA7220_IBC_SPEED_SDR (1 << 2) +#define IBA7220_IBC_SPEED_DDR (1 << 3) +#define IBA7220_IBC_SPEED_AUTONEG_MASK (0x7 << 1) +#define IBA7220_IBC_IBTA_1_2_MASK (1) + +/* kr_ibcddrstatus */ +/* link latency shift is 0, don't bother defining */ +#define IBA7220_DDRSTAT_LINKLAT_MASK 0x3ffffff + +/* kr_extstatus bits */ +#define QLOGIC_IB_EXTS_FREQSEL 0x2 +#define QLOGIC_IB_EXTS_SERDESSEL 0x4 +#define QLOGIC_IB_EXTS_MEMBIST_ENDTEST 0x0000000000004000 +#define QLOGIC_IB_EXTS_MEMBIST_DISABLED 0x0000000000008000 + +/* kr_xgxsconfig bits */ +#define QLOGIC_IB_XGXS_RESET 0x5ULL +#define QLOGIC_IB_XGXS_FC_SAFE (1ULL << 63) + +/* kr_rcvpktledcnt */ +#define IBA7220_LEDBLINK_ON_SHIFT 32 /* 4ns period on after packet */ +#define IBA7220_LEDBLINK_OFF_SHIFT 0 /* 4ns period off before next on */ + +#define _QIB_GPIO_SDA_NUM 1 +#define _QIB_GPIO_SCL_NUM 0 +#define QIB_TWSI_EEPROM_DEV 0xA2 /* All Production 7220 cards. */ +#define QIB_TWSI_TEMP_DEV 0x98 + +/* HW counter clock is at 4nsec */ +#define QIB_7220_PSXMITWAIT_CHECK_RATE 4000 + +#define IBA7220_R_INTRAVAIL_SHIFT 17 +#define IBA7220_R_PKEY_DIS_SHIFT 34 +#define IBA7220_R_TAILUPD_SHIFT 35 +#define IBA7220_R_CTXTCFG_SHIFT 36 + +#define IBA7220_HDRHEAD_PKTINT_SHIFT 32 /* interrupt cnt in upper 32 bits */ + +/* + * the size bits give us 2^N, in KB units. 0 marks as invalid, + * and 7 is reserved. We currently use only 2KB and 4KB + */ +#define IBA7220_TID_SZ_SHIFT 37 /* shift to 3bit size selector */ +#define IBA7220_TID_SZ_2K (1UL << IBA7220_TID_SZ_SHIFT) /* 2KB */ +#define IBA7220_TID_SZ_4K (2UL << IBA7220_TID_SZ_SHIFT) /* 4KB */ +#define IBA7220_TID_PA_SHIFT 11U /* TID addr in chip stored w/o low bits */ +#define PBC_7220_VL15_SEND (1ULL << 63) /* pbc; VL15, no credit check */ +#define PBC_7220_VL15_SEND_CTRL (1ULL << 31) /* control version of same */ + +#define AUTONEG_TRIES 5 /* sequential retries to negotiate DDR */ + +/* packet rate matching delay multiplier */ +static u8 rate_to_delay[2][2] = { + /* 1x, 4x */ + { 8, 2 }, /* SDR */ + { 4, 1 } /* DDR */ +}; + +static u8 ib_rate_to_delay[IB_RATE_120_GBPS + 1] = { + [IB_RATE_2_5_GBPS] = 8, + [IB_RATE_5_GBPS] = 4, + [IB_RATE_10_GBPS] = 2, + [IB_RATE_20_GBPS] = 1 +}; + +#define IBA7220_LINKSPEED_SHIFT SYM_LSB(IBCStatus, LinkSpeedActive) +#define IBA7220_LINKWIDTH_SHIFT SYM_LSB(IBCStatus, LinkWidthActive) + +/* link training states, from IBC */ +#define IB_7220_LT_STATE_DISABLED 0x00 +#define IB_7220_LT_STATE_LINKUP 0x01 +#define IB_7220_LT_STATE_POLLACTIVE 0x02 +#define IB_7220_LT_STATE_POLLQUIET 0x03 +#define IB_7220_LT_STATE_SLEEPDELAY 0x04 +#define IB_7220_LT_STATE_SLEEPQUIET 0x05 +#define IB_7220_LT_STATE_CFGDEBOUNCE 0x08 +#define IB_7220_LT_STATE_CFGRCVFCFG 0x09 +#define IB_7220_LT_STATE_CFGWAITRMT 0x0a +#define IB_7220_LT_STATE_CFGIDLE 0x0b +#define IB_7220_LT_STATE_RECOVERRETRAIN 0x0c +#define IB_7220_LT_STATE_RECOVERWAITRMT 0x0e +#define IB_7220_LT_STATE_RECOVERIDLE 0x0f + +/* link state machine states from IBC */ +#define IB_7220_L_STATE_DOWN 0x0 +#define IB_7220_L_STATE_INIT 0x1 +#define IB_7220_L_STATE_ARM 0x2 +#define IB_7220_L_STATE_ACTIVE 0x3 +#define IB_7220_L_STATE_ACT_DEFER 0x4 + +static const u8 qib_7220_physportstate[0x20] = { + [IB_7220_LT_STATE_DISABLED] = IB_PHYSPORTSTATE_DISABLED, + [IB_7220_LT_STATE_LINKUP] = IB_PHYSPORTSTATE_LINKUP, + [IB_7220_LT_STATE_POLLACTIVE] = IB_PHYSPORTSTATE_POLL, + [IB_7220_LT_STATE_POLLQUIET] = IB_PHYSPORTSTATE_POLL, + [IB_7220_LT_STATE_SLEEPDELAY] = IB_PHYSPORTSTATE_SLEEP, + [IB_7220_LT_STATE_SLEEPQUIET] = IB_PHYSPORTSTATE_SLEEP, + [IB_7220_LT_STATE_CFGDEBOUNCE] = + IB_PHYSPORTSTATE_CFG_TRAIN, + [IB_7220_LT_STATE_CFGRCVFCFG] = + IB_PHYSPORTSTATE_CFG_TRAIN, + [IB_7220_LT_STATE_CFGWAITRMT] = + IB_PHYSPORTSTATE_CFG_TRAIN, + [IB_7220_LT_STATE_CFGIDLE] = IB_PHYSPORTSTATE_CFG_TRAIN, + [IB_7220_LT_STATE_RECOVERRETRAIN] = + IB_PHYSPORTSTATE_LINK_ERR_RECOVER, + [IB_7220_LT_STATE_RECOVERWAITRMT] = + IB_PHYSPORTSTATE_LINK_ERR_RECOVER, + [IB_7220_LT_STATE_RECOVERIDLE] = + IB_PHYSPORTSTATE_LINK_ERR_RECOVER, + [0x10] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x11] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x12] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x13] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x14] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x15] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x16] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x17] = IB_PHYSPORTSTATE_CFG_TRAIN +}; + +int qib_special_trigger; +module_param_named(special_trigger, qib_special_trigger, int, S_IRUGO); +MODULE_PARM_DESC(special_trigger, "Enable SpecialTrigger arm/launch"); + +#define IBCBUSFRSPCPARITYERR HWE_MASK(IBCBusFromSPCParityErr) +#define IBCBUSTOSPCPARITYERR HWE_MASK(IBCBusToSPCParityErr) + +#define SYM_MASK_BIT(regname, fldname, bit) ((u64) \ + (1ULL << (SYM_LSB(regname, fldname) + (bit)))) + +#define TXEMEMPARITYERR_PIOBUF \ + SYM_MASK_BIT(HwErrMask, TXEMemParityErrMask, 0) +#define TXEMEMPARITYERR_PIOPBC \ + SYM_MASK_BIT(HwErrMask, TXEMemParityErrMask, 1) +#define TXEMEMPARITYERR_PIOLAUNCHFIFO \ + SYM_MASK_BIT(HwErrMask, TXEMemParityErrMask, 2) + +#define RXEMEMPARITYERR_RCVBUF \ + SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 0) +#define RXEMEMPARITYERR_LOOKUPQ \ + SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 1) +#define RXEMEMPARITYERR_EXPTID \ + SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 2) +#define RXEMEMPARITYERR_EAGERTID \ + SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 3) +#define RXEMEMPARITYERR_FLAGBUF \ + SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 4) +#define RXEMEMPARITYERR_DATAINFO \ + SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 5) +#define RXEMEMPARITYERR_HDRINFO \ + SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 6) + +/* 7220 specific hardware errors... */ +static const struct qib_hwerror_msgs qib_7220_hwerror_msgs[] = { + /* generic hardware errors */ + QLOGIC_IB_HWE_MSG(IBCBUSFRSPCPARITYERR, "QIB2IB Parity"), + QLOGIC_IB_HWE_MSG(IBCBUSTOSPCPARITYERR, "IB2QIB Parity"), + + QLOGIC_IB_HWE_MSG(TXEMEMPARITYERR_PIOBUF, + "TXE PIOBUF Memory Parity"), + QLOGIC_IB_HWE_MSG(TXEMEMPARITYERR_PIOPBC, + "TXE PIOPBC Memory Parity"), + QLOGIC_IB_HWE_MSG(TXEMEMPARITYERR_PIOLAUNCHFIFO, + "TXE PIOLAUNCHFIFO Memory Parity"), + + QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_RCVBUF, + "RXE RCVBUF Memory Parity"), + QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_LOOKUPQ, + "RXE LOOKUPQ Memory Parity"), + QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_EAGERTID, + "RXE EAGERTID Memory Parity"), + QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_EXPTID, + "RXE EXPTID Memory Parity"), + QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_FLAGBUF, + "RXE FLAGBUF Memory Parity"), + QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_DATAINFO, + "RXE DATAINFO Memory Parity"), + QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_HDRINFO, + "RXE HDRINFO Memory Parity"), + + /* chip-specific hardware errors */ + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIEPOISONEDTLP, + "PCIe Poisoned TLP"), + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIECPLTIMEOUT, + "PCIe completion timeout"), + /* + * In practice, it's unlikely wthat we'll see PCIe PLL, or bus + * parity or memory parity error failures, because most likely we + * won't be able to talk to the core of the chip. Nonetheless, we + * might see them, if they are in parts of the PCIe core that aren't + * essential. + */ + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIE1PLLFAILED, + "PCIePLL1"), + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIE0PLLFAILED, + "PCIePLL0"), + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIEBUSPARITYXTLH, + "PCIe XTLH core parity"), + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIEBUSPARITYXADM, + "PCIe ADM TX core parity"), + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIEBUSPARITYRADM, + "PCIe ADM RX core parity"), + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_SERDESPLLFAILED, + "SerDes PLL"), + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIECPLDATAQUEUEERR, + "PCIe cpl header queue"), + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIECPLHDRQUEUEERR, + "PCIe cpl data queue"), + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_SDMAMEMREADERR, + "Send DMA memory read"), + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_CLK_UC_PLLNOTLOCKED, + "uC PLL clock not locked"), + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIESERDESQ0PCLKNOTDETECT, + "PCIe serdes Q0 no clock"), + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIESERDESQ1PCLKNOTDETECT, + "PCIe serdes Q1 no clock"), + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIESERDESQ2PCLKNOTDETECT, + "PCIe serdes Q2 no clock"), + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIESERDESQ3PCLKNOTDETECT, + "PCIe serdes Q3 no clock"), + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_DDSRXEQMEMORYPARITYERR, + "DDS RXEQ memory parity"), + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR, + "IB uC memory parity"), + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIE_UC_OCT0MEMORYPARITYERR, + "PCIe uC oct0 memory parity"), + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIE_UC_OCT1MEMORYPARITYERR, + "PCIe uC oct1 memory parity"), +}; + +#define RXE_PARITY (RXEMEMPARITYERR_EAGERTID|RXEMEMPARITYERR_EXPTID) + +#define QLOGIC_IB_E_PKTERRS (\ + ERR_MASK(SendPktLenErr) | \ + ERR_MASK(SendDroppedDataPktErr) | \ + ERR_MASK(RcvVCRCErr) | \ + ERR_MASK(RcvICRCErr) | \ + ERR_MASK(RcvShortPktLenErr) | \ + ERR_MASK(RcvEBPErr)) + +/* Convenience for decoding Send DMA errors */ +#define QLOGIC_IB_E_SDMAERRS ( \ + ERR_MASK(SDmaGenMismatchErr) | \ + ERR_MASK(SDmaOutOfBoundErr) | \ + ERR_MASK(SDmaTailOutOfBoundErr) | ERR_MASK(SDmaBaseErr) | \ + ERR_MASK(SDma1stDescErr) | ERR_MASK(SDmaRpyTagErr) | \ + ERR_MASK(SDmaDwEnErr) | ERR_MASK(SDmaMissingDwErr) | \ + ERR_MASK(SDmaUnexpDataErr) | \ + ERR_MASK(SDmaDescAddrMisalignErr) | \ + ERR_MASK(SDmaDisabledErr) | \ + ERR_MASK(SendBufMisuseErr)) + +/* These are all rcv-related errors which we want to count for stats */ +#define E_SUM_PKTERRS \ + (ERR_MASK(RcvHdrLenErr) | ERR_MASK(RcvBadTidErr) | \ + ERR_MASK(RcvBadVersionErr) | ERR_MASK(RcvHdrErr) | \ + ERR_MASK(RcvLongPktLenErr) | ERR_MASK(RcvShortPktLenErr) | \ + ERR_MASK(RcvMaxPktLenErr) | ERR_MASK(RcvMinPktLenErr) | \ + ERR_MASK(RcvFormatErr) | ERR_MASK(RcvUnsupportedVLErr) | \ + ERR_MASK(RcvUnexpectedCharErr) | ERR_MASK(RcvEBPErr)) + +/* These are all send-related errors which we want to count for stats */ +#define E_SUM_ERRS \ + (ERR_MASK(SendPioArmLaunchErr) | ERR_MASK(SendUnexpectedPktNumErr) | \ + ERR_MASK(SendDroppedDataPktErr) | ERR_MASK(SendDroppedSmpPktErr) | \ + ERR_MASK(SendMaxPktLenErr) | ERR_MASK(SendUnsupportedVLErr) | \ + ERR_MASK(SendMinPktLenErr) | ERR_MASK(SendPktLenErr) | \ + ERR_MASK(InvalidAddrErr)) + +/* + * this is similar to E_SUM_ERRS, but can't ignore armlaunch, don't ignore + * errors not related to freeze and cancelling buffers. Can't ignore + * armlaunch because could get more while still cleaning up, and need + * to cancel those as they happen. + */ +#define E_SPKT_ERRS_IGNORE \ + (ERR_MASK(SendDroppedDataPktErr) | ERR_MASK(SendDroppedSmpPktErr) | \ + ERR_MASK(SendMaxPktLenErr) | ERR_MASK(SendMinPktLenErr) | \ + ERR_MASK(SendPktLenErr)) + +/* + * these are errors that can occur when the link changes state while + * a packet is being sent or received. This doesn't cover things + * like EBP or VCRC that can be the result of a sending having the + * link change state, so we receive a "known bad" packet. + */ +#define E_SUM_LINK_PKTERRS \ + (ERR_MASK(SendDroppedDataPktErr) | ERR_MASK(SendDroppedSmpPktErr) | \ + ERR_MASK(SendMinPktLenErr) | ERR_MASK(SendPktLenErr) | \ + ERR_MASK(RcvShortPktLenErr) | ERR_MASK(RcvMinPktLenErr) | \ + ERR_MASK(RcvUnexpectedCharErr)) + +static void autoneg_7220_work(struct work_struct *); +static u32 __iomem *qib_7220_getsendbuf(struct qib_pportdata *, u64, u32 *); + +/* + * Called when we might have an error that is specific to a particular + * PIO buffer, and may need to cancel that buffer, so it can be re-used. + * because we don't need to force the update of pioavail. + */ +static void qib_disarm_7220_senderrbufs(struct qib_pportdata *ppd) +{ + unsigned long sbuf[3]; + struct qib_devdata *dd = ppd->dd; + + /* + * It's possible that sendbuffererror could have bits set; might + * have already done this as a result of hardware error handling. + */ + /* read these before writing errorclear */ + sbuf[0] = qib_read_kreg64(dd, kr_sendbuffererror); + sbuf[1] = qib_read_kreg64(dd, kr_sendbuffererror + 1); + sbuf[2] = qib_read_kreg64(dd, kr_sendbuffererror + 2); + + if (sbuf[0] || sbuf[1] || sbuf[2]) + qib_disarm_piobufs_set(dd, sbuf, + dd->piobcnt2k + dd->piobcnt4k); +} + +static void qib_7220_txe_recover(struct qib_devdata *dd) +{ + qib_devinfo(dd->pcidev, "Recovering from TXE PIO parity error\n"); + qib_disarm_7220_senderrbufs(dd->pport); +} + +/* + * This is called with interrupts disabled and sdma_lock held. + */ +static void qib_7220_sdma_sendctrl(struct qib_pportdata *ppd, unsigned op) +{ + struct qib_devdata *dd = ppd->dd; + u64 set_sendctrl = 0; + u64 clr_sendctrl = 0; + + if (op & QIB_SDMA_SENDCTRL_OP_ENABLE) + set_sendctrl |= SYM_MASK(SendCtrl, SDmaEnable); + else + clr_sendctrl |= SYM_MASK(SendCtrl, SDmaEnable); + + if (op & QIB_SDMA_SENDCTRL_OP_INTENABLE) + set_sendctrl |= SYM_MASK(SendCtrl, SDmaIntEnable); + else + clr_sendctrl |= SYM_MASK(SendCtrl, SDmaIntEnable); + + if (op & QIB_SDMA_SENDCTRL_OP_HALT) + set_sendctrl |= SYM_MASK(SendCtrl, SDmaHalt); + else + clr_sendctrl |= SYM_MASK(SendCtrl, SDmaHalt); + + spin_lock(&dd->sendctrl_lock); + + dd->sendctrl |= set_sendctrl; + dd->sendctrl &= ~clr_sendctrl; + + qib_write_kreg(dd, kr_sendctrl, dd->sendctrl); + qib_write_kreg(dd, kr_scratch, 0); + + spin_unlock(&dd->sendctrl_lock); +} + +static void qib_decode_7220_sdma_errs(struct qib_pportdata *ppd, + u64 err, char *buf, size_t blen) +{ + static const struct { + u64 err; + const char *msg; + } errs[] = { + { ERR_MASK(SDmaGenMismatchErr), + "SDmaGenMismatch" }, + { ERR_MASK(SDmaOutOfBoundErr), + "SDmaOutOfBound" }, + { ERR_MASK(SDmaTailOutOfBoundErr), + "SDmaTailOutOfBound" }, + { ERR_MASK(SDmaBaseErr), + "SDmaBase" }, + { ERR_MASK(SDma1stDescErr), + "SDma1stDesc" }, + { ERR_MASK(SDmaRpyTagErr), + "SDmaRpyTag" }, + { ERR_MASK(SDmaDwEnErr), + "SDmaDwEn" }, + { ERR_MASK(SDmaMissingDwErr), + "SDmaMissingDw" }, + { ERR_MASK(SDmaUnexpDataErr), + "SDmaUnexpData" }, + { ERR_MASK(SDmaDescAddrMisalignErr), + "SDmaDescAddrMisalign" }, + { ERR_MASK(SendBufMisuseErr), + "SendBufMisuse" }, + { ERR_MASK(SDmaDisabledErr), + "SDmaDisabled" }, + }; + int i; + size_t bidx = 0; + + for (i = 0; i < ARRAY_SIZE(errs); i++) { + if (err & errs[i].err) + bidx += scnprintf(buf + bidx, blen - bidx, + "%s ", errs[i].msg); + } +} + +/* + * This is called as part of link down clean up so disarm and flush + * all send buffers so that SMP packets can be sent. + */ +static void qib_7220_sdma_hw_clean_up(struct qib_pportdata *ppd) +{ + /* This will trigger the Abort interrupt */ + sendctrl_7220_mod(ppd, QIB_SENDCTRL_DISARM_ALL | QIB_SENDCTRL_FLUSH | + QIB_SENDCTRL_AVAIL_BLIP); + ppd->dd->upd_pio_shadow = 1; /* update our idea of what's busy */ +} + +static void qib_sdma_7220_setlengen(struct qib_pportdata *ppd) +{ + /* + * Set SendDmaLenGen and clear and set + * the MSB of the generation count to enable generation checking + * and load the internal generation counter. + */ + qib_write_kreg(ppd->dd, kr_senddmalengen, ppd->sdma_descq_cnt); + qib_write_kreg(ppd->dd, kr_senddmalengen, + ppd->sdma_descq_cnt | + (1ULL << QIB_7220_SendDmaLenGen_Generation_MSB)); +} + +static void qib_7220_sdma_hw_start_up(struct qib_pportdata *ppd) +{ + qib_sdma_7220_setlengen(ppd); + qib_sdma_update_7220_tail(ppd, 0); /* Set SendDmaTail */ + ppd->sdma_head_dma[0] = 0; +} + +#define DISABLES_SDMA ( \ + ERR_MASK(SDmaDisabledErr) | \ + ERR_MASK(SDmaBaseErr) | \ + ERR_MASK(SDmaTailOutOfBoundErr) | \ + ERR_MASK(SDmaOutOfBoundErr) | \ + ERR_MASK(SDma1stDescErr) | \ + ERR_MASK(SDmaRpyTagErr) | \ + ERR_MASK(SDmaGenMismatchErr) | \ + ERR_MASK(SDmaDescAddrMisalignErr) | \ + ERR_MASK(SDmaMissingDwErr) | \ + ERR_MASK(SDmaDwEnErr)) + +static void sdma_7220_errors(struct qib_pportdata *ppd, u64 errs) +{ + unsigned long flags; + struct qib_devdata *dd = ppd->dd; + char *msg; + + errs &= QLOGIC_IB_E_SDMAERRS; + + msg = dd->cspec->sdmamsgbuf; + qib_decode_7220_sdma_errs(ppd, errs, msg, + sizeof(dd->cspec->sdmamsgbuf)); + spin_lock_irqsave(&ppd->sdma_lock, flags); + + if (errs & ERR_MASK(SendBufMisuseErr)) { + unsigned long sbuf[3]; + + sbuf[0] = qib_read_kreg64(dd, kr_sendbuffererror); + sbuf[1] = qib_read_kreg64(dd, kr_sendbuffererror + 1); + sbuf[2] = qib_read_kreg64(dd, kr_sendbuffererror + 2); + + qib_dev_err(ppd->dd, + "IB%u:%u SendBufMisuse: %04lx %016lx %016lx\n", + ppd->dd->unit, ppd->port, sbuf[2], sbuf[1], + sbuf[0]); + } + + if (errs & ERR_MASK(SDmaUnexpDataErr)) + qib_dev_err(dd, "IB%u:%u SDmaUnexpData\n", ppd->dd->unit, + ppd->port); + + switch (ppd->sdma_state.current_state) { + case qib_sdma_state_s00_hw_down: + /* not expecting any interrupts */ + break; + + case qib_sdma_state_s10_hw_start_up_wait: + /* handled in intr path */ + break; + + case qib_sdma_state_s20_idle: + /* not expecting any interrupts */ + break; + + case qib_sdma_state_s30_sw_clean_up_wait: + /* not expecting any interrupts */ + break; + + case qib_sdma_state_s40_hw_clean_up_wait: + if (errs & ERR_MASK(SDmaDisabledErr)) + __qib_sdma_process_event(ppd, + qib_sdma_event_e50_hw_cleaned); + break; + + case qib_sdma_state_s50_hw_halt_wait: + /* handled in intr path */ + break; + + case qib_sdma_state_s99_running: + if (errs & DISABLES_SDMA) + __qib_sdma_process_event(ppd, + qib_sdma_event_e7220_err_halted); + break; + } + + spin_unlock_irqrestore(&ppd->sdma_lock, flags); +} + +/* + * Decode the error status into strings, deciding whether to always + * print * it or not depending on "normal packet errors" vs everything + * else. Return 1 if "real" errors, otherwise 0 if only packet + * errors, so caller can decide what to print with the string. + */ +static int qib_decode_7220_err(struct qib_devdata *dd, char *buf, size_t blen, + u64 err) +{ + int iserr = 1; + + *buf = '\0'; + if (err & QLOGIC_IB_E_PKTERRS) { + if (!(err & ~QLOGIC_IB_E_PKTERRS)) + iserr = 0; + if ((err & ERR_MASK(RcvICRCErr)) && + !(err & (ERR_MASK(RcvVCRCErr) | ERR_MASK(RcvEBPErr)))) + strlcat(buf, "CRC ", blen); + if (!iserr) + goto done; + } + if (err & ERR_MASK(RcvHdrLenErr)) + strlcat(buf, "rhdrlen ", blen); + if (err & ERR_MASK(RcvBadTidErr)) + strlcat(buf, "rbadtid ", blen); + if (err & ERR_MASK(RcvBadVersionErr)) + strlcat(buf, "rbadversion ", blen); + if (err & ERR_MASK(RcvHdrErr)) + strlcat(buf, "rhdr ", blen); + if (err & ERR_MASK(SendSpecialTriggerErr)) + strlcat(buf, "sendspecialtrigger ", blen); + if (err & ERR_MASK(RcvLongPktLenErr)) + strlcat(buf, "rlongpktlen ", blen); + if (err & ERR_MASK(RcvMaxPktLenErr)) + strlcat(buf, "rmaxpktlen ", blen); + if (err & ERR_MASK(RcvMinPktLenErr)) + strlcat(buf, "rminpktlen ", blen); + if (err & ERR_MASK(SendMinPktLenErr)) + strlcat(buf, "sminpktlen ", blen); + if (err & ERR_MASK(RcvFormatErr)) + strlcat(buf, "rformaterr ", blen); + if (err & ERR_MASK(RcvUnsupportedVLErr)) + strlcat(buf, "runsupvl ", blen); + if (err & ERR_MASK(RcvUnexpectedCharErr)) + strlcat(buf, "runexpchar ", blen); + if (err & ERR_MASK(RcvIBFlowErr)) + strlcat(buf, "ribflow ", blen); + if (err & ERR_MASK(SendUnderRunErr)) + strlcat(buf, "sunderrun ", blen); + if (err & ERR_MASK(SendPioArmLaunchErr)) + strlcat(buf, "spioarmlaunch ", blen); + if (err & ERR_MASK(SendUnexpectedPktNumErr)) + strlcat(buf, "sunexperrpktnum ", blen); + if (err & ERR_MASK(SendDroppedSmpPktErr)) + strlcat(buf, "sdroppedsmppkt ", blen); + if (err & ERR_MASK(SendMaxPktLenErr)) + strlcat(buf, "smaxpktlen ", blen); + if (err & ERR_MASK(SendUnsupportedVLErr)) + strlcat(buf, "sunsupVL ", blen); + if (err & ERR_MASK(InvalidAddrErr)) + strlcat(buf, "invalidaddr ", blen); + if (err & ERR_MASK(RcvEgrFullErr)) + strlcat(buf, "rcvegrfull ", blen); + if (err & ERR_MASK(RcvHdrFullErr)) + strlcat(buf, "rcvhdrfull ", blen); + if (err & ERR_MASK(IBStatusChanged)) + strlcat(buf, "ibcstatuschg ", blen); + if (err & ERR_MASK(RcvIBLostLinkErr)) + strlcat(buf, "riblostlink ", blen); + if (err & ERR_MASK(HardwareErr)) + strlcat(buf, "hardware ", blen); + if (err & ERR_MASK(ResetNegated)) + strlcat(buf, "reset ", blen); + if (err & QLOGIC_IB_E_SDMAERRS) + qib_decode_7220_sdma_errs(dd->pport, err, buf, blen); + if (err & ERR_MASK(InvalidEEPCmd)) + strlcat(buf, "invalideepromcmd ", blen); +done: + return iserr; +} + +static void reenable_7220_chase(unsigned long opaque) +{ + struct qib_pportdata *ppd = (struct qib_pportdata *)opaque; + + ppd->cpspec->chase_timer.expires = 0; + qib_set_ib_7220_lstate(ppd, QLOGIC_IB_IBCC_LINKCMD_DOWN, + QLOGIC_IB_IBCC_LINKINITCMD_POLL); +} + +static void handle_7220_chase(struct qib_pportdata *ppd, u64 ibcst) +{ + u8 ibclt; + unsigned long tnow; + + ibclt = (u8)SYM_FIELD(ibcst, IBCStatus, LinkTrainingState); + + /* + * Detect and handle the state chase issue, where we can + * get stuck if we are unlucky on timing on both sides of + * the link. If we are, we disable, set a timer, and + * then re-enable. + */ + switch (ibclt) { + case IB_7220_LT_STATE_CFGRCVFCFG: + case IB_7220_LT_STATE_CFGWAITRMT: + case IB_7220_LT_STATE_TXREVLANES: + case IB_7220_LT_STATE_CFGENH: + tnow = jiffies; + if (ppd->cpspec->chase_end && + time_after(tnow, ppd->cpspec->chase_end)) { + ppd->cpspec->chase_end = 0; + qib_set_ib_7220_lstate(ppd, + QLOGIC_IB_IBCC_LINKCMD_DOWN, + QLOGIC_IB_IBCC_LINKINITCMD_DISABLE); + ppd->cpspec->chase_timer.expires = jiffies + + QIB_CHASE_DIS_TIME; + add_timer(&ppd->cpspec->chase_timer); + } else if (!ppd->cpspec->chase_end) + ppd->cpspec->chase_end = tnow + QIB_CHASE_TIME; + break; + + default: + ppd->cpspec->chase_end = 0; + break; + } +} + +static void handle_7220_errors(struct qib_devdata *dd, u64 errs) +{ + char *msg; + u64 ignore_this_time = 0; + u64 iserr = 0; + int log_idx; + struct qib_pportdata *ppd = dd->pport; + u64 mask; + + /* don't report errors that are masked */ + errs &= dd->cspec->errormask; + msg = dd->cspec->emsgbuf; + + /* do these first, they are most important */ + if (errs & ERR_MASK(HardwareErr)) + qib_7220_handle_hwerrors(dd, msg, sizeof(dd->cspec->emsgbuf)); + else + for (log_idx = 0; log_idx < QIB_EEP_LOG_CNT; ++log_idx) + if (errs & dd->eep_st_masks[log_idx].errs_to_log) + qib_inc_eeprom_err(dd, log_idx, 1); + + if (errs & QLOGIC_IB_E_SDMAERRS) + sdma_7220_errors(ppd, errs); + + if (errs & ~IB_E_BITSEXTANT) + qib_dev_err(dd, + "error interrupt with unknown errors %llx set\n", + (unsigned long long) (errs & ~IB_E_BITSEXTANT)); + + if (errs & E_SUM_ERRS) { + qib_disarm_7220_senderrbufs(ppd); + if ((errs & E_SUM_LINK_PKTERRS) && + !(ppd->lflags & QIBL_LINKACTIVE)) { + /* + * This can happen when trying to bring the link + * up, but the IB link changes state at the "wrong" + * time. The IB logic then complains that the packet + * isn't valid. We don't want to confuse people, so + * we just don't print them, except at debug + */ + ignore_this_time = errs & E_SUM_LINK_PKTERRS; + } + } else if ((errs & E_SUM_LINK_PKTERRS) && + !(ppd->lflags & QIBL_LINKACTIVE)) { + /* + * This can happen when SMA is trying to bring the link + * up, but the IB link changes state at the "wrong" time. + * The IB logic then complains that the packet isn't + * valid. We don't want to confuse people, so we just + * don't print them, except at debug + */ + ignore_this_time = errs & E_SUM_LINK_PKTERRS; + } + + qib_write_kreg(dd, kr_errclear, errs); + + errs &= ~ignore_this_time; + if (!errs) + goto done; + + /* + * The ones we mask off are handled specially below + * or above. Also mask SDMADISABLED by default as it + * is too chatty. + */ + mask = ERR_MASK(IBStatusChanged) | + ERR_MASK(RcvEgrFullErr) | ERR_MASK(RcvHdrFullErr) | + ERR_MASK(HardwareErr) | ERR_MASK(SDmaDisabledErr); + + qib_decode_7220_err(dd, msg, sizeof(dd->cspec->emsgbuf), errs & ~mask); + + if (errs & E_SUM_PKTERRS) + qib_stats.sps_rcverrs++; + if (errs & E_SUM_ERRS) + qib_stats.sps_txerrs++; + iserr = errs & ~(E_SUM_PKTERRS | QLOGIC_IB_E_PKTERRS | + ERR_MASK(SDmaDisabledErr)); + + if (errs & ERR_MASK(IBStatusChanged)) { + u64 ibcs; + + ibcs = qib_read_kreg64(dd, kr_ibcstatus); + if (!(ppd->lflags & QIBL_IB_AUTONEG_INPROG)) + handle_7220_chase(ppd, ibcs); + + /* Update our picture of width and speed from chip */ + ppd->link_width_active = + ((ibcs >> IBA7220_LINKWIDTH_SHIFT) & 1) ? + IB_WIDTH_4X : IB_WIDTH_1X; + ppd->link_speed_active = + ((ibcs >> IBA7220_LINKSPEED_SHIFT) & 1) ? + QIB_IB_DDR : QIB_IB_SDR; + + /* + * Since going into a recovery state causes the link state + * to go down and since recovery is transitory, it is better + * if we "miss" ever seeing the link training state go into + * recovery (i.e., ignore this transition for link state + * special handling purposes) without updating lastibcstat. + */ + if (qib_7220_phys_portstate(ibcs) != + IB_PHYSPORTSTATE_LINK_ERR_RECOVER) + qib_handle_e_ibstatuschanged(ppd, ibcs); + } + + if (errs & ERR_MASK(ResetNegated)) { + qib_dev_err(dd, + "Got reset, requires re-init (unload and reload driver)\n"); + dd->flags &= ~QIB_INITTED; /* needs re-init */ + /* mark as having had error */ + *dd->devstatusp |= QIB_STATUS_HWERROR; + *dd->pport->statusp &= ~QIB_STATUS_IB_CONF; + } + + if (*msg && iserr) + qib_dev_porterr(dd, ppd->port, "%s error\n", msg); + + if (ppd->state_wanted & ppd->lflags) + wake_up_interruptible(&ppd->state_wait); + + /* + * If there were hdrq or egrfull errors, wake up any processes + * waiting in poll. We used to try to check which contexts had + * the overflow, but given the cost of that and the chip reads + * to support it, it's better to just wake everybody up if we + * get an overflow; waiters can poll again if it's not them. + */ + if (errs & (ERR_MASK(RcvEgrFullErr) | ERR_MASK(RcvHdrFullErr))) { + qib_handle_urcv(dd, ~0U); + if (errs & ERR_MASK(RcvEgrFullErr)) + qib_stats.sps_buffull++; + else + qib_stats.sps_hdrfull++; + } +done: + return; +} + +/* enable/disable chip from delivering interrupts */ +static void qib_7220_set_intr_state(struct qib_devdata *dd, u32 enable) +{ + if (enable) { + if (dd->flags & QIB_BADINTR) + return; + qib_write_kreg(dd, kr_intmask, ~0ULL); + /* force re-interrupt of any pending interrupts. */ + qib_write_kreg(dd, kr_intclear, 0ULL); + } else + qib_write_kreg(dd, kr_intmask, 0ULL); +} + +/* + * Try to cleanup as much as possible for anything that might have gone + * wrong while in freeze mode, such as pio buffers being written by user + * processes (causing armlaunch), send errors due to going into freeze mode, + * etc., and try to avoid causing extra interrupts while doing so. + * Forcibly update the in-memory pioavail register copies after cleanup + * because the chip won't do it while in freeze mode (the register values + * themselves are kept correct). + * Make sure that we don't lose any important interrupts by using the chip + * feature that says that writing 0 to a bit in *clear that is set in + * *status will cause an interrupt to be generated again (if allowed by + * the *mask value). + * This is in chip-specific code because of all of the register accesses, + * even though the details are similar on most chips. + */ +static void qib_7220_clear_freeze(struct qib_devdata *dd) +{ + /* disable error interrupts, to avoid confusion */ + qib_write_kreg(dd, kr_errmask, 0ULL); + + /* also disable interrupts; errormask is sometimes overwriten */ + qib_7220_set_intr_state(dd, 0); + + qib_cancel_sends(dd->pport); + + /* clear the freeze, and be sure chip saw it */ + qib_write_kreg(dd, kr_control, dd->control); + qib_read_kreg32(dd, kr_scratch); + + /* force in-memory update now we are out of freeze */ + qib_force_pio_avail_update(dd); + + /* + * force new interrupt if any hwerr, error or interrupt bits are + * still set, and clear "safe" send packet errors related to freeze + * and cancelling sends. Re-enable error interrupts before possible + * force of re-interrupt on pending interrupts. + */ + qib_write_kreg(dd, kr_hwerrclear, 0ULL); + qib_write_kreg(dd, kr_errclear, E_SPKT_ERRS_IGNORE); + qib_write_kreg(dd, kr_errmask, dd->cspec->errormask); + qib_7220_set_intr_state(dd, 1); +} + +/** + * qib_7220_handle_hwerrors - display hardware errors. + * @dd: the qlogic_ib device + * @msg: the output buffer + * @msgl: the size of the output buffer + * + * Use same msg buffer as regular errors to avoid excessive stack + * use. Most hardware errors are catastrophic, but for right now, + * we'll print them and continue. We reuse the same message buffer as + * handle_7220_errors() to avoid excessive stack usage. + */ +static void qib_7220_handle_hwerrors(struct qib_devdata *dd, char *msg, + size_t msgl) +{ + u64 hwerrs; + u32 bits, ctrl; + int isfatal = 0; + char *bitsmsg; + int log_idx; + + hwerrs = qib_read_kreg64(dd, kr_hwerrstatus); + if (!hwerrs) + goto bail; + if (hwerrs == ~0ULL) { + qib_dev_err(dd, + "Read of hardware error status failed (all bits set); ignoring\n"); + goto bail; + } + qib_stats.sps_hwerrs++; + + /* + * Always clear the error status register, except MEMBISTFAIL, + * regardless of whether we continue or stop using the chip. + * We want that set so we know it failed, even across driver reload. + * We'll still ignore it in the hwerrmask. We do this partly for + * diagnostics, but also for support. + */ + qib_write_kreg(dd, kr_hwerrclear, + hwerrs & ~HWE_MASK(PowerOnBISTFailed)); + + hwerrs &= dd->cspec->hwerrmask; + + /* We log some errors to EEPROM, check if we have any of those. */ + for (log_idx = 0; log_idx < QIB_EEP_LOG_CNT; ++log_idx) + if (hwerrs & dd->eep_st_masks[log_idx].hwerrs_to_log) + qib_inc_eeprom_err(dd, log_idx, 1); + if (hwerrs & ~(TXEMEMPARITYERR_PIOBUF | TXEMEMPARITYERR_PIOPBC | + RXE_PARITY)) + qib_devinfo(dd->pcidev, + "Hardware error: hwerr=0x%llx (cleared)\n", + (unsigned long long) hwerrs); + + if (hwerrs & ~IB_HWE_BITSEXTANT) + qib_dev_err(dd, + "hwerror interrupt with unknown errors %llx set\n", + (unsigned long long) (hwerrs & ~IB_HWE_BITSEXTANT)); + + if (hwerrs & QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR) + qib_sd7220_clr_ibpar(dd); + + ctrl = qib_read_kreg32(dd, kr_control); + if ((ctrl & QLOGIC_IB_C_FREEZEMODE) && !dd->diag_client) { + /* + * Parity errors in send memory are recoverable by h/w + * just do housekeeping, exit freeze mode and continue. + */ + if (hwerrs & (TXEMEMPARITYERR_PIOBUF | + TXEMEMPARITYERR_PIOPBC)) { + qib_7220_txe_recover(dd); + hwerrs &= ~(TXEMEMPARITYERR_PIOBUF | + TXEMEMPARITYERR_PIOPBC); + } + if (hwerrs) + isfatal = 1; + else + qib_7220_clear_freeze(dd); + } + + *msg = '\0'; + + if (hwerrs & HWE_MASK(PowerOnBISTFailed)) { + isfatal = 1; + strlcat(msg, + "[Memory BIST test failed, InfiniPath hardware unusable]", + msgl); + /* ignore from now on, so disable until driver reloaded */ + dd->cspec->hwerrmask &= ~HWE_MASK(PowerOnBISTFailed); + qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask); + } + + qib_format_hwerrors(hwerrs, qib_7220_hwerror_msgs, + ARRAY_SIZE(qib_7220_hwerror_msgs), msg, msgl); + + bitsmsg = dd->cspec->bitsmsgbuf; + if (hwerrs & (QLOGIC_IB_HWE_PCIEMEMPARITYERR_MASK << + QLOGIC_IB_HWE_PCIEMEMPARITYERR_SHIFT)) { + bits = (u32) ((hwerrs >> + QLOGIC_IB_HWE_PCIEMEMPARITYERR_SHIFT) & + QLOGIC_IB_HWE_PCIEMEMPARITYERR_MASK); + snprintf(bitsmsg, sizeof(dd->cspec->bitsmsgbuf), + "[PCIe Mem Parity Errs %x] ", bits); + strlcat(msg, bitsmsg, msgl); + } + +#define _QIB_PLL_FAIL (QLOGIC_IB_HWE_COREPLL_FBSLIP | \ + QLOGIC_IB_HWE_COREPLL_RFSLIP) + + if (hwerrs & _QIB_PLL_FAIL) { + isfatal = 1; + snprintf(bitsmsg, sizeof(dd->cspec->bitsmsgbuf), + "[PLL failed (%llx), InfiniPath hardware unusable]", + (unsigned long long) hwerrs & _QIB_PLL_FAIL); + strlcat(msg, bitsmsg, msgl); + /* ignore from now on, so disable until driver reloaded */ + dd->cspec->hwerrmask &= ~(hwerrs & _QIB_PLL_FAIL); + qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask); + } + + if (hwerrs & QLOGIC_IB_HWE_SERDESPLLFAILED) { + /* + * If it occurs, it is left masked since the eternal + * interface is unused. + */ + dd->cspec->hwerrmask &= ~QLOGIC_IB_HWE_SERDESPLLFAILED; + qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask); + } + + qib_dev_err(dd, "%s hardware error\n", msg); + + if (isfatal && !dd->diag_client) { + qib_dev_err(dd, + "Fatal Hardware Error, no longer usable, SN %.16s\n", + dd->serial); + /* + * For /sys status file and user programs to print; if no + * trailing brace is copied, we'll know it was truncated. + */ + if (dd->freezemsg) + snprintf(dd->freezemsg, dd->freezelen, + "{%s}", msg); + qib_disable_after_error(dd); + } +bail:; +} + +/** + * qib_7220_init_hwerrors - enable hardware errors + * @dd: the qlogic_ib device + * + * now that we have finished initializing everything that might reasonably + * cause a hardware error, and cleared those errors bits as they occur, + * we can enable hardware errors in the mask (potentially enabling + * freeze mode), and enable hardware errors as errors (along with + * everything else) in errormask + */ +static void qib_7220_init_hwerrors(struct qib_devdata *dd) +{ + u64 val; + u64 extsval; + + extsval = qib_read_kreg64(dd, kr_extstatus); + + if (!(extsval & (QLOGIC_IB_EXTS_MEMBIST_ENDTEST | + QLOGIC_IB_EXTS_MEMBIST_DISABLED))) + qib_dev_err(dd, "MemBIST did not complete!\n"); + if (extsval & QLOGIC_IB_EXTS_MEMBIST_DISABLED) + qib_devinfo(dd->pcidev, "MemBIST is disabled.\n"); + + val = ~0ULL; /* default to all hwerrors become interrupts, */ + + val &= ~QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR; + dd->cspec->hwerrmask = val; + + qib_write_kreg(dd, kr_hwerrclear, ~HWE_MASK(PowerOnBISTFailed)); + qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask); + + /* clear all */ + qib_write_kreg(dd, kr_errclear, ~0ULL); + /* enable errors that are masked, at least this first time. */ + qib_write_kreg(dd, kr_errmask, ~0ULL); + dd->cspec->errormask = qib_read_kreg64(dd, kr_errmask); + /* clear any interrupts up to this point (ints still not enabled) */ + qib_write_kreg(dd, kr_intclear, ~0ULL); +} + +/* + * Disable and enable the armlaunch error. Used for PIO bandwidth testing + * on chips that are count-based, rather than trigger-based. There is no + * reference counting, but that's also fine, given the intended use. + * Only chip-specific because it's all register accesses + */ +static void qib_set_7220_armlaunch(struct qib_devdata *dd, u32 enable) +{ + if (enable) { + qib_write_kreg(dd, kr_errclear, ERR_MASK(SendPioArmLaunchErr)); + dd->cspec->errormask |= ERR_MASK(SendPioArmLaunchErr); + } else + dd->cspec->errormask &= ~ERR_MASK(SendPioArmLaunchErr); + qib_write_kreg(dd, kr_errmask, dd->cspec->errormask); +} + +/* + * Formerly took parameter in pre-shifted, + * pre-merged form with LinkCmd and LinkInitCmd + * together, and assuming the zero was NOP. + */ +static void qib_set_ib_7220_lstate(struct qib_pportdata *ppd, u16 linkcmd, + u16 linitcmd) +{ + u64 mod_wd; + struct qib_devdata *dd = ppd->dd; + unsigned long flags; + + if (linitcmd == QLOGIC_IB_IBCC_LINKINITCMD_DISABLE) { + /* + * If we are told to disable, note that so link-recovery + * code does not attempt to bring us back up. + */ + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags |= QIBL_IB_LINK_DISABLED; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + } else if (linitcmd || linkcmd == QLOGIC_IB_IBCC_LINKCMD_DOWN) { + /* + * Any other linkinitcmd will lead to LINKDOWN and then + * to INIT (if all is well), so clear flag to let + * link-recovery code attempt to bring us back up. + */ + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags &= ~QIBL_IB_LINK_DISABLED; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + } + + mod_wd = (linkcmd << IBA7220_IBCC_LINKCMD_SHIFT) | + (linitcmd << QLOGIC_IB_IBCC_LINKINITCMD_SHIFT); + + qib_write_kreg(dd, kr_ibcctrl, ppd->cpspec->ibcctrl | mod_wd); + /* write to chip to prevent back-to-back writes of ibc reg */ + qib_write_kreg(dd, kr_scratch, 0); +} + +/* + * All detailed interaction with the SerDes has been moved to qib_sd7220.c + * + * The portion of IBA7220-specific bringup_serdes() that actually deals with + * registers and memory within the SerDes itself is qib_sd7220_init(). + */ + +/** + * qib_7220_bringup_serdes - bring up the serdes + * @ppd: physical port on the qlogic_ib device + */ +static int qib_7220_bringup_serdes(struct qib_pportdata *ppd) +{ + struct qib_devdata *dd = ppd->dd; + u64 val, prev_val, guid, ibc; + int ret = 0; + + /* Put IBC in reset, sends disabled */ + dd->control &= ~QLOGIC_IB_C_LINKENABLE; + qib_write_kreg(dd, kr_control, 0ULL); + + if (qib_compat_ddr_negotiate) { + ppd->cpspec->ibdeltainprog = 1; + ppd->cpspec->ibsymsnap = read_7220_creg32(dd, cr_ibsymbolerr); + ppd->cpspec->iblnkerrsnap = + read_7220_creg32(dd, cr_iblinkerrrecov); + } + + /* flowcontrolwatermark is in units of KBytes */ + ibc = 0x5ULL << SYM_LSB(IBCCtrl, FlowCtrlWaterMark); + /* + * How often flowctrl sent. More or less in usecs; balance against + * watermark value, so that in theory senders always get a flow + * control update in time to not let the IB link go idle. + */ + ibc |= 0x3ULL << SYM_LSB(IBCCtrl, FlowCtrlPeriod); + /* max error tolerance */ + ibc |= 0xfULL << SYM_LSB(IBCCtrl, PhyerrThreshold); + /* use "real" buffer space for */ + ibc |= 4ULL << SYM_LSB(IBCCtrl, CreditScale); + /* IB credit flow control. */ + ibc |= 0xfULL << SYM_LSB(IBCCtrl, OverrunThreshold); + /* + * set initial max size pkt IBC will send, including ICRC; it's the + * PIO buffer size in dwords, less 1; also see qib_set_mtu() + */ + ibc |= ((u64)(ppd->ibmaxlen >> 2) + 1) << SYM_LSB(IBCCtrl, MaxPktLen); + ppd->cpspec->ibcctrl = ibc; /* without linkcmd or linkinitcmd! */ + + /* initially come up waiting for TS1, without sending anything. */ + val = ppd->cpspec->ibcctrl | (QLOGIC_IB_IBCC_LINKINITCMD_DISABLE << + QLOGIC_IB_IBCC_LINKINITCMD_SHIFT); + qib_write_kreg(dd, kr_ibcctrl, val); + + if (!ppd->cpspec->ibcddrctrl) { + /* not on re-init after reset */ + ppd->cpspec->ibcddrctrl = qib_read_kreg64(dd, kr_ibcddrctrl); + + if (ppd->link_speed_enabled == (QIB_IB_SDR | QIB_IB_DDR)) + ppd->cpspec->ibcddrctrl |= + IBA7220_IBC_SPEED_AUTONEG_MASK | + IBA7220_IBC_IBTA_1_2_MASK; + else + ppd->cpspec->ibcddrctrl |= + ppd->link_speed_enabled == QIB_IB_DDR ? + IBA7220_IBC_SPEED_DDR : IBA7220_IBC_SPEED_SDR; + if ((ppd->link_width_enabled & (IB_WIDTH_1X | IB_WIDTH_4X)) == + (IB_WIDTH_1X | IB_WIDTH_4X)) + ppd->cpspec->ibcddrctrl |= IBA7220_IBC_WIDTH_AUTONEG; + else + ppd->cpspec->ibcddrctrl |= + ppd->link_width_enabled == IB_WIDTH_4X ? + IBA7220_IBC_WIDTH_4X_ONLY : + IBA7220_IBC_WIDTH_1X_ONLY; + + /* always enable these on driver reload, not sticky */ + ppd->cpspec->ibcddrctrl |= + IBA7220_IBC_RXPOL_MASK << IBA7220_IBC_RXPOL_SHIFT; + ppd->cpspec->ibcddrctrl |= + IBA7220_IBC_HRTBT_MASK << IBA7220_IBC_HRTBT_SHIFT; + + /* enable automatic lane reversal detection for receive */ + ppd->cpspec->ibcddrctrl |= IBA7220_IBC_LANE_REV_SUPPORTED; + } else + /* write to chip to prevent back-to-back writes of ibc reg */ + qib_write_kreg(dd, kr_scratch, 0); + + qib_write_kreg(dd, kr_ibcddrctrl, ppd->cpspec->ibcddrctrl); + qib_write_kreg(dd, kr_scratch, 0); + + qib_write_kreg(dd, kr_ncmodectrl, 0Ull); + qib_write_kreg(dd, kr_scratch, 0); + + ret = qib_sd7220_init(dd); + + val = qib_read_kreg64(dd, kr_xgxs_cfg); + prev_val = val; + val |= QLOGIC_IB_XGXS_FC_SAFE; + if (val != prev_val) { + qib_write_kreg(dd, kr_xgxs_cfg, val); + qib_read_kreg32(dd, kr_scratch); + } + if (val & QLOGIC_IB_XGXS_RESET) + val &= ~QLOGIC_IB_XGXS_RESET; + if (val != prev_val) + qib_write_kreg(dd, kr_xgxs_cfg, val); + + /* first time through, set port guid */ + if (!ppd->guid) + ppd->guid = dd->base_guid; + guid = be64_to_cpu(ppd->guid); + + qib_write_kreg(dd, kr_hrtbt_guid, guid); + if (!ret) { + dd->control |= QLOGIC_IB_C_LINKENABLE; + qib_write_kreg(dd, kr_control, dd->control); + } else + /* write to chip to prevent back-to-back writes of ibc reg */ + qib_write_kreg(dd, kr_scratch, 0); + return ret; +} + +/** + * qib_7220_quiet_serdes - set serdes to txidle + * @ppd: physical port of the qlogic_ib device + * Called when driver is being unloaded + */ +static void qib_7220_quiet_serdes(struct qib_pportdata *ppd) +{ + u64 val; + struct qib_devdata *dd = ppd->dd; + unsigned long flags; + + /* disable IBC */ + dd->control &= ~QLOGIC_IB_C_LINKENABLE; + qib_write_kreg(dd, kr_control, + dd->control | QLOGIC_IB_C_FREEZEMODE); + + ppd->cpspec->chase_end = 0; + if (ppd->cpspec->chase_timer.data) /* if initted */ + del_timer_sync(&ppd->cpspec->chase_timer); + + if (ppd->cpspec->ibsymdelta || ppd->cpspec->iblnkerrdelta || + ppd->cpspec->ibdeltainprog) { + u64 diagc; + + /* enable counter writes */ + diagc = qib_read_kreg64(dd, kr_hwdiagctrl); + qib_write_kreg(dd, kr_hwdiagctrl, + diagc | SYM_MASK(HwDiagCtrl, CounterWrEnable)); + + if (ppd->cpspec->ibsymdelta || ppd->cpspec->ibdeltainprog) { + val = read_7220_creg32(dd, cr_ibsymbolerr); + if (ppd->cpspec->ibdeltainprog) + val -= val - ppd->cpspec->ibsymsnap; + val -= ppd->cpspec->ibsymdelta; + write_7220_creg(dd, cr_ibsymbolerr, val); + } + if (ppd->cpspec->iblnkerrdelta || ppd->cpspec->ibdeltainprog) { + val = read_7220_creg32(dd, cr_iblinkerrrecov); + if (ppd->cpspec->ibdeltainprog) + val -= val - ppd->cpspec->iblnkerrsnap; + val -= ppd->cpspec->iblnkerrdelta; + write_7220_creg(dd, cr_iblinkerrrecov, val); + } + + /* and disable counter writes */ + qib_write_kreg(dd, kr_hwdiagctrl, diagc); + } + qib_set_ib_7220_lstate(ppd, 0, QLOGIC_IB_IBCC_LINKINITCMD_DISABLE); + + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags &= ~QIBL_IB_AUTONEG_INPROG; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + wake_up(&ppd->cpspec->autoneg_wait); + cancel_delayed_work_sync(&ppd->cpspec->autoneg_work); + + shutdown_7220_relock_poll(ppd->dd); + val = qib_read_kreg64(ppd->dd, kr_xgxs_cfg); + val |= QLOGIC_IB_XGXS_RESET; + qib_write_kreg(ppd->dd, kr_xgxs_cfg, val); +} + +/** + * qib_setup_7220_setextled - set the state of the two external LEDs + * @dd: the qlogic_ib device + * @on: whether the link is up or not + * + * The exact combo of LEDs if on is true is determined by looking + * at the ibcstatus. + * + * These LEDs indicate the physical and logical state of IB link. + * For this chip (at least with recommended board pinouts), LED1 + * is Yellow (logical state) and LED2 is Green (physical state), + * + * Note: We try to match the Mellanox HCA LED behavior as best + * we can. Green indicates physical link state is OK (something is + * plugged in, and we can train). + * Amber indicates the link is logically up (ACTIVE). + * Mellanox further blinks the amber LED to indicate data packet + * activity, but we have no hardware support for that, so it would + * require waking up every 10-20 msecs and checking the counters + * on the chip, and then turning the LED off if appropriate. That's + * visible overhead, so not something we will do. + * + */ +static void qib_setup_7220_setextled(struct qib_pportdata *ppd, u32 on) +{ + struct qib_devdata *dd = ppd->dd; + u64 extctl, ledblink = 0, val, lst, ltst; + unsigned long flags; + + /* + * The diags use the LED to indicate diag info, so we leave + * the external LED alone when the diags are running. + */ + if (dd->diag_client) + return; + + if (ppd->led_override) { + ltst = (ppd->led_override & QIB_LED_PHYS) ? + IB_PHYSPORTSTATE_LINKUP : IB_PHYSPORTSTATE_DISABLED, + lst = (ppd->led_override & QIB_LED_LOG) ? + IB_PORT_ACTIVE : IB_PORT_DOWN; + } else if (on) { + val = qib_read_kreg64(dd, kr_ibcstatus); + ltst = qib_7220_phys_portstate(val); + lst = qib_7220_iblink_state(val); + } else { + ltst = 0; + lst = 0; + } + + spin_lock_irqsave(&dd->cspec->gpio_lock, flags); + extctl = dd->cspec->extctrl & ~(SYM_MASK(EXTCtrl, LEDPriPortGreenOn) | + SYM_MASK(EXTCtrl, LEDPriPortYellowOn)); + if (ltst == IB_PHYSPORTSTATE_LINKUP) { + extctl |= SYM_MASK(EXTCtrl, LEDPriPortGreenOn); + /* + * counts are in chip clock (4ns) periods. + * This is 1/16 sec (66.6ms) on, + * 3/16 sec (187.5 ms) off, with packets rcvd + */ + ledblink = ((66600 * 1000UL / 4) << IBA7220_LEDBLINK_ON_SHIFT) + | ((187500 * 1000UL / 4) << IBA7220_LEDBLINK_OFF_SHIFT); + } + if (lst == IB_PORT_ACTIVE) + extctl |= SYM_MASK(EXTCtrl, LEDPriPortYellowOn); + dd->cspec->extctrl = extctl; + qib_write_kreg(dd, kr_extctrl, extctl); + spin_unlock_irqrestore(&dd->cspec->gpio_lock, flags); + + if (ledblink) /* blink the LED on packet receive */ + qib_write_kreg(dd, kr_rcvpktledcnt, ledblink); +} + +static void qib_7220_free_irq(struct qib_devdata *dd) +{ + if (dd->cspec->irq) { + free_irq(dd->cspec->irq, dd); + dd->cspec->irq = 0; + } + qib_nomsi(dd); +} + +/* + * qib_setup_7220_cleanup - clean up any per-chip chip-specific stuff + * @dd: the qlogic_ib device + * + * This is called during driver unload. + * + */ +static void qib_setup_7220_cleanup(struct qib_devdata *dd) +{ + qib_7220_free_irq(dd); + kfree(dd->cspec->cntrs); + kfree(dd->cspec->portcntrs); +} + +/* + * This is only called for SDmaInt. + * SDmaDisabled is handled on the error path. + */ +static void sdma_7220_intr(struct qib_pportdata *ppd, u64 istat) +{ + unsigned long flags; + + spin_lock_irqsave(&ppd->sdma_lock, flags); + + switch (ppd->sdma_state.current_state) { + case qib_sdma_state_s00_hw_down: + break; + + case qib_sdma_state_s10_hw_start_up_wait: + __qib_sdma_process_event(ppd, qib_sdma_event_e20_hw_started); + break; + + case qib_sdma_state_s20_idle: + break; + + case qib_sdma_state_s30_sw_clean_up_wait: + break; + + case qib_sdma_state_s40_hw_clean_up_wait: + break; + + case qib_sdma_state_s50_hw_halt_wait: + __qib_sdma_process_event(ppd, qib_sdma_event_e60_hw_halted); + break; + + case qib_sdma_state_s99_running: + /* too chatty to print here */ + __qib_sdma_intr(ppd); + break; + } + spin_unlock_irqrestore(&ppd->sdma_lock, flags); +} + +static void qib_wantpiobuf_7220_intr(struct qib_devdata *dd, u32 needint) +{ + unsigned long flags; + + spin_lock_irqsave(&dd->sendctrl_lock, flags); + if (needint) { + if (!(dd->sendctrl & SYM_MASK(SendCtrl, SendBufAvailUpd))) + goto done; + /* + * blip the availupd off, next write will be on, so + * we ensure an avail update, regardless of threshold or + * buffers becoming free, whenever we want an interrupt + */ + qib_write_kreg(dd, kr_sendctrl, dd->sendctrl & + ~SYM_MASK(SendCtrl, SendBufAvailUpd)); + qib_write_kreg(dd, kr_scratch, 0ULL); + dd->sendctrl |= SYM_MASK(SendCtrl, SendIntBufAvail); + } else + dd->sendctrl &= ~SYM_MASK(SendCtrl, SendIntBufAvail); + qib_write_kreg(dd, kr_sendctrl, dd->sendctrl); + qib_write_kreg(dd, kr_scratch, 0ULL); +done: + spin_unlock_irqrestore(&dd->sendctrl_lock, flags); +} + +/* + * Handle errors and unusual events first, separate function + * to improve cache hits for fast path interrupt handling. + */ +static noinline void unlikely_7220_intr(struct qib_devdata *dd, u64 istat) +{ + if (unlikely(istat & ~QLOGIC_IB_I_BITSEXTANT)) + qib_dev_err(dd, + "interrupt with unknown interrupts %Lx set\n", + istat & ~QLOGIC_IB_I_BITSEXTANT); + + if (istat & QLOGIC_IB_I_GPIO) { + u32 gpiostatus; + + /* + * Boards for this chip currently don't use GPIO interrupts, + * so clear by writing GPIOstatus to GPIOclear, and complain + * to alert developer. To avoid endless repeats, clear + * the bits in the mask, since there is some kind of + * programming error or chip problem. + */ + gpiostatus = qib_read_kreg32(dd, kr_gpio_status); + /* + * In theory, writing GPIOstatus to GPIOclear could + * have a bad side-effect on some diagnostic that wanted + * to poll for a status-change, but the various shadows + * make that problematic at best. Diags will just suppress + * all GPIO interrupts during such tests. + */ + qib_write_kreg(dd, kr_gpio_clear, gpiostatus); + + if (gpiostatus) { + const u32 mask = qib_read_kreg32(dd, kr_gpio_mask); + u32 gpio_irq = mask & gpiostatus; + + /* + * A bit set in status and (chip) Mask register + * would cause an interrupt. Since we are not + * expecting any, report it. Also check that the + * chip reflects our shadow, report issues, + * and refresh from the shadow. + */ + /* + * Clear any troublemakers, and update chip + * from shadow + */ + dd->cspec->gpio_mask &= ~gpio_irq; + qib_write_kreg(dd, kr_gpio_mask, dd->cspec->gpio_mask); + } + } + + if (istat & QLOGIC_IB_I_ERROR) { + u64 estat; + + qib_stats.sps_errints++; + estat = qib_read_kreg64(dd, kr_errstatus); + if (!estat) + qib_devinfo(dd->pcidev, + "error interrupt (%Lx), but no error bits set!\n", + istat); + else + handle_7220_errors(dd, estat); + } +} + +static irqreturn_t qib_7220intr(int irq, void *data) +{ + struct qib_devdata *dd = data; + irqreturn_t ret; + u64 istat; + u64 ctxtrbits; + u64 rmask; + unsigned i; + + if ((dd->flags & (QIB_PRESENT | QIB_BADINTR)) != QIB_PRESENT) { + /* + * This return value is not great, but we do not want the + * interrupt core code to remove our interrupt handler + * because we don't appear to be handling an interrupt + * during a chip reset. + */ + ret = IRQ_HANDLED; + goto bail; + } + + istat = qib_read_kreg64(dd, kr_intstatus); + + if (unlikely(!istat)) { + ret = IRQ_NONE; /* not our interrupt, or already handled */ + goto bail; + } + if (unlikely(istat == -1)) { + qib_bad_intrstatus(dd); + /* don't know if it was our interrupt or not */ + ret = IRQ_NONE; + goto bail; + } + + this_cpu_inc(*dd->int_counter); + if (unlikely(istat & (~QLOGIC_IB_I_BITSEXTANT | + QLOGIC_IB_I_GPIO | QLOGIC_IB_I_ERROR))) + unlikely_7220_intr(dd, istat); + + /* + * Clear the interrupt bits we found set, relatively early, so we + * "know" know the chip will have seen this by the time we process + * the queue, and will re-interrupt if necessary. The processor + * itself won't take the interrupt again until we return. + */ + qib_write_kreg(dd, kr_intclear, istat); + + /* + * Handle kernel receive queues before checking for pio buffers + * available since receives can overflow; piobuf waiters can afford + * a few extra cycles, since they were waiting anyway. + */ + ctxtrbits = istat & + ((QLOGIC_IB_I_RCVAVAIL_MASK << QLOGIC_IB_I_RCVAVAIL_SHIFT) | + (QLOGIC_IB_I_RCVURG_MASK << QLOGIC_IB_I_RCVURG_SHIFT)); + if (ctxtrbits) { + rmask = (1ULL << QLOGIC_IB_I_RCVAVAIL_SHIFT) | + (1ULL << QLOGIC_IB_I_RCVURG_SHIFT); + for (i = 0; i < dd->first_user_ctxt; i++) { + if (ctxtrbits & rmask) { + ctxtrbits &= ~rmask; + qib_kreceive(dd->rcd[i], NULL, NULL); + } + rmask <<= 1; + } + if (ctxtrbits) { + ctxtrbits = + (ctxtrbits >> QLOGIC_IB_I_RCVAVAIL_SHIFT) | + (ctxtrbits >> QLOGIC_IB_I_RCVURG_SHIFT); + qib_handle_urcv(dd, ctxtrbits); + } + } + + /* only call for SDmaInt */ + if (istat & QLOGIC_IB_I_SDMAINT) + sdma_7220_intr(dd->pport, istat); + + if ((istat & QLOGIC_IB_I_SPIOBUFAVAIL) && (dd->flags & QIB_INITTED)) + qib_ib_piobufavail(dd); + + ret = IRQ_HANDLED; +bail: + return ret; +} + +/* + * Set up our chip-specific interrupt handler. + * The interrupt type has already been setup, so + * we just need to do the registration and error checking. + * If we are using MSI interrupts, we may fall back to + * INTx later, if the interrupt handler doesn't get called + * within 1/2 second (see verify_interrupt()). + */ +static void qib_setup_7220_interrupt(struct qib_devdata *dd) +{ + if (!dd->cspec->irq) + qib_dev_err(dd, + "irq is 0, BIOS error? Interrupts won't work\n"); + else { + int ret = request_irq(dd->cspec->irq, qib_7220intr, + dd->msi_lo ? 0 : IRQF_SHARED, + QIB_DRV_NAME, dd); + + if (ret) + qib_dev_err(dd, + "Couldn't setup %s interrupt (irq=%d): %d\n", + dd->msi_lo ? "MSI" : "INTx", + dd->cspec->irq, ret); + } +} + +/** + * qib_7220_boardname - fill in the board name + * @dd: the qlogic_ib device + * + * info is based on the board revision register + */ +static void qib_7220_boardname(struct qib_devdata *dd) +{ + char *n; + u32 boardid, namelen; + + boardid = SYM_FIELD(dd->revision, Revision, + BoardID); + + switch (boardid) { + case 1: + n = "InfiniPath_QLE7240"; + break; + case 2: + n = "InfiniPath_QLE7280"; + break; + default: + qib_dev_err(dd, "Unknown 7220 board with ID %u\n", boardid); + n = "Unknown_InfiniPath_7220"; + break; + } + + namelen = strlen(n) + 1; + dd->boardname = kmalloc(namelen, GFP_KERNEL); + if (!dd->boardname) + qib_dev_err(dd, "Failed allocation for board name: %s\n", n); + else + snprintf(dd->boardname, namelen, "%s", n); + + if (dd->majrev != 5 || !dd->minrev || dd->minrev > 2) + qib_dev_err(dd, + "Unsupported InfiniPath hardware revision %u.%u!\n", + dd->majrev, dd->minrev); + + snprintf(dd->boardversion, sizeof(dd->boardversion), + "ChipABI %u.%u, %s, InfiniPath%u %u.%u, SW Compat %u\n", + QIB_CHIP_VERS_MAJ, QIB_CHIP_VERS_MIN, dd->boardname, + (unsigned)SYM_FIELD(dd->revision, Revision_R, Arch), + dd->majrev, dd->minrev, + (unsigned)SYM_FIELD(dd->revision, Revision_R, SW)); +} + +/* + * This routine sleeps, so it can only be called from user context, not + * from interrupt context. + */ +static int qib_setup_7220_reset(struct qib_devdata *dd) +{ + u64 val; + int i; + int ret; + u16 cmdval; + u8 int_line, clinesz; + unsigned long flags; + + qib_pcie_getcmd(dd, &cmdval, &int_line, &clinesz); + + /* Use dev_err so it shows up in logs, etc. */ + qib_dev_err(dd, "Resetting InfiniPath unit %u\n", dd->unit); + + /* no interrupts till re-initted */ + qib_7220_set_intr_state(dd, 0); + + dd->pport->cpspec->ibdeltainprog = 0; + dd->pport->cpspec->ibsymdelta = 0; + dd->pport->cpspec->iblnkerrdelta = 0; + + /* + * Keep chip from being accessed until we are ready. Use + * writeq() directly, to allow the write even though QIB_PRESENT + * isn't set. + */ + dd->flags &= ~(QIB_INITTED | QIB_PRESENT); + /* so we check interrupts work again */ + dd->z_int_counter = qib_int_counter(dd); + val = dd->control | QLOGIC_IB_C_RESET; + writeq(val, &dd->kregbase[kr_control]); + mb(); /* prevent compiler reordering around actual reset */ + + for (i = 1; i <= 5; i++) { + /* + * Allow MBIST, etc. to complete; longer on each retry. + * We sometimes get machine checks from bus timeout if no + * response, so for now, make it *really* long. + */ + msleep(1000 + (1 + i) * 2000); + + qib_pcie_reenable(dd, cmdval, int_line, clinesz); + + /* + * Use readq directly, so we don't need to mark it as PRESENT + * until we get a successful indication that all is well. + */ + val = readq(&dd->kregbase[kr_revision]); + if (val == dd->revision) { + dd->flags |= QIB_PRESENT; /* it's back */ + ret = qib_reinit_intr(dd); + goto bail; + } + } + ret = 0; /* failed */ + +bail: + if (ret) { + if (qib_pcie_params(dd, dd->lbus_width, NULL, NULL)) + qib_dev_err(dd, + "Reset failed to setup PCIe or interrupts; continuing anyway\n"); + + /* hold IBC in reset, no sends, etc till later */ + qib_write_kreg(dd, kr_control, 0ULL); + + /* clear the reset error, init error/hwerror mask */ + qib_7220_init_hwerrors(dd); + + /* do setup similar to speed or link-width changes */ + if (dd->pport->cpspec->ibcddrctrl & IBA7220_IBC_IBTA_1_2_MASK) + dd->cspec->presets_needed = 1; + spin_lock_irqsave(&dd->pport->lflags_lock, flags); + dd->pport->lflags |= QIBL_IB_FORCE_NOTIFY; + dd->pport->lflags &= ~QIBL_IB_AUTONEG_FAILED; + spin_unlock_irqrestore(&dd->pport->lflags_lock, flags); + } + + return ret; +} + +/** + * qib_7220_put_tid - write a TID to the chip + * @dd: the qlogic_ib device + * @tidptr: pointer to the expected TID (in chip) to update + * @tidtype: 0 for eager, 1 for expected + * @pa: physical address of in memory buffer; tidinvalid if freeing + */ +static void qib_7220_put_tid(struct qib_devdata *dd, u64 __iomem *tidptr, + u32 type, unsigned long pa) +{ + if (pa != dd->tidinvalid) { + u64 chippa = pa >> IBA7220_TID_PA_SHIFT; + + /* paranoia checks */ + if (pa != (chippa << IBA7220_TID_PA_SHIFT)) { + qib_dev_err(dd, "Physaddr %lx not 2KB aligned!\n", + pa); + return; + } + if (chippa >= (1UL << IBA7220_TID_SZ_SHIFT)) { + qib_dev_err(dd, + "Physical page address 0x%lx larger than supported\n", + pa); + return; + } + + if (type == RCVHQ_RCV_TYPE_EAGER) + chippa |= dd->tidtemplate; + else /* for now, always full 4KB page */ + chippa |= IBA7220_TID_SZ_4K; + pa = chippa; + } + writeq(pa, tidptr); + mmiowb(); +} + +/** + * qib_7220_clear_tids - clear all TID entries for a ctxt, expected and eager + * @dd: the qlogic_ib device + * @ctxt: the ctxt + * + * clear all TID entries for a ctxt, expected and eager. + * Used from qib_close(). On this chip, TIDs are only 32 bits, + * not 64, but they are still on 64 bit boundaries, so tidbase + * is declared as u64 * for the pointer math, even though we write 32 bits + */ +static void qib_7220_clear_tids(struct qib_devdata *dd, + struct qib_ctxtdata *rcd) +{ + u64 __iomem *tidbase; + unsigned long tidinv; + u32 ctxt; + int i; + + if (!dd->kregbase || !rcd) + return; + + ctxt = rcd->ctxt; + + tidinv = dd->tidinvalid; + tidbase = (u64 __iomem *) + ((char __iomem *)(dd->kregbase) + + dd->rcvtidbase + + ctxt * dd->rcvtidcnt * sizeof(*tidbase)); + + for (i = 0; i < dd->rcvtidcnt; i++) + qib_7220_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EXPECTED, + tidinv); + + tidbase = (u64 __iomem *) + ((char __iomem *)(dd->kregbase) + + dd->rcvegrbase + + rcd->rcvegr_tid_base * sizeof(*tidbase)); + + for (i = 0; i < rcd->rcvegrcnt; i++) + qib_7220_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EAGER, + tidinv); +} + +/** + * qib_7220_tidtemplate - setup constants for TID updates + * @dd: the qlogic_ib device + * + * We setup stuff that we use a lot, to avoid calculating each time + */ +static void qib_7220_tidtemplate(struct qib_devdata *dd) +{ + if (dd->rcvegrbufsize == 2048) + dd->tidtemplate = IBA7220_TID_SZ_2K; + else if (dd->rcvegrbufsize == 4096) + dd->tidtemplate = IBA7220_TID_SZ_4K; + dd->tidinvalid = 0; +} + +/** + * qib_init_7220_get_base_info - set chip-specific flags for user code + * @rcd: the qlogic_ib ctxt + * @kbase: qib_base_info pointer + * + * We set the PCIE flag because the lower bandwidth on PCIe vs + * HyperTransport can affect some user packet algorithims. + */ +static int qib_7220_get_base_info(struct qib_ctxtdata *rcd, + struct qib_base_info *kinfo) +{ + kinfo->spi_runtime_flags |= QIB_RUNTIME_PCIE | + QIB_RUNTIME_NODMA_RTAIL | QIB_RUNTIME_SDMA; + + if (rcd->dd->flags & QIB_USE_SPCL_TRIG) + kinfo->spi_runtime_flags |= QIB_RUNTIME_SPECIAL_TRIGGER; + + return 0; +} + +static struct qib_message_header * +qib_7220_get_msgheader(struct qib_devdata *dd, __le32 *rhf_addr) +{ + u32 offset = qib_hdrget_offset(rhf_addr); + + return (struct qib_message_header *) + (rhf_addr - dd->rhf_offset + offset); +} + +static void qib_7220_config_ctxts(struct qib_devdata *dd) +{ + unsigned long flags; + u32 nchipctxts; + + nchipctxts = qib_read_kreg32(dd, kr_portcnt); + dd->cspec->numctxts = nchipctxts; + if (qib_n_krcv_queues > 1) { + dd->qpn_mask = 0x3e; + dd->first_user_ctxt = qib_n_krcv_queues * dd->num_pports; + if (dd->first_user_ctxt > nchipctxts) + dd->first_user_ctxt = nchipctxts; + } else + dd->first_user_ctxt = dd->num_pports; + dd->n_krcv_queues = dd->first_user_ctxt; + + if (!qib_cfgctxts) { + int nctxts = dd->first_user_ctxt + num_online_cpus(); + + if (nctxts <= 5) + dd->ctxtcnt = 5; + else if (nctxts <= 9) + dd->ctxtcnt = 9; + else if (nctxts <= nchipctxts) + dd->ctxtcnt = nchipctxts; + } else if (qib_cfgctxts <= nchipctxts) + dd->ctxtcnt = qib_cfgctxts; + if (!dd->ctxtcnt) /* none of the above, set to max */ + dd->ctxtcnt = nchipctxts; + + /* + * Chip can be configured for 5, 9, or 17 ctxts, and choice + * affects number of eager TIDs per ctxt (1K, 2K, 4K). + * Lock to be paranoid about later motion, etc. + */ + spin_lock_irqsave(&dd->cspec->rcvmod_lock, flags); + if (dd->ctxtcnt > 9) + dd->rcvctrl |= 2ULL << IBA7220_R_CTXTCFG_SHIFT; + else if (dd->ctxtcnt > 5) + dd->rcvctrl |= 1ULL << IBA7220_R_CTXTCFG_SHIFT; + /* else configure for default 5 receive ctxts */ + if (dd->qpn_mask) + dd->rcvctrl |= 1ULL << QIB_7220_RcvCtrl_RcvQPMapEnable_LSB; + qib_write_kreg(dd, kr_rcvctrl, dd->rcvctrl); + spin_unlock_irqrestore(&dd->cspec->rcvmod_lock, flags); + + /* kr_rcvegrcnt changes based on the number of contexts enabled */ + dd->cspec->rcvegrcnt = qib_read_kreg32(dd, kr_rcvegrcnt); + dd->rcvhdrcnt = max(dd->cspec->rcvegrcnt, IBA7220_KRCVEGRCNT); +} + +static int qib_7220_get_ib_cfg(struct qib_pportdata *ppd, int which) +{ + int lsb, ret = 0; + u64 maskr; /* right-justified mask */ + + switch (which) { + case QIB_IB_CFG_LWID_ENB: /* Get allowed Link-width */ + ret = ppd->link_width_enabled; + goto done; + + case QIB_IB_CFG_LWID: /* Get currently active Link-width */ + ret = ppd->link_width_active; + goto done; + + case QIB_IB_CFG_SPD_ENB: /* Get allowed Link speeds */ + ret = ppd->link_speed_enabled; + goto done; + + case QIB_IB_CFG_SPD: /* Get current Link spd */ + ret = ppd->link_speed_active; + goto done; + + case QIB_IB_CFG_RXPOL_ENB: /* Get Auto-RX-polarity enable */ + lsb = IBA7220_IBC_RXPOL_SHIFT; + maskr = IBA7220_IBC_RXPOL_MASK; + break; + + case QIB_IB_CFG_LREV_ENB: /* Get Auto-Lane-reversal enable */ + lsb = IBA7220_IBC_LREV_SHIFT; + maskr = IBA7220_IBC_LREV_MASK; + break; + + case QIB_IB_CFG_LINKLATENCY: + ret = qib_read_kreg64(ppd->dd, kr_ibcddrstatus) + & IBA7220_DDRSTAT_LINKLAT_MASK; + goto done; + + case QIB_IB_CFG_OP_VLS: + ret = ppd->vls_operational; + goto done; + + case QIB_IB_CFG_VL_HIGH_CAP: + ret = 0; + goto done; + + case QIB_IB_CFG_VL_LOW_CAP: + ret = 0; + goto done; + + case QIB_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */ + ret = SYM_FIELD(ppd->cpspec->ibcctrl, IBCCtrl, + OverrunThreshold); + goto done; + + case QIB_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */ + ret = SYM_FIELD(ppd->cpspec->ibcctrl, IBCCtrl, + PhyerrThreshold); + goto done; + + case QIB_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */ + /* will only take effect when the link state changes */ + ret = (ppd->cpspec->ibcctrl & + SYM_MASK(IBCCtrl, LinkDownDefaultState)) ? + IB_LINKINITCMD_SLEEP : IB_LINKINITCMD_POLL; + goto done; + + case QIB_IB_CFG_HRTBT: /* Get Heartbeat off/enable/auto */ + lsb = IBA7220_IBC_HRTBT_SHIFT; + maskr = IBA7220_IBC_HRTBT_MASK; + break; + + case QIB_IB_CFG_PMA_TICKS: + /* + * 0x00 = 10x link transfer rate or 4 nsec. for 2.5Gbs + * Since the clock is always 250MHz, the value is 1 or 0. + */ + ret = (ppd->link_speed_active == QIB_IB_DDR); + goto done; + + default: + ret = -EINVAL; + goto done; + } + ret = (int)((ppd->cpspec->ibcddrctrl >> lsb) & maskr); +done: + return ret; +} + +static int qib_7220_set_ib_cfg(struct qib_pportdata *ppd, int which, u32 val) +{ + struct qib_devdata *dd = ppd->dd; + u64 maskr; /* right-justified mask */ + int lsb, ret = 0, setforce = 0; + u16 lcmd, licmd; + unsigned long flags; + u32 tmp = 0; + + switch (which) { + case QIB_IB_CFG_LIDLMC: + /* + * Set LID and LMC. Combined to avoid possible hazard + * caller puts LMC in 16MSbits, DLID in 16LSbits of val + */ + lsb = IBA7220_IBC_DLIDLMC_SHIFT; + maskr = IBA7220_IBC_DLIDLMC_MASK; + break; + + case QIB_IB_CFG_LWID_ENB: /* set allowed Link-width */ + /* + * As with speed, only write the actual register if + * the link is currently down, otherwise takes effect + * on next link change. + */ + ppd->link_width_enabled = val; + if (!(ppd->lflags & QIBL_LINKDOWN)) + goto bail; + /* + * We set the QIBL_IB_FORCE_NOTIFY bit so updown + * will get called because we want update + * link_width_active, and the change may not take + * effect for some time (if we are in POLL), so this + * flag will force the updown routine to be called + * on the next ibstatuschange down interrupt, even + * if it's not an down->up transition. + */ + val--; /* convert from IB to chip */ + maskr = IBA7220_IBC_WIDTH_MASK; + lsb = IBA7220_IBC_WIDTH_SHIFT; + setforce = 1; + break; + + case QIB_IB_CFG_SPD_ENB: /* set allowed Link speeds */ + /* + * If we turn off IB1.2, need to preset SerDes defaults, + * but not right now. Set a flag for the next time + * we command the link down. As with width, only write the + * actual register if the link is currently down, otherwise + * takes effect on next link change. Since setting is being + * explicitly requested (via MAD or sysfs), clear autoneg + * failure status if speed autoneg is enabled. + */ + ppd->link_speed_enabled = val; + if ((ppd->cpspec->ibcddrctrl & IBA7220_IBC_IBTA_1_2_MASK) && + !(val & (val - 1))) + dd->cspec->presets_needed = 1; + if (!(ppd->lflags & QIBL_LINKDOWN)) + goto bail; + /* + * We set the QIBL_IB_FORCE_NOTIFY bit so updown + * will get called because we want update + * link_speed_active, and the change may not take + * effect for some time (if we are in POLL), so this + * flag will force the updown routine to be called + * on the next ibstatuschange down interrupt, even + * if it's not an down->up transition. + */ + if (val == (QIB_IB_SDR | QIB_IB_DDR)) { + val = IBA7220_IBC_SPEED_AUTONEG_MASK | + IBA7220_IBC_IBTA_1_2_MASK; + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags &= ~QIBL_IB_AUTONEG_FAILED; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + } else + val = val == QIB_IB_DDR ? + IBA7220_IBC_SPEED_DDR : IBA7220_IBC_SPEED_SDR; + maskr = IBA7220_IBC_SPEED_AUTONEG_MASK | + IBA7220_IBC_IBTA_1_2_MASK; + /* IBTA 1.2 mode + speed bits are contiguous */ + lsb = SYM_LSB(IBCDDRCtrl, IB_ENHANCED_MODE); + setforce = 1; + break; + + case QIB_IB_CFG_RXPOL_ENB: /* set Auto-RX-polarity enable */ + lsb = IBA7220_IBC_RXPOL_SHIFT; + maskr = IBA7220_IBC_RXPOL_MASK; + break; + + case QIB_IB_CFG_LREV_ENB: /* set Auto-Lane-reversal enable */ + lsb = IBA7220_IBC_LREV_SHIFT; + maskr = IBA7220_IBC_LREV_MASK; + break; + + case QIB_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */ + maskr = SYM_FIELD(ppd->cpspec->ibcctrl, IBCCtrl, + OverrunThreshold); + if (maskr != val) { + ppd->cpspec->ibcctrl &= + ~SYM_MASK(IBCCtrl, OverrunThreshold); + ppd->cpspec->ibcctrl |= (u64) val << + SYM_LSB(IBCCtrl, OverrunThreshold); + qib_write_kreg(dd, kr_ibcctrl, ppd->cpspec->ibcctrl); + qib_write_kreg(dd, kr_scratch, 0); + } + goto bail; + + case QIB_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */ + maskr = SYM_FIELD(ppd->cpspec->ibcctrl, IBCCtrl, + PhyerrThreshold); + if (maskr != val) { + ppd->cpspec->ibcctrl &= + ~SYM_MASK(IBCCtrl, PhyerrThreshold); + ppd->cpspec->ibcctrl |= (u64) val << + SYM_LSB(IBCCtrl, PhyerrThreshold); + qib_write_kreg(dd, kr_ibcctrl, ppd->cpspec->ibcctrl); + qib_write_kreg(dd, kr_scratch, 0); + } + goto bail; + + case QIB_IB_CFG_PKEYS: /* update pkeys */ + maskr = (u64) ppd->pkeys[0] | ((u64) ppd->pkeys[1] << 16) | + ((u64) ppd->pkeys[2] << 32) | + ((u64) ppd->pkeys[3] << 48); + qib_write_kreg(dd, kr_partitionkey, maskr); + goto bail; + + case QIB_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */ + /* will only take effect when the link state changes */ + if (val == IB_LINKINITCMD_POLL) + ppd->cpspec->ibcctrl &= + ~SYM_MASK(IBCCtrl, LinkDownDefaultState); + else /* SLEEP */ + ppd->cpspec->ibcctrl |= + SYM_MASK(IBCCtrl, LinkDownDefaultState); + qib_write_kreg(dd, kr_ibcctrl, ppd->cpspec->ibcctrl); + qib_write_kreg(dd, kr_scratch, 0); + goto bail; + + case QIB_IB_CFG_MTU: /* update the MTU in IBC */ + /* + * Update our housekeeping variables, and set IBC max + * size, same as init code; max IBC is max we allow in + * buffer, less the qword pbc, plus 1 for ICRC, in dwords + * Set even if it's unchanged, print debug message only + * on changes. + */ + val = (ppd->ibmaxlen >> 2) + 1; + ppd->cpspec->ibcctrl &= ~SYM_MASK(IBCCtrl, MaxPktLen); + ppd->cpspec->ibcctrl |= (u64)val << SYM_LSB(IBCCtrl, MaxPktLen); + qib_write_kreg(dd, kr_ibcctrl, ppd->cpspec->ibcctrl); + qib_write_kreg(dd, kr_scratch, 0); + goto bail; + + case QIB_IB_CFG_LSTATE: /* set the IB link state */ + switch (val & 0xffff0000) { + case IB_LINKCMD_DOWN: + lcmd = QLOGIC_IB_IBCC_LINKCMD_DOWN; + if (!ppd->cpspec->ibdeltainprog && + qib_compat_ddr_negotiate) { + ppd->cpspec->ibdeltainprog = 1; + ppd->cpspec->ibsymsnap = + read_7220_creg32(dd, cr_ibsymbolerr); + ppd->cpspec->iblnkerrsnap = + read_7220_creg32(dd, cr_iblinkerrrecov); + } + break; + + case IB_LINKCMD_ARMED: + lcmd = QLOGIC_IB_IBCC_LINKCMD_ARMED; + break; + + case IB_LINKCMD_ACTIVE: + lcmd = QLOGIC_IB_IBCC_LINKCMD_ACTIVE; + break; + + default: + ret = -EINVAL; + qib_dev_err(dd, "bad linkcmd req 0x%x\n", val >> 16); + goto bail; + } + switch (val & 0xffff) { + case IB_LINKINITCMD_NOP: + licmd = 0; + break; + + case IB_LINKINITCMD_POLL: + licmd = QLOGIC_IB_IBCC_LINKINITCMD_POLL; + break; + + case IB_LINKINITCMD_SLEEP: + licmd = QLOGIC_IB_IBCC_LINKINITCMD_SLEEP; + break; + + case IB_LINKINITCMD_DISABLE: + licmd = QLOGIC_IB_IBCC_LINKINITCMD_DISABLE; + ppd->cpspec->chase_end = 0; + /* + * stop state chase counter and timer, if running. + * wait forpending timer, but don't clear .data (ppd)! + */ + if (ppd->cpspec->chase_timer.expires) { + del_timer_sync(&ppd->cpspec->chase_timer); + ppd->cpspec->chase_timer.expires = 0; + } + break; + + default: + ret = -EINVAL; + qib_dev_err(dd, "bad linkinitcmd req 0x%x\n", + val & 0xffff); + goto bail; + } + qib_set_ib_7220_lstate(ppd, lcmd, licmd); + + maskr = IBA7220_IBC_WIDTH_MASK; + lsb = IBA7220_IBC_WIDTH_SHIFT; + tmp = (ppd->cpspec->ibcddrctrl >> lsb) & maskr; + /* If the width active on the chip does not match the + * width in the shadow register, write the new active + * width to the chip. + * We don't have to worry about speed as the speed is taken + * care of by set_7220_ibspeed_fast called by ib_updown. + */ + if (ppd->link_width_enabled-1 != tmp) { + ppd->cpspec->ibcddrctrl &= ~(maskr << lsb); + ppd->cpspec->ibcddrctrl |= + (((u64)(ppd->link_width_enabled-1) & maskr) << + lsb); + qib_write_kreg(dd, kr_ibcddrctrl, + ppd->cpspec->ibcddrctrl); + qib_write_kreg(dd, kr_scratch, 0); + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags |= QIBL_IB_FORCE_NOTIFY; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + } + goto bail; + + case QIB_IB_CFG_HRTBT: /* set Heartbeat off/enable/auto */ + if (val > IBA7220_IBC_HRTBT_MASK) { + ret = -EINVAL; + goto bail; + } + lsb = IBA7220_IBC_HRTBT_SHIFT; + maskr = IBA7220_IBC_HRTBT_MASK; + break; + + default: + ret = -EINVAL; + goto bail; + } + ppd->cpspec->ibcddrctrl &= ~(maskr << lsb); + ppd->cpspec->ibcddrctrl |= (((u64) val & maskr) << lsb); + qib_write_kreg(dd, kr_ibcddrctrl, ppd->cpspec->ibcddrctrl); + qib_write_kreg(dd, kr_scratch, 0); + if (setforce) { + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags |= QIBL_IB_FORCE_NOTIFY; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + } +bail: + return ret; +} + +static int qib_7220_set_loopback(struct qib_pportdata *ppd, const char *what) +{ + int ret = 0; + u64 val, ddr; + + if (!strncmp(what, "ibc", 3)) { + ppd->cpspec->ibcctrl |= SYM_MASK(IBCCtrl, Loopback); + val = 0; /* disable heart beat, so link will come up */ + qib_devinfo(ppd->dd->pcidev, "Enabling IB%u:%u IBC loopback\n", + ppd->dd->unit, ppd->port); + } else if (!strncmp(what, "off", 3)) { + ppd->cpspec->ibcctrl &= ~SYM_MASK(IBCCtrl, Loopback); + /* enable heart beat again */ + val = IBA7220_IBC_HRTBT_MASK << IBA7220_IBC_HRTBT_SHIFT; + qib_devinfo(ppd->dd->pcidev, + "Disabling IB%u:%u IBC loopback (normal)\n", + ppd->dd->unit, ppd->port); + } else + ret = -EINVAL; + if (!ret) { + qib_write_kreg(ppd->dd, kr_ibcctrl, ppd->cpspec->ibcctrl); + ddr = ppd->cpspec->ibcddrctrl & ~(IBA7220_IBC_HRTBT_MASK + << IBA7220_IBC_HRTBT_SHIFT); + ppd->cpspec->ibcddrctrl = ddr | val; + qib_write_kreg(ppd->dd, kr_ibcddrctrl, + ppd->cpspec->ibcddrctrl); + qib_write_kreg(ppd->dd, kr_scratch, 0); + } + return ret; +} + +static void qib_update_7220_usrhead(struct qib_ctxtdata *rcd, u64 hd, + u32 updegr, u32 egrhd, u32 npkts) +{ + if (updegr) + qib_write_ureg(rcd->dd, ur_rcvegrindexhead, egrhd, rcd->ctxt); + mmiowb(); + qib_write_ureg(rcd->dd, ur_rcvhdrhead, hd, rcd->ctxt); + mmiowb(); +} + +static u32 qib_7220_hdrqempty(struct qib_ctxtdata *rcd) +{ + u32 head, tail; + + head = qib_read_ureg32(rcd->dd, ur_rcvhdrhead, rcd->ctxt); + if (rcd->rcvhdrtail_kvaddr) + tail = qib_get_rcvhdrtail(rcd); + else + tail = qib_read_ureg32(rcd->dd, ur_rcvhdrtail, rcd->ctxt); + return head == tail; +} + +/* + * Modify the RCVCTRL register in chip-specific way. This + * is a function because bit positions and (future) register + * location is chip-specifc, but the needed operations are + * generic. is a bit-mask because we often want to + * do multiple modifications. + */ +static void rcvctrl_7220_mod(struct qib_pportdata *ppd, unsigned int op, + int ctxt) +{ + struct qib_devdata *dd = ppd->dd; + u64 mask, val; + unsigned long flags; + + spin_lock_irqsave(&dd->cspec->rcvmod_lock, flags); + if (op & QIB_RCVCTRL_TAILUPD_ENB) + dd->rcvctrl |= (1ULL << IBA7220_R_TAILUPD_SHIFT); + if (op & QIB_RCVCTRL_TAILUPD_DIS) + dd->rcvctrl &= ~(1ULL << IBA7220_R_TAILUPD_SHIFT); + if (op & QIB_RCVCTRL_PKEY_ENB) + dd->rcvctrl &= ~(1ULL << IBA7220_R_PKEY_DIS_SHIFT); + if (op & QIB_RCVCTRL_PKEY_DIS) + dd->rcvctrl |= (1ULL << IBA7220_R_PKEY_DIS_SHIFT); + if (ctxt < 0) + mask = (1ULL << dd->ctxtcnt) - 1; + else + mask = (1ULL << ctxt); + if (op & QIB_RCVCTRL_CTXT_ENB) { + /* always done for specific ctxt */ + dd->rcvctrl |= (mask << SYM_LSB(RcvCtrl, PortEnable)); + if (!(dd->flags & QIB_NODMA_RTAIL)) + dd->rcvctrl |= 1ULL << IBA7220_R_TAILUPD_SHIFT; + /* Write these registers before the context is enabled. */ + qib_write_kreg_ctxt(dd, kr_rcvhdrtailaddr, ctxt, + dd->rcd[ctxt]->rcvhdrqtailaddr_phys); + qib_write_kreg_ctxt(dd, kr_rcvhdraddr, ctxt, + dd->rcd[ctxt]->rcvhdrq_phys); + dd->rcd[ctxt]->seq_cnt = 1; + } + if (op & QIB_RCVCTRL_CTXT_DIS) + dd->rcvctrl &= ~(mask << SYM_LSB(RcvCtrl, PortEnable)); + if (op & QIB_RCVCTRL_INTRAVAIL_ENB) + dd->rcvctrl |= (mask << IBA7220_R_INTRAVAIL_SHIFT); + if (op & QIB_RCVCTRL_INTRAVAIL_DIS) + dd->rcvctrl &= ~(mask << IBA7220_R_INTRAVAIL_SHIFT); + qib_write_kreg(dd, kr_rcvctrl, dd->rcvctrl); + if ((op & QIB_RCVCTRL_INTRAVAIL_ENB) && dd->rhdrhead_intr_off) { + /* arm rcv interrupt */ + val = qib_read_ureg32(dd, ur_rcvhdrhead, ctxt) | + dd->rhdrhead_intr_off; + qib_write_ureg(dd, ur_rcvhdrhead, val, ctxt); + } + if (op & QIB_RCVCTRL_CTXT_ENB) { + /* + * Init the context registers also; if we were + * disabled, tail and head should both be zero + * already from the enable, but since we don't + * know, we have to do it explicitly. + */ + val = qib_read_ureg32(dd, ur_rcvegrindextail, ctxt); + qib_write_ureg(dd, ur_rcvegrindexhead, val, ctxt); + + val = qib_read_ureg32(dd, ur_rcvhdrtail, ctxt); + dd->rcd[ctxt]->head = val; + /* If kctxt, interrupt on next receive. */ + if (ctxt < dd->first_user_ctxt) + val |= dd->rhdrhead_intr_off; + qib_write_ureg(dd, ur_rcvhdrhead, val, ctxt); + } + if (op & QIB_RCVCTRL_CTXT_DIS) { + if (ctxt >= 0) { + qib_write_kreg_ctxt(dd, kr_rcvhdrtailaddr, ctxt, 0); + qib_write_kreg_ctxt(dd, kr_rcvhdraddr, ctxt, 0); + } else { + unsigned i; + + for (i = 0; i < dd->cfgctxts; i++) { + qib_write_kreg_ctxt(dd, kr_rcvhdrtailaddr, + i, 0); + qib_write_kreg_ctxt(dd, kr_rcvhdraddr, i, 0); + } + } + } + spin_unlock_irqrestore(&dd->cspec->rcvmod_lock, flags); +} + +/* + * Modify the SENDCTRL register in chip-specific way. This + * is a function there may be multiple such registers with + * slightly different layouts. To start, we assume the + * "canonical" register layout of the first chips. + * Chip requires no back-back sendctrl writes, so write + * scratch register after writing sendctrl + */ +static void sendctrl_7220_mod(struct qib_pportdata *ppd, u32 op) +{ + struct qib_devdata *dd = ppd->dd; + u64 tmp_dd_sendctrl; + unsigned long flags; + + spin_lock_irqsave(&dd->sendctrl_lock, flags); + + /* First the ones that are "sticky", saved in shadow */ + if (op & QIB_SENDCTRL_CLEAR) + dd->sendctrl = 0; + if (op & QIB_SENDCTRL_SEND_DIS) + dd->sendctrl &= ~SYM_MASK(SendCtrl, SPioEnable); + else if (op & QIB_SENDCTRL_SEND_ENB) { + dd->sendctrl |= SYM_MASK(SendCtrl, SPioEnable); + if (dd->flags & QIB_USE_SPCL_TRIG) + dd->sendctrl |= SYM_MASK(SendCtrl, + SSpecialTriggerEn); + } + if (op & QIB_SENDCTRL_AVAIL_DIS) + dd->sendctrl &= ~SYM_MASK(SendCtrl, SendBufAvailUpd); + else if (op & QIB_SENDCTRL_AVAIL_ENB) + dd->sendctrl |= SYM_MASK(SendCtrl, SendBufAvailUpd); + + if (op & QIB_SENDCTRL_DISARM_ALL) { + u32 i, last; + + tmp_dd_sendctrl = dd->sendctrl; + /* + * disarm any that are not yet launched, disabling sends + * and updates until done. + */ + last = dd->piobcnt2k + dd->piobcnt4k; + tmp_dd_sendctrl &= + ~(SYM_MASK(SendCtrl, SPioEnable) | + SYM_MASK(SendCtrl, SendBufAvailUpd)); + for (i = 0; i < last; i++) { + qib_write_kreg(dd, kr_sendctrl, + tmp_dd_sendctrl | + SYM_MASK(SendCtrl, Disarm) | i); + qib_write_kreg(dd, kr_scratch, 0); + } + } + + tmp_dd_sendctrl = dd->sendctrl; + + if (op & QIB_SENDCTRL_FLUSH) + tmp_dd_sendctrl |= SYM_MASK(SendCtrl, Abort); + if (op & QIB_SENDCTRL_DISARM) + tmp_dd_sendctrl |= SYM_MASK(SendCtrl, Disarm) | + ((op & QIB_7220_SendCtrl_DisarmPIOBuf_RMASK) << + SYM_LSB(SendCtrl, DisarmPIOBuf)); + if ((op & QIB_SENDCTRL_AVAIL_BLIP) && + (dd->sendctrl & SYM_MASK(SendCtrl, SendBufAvailUpd))) + tmp_dd_sendctrl &= ~SYM_MASK(SendCtrl, SendBufAvailUpd); + + qib_write_kreg(dd, kr_sendctrl, tmp_dd_sendctrl); + qib_write_kreg(dd, kr_scratch, 0); + + if (op & QIB_SENDCTRL_AVAIL_BLIP) { + qib_write_kreg(dd, kr_sendctrl, dd->sendctrl); + qib_write_kreg(dd, kr_scratch, 0); + } + + spin_unlock_irqrestore(&dd->sendctrl_lock, flags); + + if (op & QIB_SENDCTRL_FLUSH) { + u32 v; + /* + * ensure writes have hit chip, then do a few + * more reads, to allow DMA of pioavail registers + * to occur, so in-memory copy is in sync with + * the chip. Not always safe to sleep. + */ + v = qib_read_kreg32(dd, kr_scratch); + qib_write_kreg(dd, kr_scratch, v); + v = qib_read_kreg32(dd, kr_scratch); + qib_write_kreg(dd, kr_scratch, v); + qib_read_kreg32(dd, kr_scratch); + } +} + +/** + * qib_portcntr_7220 - read a per-port counter + * @dd: the qlogic_ib device + * @creg: the counter to snapshot + */ +static u64 qib_portcntr_7220(struct qib_pportdata *ppd, u32 reg) +{ + u64 ret = 0ULL; + struct qib_devdata *dd = ppd->dd; + u16 creg; + /* 0xffff for unimplemented or synthesized counters */ + static const u16 xlator[] = { + [QIBPORTCNTR_PKTSEND] = cr_pktsend, + [QIBPORTCNTR_WORDSEND] = cr_wordsend, + [QIBPORTCNTR_PSXMITDATA] = cr_psxmitdatacount, + [QIBPORTCNTR_PSXMITPKTS] = cr_psxmitpktscount, + [QIBPORTCNTR_PSXMITWAIT] = cr_psxmitwaitcount, + [QIBPORTCNTR_SENDSTALL] = cr_sendstall, + [QIBPORTCNTR_PKTRCV] = cr_pktrcv, + [QIBPORTCNTR_PSRCVDATA] = cr_psrcvdatacount, + [QIBPORTCNTR_PSRCVPKTS] = cr_psrcvpktscount, + [QIBPORTCNTR_RCVEBP] = cr_rcvebp, + [QIBPORTCNTR_RCVOVFL] = cr_rcvovfl, + [QIBPORTCNTR_WORDRCV] = cr_wordrcv, + [QIBPORTCNTR_RXDROPPKT] = cr_rxdroppkt, + [QIBPORTCNTR_RXLOCALPHYERR] = cr_rxotherlocalphyerr, + [QIBPORTCNTR_RXVLERR] = cr_rxvlerr, + [QIBPORTCNTR_ERRICRC] = cr_erricrc, + [QIBPORTCNTR_ERRVCRC] = cr_errvcrc, + [QIBPORTCNTR_ERRLPCRC] = cr_errlpcrc, + [QIBPORTCNTR_BADFORMAT] = cr_badformat, + [QIBPORTCNTR_ERR_RLEN] = cr_err_rlen, + [QIBPORTCNTR_IBSYMBOLERR] = cr_ibsymbolerr, + [QIBPORTCNTR_INVALIDRLEN] = cr_invalidrlen, + [QIBPORTCNTR_UNSUPVL] = cr_txunsupvl, + [QIBPORTCNTR_EXCESSBUFOVFL] = cr_excessbufferovfl, + [QIBPORTCNTR_ERRLINK] = cr_errlink, + [QIBPORTCNTR_IBLINKDOWN] = cr_iblinkdown, + [QIBPORTCNTR_IBLINKERRRECOV] = cr_iblinkerrrecov, + [QIBPORTCNTR_LLI] = cr_locallinkintegrityerr, + [QIBPORTCNTR_PSINTERVAL] = cr_psinterval, + [QIBPORTCNTR_PSSTART] = cr_psstart, + [QIBPORTCNTR_PSSTAT] = cr_psstat, + [QIBPORTCNTR_VL15PKTDROP] = cr_vl15droppedpkt, + [QIBPORTCNTR_ERRPKEY] = cr_errpkey, + [QIBPORTCNTR_KHDROVFL] = 0xffff, + }; + + if (reg >= ARRAY_SIZE(xlator)) { + qib_devinfo(ppd->dd->pcidev, + "Unimplemented portcounter %u\n", reg); + goto done; + } + creg = xlator[reg]; + + if (reg == QIBPORTCNTR_KHDROVFL) { + int i; + + /* sum over all kernel contexts */ + for (i = 0; i < dd->first_user_ctxt; i++) + ret += read_7220_creg32(dd, cr_portovfl + i); + } + if (creg == 0xffff) + goto done; + + /* + * only fast incrementing counters are 64bit; use 32 bit reads to + * avoid two independent reads when on opteron + */ + if ((creg == cr_wordsend || creg == cr_wordrcv || + creg == cr_pktsend || creg == cr_pktrcv)) + ret = read_7220_creg(dd, creg); + else + ret = read_7220_creg32(dd, creg); + if (creg == cr_ibsymbolerr) { + if (dd->pport->cpspec->ibdeltainprog) + ret -= ret - ppd->cpspec->ibsymsnap; + ret -= dd->pport->cpspec->ibsymdelta; + } else if (creg == cr_iblinkerrrecov) { + if (dd->pport->cpspec->ibdeltainprog) + ret -= ret - ppd->cpspec->iblnkerrsnap; + ret -= dd->pport->cpspec->iblnkerrdelta; + } +done: + return ret; +} + +/* + * Device counter names (not port-specific), one line per stat, + * single string. Used by utilities like ipathstats to print the stats + * in a way which works for different versions of drivers, without changing + * the utility. Names need to be 12 chars or less (w/o newline), for proper + * display by utility. + * Non-error counters are first. + * Start of "error" conters is indicated by a leading "E " on the first + * "error" counter, and doesn't count in label length. + * The EgrOvfl list needs to be last so we truncate them at the configured + * context count for the device. + * cntr7220indices contains the corresponding register indices. + */ +static const char cntr7220names[] = + "Interrupts\n" + "HostBusStall\n" + "E RxTIDFull\n" + "RxTIDInvalid\n" + "Ctxt0EgrOvfl\n" + "Ctxt1EgrOvfl\n" + "Ctxt2EgrOvfl\n" + "Ctxt3EgrOvfl\n" + "Ctxt4EgrOvfl\n" + "Ctxt5EgrOvfl\n" + "Ctxt6EgrOvfl\n" + "Ctxt7EgrOvfl\n" + "Ctxt8EgrOvfl\n" + "Ctxt9EgrOvfl\n" + "Ctx10EgrOvfl\n" + "Ctx11EgrOvfl\n" + "Ctx12EgrOvfl\n" + "Ctx13EgrOvfl\n" + "Ctx14EgrOvfl\n" + "Ctx15EgrOvfl\n" + "Ctx16EgrOvfl\n"; + +static const size_t cntr7220indices[] = { + cr_lbint, + cr_lbflowstall, + cr_errtidfull, + cr_errtidvalid, + cr_portovfl + 0, + cr_portovfl + 1, + cr_portovfl + 2, + cr_portovfl + 3, + cr_portovfl + 4, + cr_portovfl + 5, + cr_portovfl + 6, + cr_portovfl + 7, + cr_portovfl + 8, + cr_portovfl + 9, + cr_portovfl + 10, + cr_portovfl + 11, + cr_portovfl + 12, + cr_portovfl + 13, + cr_portovfl + 14, + cr_portovfl + 15, + cr_portovfl + 16, +}; + +/* + * same as cntr7220names and cntr7220indices, but for port-specific counters. + * portcntr7220indices is somewhat complicated by some registers needing + * adjustments of various kinds, and those are ORed with _PORT_VIRT_FLAG + */ +static const char portcntr7220names[] = + "TxPkt\n" + "TxFlowPkt\n" + "TxWords\n" + "RxPkt\n" + "RxFlowPkt\n" + "RxWords\n" + "TxFlowStall\n" + "TxDmaDesc\n" /* 7220 and 7322-only */ + "E RxDlidFltr\n" /* 7220 and 7322-only */ + "IBStatusChng\n" + "IBLinkDown\n" + "IBLnkRecov\n" + "IBRxLinkErr\n" + "IBSymbolErr\n" + "RxLLIErr\n" + "RxBadFormat\n" + "RxBadLen\n" + "RxBufOvrfl\n" + "RxEBP\n" + "RxFlowCtlErr\n" + "RxICRCerr\n" + "RxLPCRCerr\n" + "RxVCRCerr\n" + "RxInvalLen\n" + "RxInvalPKey\n" + "RxPktDropped\n" + "TxBadLength\n" + "TxDropped\n" + "TxInvalLen\n" + "TxUnderrun\n" + "TxUnsupVL\n" + "RxLclPhyErr\n" /* 7220 and 7322-only */ + "RxVL15Drop\n" /* 7220 and 7322-only */ + "RxVlErr\n" /* 7220 and 7322-only */ + "XcessBufOvfl\n" /* 7220 and 7322-only */ + ; + +#define _PORT_VIRT_FLAG 0x8000 /* "virtual", need adjustments */ +static const size_t portcntr7220indices[] = { + QIBPORTCNTR_PKTSEND | _PORT_VIRT_FLAG, + cr_pktsendflow, + QIBPORTCNTR_WORDSEND | _PORT_VIRT_FLAG, + QIBPORTCNTR_PKTRCV | _PORT_VIRT_FLAG, + cr_pktrcvflowctrl, + QIBPORTCNTR_WORDRCV | _PORT_VIRT_FLAG, + QIBPORTCNTR_SENDSTALL | _PORT_VIRT_FLAG, + cr_txsdmadesc, + cr_rxdlidfltr, + cr_ibstatuschange, + QIBPORTCNTR_IBLINKDOWN | _PORT_VIRT_FLAG, + QIBPORTCNTR_IBLINKERRRECOV | _PORT_VIRT_FLAG, + QIBPORTCNTR_ERRLINK | _PORT_VIRT_FLAG, + QIBPORTCNTR_IBSYMBOLERR | _PORT_VIRT_FLAG, + QIBPORTCNTR_LLI | _PORT_VIRT_FLAG, + QIBPORTCNTR_BADFORMAT | _PORT_VIRT_FLAG, + QIBPORTCNTR_ERR_RLEN | _PORT_VIRT_FLAG, + QIBPORTCNTR_RCVOVFL | _PORT_VIRT_FLAG, + QIBPORTCNTR_RCVEBP | _PORT_VIRT_FLAG, + cr_rcvflowctrl_err, + QIBPORTCNTR_ERRICRC | _PORT_VIRT_FLAG, + QIBPORTCNTR_ERRLPCRC | _PORT_VIRT_FLAG, + QIBPORTCNTR_ERRVCRC | _PORT_VIRT_FLAG, + QIBPORTCNTR_INVALIDRLEN | _PORT_VIRT_FLAG, + QIBPORTCNTR_ERRPKEY | _PORT_VIRT_FLAG, + QIBPORTCNTR_RXDROPPKT | _PORT_VIRT_FLAG, + cr_invalidslen, + cr_senddropped, + cr_errslen, + cr_sendunderrun, + cr_txunsupvl, + QIBPORTCNTR_RXLOCALPHYERR | _PORT_VIRT_FLAG, + QIBPORTCNTR_VL15PKTDROP | _PORT_VIRT_FLAG, + QIBPORTCNTR_RXVLERR | _PORT_VIRT_FLAG, + QIBPORTCNTR_EXCESSBUFOVFL | _PORT_VIRT_FLAG, +}; + +/* do all the setup to make the counter reads efficient later */ +static void init_7220_cntrnames(struct qib_devdata *dd) +{ + int i, j = 0; + char *s; + + for (i = 0, s = (char *)cntr7220names; s && j <= dd->cfgctxts; + i++) { + /* we always have at least one counter before the egrovfl */ + if (!j && !strncmp("Ctxt0EgrOvfl", s + 1, 12)) + j = 1; + s = strchr(s + 1, '\n'); + if (s && j) + j++; + } + dd->cspec->ncntrs = i; + if (!s) + /* full list; size is without terminating null */ + dd->cspec->cntrnamelen = sizeof(cntr7220names) - 1; + else + dd->cspec->cntrnamelen = 1 + s - cntr7220names; + dd->cspec->cntrs = kmalloc(dd->cspec->ncntrs + * sizeof(u64), GFP_KERNEL); + if (!dd->cspec->cntrs) + qib_dev_err(dd, "Failed allocation for counters\n"); + + for (i = 0, s = (char *)portcntr7220names; s; i++) + s = strchr(s + 1, '\n'); + dd->cspec->nportcntrs = i - 1; + dd->cspec->portcntrnamelen = sizeof(portcntr7220names) - 1; + dd->cspec->portcntrs = kmalloc(dd->cspec->nportcntrs + * sizeof(u64), GFP_KERNEL); + if (!dd->cspec->portcntrs) + qib_dev_err(dd, "Failed allocation for portcounters\n"); +} + +static u32 qib_read_7220cntrs(struct qib_devdata *dd, loff_t pos, char **namep, + u64 **cntrp) +{ + u32 ret; + + if (!dd->cspec->cntrs) { + ret = 0; + goto done; + } + + if (namep) { + *namep = (char *)cntr7220names; + ret = dd->cspec->cntrnamelen; + if (pos >= ret) + ret = 0; /* final read after getting everything */ + } else { + u64 *cntr = dd->cspec->cntrs; + int i; + + ret = dd->cspec->ncntrs * sizeof(u64); + if (!cntr || pos >= ret) { + /* everything read, or couldn't get memory */ + ret = 0; + goto done; + } + + *cntrp = cntr; + for (i = 0; i < dd->cspec->ncntrs; i++) + *cntr++ = read_7220_creg32(dd, cntr7220indices[i]); + } +done: + return ret; +} + +static u32 qib_read_7220portcntrs(struct qib_devdata *dd, loff_t pos, u32 port, + char **namep, u64 **cntrp) +{ + u32 ret; + + if (!dd->cspec->portcntrs) { + ret = 0; + goto done; + } + if (namep) { + *namep = (char *)portcntr7220names; + ret = dd->cspec->portcntrnamelen; + if (pos >= ret) + ret = 0; /* final read after getting everything */ + } else { + u64 *cntr = dd->cspec->portcntrs; + struct qib_pportdata *ppd = &dd->pport[port]; + int i; + + ret = dd->cspec->nportcntrs * sizeof(u64); + if (!cntr || pos >= ret) { + /* everything read, or couldn't get memory */ + ret = 0; + goto done; + } + *cntrp = cntr; + for (i = 0; i < dd->cspec->nportcntrs; i++) { + if (portcntr7220indices[i] & _PORT_VIRT_FLAG) + *cntr++ = qib_portcntr_7220(ppd, + portcntr7220indices[i] & + ~_PORT_VIRT_FLAG); + else + *cntr++ = read_7220_creg32(dd, + portcntr7220indices[i]); + } + } +done: + return ret; +} + +/** + * qib_get_7220_faststats - get word counters from chip before they overflow + * @opaque - contains a pointer to the qlogic_ib device qib_devdata + * + * This needs more work; in particular, decision on whether we really + * need traffic_wds done the way it is + * called from add_timer + */ +static void qib_get_7220_faststats(unsigned long opaque) +{ + struct qib_devdata *dd = (struct qib_devdata *) opaque; + struct qib_pportdata *ppd = dd->pport; + unsigned long flags; + u64 traffic_wds; + + /* + * don't access the chip while running diags, or memory diags can + * fail + */ + if (!(dd->flags & QIB_INITTED) || dd->diag_client) + /* but re-arm the timer, for diags case; won't hurt other */ + goto done; + + /* + * We now try to maintain an activity timer, based on traffic + * exceeding a threshold, so we need to check the word-counts + * even if they are 64-bit. + */ + traffic_wds = qib_portcntr_7220(ppd, cr_wordsend) + + qib_portcntr_7220(ppd, cr_wordrcv); + spin_lock_irqsave(&dd->eep_st_lock, flags); + traffic_wds -= dd->traffic_wds; + dd->traffic_wds += traffic_wds; + spin_unlock_irqrestore(&dd->eep_st_lock, flags); +done: + mod_timer(&dd->stats_timer, jiffies + HZ * ACTIVITY_TIMER); +} + +/* + * If we are using MSI, try to fallback to INTx. + */ +static int qib_7220_intr_fallback(struct qib_devdata *dd) +{ + if (!dd->msi_lo) + return 0; + + qib_devinfo(dd->pcidev, + "MSI interrupt not detected, trying INTx interrupts\n"); + qib_7220_free_irq(dd); + qib_enable_intx(dd->pcidev); + /* + * Some newer kernels require free_irq before disable_msi, + * and irq can be changed during disable and INTx enable + * and we need to therefore use the pcidev->irq value, + * not our saved MSI value. + */ + dd->cspec->irq = dd->pcidev->irq; + qib_setup_7220_interrupt(dd); + return 1; +} + +/* + * Reset the XGXS (between serdes and IBC). Slightly less intrusive + * than resetting the IBC or external link state, and useful in some + * cases to cause some retraining. To do this right, we reset IBC + * as well. + */ +static void qib_7220_xgxs_reset(struct qib_pportdata *ppd) +{ + u64 val, prev_val; + struct qib_devdata *dd = ppd->dd; + + prev_val = qib_read_kreg64(dd, kr_xgxs_cfg); + val = prev_val | QLOGIC_IB_XGXS_RESET; + prev_val &= ~QLOGIC_IB_XGXS_RESET; /* be sure */ + qib_write_kreg(dd, kr_control, + dd->control & ~QLOGIC_IB_C_LINKENABLE); + qib_write_kreg(dd, kr_xgxs_cfg, val); + qib_read_kreg32(dd, kr_scratch); + qib_write_kreg(dd, kr_xgxs_cfg, prev_val); + qib_write_kreg(dd, kr_control, dd->control); +} + +/* + * For this chip, we want to use the same buffer every time + * when we are trying to bring the link up (they are always VL15 + * packets). At that link state the packet should always go out immediately + * (or at least be discarded at the tx interface if the link is down). + * If it doesn't, and the buffer isn't available, that means some other + * sender has gotten ahead of us, and is preventing our packet from going + * out. In that case, we flush all packets, and try again. If that still + * fails, we fail the request, and hope things work the next time around. + * + * We don't need very complicated heuristics on whether the packet had + * time to go out or not, since even at SDR 1X, it goes out in very short + * time periods, covered by the chip reads done here and as part of the + * flush. + */ +static u32 __iomem *get_7220_link_buf(struct qib_pportdata *ppd, u32 *bnum) +{ + u32 __iomem *buf; + u32 lbuf = ppd->dd->cspec->lastbuf_for_pio; + int do_cleanup; + unsigned long flags; + + /* + * always blip to get avail list updated, since it's almost + * always needed, and is fairly cheap. + */ + sendctrl_7220_mod(ppd->dd->pport, QIB_SENDCTRL_AVAIL_BLIP); + qib_read_kreg64(ppd->dd, kr_scratch); /* extra chip flush */ + buf = qib_getsendbuf_range(ppd->dd, bnum, lbuf, lbuf); + if (buf) + goto done; + + spin_lock_irqsave(&ppd->sdma_lock, flags); + if (ppd->sdma_state.current_state == qib_sdma_state_s20_idle && + ppd->sdma_state.current_state != qib_sdma_state_s00_hw_down) { + __qib_sdma_process_event(ppd, qib_sdma_event_e00_go_hw_down); + do_cleanup = 0; + } else { + do_cleanup = 1; + qib_7220_sdma_hw_clean_up(ppd); + } + spin_unlock_irqrestore(&ppd->sdma_lock, flags); + + if (do_cleanup) { + qib_read_kreg64(ppd->dd, kr_scratch); /* extra chip flush */ + buf = qib_getsendbuf_range(ppd->dd, bnum, lbuf, lbuf); + } +done: + return buf; +} + +/* + * This code for non-IBTA-compliant IB speed negotiation is only known to + * work for the SDR to DDR transition, and only between an HCA and a switch + * with recent firmware. It is based on observed heuristics, rather than + * actual knowledge of the non-compliant speed negotiation. + * It has a number of hard-coded fields, since the hope is to rewrite this + * when a spec is available on how the negoation is intended to work. + */ +static void autoneg_7220_sendpkt(struct qib_pportdata *ppd, u32 *hdr, + u32 dcnt, u32 *data) +{ + int i; + u64 pbc; + u32 __iomem *piobuf; + u32 pnum; + struct qib_devdata *dd = ppd->dd; + + i = 0; + pbc = 7 + dcnt + 1; /* 7 dword header, dword data, icrc */ + pbc |= PBC_7220_VL15_SEND; + while (!(piobuf = get_7220_link_buf(ppd, &pnum))) { + if (i++ > 5) + return; + udelay(2); + } + sendctrl_7220_mod(dd->pport, QIB_SENDCTRL_DISARM_BUF(pnum)); + writeq(pbc, piobuf); + qib_flush_wc(); + qib_pio_copy(piobuf + 2, hdr, 7); + qib_pio_copy(piobuf + 9, data, dcnt); + if (dd->flags & QIB_USE_SPCL_TRIG) { + u32 spcl_off = (pnum >= dd->piobcnt2k) ? 2047 : 1023; + + qib_flush_wc(); + __raw_writel(0xaebecede, piobuf + spcl_off); + } + qib_flush_wc(); + qib_sendbuf_done(dd, pnum); +} + +/* + * _start packet gets sent twice at start, _done gets sent twice at end + */ +static void autoneg_7220_send(struct qib_pportdata *ppd, int which) +{ + struct qib_devdata *dd = ppd->dd; + static u32 swapped; + u32 dw, i, hcnt, dcnt, *data; + static u32 hdr[7] = { 0xf002ffff, 0x48ffff, 0x6400abba }; + static u32 madpayload_start[0x40] = { + 0x1810103, 0x1, 0x0, 0x0, 0x2c90000, 0x2c9, 0x0, 0x0, + 0xffffffff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x1388, 0x15e, 0x1, /* rest 0's */ + }; + static u32 madpayload_done[0x40] = { + 0x1810103, 0x1, 0x0, 0x0, 0x2c90000, 0x2c9, 0x0, 0x0, + 0xffffffff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x40000001, 0x1388, 0x15e, /* rest 0's */ + }; + + dcnt = ARRAY_SIZE(madpayload_start); + hcnt = ARRAY_SIZE(hdr); + if (!swapped) { + /* for maintainability, do it at runtime */ + for (i = 0; i < hcnt; i++) { + dw = (__force u32) cpu_to_be32(hdr[i]); + hdr[i] = dw; + } + for (i = 0; i < dcnt; i++) { + dw = (__force u32) cpu_to_be32(madpayload_start[i]); + madpayload_start[i] = dw; + dw = (__force u32) cpu_to_be32(madpayload_done[i]); + madpayload_done[i] = dw; + } + swapped = 1; + } + + data = which ? madpayload_done : madpayload_start; + + autoneg_7220_sendpkt(ppd, hdr, dcnt, data); + qib_read_kreg64(dd, kr_scratch); + udelay(2); + autoneg_7220_sendpkt(ppd, hdr, dcnt, data); + qib_read_kreg64(dd, kr_scratch); + udelay(2); +} + +/* + * Do the absolute minimum to cause an IB speed change, and make it + * ready, but don't actually trigger the change. The caller will + * do that when ready (if link is in Polling training state, it will + * happen immediately, otherwise when link next goes down) + * + * This routine should only be used as part of the DDR autonegotation + * code for devices that are not compliant with IB 1.2 (or code that + * fixes things up for same). + * + * When link has gone down, and autoneg enabled, or autoneg has + * failed and we give up until next time we set both speeds, and + * then we want IBTA enabled as well as "use max enabled speed. + */ +static void set_7220_ibspeed_fast(struct qib_pportdata *ppd, u32 speed) +{ + ppd->cpspec->ibcddrctrl &= ~(IBA7220_IBC_SPEED_AUTONEG_MASK | + IBA7220_IBC_IBTA_1_2_MASK); + + if (speed == (QIB_IB_SDR | QIB_IB_DDR)) + ppd->cpspec->ibcddrctrl |= IBA7220_IBC_SPEED_AUTONEG_MASK | + IBA7220_IBC_IBTA_1_2_MASK; + else + ppd->cpspec->ibcddrctrl |= speed == QIB_IB_DDR ? + IBA7220_IBC_SPEED_DDR : IBA7220_IBC_SPEED_SDR; + + qib_write_kreg(ppd->dd, kr_ibcddrctrl, ppd->cpspec->ibcddrctrl); + qib_write_kreg(ppd->dd, kr_scratch, 0); +} + +/* + * This routine is only used when we are not talking to another + * IB 1.2-compliant device that we think can do DDR. + * (This includes all existing switch chips as of Oct 2007.) + * 1.2-compliant devices go directly to DDR prior to reaching INIT + */ +static void try_7220_autoneg(struct qib_pportdata *ppd) +{ + unsigned long flags; + + /* + * Required for older non-IB1.2 DDR switches. Newer + * non-IB-compliant switches don't need it, but so far, + * aren't bothered by it either. "Magic constant" + */ + qib_write_kreg(ppd->dd, kr_ncmodectrl, 0x3b9dc07); + + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags |= QIBL_IB_AUTONEG_INPROG; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + autoneg_7220_send(ppd, 0); + set_7220_ibspeed_fast(ppd, QIB_IB_DDR); + + toggle_7220_rclkrls(ppd->dd); + /* 2 msec is minimum length of a poll cycle */ + queue_delayed_work(ib_wq, &ppd->cpspec->autoneg_work, + msecs_to_jiffies(2)); +} + +/* + * Handle the empirically determined mechanism for auto-negotiation + * of DDR speed with switches. + */ +static void autoneg_7220_work(struct work_struct *work) +{ + struct qib_pportdata *ppd; + struct qib_devdata *dd; + u64 startms; + u32 i; + unsigned long flags; + + ppd = &container_of(work, struct qib_chippport_specific, + autoneg_work.work)->pportdata; + dd = ppd->dd; + + startms = jiffies_to_msecs(jiffies); + + /* + * Busy wait for this first part, it should be at most a + * few hundred usec, since we scheduled ourselves for 2msec. + */ + for (i = 0; i < 25; i++) { + if (SYM_FIELD(ppd->lastibcstat, IBCStatus, LinkTrainingState) + == IB_7220_LT_STATE_POLLQUIET) { + qib_set_linkstate(ppd, QIB_IB_LINKDOWN_DISABLE); + break; + } + udelay(100); + } + + if (!(ppd->lflags & QIBL_IB_AUTONEG_INPROG)) + goto done; /* we got there early or told to stop */ + + /* we expect this to timeout */ + if (wait_event_timeout(ppd->cpspec->autoneg_wait, + !(ppd->lflags & QIBL_IB_AUTONEG_INPROG), + msecs_to_jiffies(90))) + goto done; + + toggle_7220_rclkrls(dd); + + /* we expect this to timeout */ + if (wait_event_timeout(ppd->cpspec->autoneg_wait, + !(ppd->lflags & QIBL_IB_AUTONEG_INPROG), + msecs_to_jiffies(1700))) + goto done; + + set_7220_ibspeed_fast(ppd, QIB_IB_SDR); + toggle_7220_rclkrls(dd); + + /* + * Wait up to 250 msec for link to train and get to INIT at DDR; + * this should terminate early. + */ + wait_event_timeout(ppd->cpspec->autoneg_wait, + !(ppd->lflags & QIBL_IB_AUTONEG_INPROG), + msecs_to_jiffies(250)); +done: + if (ppd->lflags & QIBL_IB_AUTONEG_INPROG) { + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags &= ~QIBL_IB_AUTONEG_INPROG; + if (dd->cspec->autoneg_tries == AUTONEG_TRIES) { + ppd->lflags |= QIBL_IB_AUTONEG_FAILED; + dd->cspec->autoneg_tries = 0; + } + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + set_7220_ibspeed_fast(ppd, ppd->link_speed_enabled); + } +} + +static u32 qib_7220_iblink_state(u64 ibcs) +{ + u32 state = (u32)SYM_FIELD(ibcs, IBCStatus, LinkState); + + switch (state) { + case IB_7220_L_STATE_INIT: + state = IB_PORT_INIT; + break; + case IB_7220_L_STATE_ARM: + state = IB_PORT_ARMED; + break; + case IB_7220_L_STATE_ACTIVE: + /* fall through */ + case IB_7220_L_STATE_ACT_DEFER: + state = IB_PORT_ACTIVE; + break; + default: /* fall through */ + case IB_7220_L_STATE_DOWN: + state = IB_PORT_DOWN; + break; + } + return state; +} + +/* returns the IBTA port state, rather than the IBC link training state */ +static u8 qib_7220_phys_portstate(u64 ibcs) +{ + u8 state = (u8)SYM_FIELD(ibcs, IBCStatus, LinkTrainingState); + return qib_7220_physportstate[state]; +} + +static int qib_7220_ib_updown(struct qib_pportdata *ppd, int ibup, u64 ibcs) +{ + int ret = 0, symadj = 0; + struct qib_devdata *dd = ppd->dd; + unsigned long flags; + + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags &= ~QIBL_IB_FORCE_NOTIFY; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + + if (!ibup) { + /* + * When the link goes down we don't want AEQ running, so it + * won't interfere with IBC training, etc., and we need + * to go back to the static SerDes preset values. + */ + if (!(ppd->lflags & (QIBL_IB_AUTONEG_FAILED | + QIBL_IB_AUTONEG_INPROG))) + set_7220_ibspeed_fast(ppd, ppd->link_speed_enabled); + if (!(ppd->lflags & QIBL_IB_AUTONEG_INPROG)) { + qib_sd7220_presets(dd); + qib_cancel_sends(ppd); /* initial disarm, etc. */ + spin_lock_irqsave(&ppd->sdma_lock, flags); + if (__qib_sdma_running(ppd)) + __qib_sdma_process_event(ppd, + qib_sdma_event_e70_go_idle); + spin_unlock_irqrestore(&ppd->sdma_lock, flags); + } + /* this might better in qib_sd7220_presets() */ + set_7220_relock_poll(dd, ibup); + } else { + if (qib_compat_ddr_negotiate && + !(ppd->lflags & (QIBL_IB_AUTONEG_FAILED | + QIBL_IB_AUTONEG_INPROG)) && + ppd->link_speed_active == QIB_IB_SDR && + (ppd->link_speed_enabled & (QIB_IB_DDR | QIB_IB_SDR)) == + (QIB_IB_DDR | QIB_IB_SDR) && + dd->cspec->autoneg_tries < AUTONEG_TRIES) { + /* we are SDR, and DDR auto-negotiation enabled */ + ++dd->cspec->autoneg_tries; + if (!ppd->cpspec->ibdeltainprog) { + ppd->cpspec->ibdeltainprog = 1; + ppd->cpspec->ibsymsnap = read_7220_creg32(dd, + cr_ibsymbolerr); + ppd->cpspec->iblnkerrsnap = read_7220_creg32(dd, + cr_iblinkerrrecov); + } + try_7220_autoneg(ppd); + ret = 1; /* no other IB status change processing */ + } else if ((ppd->lflags & QIBL_IB_AUTONEG_INPROG) && + ppd->link_speed_active == QIB_IB_SDR) { + autoneg_7220_send(ppd, 1); + set_7220_ibspeed_fast(ppd, QIB_IB_DDR); + udelay(2); + toggle_7220_rclkrls(dd); + ret = 1; /* no other IB status change processing */ + } else { + if ((ppd->lflags & QIBL_IB_AUTONEG_INPROG) && + (ppd->link_speed_active & QIB_IB_DDR)) { + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags &= ~(QIBL_IB_AUTONEG_INPROG | + QIBL_IB_AUTONEG_FAILED); + spin_unlock_irqrestore(&ppd->lflags_lock, + flags); + dd->cspec->autoneg_tries = 0; + /* re-enable SDR, for next link down */ + set_7220_ibspeed_fast(ppd, + ppd->link_speed_enabled); + wake_up(&ppd->cpspec->autoneg_wait); + symadj = 1; + } else if (ppd->lflags & QIBL_IB_AUTONEG_FAILED) { + /* + * Clear autoneg failure flag, and do setup + * so we'll try next time link goes down and + * back to INIT (possibly connected to a + * different device). + */ + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags &= ~QIBL_IB_AUTONEG_FAILED; + spin_unlock_irqrestore(&ppd->lflags_lock, + flags); + ppd->cpspec->ibcddrctrl |= + IBA7220_IBC_IBTA_1_2_MASK; + qib_write_kreg(dd, kr_ncmodectrl, 0); + symadj = 1; + } + } + + if (!(ppd->lflags & QIBL_IB_AUTONEG_INPROG)) + symadj = 1; + + if (!ret) { + ppd->delay_mult = rate_to_delay + [(ibcs >> IBA7220_LINKSPEED_SHIFT) & 1] + [(ibcs >> IBA7220_LINKWIDTH_SHIFT) & 1]; + + set_7220_relock_poll(dd, ibup); + spin_lock_irqsave(&ppd->sdma_lock, flags); + /* + * Unlike 7322, the 7220 needs this, due to lack of + * interrupt in some cases when we have sdma active + * when the link goes down. + */ + if (ppd->sdma_state.current_state != + qib_sdma_state_s20_idle) + __qib_sdma_process_event(ppd, + qib_sdma_event_e00_go_hw_down); + spin_unlock_irqrestore(&ppd->sdma_lock, flags); + } + } + + if (symadj) { + if (ppd->cpspec->ibdeltainprog) { + ppd->cpspec->ibdeltainprog = 0; + ppd->cpspec->ibsymdelta += read_7220_creg32(ppd->dd, + cr_ibsymbolerr) - ppd->cpspec->ibsymsnap; + ppd->cpspec->iblnkerrdelta += read_7220_creg32(ppd->dd, + cr_iblinkerrrecov) - ppd->cpspec->iblnkerrsnap; + } + } else if (!ibup && qib_compat_ddr_negotiate && + !ppd->cpspec->ibdeltainprog && + !(ppd->lflags & QIBL_IB_AUTONEG_INPROG)) { + ppd->cpspec->ibdeltainprog = 1; + ppd->cpspec->ibsymsnap = read_7220_creg32(ppd->dd, + cr_ibsymbolerr); + ppd->cpspec->iblnkerrsnap = read_7220_creg32(ppd->dd, + cr_iblinkerrrecov); + } + + if (!ret) + qib_setup_7220_setextled(ppd, ibup); + return ret; +} + +/* + * Does read/modify/write to appropriate registers to + * set output and direction bits selected by mask. + * these are in their canonical postions (e.g. lsb of + * dir will end up in D48 of extctrl on existing chips). + * returns contents of GP Inputs. + */ +static int gpio_7220_mod(struct qib_devdata *dd, u32 out, u32 dir, u32 mask) +{ + u64 read_val, new_out; + unsigned long flags; + + if (mask) { + /* some bits being written, lock access to GPIO */ + dir &= mask; + out &= mask; + spin_lock_irqsave(&dd->cspec->gpio_lock, flags); + dd->cspec->extctrl &= ~((u64)mask << SYM_LSB(EXTCtrl, GPIOOe)); + dd->cspec->extctrl |= ((u64) dir << SYM_LSB(EXTCtrl, GPIOOe)); + new_out = (dd->cspec->gpio_out & ~mask) | out; + + qib_write_kreg(dd, kr_extctrl, dd->cspec->extctrl); + qib_write_kreg(dd, kr_gpio_out, new_out); + dd->cspec->gpio_out = new_out; + spin_unlock_irqrestore(&dd->cspec->gpio_lock, flags); + } + /* + * It is unlikely that a read at this time would get valid + * data on a pin whose direction line was set in the same + * call to this function. We include the read here because + * that allows us to potentially combine a change on one pin with + * a read on another, and because the old code did something like + * this. + */ + read_val = qib_read_kreg64(dd, kr_extstatus); + return SYM_FIELD(read_val, EXTStatus, GPIOIn); +} + +/* + * Read fundamental info we need to use the chip. These are + * the registers that describe chip capabilities, and are + * saved in shadow registers. + */ +static void get_7220_chip_params(struct qib_devdata *dd) +{ + u64 val; + u32 piobufs; + int mtu; + + dd->uregbase = qib_read_kreg32(dd, kr_userregbase); + + dd->rcvtidcnt = qib_read_kreg32(dd, kr_rcvtidcnt); + dd->rcvtidbase = qib_read_kreg32(dd, kr_rcvtidbase); + dd->rcvegrbase = qib_read_kreg32(dd, kr_rcvegrbase); + dd->palign = qib_read_kreg32(dd, kr_palign); + dd->piobufbase = qib_read_kreg64(dd, kr_sendpiobufbase); + dd->pio2k_bufbase = dd->piobufbase & 0xffffffff; + + val = qib_read_kreg64(dd, kr_sendpiosize); + dd->piosize2k = val & ~0U; + dd->piosize4k = val >> 32; + + mtu = ib_mtu_enum_to_int(qib_ibmtu); + if (mtu == -1) + mtu = QIB_DEFAULT_MTU; + dd->pport->ibmtu = (u32)mtu; + + val = qib_read_kreg64(dd, kr_sendpiobufcnt); + dd->piobcnt2k = val & ~0U; + dd->piobcnt4k = val >> 32; + /* these may be adjusted in init_chip_wc_pat() */ + dd->pio2kbase = (u32 __iomem *) + ((char __iomem *) dd->kregbase + dd->pio2k_bufbase); + if (dd->piobcnt4k) { + dd->pio4kbase = (u32 __iomem *) + ((char __iomem *) dd->kregbase + + (dd->piobufbase >> 32)); + /* + * 4K buffers take 2 pages; we use roundup just to be + * paranoid; we calculate it once here, rather than on + * ever buf allocate + */ + dd->align4k = ALIGN(dd->piosize4k, dd->palign); + } + + piobufs = dd->piobcnt4k + dd->piobcnt2k; + + dd->pioavregs = ALIGN(piobufs, sizeof(u64) * BITS_PER_BYTE / 2) / + (sizeof(u64) * BITS_PER_BYTE / 2); +} + +/* + * The chip base addresses in cspec and cpspec have to be set + * after possible init_chip_wc_pat(), rather than in + * qib_get_7220_chip_params(), so split out as separate function + */ +static void set_7220_baseaddrs(struct qib_devdata *dd) +{ + u32 cregbase; + /* init after possible re-map in init_chip_wc_pat() */ + cregbase = qib_read_kreg32(dd, kr_counterregbase); + dd->cspec->cregbase = (u64 __iomem *) + ((char __iomem *) dd->kregbase + cregbase); + + dd->egrtidbase = (u64 __iomem *) + ((char __iomem *) dd->kregbase + dd->rcvegrbase); +} + + +#define SENDCTRL_SHADOWED (SYM_MASK(SendCtrl, SendIntBufAvail) | \ + SYM_MASK(SendCtrl, SPioEnable) | \ + SYM_MASK(SendCtrl, SSpecialTriggerEn) | \ + SYM_MASK(SendCtrl, SendBufAvailUpd) | \ + SYM_MASK(SendCtrl, AvailUpdThld) | \ + SYM_MASK(SendCtrl, SDmaEnable) | \ + SYM_MASK(SendCtrl, SDmaIntEnable) | \ + SYM_MASK(SendCtrl, SDmaHalt) | \ + SYM_MASK(SendCtrl, SDmaSingleDescriptor)) + +static int sendctrl_hook(struct qib_devdata *dd, + const struct diag_observer *op, + u32 offs, u64 *data, u64 mask, int only_32) +{ + unsigned long flags; + unsigned idx = offs / sizeof(u64); + u64 local_data, all_bits; + + if (idx != kr_sendctrl) { + qib_dev_err(dd, "SendCtrl Hook called with offs %X, %s-bit\n", + offs, only_32 ? "32" : "64"); + return 0; + } + + all_bits = ~0ULL; + if (only_32) + all_bits >>= 32; + spin_lock_irqsave(&dd->sendctrl_lock, flags); + if ((mask & all_bits) != all_bits) { + /* + * At least some mask bits are zero, so we need + * to read. The judgement call is whether from + * reg or shadow. First-cut: read reg, and complain + * if any bits which should be shadowed are different + * from their shadowed value. + */ + if (only_32) + local_data = (u64)qib_read_kreg32(dd, idx); + else + local_data = qib_read_kreg64(dd, idx); + qib_dev_err(dd, "Sendctrl -> %X, Shad -> %X\n", + (u32)local_data, (u32)dd->sendctrl); + if ((local_data & SENDCTRL_SHADOWED) != + (dd->sendctrl & SENDCTRL_SHADOWED)) + qib_dev_err(dd, "Sendctrl read: %X shadow is %X\n", + (u32)local_data, (u32) dd->sendctrl); + *data = (local_data & ~mask) | (*data & mask); + } + if (mask) { + /* + * At least some mask bits are one, so we need + * to write, but only shadow some bits. + */ + u64 sval, tval; /* Shadowed, transient */ + + /* + * New shadow val is bits we don't want to touch, + * ORed with bits we do, that are intended for shadow. + */ + sval = (dd->sendctrl & ~mask); + sval |= *data & SENDCTRL_SHADOWED & mask; + dd->sendctrl = sval; + tval = sval | (*data & ~SENDCTRL_SHADOWED & mask); + qib_dev_err(dd, "Sendctrl <- %X, Shad <- %X\n", + (u32)tval, (u32)sval); + qib_write_kreg(dd, kr_sendctrl, tval); + qib_write_kreg(dd, kr_scratch, 0Ull); + } + spin_unlock_irqrestore(&dd->sendctrl_lock, flags); + + return only_32 ? 4 : 8; +} + +static const struct diag_observer sendctrl_observer = { + sendctrl_hook, kr_sendctrl * sizeof(u64), + kr_sendctrl * sizeof(u64) +}; + +/* + * write the final few registers that depend on some of the + * init setup. Done late in init, just before bringing up + * the serdes. + */ +static int qib_late_7220_initreg(struct qib_devdata *dd) +{ + int ret = 0; + u64 val; + + qib_write_kreg(dd, kr_rcvhdrentsize, dd->rcvhdrentsize); + qib_write_kreg(dd, kr_rcvhdrsize, dd->rcvhdrsize); + qib_write_kreg(dd, kr_rcvhdrcnt, dd->rcvhdrcnt); + qib_write_kreg(dd, kr_sendpioavailaddr, dd->pioavailregs_phys); + val = qib_read_kreg64(dd, kr_sendpioavailaddr); + if (val != dd->pioavailregs_phys) { + qib_dev_err(dd, + "Catastrophic software error, SendPIOAvailAddr written as %lx, read back as %llx\n", + (unsigned long) dd->pioavailregs_phys, + (unsigned long long) val); + ret = -EINVAL; + } + qib_register_observer(dd, &sendctrl_observer); + return ret; +} + +static int qib_init_7220_variables(struct qib_devdata *dd) +{ + struct qib_chippport_specific *cpspec; + struct qib_pportdata *ppd; + int ret = 0; + u32 sbufs, updthresh; + + cpspec = (struct qib_chippport_specific *)(dd + 1); + ppd = &cpspec->pportdata; + dd->pport = ppd; + dd->num_pports = 1; + + dd->cspec = (struct qib_chip_specific *)(cpspec + dd->num_pports); + ppd->cpspec = cpspec; + + spin_lock_init(&dd->cspec->sdepb_lock); + spin_lock_init(&dd->cspec->rcvmod_lock); + spin_lock_init(&dd->cspec->gpio_lock); + + /* we haven't yet set QIB_PRESENT, so use read directly */ + dd->revision = readq(&dd->kregbase[kr_revision]); + + if ((dd->revision & 0xffffffffU) == 0xffffffffU) { + qib_dev_err(dd, + "Revision register read failure, giving up initialization\n"); + ret = -ENODEV; + goto bail; + } + dd->flags |= QIB_PRESENT; /* now register routines work */ + + dd->majrev = (u8) SYM_FIELD(dd->revision, Revision_R, + ChipRevMajor); + dd->minrev = (u8) SYM_FIELD(dd->revision, Revision_R, + ChipRevMinor); + + get_7220_chip_params(dd); + qib_7220_boardname(dd); + + /* + * GPIO bits for TWSI data and clock, + * used for serial EEPROM. + */ + dd->gpio_sda_num = _QIB_GPIO_SDA_NUM; + dd->gpio_scl_num = _QIB_GPIO_SCL_NUM; + dd->twsi_eeprom_dev = QIB_TWSI_EEPROM_DEV; + + dd->flags |= QIB_HAS_INTX | QIB_HAS_LINK_LATENCY | + QIB_NODMA_RTAIL | QIB_HAS_THRESH_UPDATE; + dd->flags |= qib_special_trigger ? + QIB_USE_SPCL_TRIG : QIB_HAS_SEND_DMA; + + /* + * EEPROM error log 0 is TXE Parity errors. 1 is RXE Parity. + * 2 is Some Misc, 3 is reserved for future. + */ + dd->eep_st_masks[0].hwerrs_to_log = HWE_MASK(TXEMemParityErr); + + dd->eep_st_masks[1].hwerrs_to_log = HWE_MASK(RXEMemParityErr); + + dd->eep_st_masks[2].errs_to_log = ERR_MASK(ResetNegated); + + init_waitqueue_head(&cpspec->autoneg_wait); + INIT_DELAYED_WORK(&cpspec->autoneg_work, autoneg_7220_work); + + ret = qib_init_pportdata(ppd, dd, 0, 1); + if (ret) + goto bail; + ppd->link_width_supported = IB_WIDTH_1X | IB_WIDTH_4X; + ppd->link_speed_supported = QIB_IB_SDR | QIB_IB_DDR; + + ppd->link_width_enabled = ppd->link_width_supported; + ppd->link_speed_enabled = ppd->link_speed_supported; + /* + * Set the initial values to reasonable default, will be set + * for real when link is up. + */ + ppd->link_width_active = IB_WIDTH_4X; + ppd->link_speed_active = QIB_IB_SDR; + ppd->delay_mult = rate_to_delay[0][1]; + ppd->vls_supported = IB_VL_VL0; + ppd->vls_operational = ppd->vls_supported; + + if (!qib_mini_init) + qib_write_kreg(dd, kr_rcvbthqp, QIB_KD_QP); + + init_timer(&ppd->cpspec->chase_timer); + ppd->cpspec->chase_timer.function = reenable_7220_chase; + ppd->cpspec->chase_timer.data = (unsigned long)ppd; + + qib_num_cfg_vls = 1; /* if any 7220's, only one VL */ + + dd->rcvhdrentsize = QIB_RCVHDR_ENTSIZE; + dd->rcvhdrsize = QIB_DFLT_RCVHDRSIZE; + dd->rhf_offset = + dd->rcvhdrentsize - sizeof(u64) / sizeof(u32); + + /* we always allocate at least 2048 bytes for eager buffers */ + ret = ib_mtu_enum_to_int(qib_ibmtu); + dd->rcvegrbufsize = ret != -1 ? max(ret, 2048) : QIB_DEFAULT_MTU; + BUG_ON(!is_power_of_2(dd->rcvegrbufsize)); + dd->rcvegrbufsize_shift = ilog2(dd->rcvegrbufsize); + + qib_7220_tidtemplate(dd); + + /* + * We can request a receive interrupt for 1 or + * more packets from current offset. For now, we set this + * up for a single packet. + */ + dd->rhdrhead_intr_off = 1ULL << 32; + + /* setup the stats timer; the add_timer is done at end of init */ + init_timer(&dd->stats_timer); + dd->stats_timer.function = qib_get_7220_faststats; + dd->stats_timer.data = (unsigned long) dd; + dd->stats_timer.expires = jiffies + ACTIVITY_TIMER * HZ; + + /* + * Control[4] has been added to change the arbitration within + * the SDMA engine between favoring data fetches over descriptor + * fetches. qib_sdma_fetch_arb==0 gives data fetches priority. + */ + if (qib_sdma_fetch_arb) + dd->control |= 1 << 4; + + dd->ureg_align = 0x10000; /* 64KB alignment */ + + dd->piosize2kmax_dwords = (dd->piosize2k >> 2)-1; + qib_7220_config_ctxts(dd); + qib_set_ctxtcnt(dd); /* needed for PAT setup */ + + ret = init_chip_wc_pat(dd, 0); + if (ret) + goto bail; + set_7220_baseaddrs(dd); /* set chip access pointers now */ + + ret = 0; + if (qib_mini_init) + goto bail; + + ret = qib_create_ctxts(dd); + init_7220_cntrnames(dd); + + /* use all of 4KB buffers for the kernel SDMA, zero if !SDMA. + * reserve the update threshold amount for other kernel use, such + * as sending SMI, MAD, and ACKs, or 3, whichever is greater, + * unless we aren't enabling SDMA, in which case we want to use + * all the 4k bufs for the kernel. + * if this was less than the update threshold, we could wait + * a long time for an update. Coded this way because we + * sometimes change the update threshold for various reasons, + * and we want this to remain robust. + */ + updthresh = 8U; /* update threshold */ + if (dd->flags & QIB_HAS_SEND_DMA) { + dd->cspec->sdmabufcnt = dd->piobcnt4k; + sbufs = updthresh > 3 ? updthresh : 3; + } else { + dd->cspec->sdmabufcnt = 0; + sbufs = dd->piobcnt4k; + } + + dd->cspec->lastbuf_for_pio = dd->piobcnt2k + dd->piobcnt4k - + dd->cspec->sdmabufcnt; + dd->lastctxt_piobuf = dd->cspec->lastbuf_for_pio - sbufs; + dd->cspec->lastbuf_for_pio--; /* range is <= , not < */ + dd->last_pio = dd->cspec->lastbuf_for_pio; + dd->pbufsctxt = dd->lastctxt_piobuf / + (dd->cfgctxts - dd->first_user_ctxt); + + /* + * if we are at 16 user contexts, we will have one 7 sbufs + * per context, so drop the update threshold to match. We + * want to update before we actually run out, at low pbufs/ctxt + * so give ourselves some margin + */ + if ((dd->pbufsctxt - 2) < updthresh) + updthresh = dd->pbufsctxt - 2; + + dd->cspec->updthresh_dflt = updthresh; + dd->cspec->updthresh = updthresh; + + /* before full enable, no interrupts, no locking needed */ + dd->sendctrl |= (updthresh & SYM_RMASK(SendCtrl, AvailUpdThld)) + << SYM_LSB(SendCtrl, AvailUpdThld); + + dd->psxmitwait_supported = 1; + dd->psxmitwait_check_rate = QIB_7220_PSXMITWAIT_CHECK_RATE; +bail: + return ret; +} + +static u32 __iomem *qib_7220_getsendbuf(struct qib_pportdata *ppd, u64 pbc, + u32 *pbufnum) +{ + u32 first, last, plen = pbc & QIB_PBC_LENGTH_MASK; + struct qib_devdata *dd = ppd->dd; + u32 __iomem *buf; + + if (((pbc >> 32) & PBC_7220_VL15_SEND_CTRL) && + !(ppd->lflags & (QIBL_IB_AUTONEG_INPROG | QIBL_LINKACTIVE))) + buf = get_7220_link_buf(ppd, pbufnum); + else { + if ((plen + 1) > dd->piosize2kmax_dwords) + first = dd->piobcnt2k; + else + first = 0; + /* try 4k if all 2k busy, so same last for both sizes */ + last = dd->cspec->lastbuf_for_pio; + buf = qib_getsendbuf_range(dd, pbufnum, first, last); + } + return buf; +} + +/* these 2 "counters" are really control registers, and are always RW */ +static void qib_set_cntr_7220_sample(struct qib_pportdata *ppd, u32 intv, + u32 start) +{ + write_7220_creg(ppd->dd, cr_psinterval, intv); + write_7220_creg(ppd->dd, cr_psstart, start); +} + +/* + * NOTE: no real attempt is made to generalize the SDMA stuff. + * At some point "soon" we will have a new more generalized + * set of sdma interface, and then we'll clean this up. + */ + +/* Must be called with sdma_lock held, or before init finished */ +static void qib_sdma_update_7220_tail(struct qib_pportdata *ppd, u16 tail) +{ + /* Commit writes to memory and advance the tail on the chip */ + wmb(); + ppd->sdma_descq_tail = tail; + qib_write_kreg(ppd->dd, kr_senddmatail, tail); +} + +static void qib_sdma_set_7220_desc_cnt(struct qib_pportdata *ppd, unsigned cnt) +{ +} + +static struct sdma_set_state_action sdma_7220_action_table[] = { + [qib_sdma_state_s00_hw_down] = { + .op_enable = 0, + .op_intenable = 0, + .op_halt = 0, + .go_s99_running_tofalse = 1, + }, + [qib_sdma_state_s10_hw_start_up_wait] = { + .op_enable = 1, + .op_intenable = 1, + .op_halt = 1, + }, + [qib_sdma_state_s20_idle] = { + .op_enable = 1, + .op_intenable = 1, + .op_halt = 1, + }, + [qib_sdma_state_s30_sw_clean_up_wait] = { + .op_enable = 0, + .op_intenable = 1, + .op_halt = 0, + }, + [qib_sdma_state_s40_hw_clean_up_wait] = { + .op_enable = 1, + .op_intenable = 1, + .op_halt = 1, + }, + [qib_sdma_state_s50_hw_halt_wait] = { + .op_enable = 1, + .op_intenable = 1, + .op_halt = 1, + }, + [qib_sdma_state_s99_running] = { + .op_enable = 1, + .op_intenable = 1, + .op_halt = 0, + .go_s99_running_totrue = 1, + }, +}; + +static void qib_7220_sdma_init_early(struct qib_pportdata *ppd) +{ + ppd->sdma_state.set_state_action = sdma_7220_action_table; +} + +static int init_sdma_7220_regs(struct qib_pportdata *ppd) +{ + struct qib_devdata *dd = ppd->dd; + unsigned i, n; + u64 senddmabufmask[3] = { 0 }; + + /* Set SendDmaBase */ + qib_write_kreg(dd, kr_senddmabase, ppd->sdma_descq_phys); + qib_sdma_7220_setlengen(ppd); + qib_sdma_update_7220_tail(ppd, 0); /* Set SendDmaTail */ + /* Set SendDmaHeadAddr */ + qib_write_kreg(dd, kr_senddmaheadaddr, ppd->sdma_head_phys); + + /* + * Reserve all the former "kernel" piobufs, using high number range + * so we get as many 4K buffers as possible + */ + n = dd->piobcnt2k + dd->piobcnt4k; + i = n - dd->cspec->sdmabufcnt; + + for (; i < n; ++i) { + unsigned word = i / 64; + unsigned bit = i & 63; + + BUG_ON(word >= 3); + senddmabufmask[word] |= 1ULL << bit; + } + qib_write_kreg(dd, kr_senddmabufmask0, senddmabufmask[0]); + qib_write_kreg(dd, kr_senddmabufmask1, senddmabufmask[1]); + qib_write_kreg(dd, kr_senddmabufmask2, senddmabufmask[2]); + + ppd->sdma_state.first_sendbuf = i; + ppd->sdma_state.last_sendbuf = n; + + return 0; +} + +/* sdma_lock must be held */ +static u16 qib_sdma_7220_gethead(struct qib_pportdata *ppd) +{ + struct qib_devdata *dd = ppd->dd; + int sane; + int use_dmahead; + u16 swhead; + u16 swtail; + u16 cnt; + u16 hwhead; + + use_dmahead = __qib_sdma_running(ppd) && + (dd->flags & QIB_HAS_SDMA_TIMEOUT); +retry: + hwhead = use_dmahead ? + (u16)le64_to_cpu(*ppd->sdma_head_dma) : + (u16)qib_read_kreg32(dd, kr_senddmahead); + + swhead = ppd->sdma_descq_head; + swtail = ppd->sdma_descq_tail; + cnt = ppd->sdma_descq_cnt; + + if (swhead < swtail) { + /* not wrapped */ + sane = (hwhead >= swhead) & (hwhead <= swtail); + } else if (swhead > swtail) { + /* wrapped around */ + sane = ((hwhead >= swhead) && (hwhead < cnt)) || + (hwhead <= swtail); + } else { + /* empty */ + sane = (hwhead == swhead); + } + + if (unlikely(!sane)) { + if (use_dmahead) { + /* try one more time, directly from the register */ + use_dmahead = 0; + goto retry; + } + /* assume no progress */ + hwhead = swhead; + } + + return hwhead; +} + +static int qib_sdma_7220_busy(struct qib_pportdata *ppd) +{ + u64 hwstatus = qib_read_kreg64(ppd->dd, kr_senddmastatus); + + return (hwstatus & SYM_MASK(SendDmaStatus, ScoreBoardDrainInProg)) || + (hwstatus & SYM_MASK(SendDmaStatus, AbortInProg)) || + (hwstatus & SYM_MASK(SendDmaStatus, InternalSDmaEnable)) || + !(hwstatus & SYM_MASK(SendDmaStatus, ScbEmpty)); +} + +/* + * Compute the amount of delay before sending the next packet if the + * port's send rate differs from the static rate set for the QP. + * Since the delay affects this packet but the amount of the delay is + * based on the length of the previous packet, use the last delay computed + * and save the delay count for this packet to be used next time + * we get here. + */ +static u32 qib_7220_setpbc_control(struct qib_pportdata *ppd, u32 plen, + u8 srate, u8 vl) +{ + u8 snd_mult = ppd->delay_mult; + u8 rcv_mult = ib_rate_to_delay[srate]; + u32 ret = ppd->cpspec->last_delay_mult; + + ppd->cpspec->last_delay_mult = (rcv_mult > snd_mult) ? + (plen * (rcv_mult - snd_mult) + 1) >> 1 : 0; + + /* Indicate VL15, if necessary */ + if (vl == 15) + ret |= PBC_7220_VL15_SEND_CTRL; + return ret; +} + +static void qib_7220_initvl15_bufs(struct qib_devdata *dd) +{ +} + +static void qib_7220_init_ctxt(struct qib_ctxtdata *rcd) +{ + if (!rcd->ctxt) { + rcd->rcvegrcnt = IBA7220_KRCVEGRCNT; + rcd->rcvegr_tid_base = 0; + } else { + rcd->rcvegrcnt = rcd->dd->cspec->rcvegrcnt; + rcd->rcvegr_tid_base = IBA7220_KRCVEGRCNT + + (rcd->ctxt - 1) * rcd->rcvegrcnt; + } +} + +static void qib_7220_txchk_change(struct qib_devdata *dd, u32 start, + u32 len, u32 which, struct qib_ctxtdata *rcd) +{ + int i; + unsigned long flags; + + switch (which) { + case TXCHK_CHG_TYPE_KERN: + /* see if we need to raise avail update threshold */ + spin_lock_irqsave(&dd->uctxt_lock, flags); + for (i = dd->first_user_ctxt; + dd->cspec->updthresh != dd->cspec->updthresh_dflt + && i < dd->cfgctxts; i++) + if (dd->rcd[i] && dd->rcd[i]->subctxt_cnt && + ((dd->rcd[i]->piocnt / dd->rcd[i]->subctxt_cnt) - 1) + < dd->cspec->updthresh_dflt) + break; + spin_unlock_irqrestore(&dd->uctxt_lock, flags); + if (i == dd->cfgctxts) { + spin_lock_irqsave(&dd->sendctrl_lock, flags); + dd->cspec->updthresh = dd->cspec->updthresh_dflt; + dd->sendctrl &= ~SYM_MASK(SendCtrl, AvailUpdThld); + dd->sendctrl |= (dd->cspec->updthresh & + SYM_RMASK(SendCtrl, AvailUpdThld)) << + SYM_LSB(SendCtrl, AvailUpdThld); + spin_unlock_irqrestore(&dd->sendctrl_lock, flags); + sendctrl_7220_mod(dd->pport, QIB_SENDCTRL_AVAIL_BLIP); + } + break; + case TXCHK_CHG_TYPE_USER: + spin_lock_irqsave(&dd->sendctrl_lock, flags); + if (rcd && rcd->subctxt_cnt && ((rcd->piocnt + / rcd->subctxt_cnt) - 1) < dd->cspec->updthresh) { + dd->cspec->updthresh = (rcd->piocnt / + rcd->subctxt_cnt) - 1; + dd->sendctrl &= ~SYM_MASK(SendCtrl, AvailUpdThld); + dd->sendctrl |= (dd->cspec->updthresh & + SYM_RMASK(SendCtrl, AvailUpdThld)) + << SYM_LSB(SendCtrl, AvailUpdThld); + spin_unlock_irqrestore(&dd->sendctrl_lock, flags); + sendctrl_7220_mod(dd->pport, QIB_SENDCTRL_AVAIL_BLIP); + } else + spin_unlock_irqrestore(&dd->sendctrl_lock, flags); + break; + } +} + +static void writescratch(struct qib_devdata *dd, u32 val) +{ + qib_write_kreg(dd, kr_scratch, val); +} + +#define VALID_TS_RD_REG_MASK 0xBF +/** + * qib_7220_tempsense_read - read register of temp sensor via TWSI + * @dd: the qlogic_ib device + * @regnum: register to read from + * + * returns reg contents (0..255) or < 0 for error + */ +static int qib_7220_tempsense_rd(struct qib_devdata *dd, int regnum) +{ + int ret; + u8 rdata; + + if (regnum > 7) { + ret = -EINVAL; + goto bail; + } + + /* return a bogus value for (the one) register we do not have */ + if (!((1 << regnum) & VALID_TS_RD_REG_MASK)) { + ret = 0; + goto bail; + } + + ret = mutex_lock_interruptible(&dd->eep_lock); + if (ret) + goto bail; + + ret = qib_twsi_blk_rd(dd, QIB_TWSI_TEMP_DEV, regnum, &rdata, 1); + if (!ret) + ret = rdata; + + mutex_unlock(&dd->eep_lock); + + /* + * There are three possibilities here: + * ret is actual value (0..255) + * ret is -ENXIO or -EINVAL from twsi code or this file + * ret is -EINTR from mutex_lock_interruptible. + */ +bail: + return ret; +} + +#ifdef CONFIG_INFINIBAND_QIB_DCA +static int qib_7220_notify_dca(struct qib_devdata *dd, unsigned long event) +{ + return 0; +} +#endif + +/* Dummy function, as 7220 boards never disable EEPROM Write */ +static int qib_7220_eeprom_wen(struct qib_devdata *dd, int wen) +{ + return 1; +} + +/** + * qib_init_iba7220_funcs - set up the chip-specific function pointers + * @dev: the pci_dev for qlogic_ib device + * @ent: pci_device_id struct for this dev + * + * This is global, and is called directly at init to set up the + * chip-specific function pointers for later use. + */ +struct qib_devdata *qib_init_iba7220_funcs(struct pci_dev *pdev, + const struct pci_device_id *ent) +{ + struct qib_devdata *dd; + int ret; + u32 boardid, minwidth; + + dd = qib_alloc_devdata(pdev, sizeof(struct qib_chip_specific) + + sizeof(struct qib_chippport_specific)); + if (IS_ERR(dd)) + goto bail; + + dd->f_bringup_serdes = qib_7220_bringup_serdes; + dd->f_cleanup = qib_setup_7220_cleanup; + dd->f_clear_tids = qib_7220_clear_tids; + dd->f_free_irq = qib_7220_free_irq; + dd->f_get_base_info = qib_7220_get_base_info; + dd->f_get_msgheader = qib_7220_get_msgheader; + dd->f_getsendbuf = qib_7220_getsendbuf; + dd->f_gpio_mod = gpio_7220_mod; + dd->f_eeprom_wen = qib_7220_eeprom_wen; + dd->f_hdrqempty = qib_7220_hdrqempty; + dd->f_ib_updown = qib_7220_ib_updown; + dd->f_init_ctxt = qib_7220_init_ctxt; + dd->f_initvl15_bufs = qib_7220_initvl15_bufs; + dd->f_intr_fallback = qib_7220_intr_fallback; + dd->f_late_initreg = qib_late_7220_initreg; + dd->f_setpbc_control = qib_7220_setpbc_control; + dd->f_portcntr = qib_portcntr_7220; + dd->f_put_tid = qib_7220_put_tid; + dd->f_quiet_serdes = qib_7220_quiet_serdes; + dd->f_rcvctrl = rcvctrl_7220_mod; + dd->f_read_cntrs = qib_read_7220cntrs; + dd->f_read_portcntrs = qib_read_7220portcntrs; + dd->f_reset = qib_setup_7220_reset; + dd->f_init_sdma_regs = init_sdma_7220_regs; + dd->f_sdma_busy = qib_sdma_7220_busy; + dd->f_sdma_gethead = qib_sdma_7220_gethead; + dd->f_sdma_sendctrl = qib_7220_sdma_sendctrl; + dd->f_sdma_set_desc_cnt = qib_sdma_set_7220_desc_cnt; + dd->f_sdma_update_tail = qib_sdma_update_7220_tail; + dd->f_sdma_hw_clean_up = qib_7220_sdma_hw_clean_up; + dd->f_sdma_hw_start_up = qib_7220_sdma_hw_start_up; + dd->f_sdma_init_early = qib_7220_sdma_init_early; + dd->f_sendctrl = sendctrl_7220_mod; + dd->f_set_armlaunch = qib_set_7220_armlaunch; + dd->f_set_cntr_sample = qib_set_cntr_7220_sample; + dd->f_iblink_state = qib_7220_iblink_state; + dd->f_ibphys_portstate = qib_7220_phys_portstate; + dd->f_get_ib_cfg = qib_7220_get_ib_cfg; + dd->f_set_ib_cfg = qib_7220_set_ib_cfg; + dd->f_set_ib_loopback = qib_7220_set_loopback; + dd->f_set_intr_state = qib_7220_set_intr_state; + dd->f_setextled = qib_setup_7220_setextled; + dd->f_txchk_change = qib_7220_txchk_change; + dd->f_update_usrhead = qib_update_7220_usrhead; + dd->f_wantpiobuf_intr = qib_wantpiobuf_7220_intr; + dd->f_xgxs_reset = qib_7220_xgxs_reset; + dd->f_writescratch = writescratch; + dd->f_tempsense_rd = qib_7220_tempsense_rd; +#ifdef CONFIG_INFINIBAND_QIB_DCA + dd->f_notify_dca = qib_7220_notify_dca; +#endif + /* + * Do remaining pcie setup and save pcie values in dd. + * Any error printing is already done by the init code. + * On return, we have the chip mapped, but chip registers + * are not set up until start of qib_init_7220_variables. + */ + ret = qib_pcie_ddinit(dd, pdev, ent); + if (ret < 0) + goto bail_free; + + /* initialize chip-specific variables */ + ret = qib_init_7220_variables(dd); + if (ret) + goto bail_cleanup; + + if (qib_mini_init) + goto bail; + + boardid = SYM_FIELD(dd->revision, Revision, + BoardID); + switch (boardid) { + case 0: + case 2: + case 10: + case 12: + minwidth = 16; /* x16 capable boards */ + break; + default: + minwidth = 8; /* x8 capable boards */ + break; + } + if (qib_pcie_params(dd, minwidth, NULL, NULL)) + qib_dev_err(dd, + "Failed to setup PCIe or interrupts; continuing anyway\n"); + + /* save IRQ for possible later use */ + dd->cspec->irq = pdev->irq; + + if (qib_read_kreg64(dd, kr_hwerrstatus) & + QLOGIC_IB_HWE_SERDESPLLFAILED) + qib_write_kreg(dd, kr_hwerrclear, + QLOGIC_IB_HWE_SERDESPLLFAILED); + + /* setup interrupt handler (interrupt type handled above) */ + qib_setup_7220_interrupt(dd); + qib_7220_init_hwerrors(dd); + + /* clear diagctrl register, in case diags were running and crashed */ + qib_write_kreg(dd, kr_hwdiagctrl, 0); + + goto bail; + +bail_cleanup: + qib_pcie_ddcleanup(dd); +bail_free: + qib_free_devdata(dd); + dd = ERR_PTR(ret); +bail: + return dd; +} diff --git a/kernel/drivers/infiniband/hw/qib/qib_iba7322.c b/kernel/drivers/infiniband/hw/qib/qib_iba7322.c new file mode 100644 index 000000000..f32b4628e --- /dev/null +++ b/kernel/drivers/infiniband/hw/qib/qib_iba7322.c @@ -0,0 +1,8573 @@ +/* + * Copyright (c) 2012 Intel Corporation. All rights reserved. + * Copyright (c) 2008 - 2012 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * This file contains all of the code that is specific to the + * InfiniPath 7322 chip + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_INFINIBAND_QIB_DCA +#include +#endif + +#include "qib.h" +#include "qib_7322_regs.h" +#include "qib_qsfp.h" + +#include "qib_mad.h" +#include "qib_verbs.h" + +#undef pr_fmt +#define pr_fmt(fmt) QIB_DRV_NAME " " fmt + +static void qib_setup_7322_setextled(struct qib_pportdata *, u32); +static void qib_7322_handle_hwerrors(struct qib_devdata *, char *, size_t); +static void sendctrl_7322_mod(struct qib_pportdata *ppd, u32 op); +static irqreturn_t qib_7322intr(int irq, void *data); +static irqreturn_t qib_7322bufavail(int irq, void *data); +static irqreturn_t sdma_intr(int irq, void *data); +static irqreturn_t sdma_idle_intr(int irq, void *data); +static irqreturn_t sdma_progress_intr(int irq, void *data); +static irqreturn_t sdma_cleanup_intr(int irq, void *data); +static void qib_7322_txchk_change(struct qib_devdata *, u32, u32, u32, + struct qib_ctxtdata *rcd); +static u8 qib_7322_phys_portstate(u64); +static u32 qib_7322_iblink_state(u64); +static void qib_set_ib_7322_lstate(struct qib_pportdata *ppd, u16 linkcmd, + u16 linitcmd); +static void force_h1(struct qib_pportdata *); +static void adj_tx_serdes(struct qib_pportdata *); +static u32 qib_7322_setpbc_control(struct qib_pportdata *, u32, u8, u8); +static void qib_7322_mini_pcs_reset(struct qib_pportdata *); + +static u32 ahb_mod(struct qib_devdata *, int, int, int, u32, u32); +static void ibsd_wr_allchans(struct qib_pportdata *, int, unsigned, unsigned); +static void serdes_7322_los_enable(struct qib_pportdata *, int); +static int serdes_7322_init_old(struct qib_pportdata *); +static int serdes_7322_init_new(struct qib_pportdata *); +static void dump_sdma_7322_state(struct qib_pportdata *); + +#define BMASK(msb, lsb) (((1 << ((msb) + 1 - (lsb))) - 1) << (lsb)) + +/* LE2 serdes values for different cases */ +#define LE2_DEFAULT 5 +#define LE2_5m 4 +#define LE2_QME 0 + +/* Below is special-purpose, so only really works for the IB SerDes blocks. */ +#define IBSD(hw_pidx) (hw_pidx + 2) + +/* these are variables for documentation and experimentation purposes */ +static const unsigned rcv_int_timeout = 375; +static const unsigned rcv_int_count = 16; +static const unsigned sdma_idle_cnt = 64; + +/* Time to stop altering Rx Equalization parameters, after link up. */ +#define RXEQ_DISABLE_MSECS 2500 + +/* + * Number of VLs we are configured to use (to allow for more + * credits per vl, etc.) + */ +ushort qib_num_cfg_vls = 2; +module_param_named(num_vls, qib_num_cfg_vls, ushort, S_IRUGO); +MODULE_PARM_DESC(num_vls, "Set number of Virtual Lanes to use (1-8)"); + +static ushort qib_chase = 1; +module_param_named(chase, qib_chase, ushort, S_IRUGO); +MODULE_PARM_DESC(chase, "Enable state chase handling"); + +static ushort qib_long_atten = 10; /* 10 dB ~= 5m length */ +module_param_named(long_attenuation, qib_long_atten, ushort, S_IRUGO); +MODULE_PARM_DESC(long_attenuation, + "attenuation cutoff (dB) for long copper cable setup"); + +static ushort qib_singleport; +module_param_named(singleport, qib_singleport, ushort, S_IRUGO); +MODULE_PARM_DESC(singleport, "Use only IB port 1; more per-port buffer space"); + +static ushort qib_krcvq01_no_msi; +module_param_named(krcvq01_no_msi, qib_krcvq01_no_msi, ushort, S_IRUGO); +MODULE_PARM_DESC(krcvq01_no_msi, "No MSI for kctx < 2"); + +/* + * Receive header queue sizes + */ +static unsigned qib_rcvhdrcnt; +module_param_named(rcvhdrcnt, qib_rcvhdrcnt, uint, S_IRUGO); +MODULE_PARM_DESC(rcvhdrcnt, "receive header count"); + +static unsigned qib_rcvhdrsize; +module_param_named(rcvhdrsize, qib_rcvhdrsize, uint, S_IRUGO); +MODULE_PARM_DESC(rcvhdrsize, "receive header size in 32-bit words"); + +static unsigned qib_rcvhdrentsize; +module_param_named(rcvhdrentsize, qib_rcvhdrentsize, uint, S_IRUGO); +MODULE_PARM_DESC(rcvhdrentsize, "receive header entry size in 32-bit words"); + +#define MAX_ATTEN_LEN 64 /* plenty for any real system */ +/* for read back, default index is ~5m copper cable */ +static char txselect_list[MAX_ATTEN_LEN] = "10"; +static struct kparam_string kp_txselect = { + .string = txselect_list, + .maxlen = MAX_ATTEN_LEN +}; +static int setup_txselect(const char *, struct kernel_param *); +module_param_call(txselect, setup_txselect, param_get_string, + &kp_txselect, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(txselect, + "Tx serdes indices (for no QSFP or invalid QSFP data)"); + +#define BOARD_QME7342 5 +#define BOARD_QMH7342 6 +#define BOARD_QMH7360 9 +#define IS_QMH(dd) (SYM_FIELD((dd)->revision, Revision, BoardID) == \ + BOARD_QMH7342) +#define IS_QME(dd) (SYM_FIELD((dd)->revision, Revision, BoardID) == \ + BOARD_QME7342) + +#define KREG_IDX(regname) (QIB_7322_##regname##_OFFS / sizeof(u64)) + +#define KREG_IBPORT_IDX(regname) ((QIB_7322_##regname##_0_OFFS / sizeof(u64))) + +#define MASK_ACROSS(lsb, msb) \ + (((1ULL << ((msb) + 1 - (lsb))) - 1) << (lsb)) + +#define SYM_RMASK(regname, fldname) ((u64) \ + QIB_7322_##regname##_##fldname##_RMASK) + +#define SYM_MASK(regname, fldname) ((u64) \ + QIB_7322_##regname##_##fldname##_RMASK << \ + QIB_7322_##regname##_##fldname##_LSB) + +#define SYM_FIELD(value, regname, fldname) ((u64) \ + (((value) >> SYM_LSB(regname, fldname)) & \ + SYM_RMASK(regname, fldname))) + +/* useful for things like LaFifoEmpty_0...7, TxCreditOK_0...7, etc. */ +#define SYM_FIELD_ACROSS(value, regname, fldname, nbits) \ + (((value) >> SYM_LSB(regname, fldname)) & MASK_ACROSS(0, nbits)) + +#define HWE_MASK(fldname) SYM_MASK(HwErrMask, fldname##Mask) +#define ERR_MASK(fldname) SYM_MASK(ErrMask, fldname##Mask) +#define ERR_MASK_N(fldname) SYM_MASK(ErrMask_0, fldname##Mask) +#define INT_MASK(fldname) SYM_MASK(IntMask, fldname##IntMask) +#define INT_MASK_P(fldname, port) SYM_MASK(IntMask, fldname##IntMask##_##port) +/* Below because most, but not all, fields of IntMask have that full suffix */ +#define INT_MASK_PM(fldname, port) SYM_MASK(IntMask, fldname##Mask##_##port) + + +#define SYM_LSB(regname, fldname) (QIB_7322_##regname##_##fldname##_LSB) + +/* + * the size bits give us 2^N, in KB units. 0 marks as invalid, + * and 7 is reserved. We currently use only 2KB and 4KB + */ +#define IBA7322_TID_SZ_SHIFT QIB_7322_RcvTIDArray0_RT_BufSize_LSB +#define IBA7322_TID_SZ_2K (1UL<kregbase || !(dd->flags & QIB_PRESENT)) + return 0; + return readl(regno + (u64 __iomem *)( + (dd->ureg_align * ctxt) + (dd->userbase ? + (char __iomem *)dd->userbase : + (char __iomem *)dd->kregbase + dd->uregbase))); +} + +/** + * qib_read_ureg - read virtualized per-context register + * @dd: device + * @regno: register number + * @ctxt: context number + * + * Return the contents of a register that is virtualized to be per context. + * Returns -1 on errors (not distinguishable from valid contents at + * runtime; we may add a separate error variable at some point). + */ +static inline u64 qib_read_ureg(const struct qib_devdata *dd, + enum qib_ureg regno, int ctxt) +{ + + if (!dd->kregbase || !(dd->flags & QIB_PRESENT)) + return 0; + return readq(regno + (u64 __iomem *)( + (dd->ureg_align * ctxt) + (dd->userbase ? + (char __iomem *)dd->userbase : + (char __iomem *)dd->kregbase + dd->uregbase))); +} + +/** + * qib_write_ureg - write virtualized per-context register + * @dd: device + * @regno: register number + * @value: value + * @ctxt: context + * + * Write the contents of a register that is virtualized to be per context. + */ +static inline void qib_write_ureg(const struct qib_devdata *dd, + enum qib_ureg regno, u64 value, int ctxt) +{ + u64 __iomem *ubase; + + if (dd->userbase) + ubase = (u64 __iomem *) + ((char __iomem *) dd->userbase + + dd->ureg_align * ctxt); + else + ubase = (u64 __iomem *) + (dd->uregbase + + (char __iomem *) dd->kregbase + + dd->ureg_align * ctxt); + + if (dd->kregbase && (dd->flags & QIB_PRESENT)) + writeq(value, &ubase[regno]); +} + +static inline u32 qib_read_kreg32(const struct qib_devdata *dd, + const u32 regno) +{ + if (!dd->kregbase || !(dd->flags & QIB_PRESENT)) + return -1; + return readl((u32 __iomem *) &dd->kregbase[regno]); +} + +static inline u64 qib_read_kreg64(const struct qib_devdata *dd, + const u32 regno) +{ + if (!dd->kregbase || !(dd->flags & QIB_PRESENT)) + return -1; + return readq(&dd->kregbase[regno]); +} + +static inline void qib_write_kreg(const struct qib_devdata *dd, + const u32 regno, u64 value) +{ + if (dd->kregbase && (dd->flags & QIB_PRESENT)) + writeq(value, &dd->kregbase[regno]); +} + +/* + * not many sanity checks for the port-specific kernel register routines, + * since they are only used when it's known to be safe. +*/ +static inline u64 qib_read_kreg_port(const struct qib_pportdata *ppd, + const u16 regno) +{ + if (!ppd->cpspec->kpregbase || !(ppd->dd->flags & QIB_PRESENT)) + return 0ULL; + return readq(&ppd->cpspec->kpregbase[regno]); +} + +static inline void qib_write_kreg_port(const struct qib_pportdata *ppd, + const u16 regno, u64 value) +{ + if (ppd->cpspec && ppd->dd && ppd->cpspec->kpregbase && + (ppd->dd->flags & QIB_PRESENT)) + writeq(value, &ppd->cpspec->kpregbase[regno]); +} + +/** + * qib_write_kreg_ctxt - write a device's per-ctxt 64-bit kernel register + * @dd: the qlogic_ib device + * @regno: the register number to write + * @ctxt: the context containing the register + * @value: the value to write + */ +static inline void qib_write_kreg_ctxt(const struct qib_devdata *dd, + const u16 regno, unsigned ctxt, + u64 value) +{ + qib_write_kreg(dd, regno + ctxt, value); +} + +static inline u64 read_7322_creg(const struct qib_devdata *dd, u16 regno) +{ + if (!dd->cspec->cregbase || !(dd->flags & QIB_PRESENT)) + return 0; + return readq(&dd->cspec->cregbase[regno]); + + +} + +static inline u32 read_7322_creg32(const struct qib_devdata *dd, u16 regno) +{ + if (!dd->cspec->cregbase || !(dd->flags & QIB_PRESENT)) + return 0; + return readl(&dd->cspec->cregbase[regno]); + + +} + +static inline void write_7322_creg_port(const struct qib_pportdata *ppd, + u16 regno, u64 value) +{ + if (ppd->cpspec && ppd->cpspec->cpregbase && + (ppd->dd->flags & QIB_PRESENT)) + writeq(value, &ppd->cpspec->cpregbase[regno]); +} + +static inline u64 read_7322_creg_port(const struct qib_pportdata *ppd, + u16 regno) +{ + if (!ppd->cpspec || !ppd->cpspec->cpregbase || + !(ppd->dd->flags & QIB_PRESENT)) + return 0; + return readq(&ppd->cpspec->cpregbase[regno]); +} + +static inline u32 read_7322_creg32_port(const struct qib_pportdata *ppd, + u16 regno) +{ + if (!ppd->cpspec || !ppd->cpspec->cpregbase || + !(ppd->dd->flags & QIB_PRESENT)) + return 0; + return readl(&ppd->cpspec->cpregbase[regno]); +} + +/* bits in Control register */ +#define QLOGIC_IB_C_RESET SYM_MASK(Control, SyncReset) +#define QLOGIC_IB_C_SDMAFETCHPRIOEN SYM_MASK(Control, SDmaDescFetchPriorityEn) + +/* bits in general interrupt regs */ +#define QIB_I_RCVURG_LSB SYM_LSB(IntMask, RcvUrg0IntMask) +#define QIB_I_RCVURG_RMASK MASK_ACROSS(0, 17) +#define QIB_I_RCVURG_MASK (QIB_I_RCVURG_RMASK << QIB_I_RCVURG_LSB) +#define QIB_I_RCVAVAIL_LSB SYM_LSB(IntMask, RcvAvail0IntMask) +#define QIB_I_RCVAVAIL_RMASK MASK_ACROSS(0, 17) +#define QIB_I_RCVAVAIL_MASK (QIB_I_RCVAVAIL_RMASK << QIB_I_RCVAVAIL_LSB) +#define QIB_I_C_ERROR INT_MASK(Err) + +#define QIB_I_SPIOSENT (INT_MASK_P(SendDone, 0) | INT_MASK_P(SendDone, 1)) +#define QIB_I_SPIOBUFAVAIL INT_MASK(SendBufAvail) +#define QIB_I_GPIO INT_MASK(AssertGPIO) +#define QIB_I_P_SDMAINT(pidx) \ + (INT_MASK_P(SDma, pidx) | INT_MASK_P(SDmaIdle, pidx) | \ + INT_MASK_P(SDmaProgress, pidx) | \ + INT_MASK_PM(SDmaCleanupDone, pidx)) + +/* Interrupt bits that are "per port" */ +#define QIB_I_P_BITSEXTANT(pidx) \ + (INT_MASK_P(Err, pidx) | INT_MASK_P(SendDone, pidx) | \ + INT_MASK_P(SDma, pidx) | INT_MASK_P(SDmaIdle, pidx) | \ + INT_MASK_P(SDmaProgress, pidx) | \ + INT_MASK_PM(SDmaCleanupDone, pidx)) + +/* Interrupt bits that are common to a device */ +/* currently unused: QIB_I_SPIOSENT */ +#define QIB_I_C_BITSEXTANT \ + (QIB_I_RCVURG_MASK | QIB_I_RCVAVAIL_MASK | \ + QIB_I_SPIOSENT | \ + QIB_I_C_ERROR | QIB_I_SPIOBUFAVAIL | QIB_I_GPIO) + +#define QIB_I_BITSEXTANT (QIB_I_C_BITSEXTANT | \ + QIB_I_P_BITSEXTANT(0) | QIB_I_P_BITSEXTANT(1)) + +/* + * Error bits that are "per port". + */ +#define QIB_E_P_IBSTATUSCHANGED ERR_MASK_N(IBStatusChanged) +#define QIB_E_P_SHDR ERR_MASK_N(SHeadersErr) +#define QIB_E_P_VL15_BUF_MISUSE ERR_MASK_N(VL15BufMisuseErr) +#define QIB_E_P_SND_BUF_MISUSE ERR_MASK_N(SendBufMisuseErr) +#define QIB_E_P_SUNSUPVL ERR_MASK_N(SendUnsupportedVLErr) +#define QIB_E_P_SUNEXP_PKTNUM ERR_MASK_N(SendUnexpectedPktNumErr) +#define QIB_E_P_SDROP_DATA ERR_MASK_N(SendDroppedDataPktErr) +#define QIB_E_P_SDROP_SMP ERR_MASK_N(SendDroppedSmpPktErr) +#define QIB_E_P_SPKTLEN ERR_MASK_N(SendPktLenErr) +#define QIB_E_P_SUNDERRUN ERR_MASK_N(SendUnderRunErr) +#define QIB_E_P_SMAXPKTLEN ERR_MASK_N(SendMaxPktLenErr) +#define QIB_E_P_SMINPKTLEN ERR_MASK_N(SendMinPktLenErr) +#define QIB_E_P_RIBLOSTLINK ERR_MASK_N(RcvIBLostLinkErr) +#define QIB_E_P_RHDR ERR_MASK_N(RcvHdrErr) +#define QIB_E_P_RHDRLEN ERR_MASK_N(RcvHdrLenErr) +#define QIB_E_P_RBADTID ERR_MASK_N(RcvBadTidErr) +#define QIB_E_P_RBADVERSION ERR_MASK_N(RcvBadVersionErr) +#define QIB_E_P_RIBFLOW ERR_MASK_N(RcvIBFlowErr) +#define QIB_E_P_REBP ERR_MASK_N(RcvEBPErr) +#define QIB_E_P_RUNSUPVL ERR_MASK_N(RcvUnsupportedVLErr) +#define QIB_E_P_RUNEXPCHAR ERR_MASK_N(RcvUnexpectedCharErr) +#define QIB_E_P_RSHORTPKTLEN ERR_MASK_N(RcvShortPktLenErr) +#define QIB_E_P_RLONGPKTLEN ERR_MASK_N(RcvLongPktLenErr) +#define QIB_E_P_RMAXPKTLEN ERR_MASK_N(RcvMaxPktLenErr) +#define QIB_E_P_RMINPKTLEN ERR_MASK_N(RcvMinPktLenErr) +#define QIB_E_P_RICRC ERR_MASK_N(RcvICRCErr) +#define QIB_E_P_RVCRC ERR_MASK_N(RcvVCRCErr) +#define QIB_E_P_RFORMATERR ERR_MASK_N(RcvFormatErr) + +#define QIB_E_P_SDMA1STDESC ERR_MASK_N(SDma1stDescErr) +#define QIB_E_P_SDMABASE ERR_MASK_N(SDmaBaseErr) +#define QIB_E_P_SDMADESCADDRMISALIGN ERR_MASK_N(SDmaDescAddrMisalignErr) +#define QIB_E_P_SDMADWEN ERR_MASK_N(SDmaDwEnErr) +#define QIB_E_P_SDMAGENMISMATCH ERR_MASK_N(SDmaGenMismatchErr) +#define QIB_E_P_SDMAHALT ERR_MASK_N(SDmaHaltErr) +#define QIB_E_P_SDMAMISSINGDW ERR_MASK_N(SDmaMissingDwErr) +#define QIB_E_P_SDMAOUTOFBOUND ERR_MASK_N(SDmaOutOfBoundErr) +#define QIB_E_P_SDMARPYTAG ERR_MASK_N(SDmaRpyTagErr) +#define QIB_E_P_SDMATAILOUTOFBOUND ERR_MASK_N(SDmaTailOutOfBoundErr) +#define QIB_E_P_SDMAUNEXPDATA ERR_MASK_N(SDmaUnexpDataErr) + +/* Error bits that are common to a device */ +#define QIB_E_RESET ERR_MASK(ResetNegated) +#define QIB_E_HARDWARE ERR_MASK(HardwareErr) +#define QIB_E_INVALIDADDR ERR_MASK(InvalidAddrErr) + + +/* + * Per chip (rather than per-port) errors. Most either do + * nothing but trigger a print (because they self-recover, or + * always occur in tandem with other errors that handle the + * issue), or because they indicate errors with no recovery, + * but we want to know that they happened. + */ +#define QIB_E_SBUF_VL15_MISUSE ERR_MASK(SBufVL15MisUseErr) +#define QIB_E_BADEEP ERR_MASK(InvalidEEPCmd) +#define QIB_E_VLMISMATCH ERR_MASK(SendVLMismatchErr) +#define QIB_E_ARMLAUNCH ERR_MASK(SendArmLaunchErr) +#define QIB_E_SPCLTRIG ERR_MASK(SendSpecialTriggerErr) +#define QIB_E_RRCVHDRFULL ERR_MASK(RcvHdrFullErr) +#define QIB_E_RRCVEGRFULL ERR_MASK(RcvEgrFullErr) +#define QIB_E_RCVCTXTSHARE ERR_MASK(RcvContextShareErr) + +/* SDMA chip errors (not per port) + * QIB_E_SDMA_BUF_DUP needs no special handling, because we will also get + * the SDMAHALT error immediately, so we just print the dup error via the + * E_AUTO mechanism. This is true of most of the per-port fatal errors + * as well, but since this is port-independent, by definition, it's + * handled a bit differently. SDMA_VL15 and SDMA_WRONG_PORT are per + * packet send errors, and so are handled in the same manner as other + * per-packet errors. + */ +#define QIB_E_SDMA_VL15 ERR_MASK(SDmaVL15Err) +#define QIB_E_SDMA_WRONG_PORT ERR_MASK(SDmaWrongPortErr) +#define QIB_E_SDMA_BUF_DUP ERR_MASK(SDmaBufMaskDuplicateErr) + +/* + * Below functionally equivalent to legacy QLOGIC_IB_E_PKTERRS + * it is used to print "common" packet errors. + */ +#define QIB_E_P_PKTERRS (QIB_E_P_SPKTLEN |\ + QIB_E_P_SDROP_DATA | QIB_E_P_RVCRC |\ + QIB_E_P_RICRC | QIB_E_P_RSHORTPKTLEN |\ + QIB_E_P_VL15_BUF_MISUSE | QIB_E_P_SHDR | \ + QIB_E_P_REBP) + +/* Error Bits that Packet-related (Receive, per-port) */ +#define QIB_E_P_RPKTERRS (\ + QIB_E_P_RHDRLEN | QIB_E_P_RBADTID | \ + QIB_E_P_RBADVERSION | QIB_E_P_RHDR | \ + QIB_E_P_RLONGPKTLEN | QIB_E_P_RSHORTPKTLEN |\ + QIB_E_P_RMAXPKTLEN | QIB_E_P_RMINPKTLEN | \ + QIB_E_P_RFORMATERR | QIB_E_P_RUNSUPVL | \ + QIB_E_P_RUNEXPCHAR | QIB_E_P_RIBFLOW | QIB_E_P_REBP) + +/* + * Error bits that are Send-related (per port) + * (ARMLAUNCH excluded from E_SPKTERRS because it gets special handling). + * All of these potentially need to have a buffer disarmed + */ +#define QIB_E_P_SPKTERRS (\ + QIB_E_P_SUNEXP_PKTNUM |\ + QIB_E_P_SDROP_DATA | QIB_E_P_SDROP_SMP |\ + QIB_E_P_SMAXPKTLEN |\ + QIB_E_P_VL15_BUF_MISUSE | QIB_E_P_SHDR | \ + QIB_E_P_SMINPKTLEN | QIB_E_P_SPKTLEN | \ + QIB_E_P_SND_BUF_MISUSE | QIB_E_P_SUNSUPVL) + +#define QIB_E_SPKTERRS ( \ + QIB_E_SBUF_VL15_MISUSE | QIB_E_VLMISMATCH | \ + ERR_MASK_N(SendUnsupportedVLErr) | \ + QIB_E_SPCLTRIG | QIB_E_SDMA_VL15 | QIB_E_SDMA_WRONG_PORT) + +#define QIB_E_P_SDMAERRS ( \ + QIB_E_P_SDMAHALT | \ + QIB_E_P_SDMADESCADDRMISALIGN | \ + QIB_E_P_SDMAUNEXPDATA | \ + QIB_E_P_SDMAMISSINGDW | \ + QIB_E_P_SDMADWEN | \ + QIB_E_P_SDMARPYTAG | \ + QIB_E_P_SDMA1STDESC | \ + QIB_E_P_SDMABASE | \ + QIB_E_P_SDMATAILOUTOFBOUND | \ + QIB_E_P_SDMAOUTOFBOUND | \ + QIB_E_P_SDMAGENMISMATCH) + +/* + * This sets some bits more than once, but makes it more obvious which + * bits are not handled under other categories, and the repeat definition + * is not a problem. + */ +#define QIB_E_P_BITSEXTANT ( \ + QIB_E_P_SPKTERRS | QIB_E_P_PKTERRS | QIB_E_P_RPKTERRS | \ + QIB_E_P_RIBLOSTLINK | QIB_E_P_IBSTATUSCHANGED | \ + QIB_E_P_SND_BUF_MISUSE | QIB_E_P_SUNDERRUN | \ + QIB_E_P_SHDR | QIB_E_P_VL15_BUF_MISUSE | QIB_E_P_SDMAERRS \ + ) + +/* + * These are errors that can occur when the link + * changes state while a packet is being sent or received. This doesn't + * cover things like EBP or VCRC that can be the result of a sending + * having the link change state, so we receive a "known bad" packet. + * All of these are "per port", so renamed: + */ +#define QIB_E_P_LINK_PKTERRS (\ + QIB_E_P_SDROP_DATA | QIB_E_P_SDROP_SMP |\ + QIB_E_P_SMINPKTLEN | QIB_E_P_SPKTLEN |\ + QIB_E_P_RSHORTPKTLEN | QIB_E_P_RMINPKTLEN |\ + QIB_E_P_RUNEXPCHAR) + +/* + * This sets some bits more than once, but makes it more obvious which + * bits are not handled under other categories (such as QIB_E_SPKTERRS), + * and the repeat definition is not a problem. + */ +#define QIB_E_C_BITSEXTANT (\ + QIB_E_HARDWARE | QIB_E_INVALIDADDR | QIB_E_BADEEP |\ + QIB_E_ARMLAUNCH | QIB_E_VLMISMATCH | QIB_E_RRCVHDRFULL |\ + QIB_E_RRCVEGRFULL | QIB_E_RESET | QIB_E_SBUF_VL15_MISUSE) + +/* Likewise Neuter E_SPKT_ERRS_IGNORE */ +#define E_SPKT_ERRS_IGNORE 0 + +#define QIB_EXTS_MEMBIST_DISABLED \ + SYM_MASK(EXTStatus, MemBISTDisabled) +#define QIB_EXTS_MEMBIST_ENDTEST \ + SYM_MASK(EXTStatus, MemBISTEndTest) + +#define QIB_E_SPIOARMLAUNCH \ + ERR_MASK(SendArmLaunchErr) + +#define IBA7322_IBCC_LINKINITCMD_MASK SYM_RMASK(IBCCtrlA_0, LinkInitCmd) +#define IBA7322_IBCC_LINKCMD_SHIFT SYM_LSB(IBCCtrlA_0, LinkCmd) + +/* + * IBTA_1_2 is set when multiple speeds are enabled (normal), + * and also if forced QDR (only QDR enabled). It's enabled for the + * forced QDR case so that scrambling will be enabled by the TS3 + * exchange, when supported by both sides of the link. + */ +#define IBA7322_IBC_IBTA_1_2_MASK SYM_MASK(IBCCtrlB_0, IB_ENHANCED_MODE) +#define IBA7322_IBC_MAX_SPEED_MASK SYM_MASK(IBCCtrlB_0, SD_SPEED) +#define IBA7322_IBC_SPEED_QDR SYM_MASK(IBCCtrlB_0, SD_SPEED_QDR) +#define IBA7322_IBC_SPEED_DDR SYM_MASK(IBCCtrlB_0, SD_SPEED_DDR) +#define IBA7322_IBC_SPEED_SDR SYM_MASK(IBCCtrlB_0, SD_SPEED_SDR) +#define IBA7322_IBC_SPEED_MASK (SYM_MASK(IBCCtrlB_0, SD_SPEED_SDR) | \ + SYM_MASK(IBCCtrlB_0, SD_SPEED_DDR) | SYM_MASK(IBCCtrlB_0, SD_SPEED_QDR)) +#define IBA7322_IBC_SPEED_LSB SYM_LSB(IBCCtrlB_0, SD_SPEED_SDR) + +#define IBA7322_LEDBLINK_OFF_SHIFT SYM_LSB(RcvPktLEDCnt_0, OFFperiod) +#define IBA7322_LEDBLINK_ON_SHIFT SYM_LSB(RcvPktLEDCnt_0, ONperiod) + +#define IBA7322_IBC_WIDTH_AUTONEG SYM_MASK(IBCCtrlB_0, IB_NUM_CHANNELS) +#define IBA7322_IBC_WIDTH_4X_ONLY (1<> \ + SYM_LSB(IBCCtrlB_0, HRTBT_ENB)) +#define IBA7322_IBC_HRTBT_LSB SYM_LSB(IBCCtrlB_0, HRTBT_ENB) + +#define IBA7322_REDIRECT_VEC_PER_REG 12 + +#define IBA7322_SENDCHK_PKEY SYM_MASK(SendCheckControl_0, PKey_En) +#define IBA7322_SENDCHK_BTHQP SYM_MASK(SendCheckControl_0, BTHQP_En) +#define IBA7322_SENDCHK_SLID SYM_MASK(SendCheckControl_0, SLID_En) +#define IBA7322_SENDCHK_RAW_IPV6 SYM_MASK(SendCheckControl_0, RawIPV6_En) +#define IBA7322_SENDCHK_MINSZ SYM_MASK(SendCheckControl_0, PacketTooSmall_En) + +#define AUTONEG_TRIES 3 /* sequential retries to negotiate DDR */ + +#define HWE_AUTO(fldname) { .mask = SYM_MASK(HwErrMask, fldname##Mask), \ + .msg = #fldname , .sz = sizeof(#fldname) } +#define HWE_AUTO_P(fldname, port) { .mask = SYM_MASK(HwErrMask, \ + fldname##Mask##_##port), .msg = #fldname , .sz = sizeof(#fldname) } +static const struct qib_hwerror_msgs qib_7322_hwerror_msgs[] = { + HWE_AUTO_P(IBSerdesPClkNotDetect, 1), + HWE_AUTO_P(IBSerdesPClkNotDetect, 0), + HWE_AUTO(PCIESerdesPClkNotDetect), + HWE_AUTO(PowerOnBISTFailed), + HWE_AUTO(TempsenseTholdReached), + HWE_AUTO(MemoryErr), + HWE_AUTO(PCIeBusParityErr), + HWE_AUTO(PcieCplTimeout), + HWE_AUTO(PciePoisonedTLP), + HWE_AUTO_P(SDmaMemReadErr, 1), + HWE_AUTO_P(SDmaMemReadErr, 0), + HWE_AUTO_P(IBCBusFromSPCParityErr, 1), + HWE_AUTO_P(IBCBusToSPCParityErr, 1), + HWE_AUTO_P(IBCBusFromSPCParityErr, 0), + HWE_AUTO(statusValidNoEop), + HWE_AUTO(LATriggered), + { .mask = 0, .sz = 0 } +}; + +#define E_AUTO(fldname) { .mask = SYM_MASK(ErrMask, fldname##Mask), \ + .msg = #fldname, .sz = sizeof(#fldname) } +#define E_P_AUTO(fldname) { .mask = SYM_MASK(ErrMask_0, fldname##Mask), \ + .msg = #fldname, .sz = sizeof(#fldname) } +static const struct qib_hwerror_msgs qib_7322error_msgs[] = { + E_AUTO(RcvEgrFullErr), + E_AUTO(RcvHdrFullErr), + E_AUTO(ResetNegated), + E_AUTO(HardwareErr), + E_AUTO(InvalidAddrErr), + E_AUTO(SDmaVL15Err), + E_AUTO(SBufVL15MisUseErr), + E_AUTO(InvalidEEPCmd), + E_AUTO(RcvContextShareErr), + E_AUTO(SendVLMismatchErr), + E_AUTO(SendArmLaunchErr), + E_AUTO(SendSpecialTriggerErr), + E_AUTO(SDmaWrongPortErr), + E_AUTO(SDmaBufMaskDuplicateErr), + { .mask = 0, .sz = 0 } +}; + +static const struct qib_hwerror_msgs qib_7322p_error_msgs[] = { + E_P_AUTO(IBStatusChanged), + E_P_AUTO(SHeadersErr), + E_P_AUTO(VL15BufMisuseErr), + /* + * SDmaHaltErr is not really an error, make it clearer; + */ + {.mask = SYM_MASK(ErrMask_0, SDmaHaltErrMask), .msg = "SDmaHalted", + .sz = 11}, + E_P_AUTO(SDmaDescAddrMisalignErr), + E_P_AUTO(SDmaUnexpDataErr), + E_P_AUTO(SDmaMissingDwErr), + E_P_AUTO(SDmaDwEnErr), + E_P_AUTO(SDmaRpyTagErr), + E_P_AUTO(SDma1stDescErr), + E_P_AUTO(SDmaBaseErr), + E_P_AUTO(SDmaTailOutOfBoundErr), + E_P_AUTO(SDmaOutOfBoundErr), + E_P_AUTO(SDmaGenMismatchErr), + E_P_AUTO(SendBufMisuseErr), + E_P_AUTO(SendUnsupportedVLErr), + E_P_AUTO(SendUnexpectedPktNumErr), + E_P_AUTO(SendDroppedDataPktErr), + E_P_AUTO(SendDroppedSmpPktErr), + E_P_AUTO(SendPktLenErr), + E_P_AUTO(SendUnderRunErr), + E_P_AUTO(SendMaxPktLenErr), + E_P_AUTO(SendMinPktLenErr), + E_P_AUTO(RcvIBLostLinkErr), + E_P_AUTO(RcvHdrErr), + E_P_AUTO(RcvHdrLenErr), + E_P_AUTO(RcvBadTidErr), + E_P_AUTO(RcvBadVersionErr), + E_P_AUTO(RcvIBFlowErr), + E_P_AUTO(RcvEBPErr), + E_P_AUTO(RcvUnsupportedVLErr), + E_P_AUTO(RcvUnexpectedCharErr), + E_P_AUTO(RcvShortPktLenErr), + E_P_AUTO(RcvLongPktLenErr), + E_P_AUTO(RcvMaxPktLenErr), + E_P_AUTO(RcvMinPktLenErr), + E_P_AUTO(RcvICRCErr), + E_P_AUTO(RcvVCRCErr), + E_P_AUTO(RcvFormatErr), + { .mask = 0, .sz = 0 } +}; + +/* + * Below generates "auto-message" for interrupts not specific to any port or + * context + */ +#define INTR_AUTO(fldname) { .mask = SYM_MASK(IntMask, fldname##Mask), \ + .msg = #fldname, .sz = sizeof(#fldname) } +/* Below generates "auto-message" for interrupts specific to a port */ +#define INTR_AUTO_P(fldname) { .mask = MASK_ACROSS(\ + SYM_LSB(IntMask, fldname##Mask##_0), \ + SYM_LSB(IntMask, fldname##Mask##_1)), \ + .msg = #fldname "_P", .sz = sizeof(#fldname "_P") } +/* For some reason, the SerDesTrimDone bits are reversed */ +#define INTR_AUTO_PI(fldname) { .mask = MASK_ACROSS(\ + SYM_LSB(IntMask, fldname##Mask##_1), \ + SYM_LSB(IntMask, fldname##Mask##_0)), \ + .msg = #fldname "_P", .sz = sizeof(#fldname "_P") } +/* + * Below generates "auto-message" for interrupts specific to a context, + * with ctxt-number appended + */ +#define INTR_AUTO_C(fldname) { .mask = MASK_ACROSS(\ + SYM_LSB(IntMask, fldname##0IntMask), \ + SYM_LSB(IntMask, fldname##17IntMask)), \ + .msg = #fldname "_C", .sz = sizeof(#fldname "_C") } + +static const struct qib_hwerror_msgs qib_7322_intr_msgs[] = { + INTR_AUTO_P(SDmaInt), + INTR_AUTO_P(SDmaProgressInt), + INTR_AUTO_P(SDmaIdleInt), + INTR_AUTO_P(SDmaCleanupDone), + INTR_AUTO_C(RcvUrg), + INTR_AUTO_P(ErrInt), + INTR_AUTO(ErrInt), /* non-port-specific errs */ + INTR_AUTO(AssertGPIOInt), + INTR_AUTO_P(SendDoneInt), + INTR_AUTO(SendBufAvailInt), + INTR_AUTO_C(RcvAvail), + { .mask = 0, .sz = 0 } +}; + +#define TXSYMPTOM_AUTO_P(fldname) \ + { .mask = SYM_MASK(SendHdrErrSymptom_0, fldname), \ + .msg = #fldname, .sz = sizeof(#fldname) } +static const struct qib_hwerror_msgs hdrchk_msgs[] = { + TXSYMPTOM_AUTO_P(NonKeyPacket), + TXSYMPTOM_AUTO_P(GRHFail), + TXSYMPTOM_AUTO_P(PkeyFail), + TXSYMPTOM_AUTO_P(QPFail), + TXSYMPTOM_AUTO_P(SLIDFail), + TXSYMPTOM_AUTO_P(RawIPV6), + TXSYMPTOM_AUTO_P(PacketTooSmall), + { .mask = 0, .sz = 0 } +}; + +#define IBA7322_HDRHEAD_PKTINT_SHIFT 32 /* interrupt cnt in upper 32 bits */ + +/* + * Called when we might have an error that is specific to a particular + * PIO buffer, and may need to cancel that buffer, so it can be re-used, + * because we don't need to force the update of pioavail + */ +static void qib_disarm_7322_senderrbufs(struct qib_pportdata *ppd) +{ + struct qib_devdata *dd = ppd->dd; + u32 i; + int any; + u32 piobcnt = dd->piobcnt2k + dd->piobcnt4k + NUM_VL15_BUFS; + u32 regcnt = (piobcnt + BITS_PER_LONG - 1) / BITS_PER_LONG; + unsigned long sbuf[4]; + + /* + * It's possible that sendbuffererror could have bits set; might + * have already done this as a result of hardware error handling. + */ + any = 0; + for (i = 0; i < regcnt; ++i) { + sbuf[i] = qib_read_kreg64(dd, kr_sendbuffererror + i); + if (sbuf[i]) { + any = 1; + qib_write_kreg(dd, kr_sendbuffererror + i, sbuf[i]); + } + } + + if (any) + qib_disarm_piobufs_set(dd, sbuf, piobcnt); +} + +/* No txe_recover yet, if ever */ + +/* No decode__errors yet */ +static void err_decode(char *msg, size_t len, u64 errs, + const struct qib_hwerror_msgs *msp) +{ + u64 these, lmask; + int took, multi, n = 0; + + while (errs && msp && msp->mask) { + multi = (msp->mask & (msp->mask - 1)); + while (errs & msp->mask) { + these = (errs & msp->mask); + lmask = (these & (these - 1)) ^ these; + if (len) { + if (n++) { + /* separate the strings */ + *msg++ = ','; + len--; + } + BUG_ON(!msp->sz); + /* msp->sz counts the nul */ + took = min_t(size_t, msp->sz - (size_t)1, len); + memcpy(msg, msp->msg, took); + len -= took; + msg += took; + if (len) + *msg = '\0'; + } + errs &= ~lmask; + if (len && multi) { + /* More than one bit this mask */ + int idx = -1; + + while (lmask & msp->mask) { + ++idx; + lmask >>= 1; + } + took = scnprintf(msg, len, "_%d", idx); + len -= took; + msg += took; + } + } + ++msp; + } + /* If some bits are left, show in hex. */ + if (len && errs) + snprintf(msg, len, "%sMORE:%llX", n ? "," : "", + (unsigned long long) errs); +} + +/* only called if r1 set */ +static void flush_fifo(struct qib_pportdata *ppd) +{ + struct qib_devdata *dd = ppd->dd; + u32 __iomem *piobuf; + u32 bufn; + u32 *hdr; + u64 pbc; + const unsigned hdrwords = 7; + static struct qib_ib_header ibhdr = { + .lrh[0] = cpu_to_be16(0xF000 | QIB_LRH_BTH), + .lrh[1] = IB_LID_PERMISSIVE, + .lrh[2] = cpu_to_be16(hdrwords + SIZE_OF_CRC), + .lrh[3] = IB_LID_PERMISSIVE, + .u.oth.bth[0] = cpu_to_be32( + (IB_OPCODE_UD_SEND_ONLY << 24) | QIB_DEFAULT_P_KEY), + .u.oth.bth[1] = cpu_to_be32(0), + .u.oth.bth[2] = cpu_to_be32(0), + .u.oth.u.ud.deth[0] = cpu_to_be32(0), + .u.oth.u.ud.deth[1] = cpu_to_be32(0), + }; + + /* + * Send a dummy VL15 packet to flush the launch FIFO. + * This will not actually be sent since the TxeBypassIbc bit is set. + */ + pbc = PBC_7322_VL15_SEND | + (((u64)ppd->hw_pidx) << (PBC_PORT_SEL_LSB + 32)) | + (hdrwords + SIZE_OF_CRC); + piobuf = qib_7322_getsendbuf(ppd, pbc, &bufn); + if (!piobuf) + return; + writeq(pbc, piobuf); + hdr = (u32 *) &ibhdr; + if (dd->flags & QIB_PIO_FLUSH_WC) { + qib_flush_wc(); + qib_pio_copy(piobuf + 2, hdr, hdrwords - 1); + qib_flush_wc(); + __raw_writel(hdr[hdrwords - 1], piobuf + hdrwords + 1); + qib_flush_wc(); + } else + qib_pio_copy(piobuf + 2, hdr, hdrwords); + qib_sendbuf_done(dd, bufn); +} + +/* + * This is called with interrupts disabled and sdma_lock held. + */ +static void qib_7322_sdma_sendctrl(struct qib_pportdata *ppd, unsigned op) +{ + struct qib_devdata *dd = ppd->dd; + u64 set_sendctrl = 0; + u64 clr_sendctrl = 0; + + if (op & QIB_SDMA_SENDCTRL_OP_ENABLE) + set_sendctrl |= SYM_MASK(SendCtrl_0, SDmaEnable); + else + clr_sendctrl |= SYM_MASK(SendCtrl_0, SDmaEnable); + + if (op & QIB_SDMA_SENDCTRL_OP_INTENABLE) + set_sendctrl |= SYM_MASK(SendCtrl_0, SDmaIntEnable); + else + clr_sendctrl |= SYM_MASK(SendCtrl_0, SDmaIntEnable); + + if (op & QIB_SDMA_SENDCTRL_OP_HALT) + set_sendctrl |= SYM_MASK(SendCtrl_0, SDmaHalt); + else + clr_sendctrl |= SYM_MASK(SendCtrl_0, SDmaHalt); + + if (op & QIB_SDMA_SENDCTRL_OP_DRAIN) + set_sendctrl |= SYM_MASK(SendCtrl_0, TxeBypassIbc) | + SYM_MASK(SendCtrl_0, TxeAbortIbc) | + SYM_MASK(SendCtrl_0, TxeDrainRmFifo); + else + clr_sendctrl |= SYM_MASK(SendCtrl_0, TxeBypassIbc) | + SYM_MASK(SendCtrl_0, TxeAbortIbc) | + SYM_MASK(SendCtrl_0, TxeDrainRmFifo); + + spin_lock(&dd->sendctrl_lock); + + /* If we are draining everything, block sends first */ + if (op & QIB_SDMA_SENDCTRL_OP_DRAIN) { + ppd->p_sendctrl &= ~SYM_MASK(SendCtrl_0, SendEnable); + qib_write_kreg_port(ppd, krp_sendctrl, ppd->p_sendctrl); + qib_write_kreg(dd, kr_scratch, 0); + } + + ppd->p_sendctrl |= set_sendctrl; + ppd->p_sendctrl &= ~clr_sendctrl; + + if (op & QIB_SDMA_SENDCTRL_OP_CLEANUP) + qib_write_kreg_port(ppd, krp_sendctrl, + ppd->p_sendctrl | + SYM_MASK(SendCtrl_0, SDmaCleanup)); + else + qib_write_kreg_port(ppd, krp_sendctrl, ppd->p_sendctrl); + qib_write_kreg(dd, kr_scratch, 0); + + if (op & QIB_SDMA_SENDCTRL_OP_DRAIN) { + ppd->p_sendctrl |= SYM_MASK(SendCtrl_0, SendEnable); + qib_write_kreg_port(ppd, krp_sendctrl, ppd->p_sendctrl); + qib_write_kreg(dd, kr_scratch, 0); + } + + spin_unlock(&dd->sendctrl_lock); + + if ((op & QIB_SDMA_SENDCTRL_OP_DRAIN) && ppd->dd->cspec->r1) + flush_fifo(ppd); +} + +static void qib_7322_sdma_hw_clean_up(struct qib_pportdata *ppd) +{ + __qib_sdma_process_event(ppd, qib_sdma_event_e50_hw_cleaned); +} + +static void qib_sdma_7322_setlengen(struct qib_pportdata *ppd) +{ + /* + * Set SendDmaLenGen and clear and set + * the MSB of the generation count to enable generation checking + * and load the internal generation counter. + */ + qib_write_kreg_port(ppd, krp_senddmalengen, ppd->sdma_descq_cnt); + qib_write_kreg_port(ppd, krp_senddmalengen, + ppd->sdma_descq_cnt | + (1ULL << QIB_7322_SendDmaLenGen_0_Generation_MSB)); +} + +/* + * Must be called with sdma_lock held, or before init finished. + */ +static void qib_sdma_update_7322_tail(struct qib_pportdata *ppd, u16 tail) +{ + /* Commit writes to memory and advance the tail on the chip */ + wmb(); + ppd->sdma_descq_tail = tail; + qib_write_kreg_port(ppd, krp_senddmatail, tail); +} + +/* + * This is called with interrupts disabled and sdma_lock held. + */ +static void qib_7322_sdma_hw_start_up(struct qib_pportdata *ppd) +{ + /* + * Drain all FIFOs. + * The hardware doesn't require this but we do it so that verbs + * and user applications don't wait for link active to send stale + * data. + */ + sendctrl_7322_mod(ppd, QIB_SENDCTRL_FLUSH); + + qib_sdma_7322_setlengen(ppd); + qib_sdma_update_7322_tail(ppd, 0); /* Set SendDmaTail */ + ppd->sdma_head_dma[0] = 0; + qib_7322_sdma_sendctrl(ppd, + ppd->sdma_state.current_op | QIB_SDMA_SENDCTRL_OP_CLEANUP); +} + +#define DISABLES_SDMA ( \ + QIB_E_P_SDMAHALT | \ + QIB_E_P_SDMADESCADDRMISALIGN | \ + QIB_E_P_SDMAMISSINGDW | \ + QIB_E_P_SDMADWEN | \ + QIB_E_P_SDMARPYTAG | \ + QIB_E_P_SDMA1STDESC | \ + QIB_E_P_SDMABASE | \ + QIB_E_P_SDMATAILOUTOFBOUND | \ + QIB_E_P_SDMAOUTOFBOUND | \ + QIB_E_P_SDMAGENMISMATCH) + +static void sdma_7322_p_errors(struct qib_pportdata *ppd, u64 errs) +{ + unsigned long flags; + struct qib_devdata *dd = ppd->dd; + + errs &= QIB_E_P_SDMAERRS; + err_decode(ppd->cpspec->sdmamsgbuf, sizeof(ppd->cpspec->sdmamsgbuf), + errs, qib_7322p_error_msgs); + + if (errs & QIB_E_P_SDMAUNEXPDATA) + qib_dev_err(dd, "IB%u:%u SDmaUnexpData\n", dd->unit, + ppd->port); + + spin_lock_irqsave(&ppd->sdma_lock, flags); + + if (errs != QIB_E_P_SDMAHALT) { + /* SDMA errors have QIB_E_P_SDMAHALT and another bit set */ + qib_dev_porterr(dd, ppd->port, + "SDMA %s 0x%016llx %s\n", + qib_sdma_state_names[ppd->sdma_state.current_state], + errs, ppd->cpspec->sdmamsgbuf); + dump_sdma_7322_state(ppd); + } + + switch (ppd->sdma_state.current_state) { + case qib_sdma_state_s00_hw_down: + break; + + case qib_sdma_state_s10_hw_start_up_wait: + if (errs & QIB_E_P_SDMAHALT) + __qib_sdma_process_event(ppd, + qib_sdma_event_e20_hw_started); + break; + + case qib_sdma_state_s20_idle: + break; + + case qib_sdma_state_s30_sw_clean_up_wait: + break; + + case qib_sdma_state_s40_hw_clean_up_wait: + if (errs & QIB_E_P_SDMAHALT) + __qib_sdma_process_event(ppd, + qib_sdma_event_e50_hw_cleaned); + break; + + case qib_sdma_state_s50_hw_halt_wait: + if (errs & QIB_E_P_SDMAHALT) + __qib_sdma_process_event(ppd, + qib_sdma_event_e60_hw_halted); + break; + + case qib_sdma_state_s99_running: + __qib_sdma_process_event(ppd, qib_sdma_event_e7322_err_halted); + __qib_sdma_process_event(ppd, qib_sdma_event_e60_hw_halted); + break; + } + + spin_unlock_irqrestore(&ppd->sdma_lock, flags); +} + +/* + * handle per-device errors (not per-port errors) + */ +static noinline void handle_7322_errors(struct qib_devdata *dd) +{ + char *msg; + u64 iserr = 0; + u64 errs; + u64 mask; + int log_idx; + + qib_stats.sps_errints++; + errs = qib_read_kreg64(dd, kr_errstatus); + if (!errs) { + qib_devinfo(dd->pcidev, + "device error interrupt, but no error bits set!\n"); + goto done; + } + + /* don't report errors that are masked */ + errs &= dd->cspec->errormask; + msg = dd->cspec->emsgbuf; + + /* do these first, they are most important */ + if (errs & QIB_E_HARDWARE) { + *msg = '\0'; + qib_7322_handle_hwerrors(dd, msg, sizeof(dd->cspec->emsgbuf)); + } else + for (log_idx = 0; log_idx < QIB_EEP_LOG_CNT; ++log_idx) + if (errs & dd->eep_st_masks[log_idx].errs_to_log) + qib_inc_eeprom_err(dd, log_idx, 1); + + if (errs & QIB_E_SPKTERRS) { + qib_disarm_7322_senderrbufs(dd->pport); + qib_stats.sps_txerrs++; + } else if (errs & QIB_E_INVALIDADDR) + qib_stats.sps_txerrs++; + else if (errs & QIB_E_ARMLAUNCH) { + qib_stats.sps_txerrs++; + qib_disarm_7322_senderrbufs(dd->pport); + } + qib_write_kreg(dd, kr_errclear, errs); + + /* + * The ones we mask off are handled specially below + * or above. Also mask SDMADISABLED by default as it + * is too chatty. + */ + mask = QIB_E_HARDWARE; + *msg = '\0'; + + err_decode(msg, sizeof(dd->cspec->emsgbuf), errs & ~mask, + qib_7322error_msgs); + + /* + * Getting reset is a tragedy for all ports. Mark the device + * _and_ the ports as "offline" in way meaningful to each. + */ + if (errs & QIB_E_RESET) { + int pidx; + + qib_dev_err(dd, + "Got reset, requires re-init (unload and reload driver)\n"); + dd->flags &= ~QIB_INITTED; /* needs re-init */ + /* mark as having had error */ + *dd->devstatusp |= QIB_STATUS_HWERROR; + for (pidx = 0; pidx < dd->num_pports; ++pidx) + if (dd->pport[pidx].link_speed_supported) + *dd->pport[pidx].statusp &= ~QIB_STATUS_IB_CONF; + } + + if (*msg && iserr) + qib_dev_err(dd, "%s error\n", msg); + + /* + * If there were hdrq or egrfull errors, wake up any processes + * waiting in poll. We used to try to check which contexts had + * the overflow, but given the cost of that and the chip reads + * to support it, it's better to just wake everybody up if we + * get an overflow; waiters can poll again if it's not them. + */ + if (errs & (ERR_MASK(RcvEgrFullErr) | ERR_MASK(RcvHdrFullErr))) { + qib_handle_urcv(dd, ~0U); + if (errs & ERR_MASK(RcvEgrFullErr)) + qib_stats.sps_buffull++; + else + qib_stats.sps_hdrfull++; + } + +done: + return; +} + +static void qib_error_tasklet(unsigned long data) +{ + struct qib_devdata *dd = (struct qib_devdata *)data; + + handle_7322_errors(dd); + qib_write_kreg(dd, kr_errmask, dd->cspec->errormask); +} + +static void reenable_chase(unsigned long opaque) +{ + struct qib_pportdata *ppd = (struct qib_pportdata *)opaque; + + ppd->cpspec->chase_timer.expires = 0; + qib_set_ib_7322_lstate(ppd, QLOGIC_IB_IBCC_LINKCMD_DOWN, + QLOGIC_IB_IBCC_LINKINITCMD_POLL); +} + +static void disable_chase(struct qib_pportdata *ppd, unsigned long tnow, + u8 ibclt) +{ + ppd->cpspec->chase_end = 0; + + if (!qib_chase) + return; + + qib_set_ib_7322_lstate(ppd, QLOGIC_IB_IBCC_LINKCMD_DOWN, + QLOGIC_IB_IBCC_LINKINITCMD_DISABLE); + ppd->cpspec->chase_timer.expires = jiffies + QIB_CHASE_DIS_TIME; + add_timer(&ppd->cpspec->chase_timer); +} + +static void handle_serdes_issues(struct qib_pportdata *ppd, u64 ibcst) +{ + u8 ibclt; + unsigned long tnow; + + ibclt = (u8)SYM_FIELD(ibcst, IBCStatusA_0, LinkTrainingState); + + /* + * Detect and handle the state chase issue, where we can + * get stuck if we are unlucky on timing on both sides of + * the link. If we are, we disable, set a timer, and + * then re-enable. + */ + switch (ibclt) { + case IB_7322_LT_STATE_CFGRCVFCFG: + case IB_7322_LT_STATE_CFGWAITRMT: + case IB_7322_LT_STATE_TXREVLANES: + case IB_7322_LT_STATE_CFGENH: + tnow = jiffies; + if (ppd->cpspec->chase_end && + time_after(tnow, ppd->cpspec->chase_end)) + disable_chase(ppd, tnow, ibclt); + else if (!ppd->cpspec->chase_end) + ppd->cpspec->chase_end = tnow + QIB_CHASE_TIME; + break; + default: + ppd->cpspec->chase_end = 0; + break; + } + + if (((ibclt >= IB_7322_LT_STATE_CFGTEST && + ibclt <= IB_7322_LT_STATE_CFGWAITENH) || + ibclt == IB_7322_LT_STATE_LINKUP) && + (ibcst & SYM_MASK(IBCStatusA_0, LinkSpeedQDR))) { + force_h1(ppd); + ppd->cpspec->qdr_reforce = 1; + if (!ppd->dd->cspec->r1) + serdes_7322_los_enable(ppd, 0); + } else if (ppd->cpspec->qdr_reforce && + (ibcst & SYM_MASK(IBCStatusA_0, LinkSpeedQDR)) && + (ibclt == IB_7322_LT_STATE_CFGENH || + ibclt == IB_7322_LT_STATE_CFGIDLE || + ibclt == IB_7322_LT_STATE_LINKUP)) + force_h1(ppd); + + if ((IS_QMH(ppd->dd) || IS_QME(ppd->dd)) && + ppd->link_speed_enabled == QIB_IB_QDR && + (ibclt == IB_7322_LT_STATE_CFGTEST || + ibclt == IB_7322_LT_STATE_CFGENH || + (ibclt >= IB_7322_LT_STATE_POLLACTIVE && + ibclt <= IB_7322_LT_STATE_SLEEPQUIET))) + adj_tx_serdes(ppd); + + if (ibclt != IB_7322_LT_STATE_LINKUP) { + u8 ltstate = qib_7322_phys_portstate(ibcst); + u8 pibclt = (u8)SYM_FIELD(ppd->lastibcstat, IBCStatusA_0, + LinkTrainingState); + if (!ppd->dd->cspec->r1 && + pibclt == IB_7322_LT_STATE_LINKUP && + ltstate != IB_PHYSPORTSTATE_LINK_ERR_RECOVER && + ltstate != IB_PHYSPORTSTATE_RECOVERY_RETRAIN && + ltstate != IB_PHYSPORTSTATE_RECOVERY_WAITRMT && + ltstate != IB_PHYSPORTSTATE_RECOVERY_IDLE) + /* If the link went down (but no into recovery, + * turn LOS back on */ + serdes_7322_los_enable(ppd, 1); + if (!ppd->cpspec->qdr_dfe_on && + ibclt <= IB_7322_LT_STATE_SLEEPQUIET) { + ppd->cpspec->qdr_dfe_on = 1; + ppd->cpspec->qdr_dfe_time = 0; + /* On link down, reenable QDR adaptation */ + qib_write_kreg_port(ppd, krp_static_adapt_dis(2), + ppd->dd->cspec->r1 ? + QDR_STATIC_ADAPT_DOWN_R1 : + QDR_STATIC_ADAPT_DOWN); + pr_info( + "IB%u:%u re-enabled QDR adaptation ibclt %x\n", + ppd->dd->unit, ppd->port, ibclt); + } + } +} + +static int qib_7322_set_ib_cfg(struct qib_pportdata *, int, u32); + +/* + * This is per-pport error handling. + * will likely get it's own MSIx interrupt (one for each port, + * although just a single handler). + */ +static noinline void handle_7322_p_errors(struct qib_pportdata *ppd) +{ + char *msg; + u64 ignore_this_time = 0, iserr = 0, errs, fmask; + struct qib_devdata *dd = ppd->dd; + + /* do this as soon as possible */ + fmask = qib_read_kreg64(dd, kr_act_fmask); + if (!fmask) + check_7322_rxe_status(ppd); + + errs = qib_read_kreg_port(ppd, krp_errstatus); + if (!errs) + qib_devinfo(dd->pcidev, + "Port%d error interrupt, but no error bits set!\n", + ppd->port); + if (!fmask) + errs &= ~QIB_E_P_IBSTATUSCHANGED; + if (!errs) + goto done; + + msg = ppd->cpspec->epmsgbuf; + *msg = '\0'; + + if (errs & ~QIB_E_P_BITSEXTANT) { + err_decode(msg, sizeof(ppd->cpspec->epmsgbuf), + errs & ~QIB_E_P_BITSEXTANT, qib_7322p_error_msgs); + if (!*msg) + snprintf(msg, sizeof(ppd->cpspec->epmsgbuf), + "no others"); + qib_dev_porterr(dd, ppd->port, + "error interrupt with unknown errors 0x%016Lx set (and %s)\n", + (errs & ~QIB_E_P_BITSEXTANT), msg); + *msg = '\0'; + } + + if (errs & QIB_E_P_SHDR) { + u64 symptom; + + /* determine cause, then write to clear */ + symptom = qib_read_kreg_port(ppd, krp_sendhdrsymptom); + qib_write_kreg_port(ppd, krp_sendhdrsymptom, 0); + err_decode(msg, sizeof(ppd->cpspec->epmsgbuf), symptom, + hdrchk_msgs); + *msg = '\0'; + /* senderrbuf cleared in SPKTERRS below */ + } + + if (errs & QIB_E_P_SPKTERRS) { + if ((errs & QIB_E_P_LINK_PKTERRS) && + !(ppd->lflags & QIBL_LINKACTIVE)) { + /* + * This can happen when trying to bring the link + * up, but the IB link changes state at the "wrong" + * time. The IB logic then complains that the packet + * isn't valid. We don't want to confuse people, so + * we just don't print them, except at debug + */ + err_decode(msg, sizeof(ppd->cpspec->epmsgbuf), + (errs & QIB_E_P_LINK_PKTERRS), + qib_7322p_error_msgs); + *msg = '\0'; + ignore_this_time = errs & QIB_E_P_LINK_PKTERRS; + } + qib_disarm_7322_senderrbufs(ppd); + } else if ((errs & QIB_E_P_LINK_PKTERRS) && + !(ppd->lflags & QIBL_LINKACTIVE)) { + /* + * This can happen when SMA is trying to bring the link + * up, but the IB link changes state at the "wrong" time. + * The IB logic then complains that the packet isn't + * valid. We don't want to confuse people, so we just + * don't print them, except at debug + */ + err_decode(msg, sizeof(ppd->cpspec->epmsgbuf), errs, + qib_7322p_error_msgs); + ignore_this_time = errs & QIB_E_P_LINK_PKTERRS; + *msg = '\0'; + } + + qib_write_kreg_port(ppd, krp_errclear, errs); + + errs &= ~ignore_this_time; + if (!errs) + goto done; + + if (errs & QIB_E_P_RPKTERRS) + qib_stats.sps_rcverrs++; + if (errs & QIB_E_P_SPKTERRS) + qib_stats.sps_txerrs++; + + iserr = errs & ~(QIB_E_P_RPKTERRS | QIB_E_P_PKTERRS); + + if (errs & QIB_E_P_SDMAERRS) + sdma_7322_p_errors(ppd, errs); + + if (errs & QIB_E_P_IBSTATUSCHANGED) { + u64 ibcs; + u8 ltstate; + + ibcs = qib_read_kreg_port(ppd, krp_ibcstatus_a); + ltstate = qib_7322_phys_portstate(ibcs); + + if (!(ppd->lflags & QIBL_IB_AUTONEG_INPROG)) + handle_serdes_issues(ppd, ibcs); + if (!(ppd->cpspec->ibcctrl_a & + SYM_MASK(IBCCtrlA_0, IBStatIntReductionEn))) { + /* + * We got our interrupt, so init code should be + * happy and not try alternatives. Now squelch + * other "chatter" from link-negotiation (pre Init) + */ + ppd->cpspec->ibcctrl_a |= + SYM_MASK(IBCCtrlA_0, IBStatIntReductionEn); + qib_write_kreg_port(ppd, krp_ibcctrl_a, + ppd->cpspec->ibcctrl_a); + } + + /* Update our picture of width and speed from chip */ + ppd->link_width_active = + (ibcs & SYM_MASK(IBCStatusA_0, LinkWidthActive)) ? + IB_WIDTH_4X : IB_WIDTH_1X; + ppd->link_speed_active = (ibcs & SYM_MASK(IBCStatusA_0, + LinkSpeedQDR)) ? QIB_IB_QDR : (ibcs & + SYM_MASK(IBCStatusA_0, LinkSpeedActive)) ? + QIB_IB_DDR : QIB_IB_SDR; + + if ((ppd->lflags & QIBL_IB_LINK_DISABLED) && ltstate != + IB_PHYSPORTSTATE_DISABLED) + qib_set_ib_7322_lstate(ppd, 0, + QLOGIC_IB_IBCC_LINKINITCMD_DISABLE); + else + /* + * Since going into a recovery state causes the link + * state to go down and since recovery is transitory, + * it is better if we "miss" ever seeing the link + * training state go into recovery (i.e., ignore this + * transition for link state special handling purposes) + * without updating lastibcstat. + */ + if (ltstate != IB_PHYSPORTSTATE_LINK_ERR_RECOVER && + ltstate != IB_PHYSPORTSTATE_RECOVERY_RETRAIN && + ltstate != IB_PHYSPORTSTATE_RECOVERY_WAITRMT && + ltstate != IB_PHYSPORTSTATE_RECOVERY_IDLE) + qib_handle_e_ibstatuschanged(ppd, ibcs); + } + if (*msg && iserr) + qib_dev_porterr(dd, ppd->port, "%s error\n", msg); + + if (ppd->state_wanted & ppd->lflags) + wake_up_interruptible(&ppd->state_wait); +done: + return; +} + +/* enable/disable chip from delivering interrupts */ +static void qib_7322_set_intr_state(struct qib_devdata *dd, u32 enable) +{ + if (enable) { + if (dd->flags & QIB_BADINTR) + return; + qib_write_kreg(dd, kr_intmask, dd->cspec->int_enable_mask); + /* cause any pending enabled interrupts to be re-delivered */ + qib_write_kreg(dd, kr_intclear, 0ULL); + if (dd->cspec->num_msix_entries) { + /* and same for MSIx */ + u64 val = qib_read_kreg64(dd, kr_intgranted); + + if (val) + qib_write_kreg(dd, kr_intgranted, val); + } + } else + qib_write_kreg(dd, kr_intmask, 0ULL); +} + +/* + * Try to cleanup as much as possible for anything that might have gone + * wrong while in freeze mode, such as pio buffers being written by user + * processes (causing armlaunch), send errors due to going into freeze mode, + * etc., and try to avoid causing extra interrupts while doing so. + * Forcibly update the in-memory pioavail register copies after cleanup + * because the chip won't do it while in freeze mode (the register values + * themselves are kept correct). + * Make sure that we don't lose any important interrupts by using the chip + * feature that says that writing 0 to a bit in *clear that is set in + * *status will cause an interrupt to be generated again (if allowed by + * the *mask value). + * This is in chip-specific code because of all of the register accesses, + * even though the details are similar on most chips. + */ +static void qib_7322_clear_freeze(struct qib_devdata *dd) +{ + int pidx; + + /* disable error interrupts, to avoid confusion */ + qib_write_kreg(dd, kr_errmask, 0ULL); + + for (pidx = 0; pidx < dd->num_pports; ++pidx) + if (dd->pport[pidx].link_speed_supported) + qib_write_kreg_port(dd->pport + pidx, krp_errmask, + 0ULL); + + /* also disable interrupts; errormask is sometimes overwriten */ + qib_7322_set_intr_state(dd, 0); + + /* clear the freeze, and be sure chip saw it */ + qib_write_kreg(dd, kr_control, dd->control); + qib_read_kreg32(dd, kr_scratch); + + /* + * Force new interrupt if any hwerr, error or interrupt bits are + * still set, and clear "safe" send packet errors related to freeze + * and cancelling sends. Re-enable error interrupts before possible + * force of re-interrupt on pending interrupts. + */ + qib_write_kreg(dd, kr_hwerrclear, 0ULL); + qib_write_kreg(dd, kr_errclear, E_SPKT_ERRS_IGNORE); + qib_write_kreg(dd, kr_errmask, dd->cspec->errormask); + /* We need to purge per-port errs and reset mask, too */ + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + if (!dd->pport[pidx].link_speed_supported) + continue; + qib_write_kreg_port(dd->pport + pidx, krp_errclear, ~0Ull); + qib_write_kreg_port(dd->pport + pidx, krp_errmask, ~0Ull); + } + qib_7322_set_intr_state(dd, 1); +} + +/* no error handling to speak of */ +/** + * qib_7322_handle_hwerrors - display hardware errors. + * @dd: the qlogic_ib device + * @msg: the output buffer + * @msgl: the size of the output buffer + * + * Use same msg buffer as regular errors to avoid excessive stack + * use. Most hardware errors are catastrophic, but for right now, + * we'll print them and continue. We reuse the same message buffer as + * qib_handle_errors() to avoid excessive stack usage. + */ +static void qib_7322_handle_hwerrors(struct qib_devdata *dd, char *msg, + size_t msgl) +{ + u64 hwerrs; + u32 ctrl; + int isfatal = 0; + + hwerrs = qib_read_kreg64(dd, kr_hwerrstatus); + if (!hwerrs) + goto bail; + if (hwerrs == ~0ULL) { + qib_dev_err(dd, + "Read of hardware error status failed (all bits set); ignoring\n"); + goto bail; + } + qib_stats.sps_hwerrs++; + + /* Always clear the error status register, except BIST fail */ + qib_write_kreg(dd, kr_hwerrclear, hwerrs & + ~HWE_MASK(PowerOnBISTFailed)); + + hwerrs &= dd->cspec->hwerrmask; + + /* no EEPROM logging, yet */ + + if (hwerrs) + qib_devinfo(dd->pcidev, + "Hardware error: hwerr=0x%llx (cleared)\n", + (unsigned long long) hwerrs); + + ctrl = qib_read_kreg32(dd, kr_control); + if ((ctrl & SYM_MASK(Control, FreezeMode)) && !dd->diag_client) { + /* + * No recovery yet... + */ + if ((hwerrs & ~HWE_MASK(LATriggered)) || + dd->cspec->stay_in_freeze) { + /* + * If any set that we aren't ignoring only make the + * complaint once, in case it's stuck or recurring, + * and we get here multiple times + * Force link down, so switch knows, and + * LEDs are turned off. + */ + if (dd->flags & QIB_INITTED) + isfatal = 1; + } else + qib_7322_clear_freeze(dd); + } + + if (hwerrs & HWE_MASK(PowerOnBISTFailed)) { + isfatal = 1; + strlcpy(msg, + "[Memory BIST test failed, InfiniPath hardware unusable]", + msgl); + /* ignore from now on, so disable until driver reloaded */ + dd->cspec->hwerrmask &= ~HWE_MASK(PowerOnBISTFailed); + qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask); + } + + err_decode(msg, msgl, hwerrs, qib_7322_hwerror_msgs); + + /* Ignore esoteric PLL failures et al. */ + + qib_dev_err(dd, "%s hardware error\n", msg); + + if (hwerrs & + (SYM_MASK(HwErrMask, SDmaMemReadErrMask_0) | + SYM_MASK(HwErrMask, SDmaMemReadErrMask_1))) { + int pidx = 0; + int err; + unsigned long flags; + struct qib_pportdata *ppd = dd->pport; + + for (; pidx < dd->num_pports; ++pidx, ppd++) { + err = 0; + if (pidx == 0 && (hwerrs & + SYM_MASK(HwErrMask, SDmaMemReadErrMask_0))) + err++; + if (pidx == 1 && (hwerrs & + SYM_MASK(HwErrMask, SDmaMemReadErrMask_1))) + err++; + if (err) { + spin_lock_irqsave(&ppd->sdma_lock, flags); + dump_sdma_7322_state(ppd); + spin_unlock_irqrestore(&ppd->sdma_lock, flags); + } + } + } + + if (isfatal && !dd->diag_client) { + qib_dev_err(dd, + "Fatal Hardware Error, no longer usable, SN %.16s\n", + dd->serial); + /* + * for /sys status file and user programs to print; if no + * trailing brace is copied, we'll know it was truncated. + */ + if (dd->freezemsg) + snprintf(dd->freezemsg, dd->freezelen, + "{%s}", msg); + qib_disable_after_error(dd); + } +bail:; +} + +/** + * qib_7322_init_hwerrors - enable hardware errors + * @dd: the qlogic_ib device + * + * now that we have finished initializing everything that might reasonably + * cause a hardware error, and cleared those errors bits as they occur, + * we can enable hardware errors in the mask (potentially enabling + * freeze mode), and enable hardware errors as errors (along with + * everything else) in errormask + */ +static void qib_7322_init_hwerrors(struct qib_devdata *dd) +{ + int pidx; + u64 extsval; + + extsval = qib_read_kreg64(dd, kr_extstatus); + if (!(extsval & (QIB_EXTS_MEMBIST_DISABLED | + QIB_EXTS_MEMBIST_ENDTEST))) + qib_dev_err(dd, "MemBIST did not complete!\n"); + + /* never clear BIST failure, so reported on each driver load */ + qib_write_kreg(dd, kr_hwerrclear, ~HWE_MASK(PowerOnBISTFailed)); + qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask); + + /* clear all */ + qib_write_kreg(dd, kr_errclear, ~0ULL); + /* enable errors that are masked, at least this first time. */ + qib_write_kreg(dd, kr_errmask, ~0ULL); + dd->cspec->errormask = qib_read_kreg64(dd, kr_errmask); + for (pidx = 0; pidx < dd->num_pports; ++pidx) + if (dd->pport[pidx].link_speed_supported) + qib_write_kreg_port(dd->pport + pidx, krp_errmask, + ~0ULL); +} + +/* + * Disable and enable the armlaunch error. Used for PIO bandwidth testing + * on chips that are count-based, rather than trigger-based. There is no + * reference counting, but that's also fine, given the intended use. + * Only chip-specific because it's all register accesses + */ +static void qib_set_7322_armlaunch(struct qib_devdata *dd, u32 enable) +{ + if (enable) { + qib_write_kreg(dd, kr_errclear, QIB_E_SPIOARMLAUNCH); + dd->cspec->errormask |= QIB_E_SPIOARMLAUNCH; + } else + dd->cspec->errormask &= ~QIB_E_SPIOARMLAUNCH; + qib_write_kreg(dd, kr_errmask, dd->cspec->errormask); +} + +/* + * Formerly took parameter in pre-shifted, + * pre-merged form with LinkCmd and LinkInitCmd + * together, and assuming the zero was NOP. + */ +static void qib_set_ib_7322_lstate(struct qib_pportdata *ppd, u16 linkcmd, + u16 linitcmd) +{ + u64 mod_wd; + struct qib_devdata *dd = ppd->dd; + unsigned long flags; + + if (linitcmd == QLOGIC_IB_IBCC_LINKINITCMD_DISABLE) { + /* + * If we are told to disable, note that so link-recovery + * code does not attempt to bring us back up. + * Also reset everything that we can, so we start + * completely clean when re-enabled (before we + * actually issue the disable to the IBC) + */ + qib_7322_mini_pcs_reset(ppd); + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags |= QIBL_IB_LINK_DISABLED; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + } else if (linitcmd || linkcmd == QLOGIC_IB_IBCC_LINKCMD_DOWN) { + /* + * Any other linkinitcmd will lead to LINKDOWN and then + * to INIT (if all is well), so clear flag to let + * link-recovery code attempt to bring us back up. + */ + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags &= ~QIBL_IB_LINK_DISABLED; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + /* + * Clear status change interrupt reduction so the + * new state is seen. + */ + ppd->cpspec->ibcctrl_a &= + ~SYM_MASK(IBCCtrlA_0, IBStatIntReductionEn); + } + + mod_wd = (linkcmd << IBA7322_IBCC_LINKCMD_SHIFT) | + (linitcmd << QLOGIC_IB_IBCC_LINKINITCMD_SHIFT); + + qib_write_kreg_port(ppd, krp_ibcctrl_a, ppd->cpspec->ibcctrl_a | + mod_wd); + /* write to chip to prevent back-to-back writes of ibc reg */ + qib_write_kreg(dd, kr_scratch, 0); + +} + +/* + * The total RCV buffer memory is 64KB, used for both ports, and is + * in units of 64 bytes (same as IB flow control credit unit). + * The consumedVL unit in the same registers are in 32 byte units! + * So, a VL15 packet needs 4.50 IB credits, and 9 rx buffer chunks, + * and we can therefore allocate just 9 IB credits for 2 VL15 packets + * in krp_rxcreditvl15, rather than 10. + */ +#define RCV_BUF_UNITSZ 64 +#define NUM_RCV_BUF_UNITS(dd) ((64 * 1024) / (RCV_BUF_UNITSZ * dd->num_pports)) + +static void set_vls(struct qib_pportdata *ppd) +{ + int i, numvls, totcred, cred_vl, vl0extra; + struct qib_devdata *dd = ppd->dd; + u64 val; + + numvls = qib_num_vls(ppd->vls_operational); + + /* + * Set up per-VL credits. Below is kluge based on these assumptions: + * 1) port is disabled at the time early_init is called. + * 2) give VL15 17 credits, for two max-plausible packets. + * 3) Give VL0-N the rest, with any rounding excess used for VL0 + */ + /* 2 VL15 packets @ 288 bytes each (including IB headers) */ + totcred = NUM_RCV_BUF_UNITS(dd); + cred_vl = (2 * 288 + RCV_BUF_UNITSZ - 1) / RCV_BUF_UNITSZ; + totcred -= cred_vl; + qib_write_kreg_port(ppd, krp_rxcreditvl15, (u64) cred_vl); + cred_vl = totcred / numvls; + vl0extra = totcred - cred_vl * numvls; + qib_write_kreg_port(ppd, krp_rxcreditvl0, cred_vl + vl0extra); + for (i = 1; i < numvls; i++) + qib_write_kreg_port(ppd, krp_rxcreditvl0 + i, cred_vl); + for (; i < 8; i++) /* no buffer space for other VLs */ + qib_write_kreg_port(ppd, krp_rxcreditvl0 + i, 0); + + /* Notify IBC that credits need to be recalculated */ + val = qib_read_kreg_port(ppd, krp_ibsdtestiftx); + val |= SYM_MASK(IB_SDTEST_IF_TX_0, CREDIT_CHANGE); + qib_write_kreg_port(ppd, krp_ibsdtestiftx, val); + qib_write_kreg(dd, kr_scratch, 0ULL); + val &= ~SYM_MASK(IB_SDTEST_IF_TX_0, CREDIT_CHANGE); + qib_write_kreg_port(ppd, krp_ibsdtestiftx, val); + + for (i = 0; i < numvls; i++) + val = qib_read_kreg_port(ppd, krp_rxcreditvl0 + i); + val = qib_read_kreg_port(ppd, krp_rxcreditvl15); + + /* Change the number of operational VLs */ + ppd->cpspec->ibcctrl_a = (ppd->cpspec->ibcctrl_a & + ~SYM_MASK(IBCCtrlA_0, NumVLane)) | + ((u64)(numvls - 1) << SYM_LSB(IBCCtrlA_0, NumVLane)); + qib_write_kreg_port(ppd, krp_ibcctrl_a, ppd->cpspec->ibcctrl_a); + qib_write_kreg(dd, kr_scratch, 0ULL); +} + +/* + * The code that deals with actual SerDes is in serdes_7322_init(). + * Compared to the code for iba7220, it is minimal. + */ +static int serdes_7322_init(struct qib_pportdata *ppd); + +/** + * qib_7322_bringup_serdes - bring up the serdes + * @ppd: physical port on the qlogic_ib device + */ +static int qib_7322_bringup_serdes(struct qib_pportdata *ppd) +{ + struct qib_devdata *dd = ppd->dd; + u64 val, guid, ibc; + unsigned long flags; + int ret = 0; + + /* + * SerDes model not in Pd, but still need to + * set up much of IBCCtrl and IBCDDRCtrl; move elsewhere + * eventually. + */ + /* Put IBC in reset, sends disabled (should be in reset already) */ + ppd->cpspec->ibcctrl_a &= ~SYM_MASK(IBCCtrlA_0, IBLinkEn); + qib_write_kreg_port(ppd, krp_ibcctrl_a, ppd->cpspec->ibcctrl_a); + qib_write_kreg(dd, kr_scratch, 0ULL); + + /* ensure previous Tx parameters are not still forced */ + qib_write_kreg_port(ppd, krp_tx_deemph_override, + SYM_MASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0, + reset_tx_deemphasis_override)); + + if (qib_compat_ddr_negotiate) { + ppd->cpspec->ibdeltainprog = 1; + ppd->cpspec->ibsymsnap = read_7322_creg32_port(ppd, + crp_ibsymbolerr); + ppd->cpspec->iblnkerrsnap = read_7322_creg32_port(ppd, + crp_iblinkerrrecov); + } + + /* flowcontrolwatermark is in units of KBytes */ + ibc = 0x5ULL << SYM_LSB(IBCCtrlA_0, FlowCtrlWaterMark); + /* + * Flow control is sent this often, even if no changes in + * buffer space occur. Units are 128ns for this chip. + * Set to 3usec. + */ + ibc |= 24ULL << SYM_LSB(IBCCtrlA_0, FlowCtrlPeriod); + /* max error tolerance */ + ibc |= 0xfULL << SYM_LSB(IBCCtrlA_0, PhyerrThreshold); + /* IB credit flow control. */ + ibc |= 0xfULL << SYM_LSB(IBCCtrlA_0, OverrunThreshold); + /* + * set initial max size pkt IBC will send, including ICRC; it's the + * PIO buffer size in dwords, less 1; also see qib_set_mtu() + */ + ibc |= ((u64)(ppd->ibmaxlen >> 2) + 1) << + SYM_LSB(IBCCtrlA_0, MaxPktLen); + ppd->cpspec->ibcctrl_a = ibc; /* without linkcmd or linkinitcmd! */ + + /* + * Reset the PCS interface to the serdes (and also ibc, which is still + * in reset from above). Writes new value of ibcctrl_a as last step. + */ + qib_7322_mini_pcs_reset(ppd); + + if (!ppd->cpspec->ibcctrl_b) { + unsigned lse = ppd->link_speed_enabled; + + /* + * Not on re-init after reset, establish shadow + * and force initial config. + */ + ppd->cpspec->ibcctrl_b = qib_read_kreg_port(ppd, + krp_ibcctrl_b); + ppd->cpspec->ibcctrl_b &= ~(IBA7322_IBC_SPEED_QDR | + IBA7322_IBC_SPEED_DDR | + IBA7322_IBC_SPEED_SDR | + IBA7322_IBC_WIDTH_AUTONEG | + SYM_MASK(IBCCtrlB_0, IB_LANE_REV_SUPPORTED)); + if (lse & (lse - 1)) /* Muliple speeds enabled */ + ppd->cpspec->ibcctrl_b |= + (lse << IBA7322_IBC_SPEED_LSB) | + IBA7322_IBC_IBTA_1_2_MASK | + IBA7322_IBC_MAX_SPEED_MASK; + else + ppd->cpspec->ibcctrl_b |= (lse == QIB_IB_QDR) ? + IBA7322_IBC_SPEED_QDR | + IBA7322_IBC_IBTA_1_2_MASK : + (lse == QIB_IB_DDR) ? + IBA7322_IBC_SPEED_DDR : + IBA7322_IBC_SPEED_SDR; + if ((ppd->link_width_enabled & (IB_WIDTH_1X | IB_WIDTH_4X)) == + (IB_WIDTH_1X | IB_WIDTH_4X)) + ppd->cpspec->ibcctrl_b |= IBA7322_IBC_WIDTH_AUTONEG; + else + ppd->cpspec->ibcctrl_b |= + ppd->link_width_enabled == IB_WIDTH_4X ? + IBA7322_IBC_WIDTH_4X_ONLY : + IBA7322_IBC_WIDTH_1X_ONLY; + + /* always enable these on driver reload, not sticky */ + ppd->cpspec->ibcctrl_b |= (IBA7322_IBC_RXPOL_MASK | + IBA7322_IBC_HRTBT_MASK); + } + qib_write_kreg_port(ppd, krp_ibcctrl_b, ppd->cpspec->ibcctrl_b); + + /* setup so we have more time at CFGTEST to change H1 */ + val = qib_read_kreg_port(ppd, krp_ibcctrl_c); + val &= ~SYM_MASK(IBCCtrlC_0, IB_FRONT_PORCH); + val |= 0xfULL << SYM_LSB(IBCCtrlC_0, IB_FRONT_PORCH); + qib_write_kreg_port(ppd, krp_ibcctrl_c, val); + + serdes_7322_init(ppd); + + guid = be64_to_cpu(ppd->guid); + if (!guid) { + if (dd->base_guid) + guid = be64_to_cpu(dd->base_guid) + ppd->port - 1; + ppd->guid = cpu_to_be64(guid); + } + + qib_write_kreg_port(ppd, krp_hrtbt_guid, guid); + /* write to chip to prevent back-to-back writes of ibc reg */ + qib_write_kreg(dd, kr_scratch, 0); + + /* Enable port */ + ppd->cpspec->ibcctrl_a |= SYM_MASK(IBCCtrlA_0, IBLinkEn); + set_vls(ppd); + + /* initially come up DISABLED, without sending anything. */ + val = ppd->cpspec->ibcctrl_a | (QLOGIC_IB_IBCC_LINKINITCMD_DISABLE << + QLOGIC_IB_IBCC_LINKINITCMD_SHIFT); + qib_write_kreg_port(ppd, krp_ibcctrl_a, val); + qib_write_kreg(dd, kr_scratch, 0ULL); + /* clear the linkinit cmds */ + ppd->cpspec->ibcctrl_a = val & ~SYM_MASK(IBCCtrlA_0, LinkInitCmd); + + /* be paranoid against later code motion, etc. */ + spin_lock_irqsave(&dd->cspec->rcvmod_lock, flags); + ppd->p_rcvctrl |= SYM_MASK(RcvCtrl_0, RcvIBPortEnable); + qib_write_kreg_port(ppd, krp_rcvctrl, ppd->p_rcvctrl); + spin_unlock_irqrestore(&dd->cspec->rcvmod_lock, flags); + + /* Also enable IBSTATUSCHG interrupt. */ + val = qib_read_kreg_port(ppd, krp_errmask); + qib_write_kreg_port(ppd, krp_errmask, + val | ERR_MASK_N(IBStatusChanged)); + + /* Always zero until we start messing with SerDes for real */ + return ret; +} + +/** + * qib_7322_quiet_serdes - set serdes to txidle + * @dd: the qlogic_ib device + * Called when driver is being unloaded + */ +static void qib_7322_mini_quiet_serdes(struct qib_pportdata *ppd) +{ + u64 val; + unsigned long flags; + + qib_set_ib_7322_lstate(ppd, 0, QLOGIC_IB_IBCC_LINKINITCMD_DISABLE); + + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags &= ~QIBL_IB_AUTONEG_INPROG; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + wake_up(&ppd->cpspec->autoneg_wait); + cancel_delayed_work_sync(&ppd->cpspec->autoneg_work); + if (ppd->dd->cspec->r1) + cancel_delayed_work_sync(&ppd->cpspec->ipg_work); + + ppd->cpspec->chase_end = 0; + if (ppd->cpspec->chase_timer.data) /* if initted */ + del_timer_sync(&ppd->cpspec->chase_timer); + + /* + * Despite the name, actually disables IBC as well. Do it when + * we are as sure as possible that no more packets can be + * received, following the down and the PCS reset. + * The actual disabling happens in qib_7322_mini_pci_reset(), + * along with the PCS being reset. + */ + ppd->cpspec->ibcctrl_a &= ~SYM_MASK(IBCCtrlA_0, IBLinkEn); + qib_7322_mini_pcs_reset(ppd); + + /* + * Update the adjusted counters so the adjustment persists + * across driver reload. + */ + if (ppd->cpspec->ibsymdelta || ppd->cpspec->iblnkerrdelta || + ppd->cpspec->ibdeltainprog || ppd->cpspec->iblnkdowndelta) { + struct qib_devdata *dd = ppd->dd; + u64 diagc; + + /* enable counter writes */ + diagc = qib_read_kreg64(dd, kr_hwdiagctrl); + qib_write_kreg(dd, kr_hwdiagctrl, + diagc | SYM_MASK(HwDiagCtrl, CounterWrEnable)); + + if (ppd->cpspec->ibsymdelta || ppd->cpspec->ibdeltainprog) { + val = read_7322_creg32_port(ppd, crp_ibsymbolerr); + if (ppd->cpspec->ibdeltainprog) + val -= val - ppd->cpspec->ibsymsnap; + val -= ppd->cpspec->ibsymdelta; + write_7322_creg_port(ppd, crp_ibsymbolerr, val); + } + if (ppd->cpspec->iblnkerrdelta || ppd->cpspec->ibdeltainprog) { + val = read_7322_creg32_port(ppd, crp_iblinkerrrecov); + if (ppd->cpspec->ibdeltainprog) + val -= val - ppd->cpspec->iblnkerrsnap; + val -= ppd->cpspec->iblnkerrdelta; + write_7322_creg_port(ppd, crp_iblinkerrrecov, val); + } + if (ppd->cpspec->iblnkdowndelta) { + val = read_7322_creg32_port(ppd, crp_iblinkdown); + val += ppd->cpspec->iblnkdowndelta; + write_7322_creg_port(ppd, crp_iblinkdown, val); + } + /* + * No need to save ibmalfdelta since IB perfcounters + * are cleared on driver reload. + */ + + /* and disable counter writes */ + qib_write_kreg(dd, kr_hwdiagctrl, diagc); + } +} + +/** + * qib_setup_7322_setextled - set the state of the two external LEDs + * @ppd: physical port on the qlogic_ib device + * @on: whether the link is up or not + * + * The exact combo of LEDs if on is true is determined by looking + * at the ibcstatus. + * + * These LEDs indicate the physical and logical state of IB link. + * For this chip (at least with recommended board pinouts), LED1 + * is Yellow (logical state) and LED2 is Green (physical state), + * + * Note: We try to match the Mellanox HCA LED behavior as best + * we can. Green indicates physical link state is OK (something is + * plugged in, and we can train). + * Amber indicates the link is logically up (ACTIVE). + * Mellanox further blinks the amber LED to indicate data packet + * activity, but we have no hardware support for that, so it would + * require waking up every 10-20 msecs and checking the counters + * on the chip, and then turning the LED off if appropriate. That's + * visible overhead, so not something we will do. + */ +static void qib_setup_7322_setextled(struct qib_pportdata *ppd, u32 on) +{ + struct qib_devdata *dd = ppd->dd; + u64 extctl, ledblink = 0, val; + unsigned long flags; + int yel, grn; + + /* + * The diags use the LED to indicate diag info, so we leave + * the external LED alone when the diags are running. + */ + if (dd->diag_client) + return; + + /* Allow override of LED display for, e.g. Locating system in rack */ + if (ppd->led_override) { + grn = (ppd->led_override & QIB_LED_PHYS); + yel = (ppd->led_override & QIB_LED_LOG); + } else if (on) { + val = qib_read_kreg_port(ppd, krp_ibcstatus_a); + grn = qib_7322_phys_portstate(val) == + IB_PHYSPORTSTATE_LINKUP; + yel = qib_7322_iblink_state(val) == IB_PORT_ACTIVE; + } else { + grn = 0; + yel = 0; + } + + spin_lock_irqsave(&dd->cspec->gpio_lock, flags); + extctl = dd->cspec->extctrl & (ppd->port == 1 ? + ~ExtLED_IB1_MASK : ~ExtLED_IB2_MASK); + if (grn) { + extctl |= ppd->port == 1 ? ExtLED_IB1_GRN : ExtLED_IB2_GRN; + /* + * Counts are in chip clock (4ns) periods. + * This is 1/16 sec (66.6ms) on, + * 3/16 sec (187.5 ms) off, with packets rcvd. + */ + ledblink = ((66600 * 1000UL / 4) << IBA7322_LEDBLINK_ON_SHIFT) | + ((187500 * 1000UL / 4) << IBA7322_LEDBLINK_OFF_SHIFT); + } + if (yel) + extctl |= ppd->port == 1 ? ExtLED_IB1_YEL : ExtLED_IB2_YEL; + dd->cspec->extctrl = extctl; + qib_write_kreg(dd, kr_extctrl, dd->cspec->extctrl); + spin_unlock_irqrestore(&dd->cspec->gpio_lock, flags); + + if (ledblink) /* blink the LED on packet receive */ + qib_write_kreg_port(ppd, krp_rcvpktledcnt, ledblink); +} + +#ifdef CONFIG_INFINIBAND_QIB_DCA + +static int qib_7322_notify_dca(struct qib_devdata *dd, unsigned long event) +{ + switch (event) { + case DCA_PROVIDER_ADD: + if (dd->flags & QIB_DCA_ENABLED) + break; + if (!dca_add_requester(&dd->pcidev->dev)) { + qib_devinfo(dd->pcidev, "DCA enabled\n"); + dd->flags |= QIB_DCA_ENABLED; + qib_setup_dca(dd); + } + break; + case DCA_PROVIDER_REMOVE: + if (dd->flags & QIB_DCA_ENABLED) { + dca_remove_requester(&dd->pcidev->dev); + dd->flags &= ~QIB_DCA_ENABLED; + dd->cspec->dca_ctrl = 0; + qib_write_kreg(dd, KREG_IDX(DCACtrlA), + dd->cspec->dca_ctrl); + } + break; + } + return 0; +} + +static void qib_update_rhdrq_dca(struct qib_ctxtdata *rcd, int cpu) +{ + struct qib_devdata *dd = rcd->dd; + struct qib_chip_specific *cspec = dd->cspec; + + if (!(dd->flags & QIB_DCA_ENABLED)) + return; + if (cspec->rhdr_cpu[rcd->ctxt] != cpu) { + const struct dca_reg_map *rmp; + + cspec->rhdr_cpu[rcd->ctxt] = cpu; + rmp = &dca_rcvhdr_reg_map[rcd->ctxt]; + cspec->dca_rcvhdr_ctrl[rmp->shadow_inx] &= rmp->mask; + cspec->dca_rcvhdr_ctrl[rmp->shadow_inx] |= + (u64) dca3_get_tag(&dd->pcidev->dev, cpu) << rmp->lsb; + qib_devinfo(dd->pcidev, + "Ctxt %d cpu %d dca %llx\n", rcd->ctxt, cpu, + (long long) cspec->dca_rcvhdr_ctrl[rmp->shadow_inx]); + qib_write_kreg(dd, rmp->regno, + cspec->dca_rcvhdr_ctrl[rmp->shadow_inx]); + cspec->dca_ctrl |= SYM_MASK(DCACtrlA, RcvHdrqDCAEnable); + qib_write_kreg(dd, KREG_IDX(DCACtrlA), cspec->dca_ctrl); + } +} + +static void qib_update_sdma_dca(struct qib_pportdata *ppd, int cpu) +{ + struct qib_devdata *dd = ppd->dd; + struct qib_chip_specific *cspec = dd->cspec; + unsigned pidx = ppd->port - 1; + + if (!(dd->flags & QIB_DCA_ENABLED)) + return; + if (cspec->sdma_cpu[pidx] != cpu) { + cspec->sdma_cpu[pidx] = cpu; + cspec->dca_rcvhdr_ctrl[4] &= ~(ppd->hw_pidx ? + SYM_MASK(DCACtrlF, SendDma1DCAOPH) : + SYM_MASK(DCACtrlF, SendDma0DCAOPH)); + cspec->dca_rcvhdr_ctrl[4] |= + (u64) dca3_get_tag(&dd->pcidev->dev, cpu) << + (ppd->hw_pidx ? + SYM_LSB(DCACtrlF, SendDma1DCAOPH) : + SYM_LSB(DCACtrlF, SendDma0DCAOPH)); + qib_devinfo(dd->pcidev, + "sdma %d cpu %d dca %llx\n", ppd->hw_pidx, cpu, + (long long) cspec->dca_rcvhdr_ctrl[4]); + qib_write_kreg(dd, KREG_IDX(DCACtrlF), + cspec->dca_rcvhdr_ctrl[4]); + cspec->dca_ctrl |= ppd->hw_pidx ? + SYM_MASK(DCACtrlA, SendDMAHead1DCAEnable) : + SYM_MASK(DCACtrlA, SendDMAHead0DCAEnable); + qib_write_kreg(dd, KREG_IDX(DCACtrlA), cspec->dca_ctrl); + } +} + +static void qib_setup_dca(struct qib_devdata *dd) +{ + struct qib_chip_specific *cspec = dd->cspec; + int i; + + for (i = 0; i < ARRAY_SIZE(cspec->rhdr_cpu); i++) + cspec->rhdr_cpu[i] = -1; + for (i = 0; i < ARRAY_SIZE(cspec->sdma_cpu); i++) + cspec->sdma_cpu[i] = -1; + cspec->dca_rcvhdr_ctrl[0] = + (1ULL << SYM_LSB(DCACtrlB, RcvHdrq0DCAXfrCnt)) | + (1ULL << SYM_LSB(DCACtrlB, RcvHdrq1DCAXfrCnt)) | + (1ULL << SYM_LSB(DCACtrlB, RcvHdrq2DCAXfrCnt)) | + (1ULL << SYM_LSB(DCACtrlB, RcvHdrq3DCAXfrCnt)); + cspec->dca_rcvhdr_ctrl[1] = + (1ULL << SYM_LSB(DCACtrlC, RcvHdrq4DCAXfrCnt)) | + (1ULL << SYM_LSB(DCACtrlC, RcvHdrq5DCAXfrCnt)) | + (1ULL << SYM_LSB(DCACtrlC, RcvHdrq6DCAXfrCnt)) | + (1ULL << SYM_LSB(DCACtrlC, RcvHdrq7DCAXfrCnt)); + cspec->dca_rcvhdr_ctrl[2] = + (1ULL << SYM_LSB(DCACtrlD, RcvHdrq8DCAXfrCnt)) | + (1ULL << SYM_LSB(DCACtrlD, RcvHdrq9DCAXfrCnt)) | + (1ULL << SYM_LSB(DCACtrlD, RcvHdrq10DCAXfrCnt)) | + (1ULL << SYM_LSB(DCACtrlD, RcvHdrq11DCAXfrCnt)); + cspec->dca_rcvhdr_ctrl[3] = + (1ULL << SYM_LSB(DCACtrlE, RcvHdrq12DCAXfrCnt)) | + (1ULL << SYM_LSB(DCACtrlE, RcvHdrq13DCAXfrCnt)) | + (1ULL << SYM_LSB(DCACtrlE, RcvHdrq14DCAXfrCnt)) | + (1ULL << SYM_LSB(DCACtrlE, RcvHdrq15DCAXfrCnt)); + cspec->dca_rcvhdr_ctrl[4] = + (1ULL << SYM_LSB(DCACtrlF, RcvHdrq16DCAXfrCnt)) | + (1ULL << SYM_LSB(DCACtrlF, RcvHdrq17DCAXfrCnt)); + for (i = 0; i < ARRAY_SIZE(cspec->sdma_cpu); i++) + qib_write_kreg(dd, KREG_IDX(DCACtrlB) + i, + cspec->dca_rcvhdr_ctrl[i]); + for (i = 0; i < cspec->num_msix_entries; i++) + setup_dca_notifier(dd, &cspec->msix_entries[i]); +} + +static void qib_irq_notifier_notify(struct irq_affinity_notify *notify, + const cpumask_t *mask) +{ + struct qib_irq_notify *n = + container_of(notify, struct qib_irq_notify, notify); + int cpu = cpumask_first(mask); + + if (n->rcv) { + struct qib_ctxtdata *rcd = (struct qib_ctxtdata *)n->arg; + + qib_update_rhdrq_dca(rcd, cpu); + } else { + struct qib_pportdata *ppd = (struct qib_pportdata *)n->arg; + + qib_update_sdma_dca(ppd, cpu); + } +} + +static void qib_irq_notifier_release(struct kref *ref) +{ + struct qib_irq_notify *n = + container_of(ref, struct qib_irq_notify, notify.kref); + struct qib_devdata *dd; + + if (n->rcv) { + struct qib_ctxtdata *rcd = (struct qib_ctxtdata *)n->arg; + + dd = rcd->dd; + } else { + struct qib_pportdata *ppd = (struct qib_pportdata *)n->arg; + + dd = ppd->dd; + } + qib_devinfo(dd->pcidev, + "release on HCA notify 0x%p n 0x%p\n", ref, n); + kfree(n); +} +#endif + +/* + * Disable MSIx interrupt if enabled, call generic MSIx code + * to cleanup, and clear pending MSIx interrupts. + * Used for fallback to INTx, after reset, and when MSIx setup fails. + */ +static void qib_7322_nomsix(struct qib_devdata *dd) +{ + u64 intgranted; + int n; + + dd->cspec->main_int_mask = ~0ULL; + n = dd->cspec->num_msix_entries; + if (n) { + int i; + + dd->cspec->num_msix_entries = 0; + for (i = 0; i < n; i++) { +#ifdef CONFIG_INFINIBAND_QIB_DCA + reset_dca_notifier(dd, &dd->cspec->msix_entries[i]); +#endif + irq_set_affinity_hint( + dd->cspec->msix_entries[i].msix.vector, NULL); + free_cpumask_var(dd->cspec->msix_entries[i].mask); + free_irq(dd->cspec->msix_entries[i].msix.vector, + dd->cspec->msix_entries[i].arg); + } + qib_nomsix(dd); + } + /* make sure no MSIx interrupts are left pending */ + intgranted = qib_read_kreg64(dd, kr_intgranted); + if (intgranted) + qib_write_kreg(dd, kr_intgranted, intgranted); +} + +static void qib_7322_free_irq(struct qib_devdata *dd) +{ + if (dd->cspec->irq) { + free_irq(dd->cspec->irq, dd); + dd->cspec->irq = 0; + } + qib_7322_nomsix(dd); +} + +static void qib_setup_7322_cleanup(struct qib_devdata *dd) +{ + int i; + +#ifdef CONFIG_INFINIBAND_QIB_DCA + if (dd->flags & QIB_DCA_ENABLED) { + dca_remove_requester(&dd->pcidev->dev); + dd->flags &= ~QIB_DCA_ENABLED; + dd->cspec->dca_ctrl = 0; + qib_write_kreg(dd, KREG_IDX(DCACtrlA), dd->cspec->dca_ctrl); + } +#endif + + qib_7322_free_irq(dd); + kfree(dd->cspec->cntrs); + kfree(dd->cspec->sendchkenable); + kfree(dd->cspec->sendgrhchk); + kfree(dd->cspec->sendibchk); + kfree(dd->cspec->msix_entries); + for (i = 0; i < dd->num_pports; i++) { + unsigned long flags; + u32 mask = QSFP_GPIO_MOD_PRS_N | + (QSFP_GPIO_MOD_PRS_N << QSFP_GPIO_PORT2_SHIFT); + + kfree(dd->pport[i].cpspec->portcntrs); + if (dd->flags & QIB_HAS_QSFP) { + spin_lock_irqsave(&dd->cspec->gpio_lock, flags); + dd->cspec->gpio_mask &= ~mask; + qib_write_kreg(dd, kr_gpio_mask, dd->cspec->gpio_mask); + spin_unlock_irqrestore(&dd->cspec->gpio_lock, flags); + qib_qsfp_deinit(&dd->pport[i].cpspec->qsfp_data); + } + if (dd->pport[i].ibport_data.smi_ah) + ib_destroy_ah(&dd->pport[i].ibport_data.smi_ah->ibah); + } +} + +/* handle SDMA interrupts */ +static void sdma_7322_intr(struct qib_devdata *dd, u64 istat) +{ + struct qib_pportdata *ppd0 = &dd->pport[0]; + struct qib_pportdata *ppd1 = &dd->pport[1]; + u64 intr0 = istat & (INT_MASK_P(SDma, 0) | + INT_MASK_P(SDmaIdle, 0) | INT_MASK_P(SDmaProgress, 0)); + u64 intr1 = istat & (INT_MASK_P(SDma, 1) | + INT_MASK_P(SDmaIdle, 1) | INT_MASK_P(SDmaProgress, 1)); + + if (intr0) + qib_sdma_intr(ppd0); + if (intr1) + qib_sdma_intr(ppd1); + + if (istat & INT_MASK_PM(SDmaCleanupDone, 0)) + qib_sdma_process_event(ppd0, qib_sdma_event_e20_hw_started); + if (istat & INT_MASK_PM(SDmaCleanupDone, 1)) + qib_sdma_process_event(ppd1, qib_sdma_event_e20_hw_started); +} + +/* + * Set or clear the Send buffer available interrupt enable bit. + */ +static void qib_wantpiobuf_7322_intr(struct qib_devdata *dd, u32 needint) +{ + unsigned long flags; + + spin_lock_irqsave(&dd->sendctrl_lock, flags); + if (needint) + dd->sendctrl |= SYM_MASK(SendCtrl, SendIntBufAvail); + else + dd->sendctrl &= ~SYM_MASK(SendCtrl, SendIntBufAvail); + qib_write_kreg(dd, kr_sendctrl, dd->sendctrl); + qib_write_kreg(dd, kr_scratch, 0ULL); + spin_unlock_irqrestore(&dd->sendctrl_lock, flags); +} + +/* + * Somehow got an interrupt with reserved bits set in interrupt status. + * Print a message so we know it happened, then clear them. + * keep mainline interrupt handler cache-friendly + */ +static noinline void unknown_7322_ibits(struct qib_devdata *dd, u64 istat) +{ + u64 kills; + char msg[128]; + + kills = istat & ~QIB_I_BITSEXTANT; + qib_dev_err(dd, + "Clearing reserved interrupt(s) 0x%016llx: %s\n", + (unsigned long long) kills, msg); + qib_write_kreg(dd, kr_intmask, (dd->cspec->int_enable_mask & ~kills)); +} + +/* keep mainline interrupt handler cache-friendly */ +static noinline void unknown_7322_gpio_intr(struct qib_devdata *dd) +{ + u32 gpiostatus; + int handled = 0; + int pidx; + + /* + * Boards for this chip currently don't use GPIO interrupts, + * so clear by writing GPIOstatus to GPIOclear, and complain + * to developer. To avoid endless repeats, clear + * the bits in the mask, since there is some kind of + * programming error or chip problem. + */ + gpiostatus = qib_read_kreg32(dd, kr_gpio_status); + /* + * In theory, writing GPIOstatus to GPIOclear could + * have a bad side-effect on some diagnostic that wanted + * to poll for a status-change, but the various shadows + * make that problematic at best. Diags will just suppress + * all GPIO interrupts during such tests. + */ + qib_write_kreg(dd, kr_gpio_clear, gpiostatus); + /* + * Check for QSFP MOD_PRS changes + * only works for single port if IB1 != pidx1 + */ + for (pidx = 0; pidx < dd->num_pports && (dd->flags & QIB_HAS_QSFP); + ++pidx) { + struct qib_pportdata *ppd; + struct qib_qsfp_data *qd; + u32 mask; + + if (!dd->pport[pidx].link_speed_supported) + continue; + mask = QSFP_GPIO_MOD_PRS_N; + ppd = dd->pport + pidx; + mask <<= (QSFP_GPIO_PORT2_SHIFT * ppd->hw_pidx); + if (gpiostatus & dd->cspec->gpio_mask & mask) { + u64 pins; + + qd = &ppd->cpspec->qsfp_data; + gpiostatus &= ~mask; + pins = qib_read_kreg64(dd, kr_extstatus); + pins >>= SYM_LSB(EXTStatus, GPIOIn); + if (!(pins & mask)) { + ++handled; + qd->t_insert = jiffies; + queue_work(ib_wq, &qd->work); + } + } + } + + if (gpiostatus && !handled) { + const u32 mask = qib_read_kreg32(dd, kr_gpio_mask); + u32 gpio_irq = mask & gpiostatus; + + /* + * Clear any troublemakers, and update chip from shadow + */ + dd->cspec->gpio_mask &= ~gpio_irq; + qib_write_kreg(dd, kr_gpio_mask, dd->cspec->gpio_mask); + } +} + +/* + * Handle errors and unusual events first, separate function + * to improve cache hits for fast path interrupt handling. + */ +static noinline void unlikely_7322_intr(struct qib_devdata *dd, u64 istat) +{ + if (istat & ~QIB_I_BITSEXTANT) + unknown_7322_ibits(dd, istat); + if (istat & QIB_I_GPIO) + unknown_7322_gpio_intr(dd); + if (istat & QIB_I_C_ERROR) { + qib_write_kreg(dd, kr_errmask, 0ULL); + tasklet_schedule(&dd->error_tasklet); + } + if (istat & INT_MASK_P(Err, 0) && dd->rcd[0]) + handle_7322_p_errors(dd->rcd[0]->ppd); + if (istat & INT_MASK_P(Err, 1) && dd->rcd[1]) + handle_7322_p_errors(dd->rcd[1]->ppd); +} + +/* + * Dynamically adjust the rcv int timeout for a context based on incoming + * packet rate. + */ +static void adjust_rcv_timeout(struct qib_ctxtdata *rcd, int npkts) +{ + struct qib_devdata *dd = rcd->dd; + u32 timeout = dd->cspec->rcvavail_timeout[rcd->ctxt]; + + /* + * Dynamically adjust idle timeout on chip + * based on number of packets processed. + */ + if (npkts < rcv_int_count && timeout > 2) + timeout >>= 1; + else if (npkts >= rcv_int_count && timeout < rcv_int_timeout) + timeout = min(timeout << 1, rcv_int_timeout); + else + return; + + dd->cspec->rcvavail_timeout[rcd->ctxt] = timeout; + qib_write_kreg(dd, kr_rcvavailtimeout + rcd->ctxt, timeout); +} + +/* + * This is the main interrupt handler. + * It will normally only be used for low frequency interrupts but may + * have to handle all interrupts if INTx is enabled or fewer than normal + * MSIx interrupts were allocated. + * This routine should ignore the interrupt bits for any of the + * dedicated MSIx handlers. + */ +static irqreturn_t qib_7322intr(int irq, void *data) +{ + struct qib_devdata *dd = data; + irqreturn_t ret; + u64 istat; + u64 ctxtrbits; + u64 rmask; + unsigned i; + u32 npkts; + + if ((dd->flags & (QIB_PRESENT | QIB_BADINTR)) != QIB_PRESENT) { + /* + * This return value is not great, but we do not want the + * interrupt core code to remove our interrupt handler + * because we don't appear to be handling an interrupt + * during a chip reset. + */ + ret = IRQ_HANDLED; + goto bail; + } + + istat = qib_read_kreg64(dd, kr_intstatus); + + if (unlikely(istat == ~0ULL)) { + qib_bad_intrstatus(dd); + qib_dev_err(dd, "Interrupt status all f's, skipping\n"); + /* don't know if it was our interrupt or not */ + ret = IRQ_NONE; + goto bail; + } + + istat &= dd->cspec->main_int_mask; + if (unlikely(!istat)) { + /* already handled, or shared and not us */ + ret = IRQ_NONE; + goto bail; + } + + this_cpu_inc(*dd->int_counter); + + /* handle "errors" of various kinds first, device ahead of port */ + if (unlikely(istat & (~QIB_I_BITSEXTANT | QIB_I_GPIO | + QIB_I_C_ERROR | INT_MASK_P(Err, 0) | + INT_MASK_P(Err, 1)))) + unlikely_7322_intr(dd, istat); + + /* + * Clear the interrupt bits we found set, relatively early, so we + * "know" know the chip will have seen this by the time we process + * the queue, and will re-interrupt if necessary. The processor + * itself won't take the interrupt again until we return. + */ + qib_write_kreg(dd, kr_intclear, istat); + + /* + * Handle kernel receive queues before checking for pio buffers + * available since receives can overflow; piobuf waiters can afford + * a few extra cycles, since they were waiting anyway. + */ + ctxtrbits = istat & (QIB_I_RCVAVAIL_MASK | QIB_I_RCVURG_MASK); + if (ctxtrbits) { + rmask = (1ULL << QIB_I_RCVAVAIL_LSB) | + (1ULL << QIB_I_RCVURG_LSB); + for (i = 0; i < dd->first_user_ctxt; i++) { + if (ctxtrbits & rmask) { + ctxtrbits &= ~rmask; + if (dd->rcd[i]) + qib_kreceive(dd->rcd[i], NULL, &npkts); + } + rmask <<= 1; + } + if (ctxtrbits) { + ctxtrbits = (ctxtrbits >> QIB_I_RCVAVAIL_LSB) | + (ctxtrbits >> QIB_I_RCVURG_LSB); + qib_handle_urcv(dd, ctxtrbits); + } + } + + if (istat & (QIB_I_P_SDMAINT(0) | QIB_I_P_SDMAINT(1))) + sdma_7322_intr(dd, istat); + + if ((istat & QIB_I_SPIOBUFAVAIL) && (dd->flags & QIB_INITTED)) + qib_ib_piobufavail(dd); + + ret = IRQ_HANDLED; +bail: + return ret; +} + +/* + * Dedicated receive packet available interrupt handler. + */ +static irqreturn_t qib_7322pintr(int irq, void *data) +{ + struct qib_ctxtdata *rcd = data; + struct qib_devdata *dd = rcd->dd; + u32 npkts; + + if ((dd->flags & (QIB_PRESENT | QIB_BADINTR)) != QIB_PRESENT) + /* + * This return value is not great, but we do not want the + * interrupt core code to remove our interrupt handler + * because we don't appear to be handling an interrupt + * during a chip reset. + */ + return IRQ_HANDLED; + + this_cpu_inc(*dd->int_counter); + + /* Clear the interrupt bit we expect to be set. */ + qib_write_kreg(dd, kr_intclear, ((1ULL << QIB_I_RCVAVAIL_LSB) | + (1ULL << QIB_I_RCVURG_LSB)) << rcd->ctxt); + + qib_kreceive(rcd, NULL, &npkts); + + return IRQ_HANDLED; +} + +/* + * Dedicated Send buffer available interrupt handler. + */ +static irqreturn_t qib_7322bufavail(int irq, void *data) +{ + struct qib_devdata *dd = data; + + if ((dd->flags & (QIB_PRESENT | QIB_BADINTR)) != QIB_PRESENT) + /* + * This return value is not great, but we do not want the + * interrupt core code to remove our interrupt handler + * because we don't appear to be handling an interrupt + * during a chip reset. + */ + return IRQ_HANDLED; + + this_cpu_inc(*dd->int_counter); + + /* Clear the interrupt bit we expect to be set. */ + qib_write_kreg(dd, kr_intclear, QIB_I_SPIOBUFAVAIL); + + /* qib_ib_piobufavail() will clear the want PIO interrupt if needed */ + if (dd->flags & QIB_INITTED) + qib_ib_piobufavail(dd); + else + qib_wantpiobuf_7322_intr(dd, 0); + + return IRQ_HANDLED; +} + +/* + * Dedicated Send DMA interrupt handler. + */ +static irqreturn_t sdma_intr(int irq, void *data) +{ + struct qib_pportdata *ppd = data; + struct qib_devdata *dd = ppd->dd; + + if ((dd->flags & (QIB_PRESENT | QIB_BADINTR)) != QIB_PRESENT) + /* + * This return value is not great, but we do not want the + * interrupt core code to remove our interrupt handler + * because we don't appear to be handling an interrupt + * during a chip reset. + */ + return IRQ_HANDLED; + + this_cpu_inc(*dd->int_counter); + + /* Clear the interrupt bit we expect to be set. */ + qib_write_kreg(dd, kr_intclear, ppd->hw_pidx ? + INT_MASK_P(SDma, 1) : INT_MASK_P(SDma, 0)); + qib_sdma_intr(ppd); + + return IRQ_HANDLED; +} + +/* + * Dedicated Send DMA idle interrupt handler. + */ +static irqreturn_t sdma_idle_intr(int irq, void *data) +{ + struct qib_pportdata *ppd = data; + struct qib_devdata *dd = ppd->dd; + + if ((dd->flags & (QIB_PRESENT | QIB_BADINTR)) != QIB_PRESENT) + /* + * This return value is not great, but we do not want the + * interrupt core code to remove our interrupt handler + * because we don't appear to be handling an interrupt + * during a chip reset. + */ + return IRQ_HANDLED; + + this_cpu_inc(*dd->int_counter); + + /* Clear the interrupt bit we expect to be set. */ + qib_write_kreg(dd, kr_intclear, ppd->hw_pidx ? + INT_MASK_P(SDmaIdle, 1) : INT_MASK_P(SDmaIdle, 0)); + qib_sdma_intr(ppd); + + return IRQ_HANDLED; +} + +/* + * Dedicated Send DMA progress interrupt handler. + */ +static irqreturn_t sdma_progress_intr(int irq, void *data) +{ + struct qib_pportdata *ppd = data; + struct qib_devdata *dd = ppd->dd; + + if ((dd->flags & (QIB_PRESENT | QIB_BADINTR)) != QIB_PRESENT) + /* + * This return value is not great, but we do not want the + * interrupt core code to remove our interrupt handler + * because we don't appear to be handling an interrupt + * during a chip reset. + */ + return IRQ_HANDLED; + + this_cpu_inc(*dd->int_counter); + + /* Clear the interrupt bit we expect to be set. */ + qib_write_kreg(dd, kr_intclear, ppd->hw_pidx ? + INT_MASK_P(SDmaProgress, 1) : + INT_MASK_P(SDmaProgress, 0)); + qib_sdma_intr(ppd); + + return IRQ_HANDLED; +} + +/* + * Dedicated Send DMA cleanup interrupt handler. + */ +static irqreturn_t sdma_cleanup_intr(int irq, void *data) +{ + struct qib_pportdata *ppd = data; + struct qib_devdata *dd = ppd->dd; + + if ((dd->flags & (QIB_PRESENT | QIB_BADINTR)) != QIB_PRESENT) + /* + * This return value is not great, but we do not want the + * interrupt core code to remove our interrupt handler + * because we don't appear to be handling an interrupt + * during a chip reset. + */ + return IRQ_HANDLED; + + this_cpu_inc(*dd->int_counter); + + /* Clear the interrupt bit we expect to be set. */ + qib_write_kreg(dd, kr_intclear, ppd->hw_pidx ? + INT_MASK_PM(SDmaCleanupDone, 1) : + INT_MASK_PM(SDmaCleanupDone, 0)); + qib_sdma_process_event(ppd, qib_sdma_event_e20_hw_started); + + return IRQ_HANDLED; +} + +#ifdef CONFIG_INFINIBAND_QIB_DCA + +static void reset_dca_notifier(struct qib_devdata *dd, struct qib_msix_entry *m) +{ + if (!m->dca) + return; + qib_devinfo(dd->pcidev, + "Disabling notifier on HCA %d irq %d\n", + dd->unit, + m->msix.vector); + irq_set_affinity_notifier( + m->msix.vector, + NULL); + m->notifier = NULL; +} + +static void setup_dca_notifier(struct qib_devdata *dd, struct qib_msix_entry *m) +{ + struct qib_irq_notify *n; + + if (!m->dca) + return; + n = kzalloc(sizeof(*n), GFP_KERNEL); + if (n) { + int ret; + + m->notifier = n; + n->notify.irq = m->msix.vector; + n->notify.notify = qib_irq_notifier_notify; + n->notify.release = qib_irq_notifier_release; + n->arg = m->arg; + n->rcv = m->rcv; + qib_devinfo(dd->pcidev, + "set notifier irq %d rcv %d notify %p\n", + n->notify.irq, n->rcv, &n->notify); + ret = irq_set_affinity_notifier( + n->notify.irq, + &n->notify); + if (ret) { + m->notifier = NULL; + kfree(n); + } + } +} + +#endif + +/* + * Set up our chip-specific interrupt handler. + * The interrupt type has already been setup, so + * we just need to do the registration and error checking. + * If we are using MSIx interrupts, we may fall back to + * INTx later, if the interrupt handler doesn't get called + * within 1/2 second (see verify_interrupt()). + */ +static void qib_setup_7322_interrupt(struct qib_devdata *dd, int clearpend) +{ + int ret, i, msixnum; + u64 redirect[6]; + u64 mask; + const struct cpumask *local_mask; + int firstcpu, secondcpu = 0, currrcvcpu = 0; + + if (!dd->num_pports) + return; + + if (clearpend) { + /* + * if not switching interrupt types, be sure interrupts are + * disabled, and then clear anything pending at this point, + * because we are starting clean. + */ + qib_7322_set_intr_state(dd, 0); + + /* clear the reset error, init error/hwerror mask */ + qib_7322_init_hwerrors(dd); + + /* clear any interrupt bits that might be set */ + qib_write_kreg(dd, kr_intclear, ~0ULL); + + /* make sure no pending MSIx intr, and clear diag reg */ + qib_write_kreg(dd, kr_intgranted, ~0ULL); + qib_write_kreg(dd, kr_vecclr_wo_int, ~0ULL); + } + + if (!dd->cspec->num_msix_entries) { + /* Try to get INTx interrupt */ +try_intx: + if (!dd->pcidev->irq) { + qib_dev_err(dd, + "irq is 0, BIOS error? Interrupts won't work\n"); + goto bail; + } + ret = request_irq(dd->pcidev->irq, qib_7322intr, + IRQF_SHARED, QIB_DRV_NAME, dd); + if (ret) { + qib_dev_err(dd, + "Couldn't setup INTx interrupt (irq=%d): %d\n", + dd->pcidev->irq, ret); + goto bail; + } + dd->cspec->irq = dd->pcidev->irq; + dd->cspec->main_int_mask = ~0ULL; + goto bail; + } + + /* Try to get MSIx interrupts */ + memset(redirect, 0, sizeof(redirect)); + mask = ~0ULL; + msixnum = 0; + local_mask = cpumask_of_pcibus(dd->pcidev->bus); + firstcpu = cpumask_first(local_mask); + if (firstcpu >= nr_cpu_ids || + cpumask_weight(local_mask) == num_online_cpus()) { + local_mask = topology_core_cpumask(0); + firstcpu = cpumask_first(local_mask); + } + if (firstcpu < nr_cpu_ids) { + secondcpu = cpumask_next(firstcpu, local_mask); + if (secondcpu >= nr_cpu_ids) + secondcpu = firstcpu; + currrcvcpu = secondcpu; + } + for (i = 0; msixnum < dd->cspec->num_msix_entries; i++) { + irq_handler_t handler; + void *arg; + u64 val; + int lsb, reg, sh; +#ifdef CONFIG_INFINIBAND_QIB_DCA + int dca = 0; +#endif + + dd->cspec->msix_entries[msixnum]. + name[sizeof(dd->cspec->msix_entries[msixnum].name) - 1] + = '\0'; + if (i < ARRAY_SIZE(irq_table)) { + if (irq_table[i].port) { + /* skip if for a non-configured port */ + if (irq_table[i].port > dd->num_pports) + continue; + arg = dd->pport + irq_table[i].port - 1; + } else + arg = dd; +#ifdef CONFIG_INFINIBAND_QIB_DCA + dca = irq_table[i].dca; +#endif + lsb = irq_table[i].lsb; + handler = irq_table[i].handler; + snprintf(dd->cspec->msix_entries[msixnum].name, + sizeof(dd->cspec->msix_entries[msixnum].name) + - 1, + QIB_DRV_NAME "%d%s", dd->unit, + irq_table[i].name); + } else { + unsigned ctxt; + + ctxt = i - ARRAY_SIZE(irq_table); + /* per krcvq context receive interrupt */ + arg = dd->rcd[ctxt]; + if (!arg) + continue; + if (qib_krcvq01_no_msi && ctxt < 2) + continue; +#ifdef CONFIG_INFINIBAND_QIB_DCA + dca = 1; +#endif + lsb = QIB_I_RCVAVAIL_LSB + ctxt; + handler = qib_7322pintr; + snprintf(dd->cspec->msix_entries[msixnum].name, + sizeof(dd->cspec->msix_entries[msixnum].name) + - 1, + QIB_DRV_NAME "%d (kctx)", dd->unit); + } + ret = request_irq( + dd->cspec->msix_entries[msixnum].msix.vector, + handler, 0, dd->cspec->msix_entries[msixnum].name, + arg); + if (ret) { + /* + * Shouldn't happen since the enable said we could + * have as many as we are trying to setup here. + */ + qib_dev_err(dd, + "Couldn't setup MSIx interrupt (vec=%d, irq=%d): %d\n", + msixnum, + dd->cspec->msix_entries[msixnum].msix.vector, + ret); + qib_7322_nomsix(dd); + goto try_intx; + } + dd->cspec->msix_entries[msixnum].arg = arg; +#ifdef CONFIG_INFINIBAND_QIB_DCA + dd->cspec->msix_entries[msixnum].dca = dca; + dd->cspec->msix_entries[msixnum].rcv = + handler == qib_7322pintr; +#endif + if (lsb >= 0) { + reg = lsb / IBA7322_REDIRECT_VEC_PER_REG; + sh = (lsb % IBA7322_REDIRECT_VEC_PER_REG) * + SYM_LSB(IntRedirect0, vec1); + mask &= ~(1ULL << lsb); + redirect[reg] |= ((u64) msixnum) << sh; + } + val = qib_read_kreg64(dd, 2 * msixnum + 1 + + (QIB_7322_MsixTable_OFFS / sizeof(u64))); + if (firstcpu < nr_cpu_ids && + zalloc_cpumask_var( + &dd->cspec->msix_entries[msixnum].mask, + GFP_KERNEL)) { + if (handler == qib_7322pintr) { + cpumask_set_cpu(currrcvcpu, + dd->cspec->msix_entries[msixnum].mask); + currrcvcpu = cpumask_next(currrcvcpu, + local_mask); + if (currrcvcpu >= nr_cpu_ids) + currrcvcpu = secondcpu; + } else { + cpumask_set_cpu(firstcpu, + dd->cspec->msix_entries[msixnum].mask); + } + irq_set_affinity_hint( + dd->cspec->msix_entries[msixnum].msix.vector, + dd->cspec->msix_entries[msixnum].mask); + } + msixnum++; + } + /* Initialize the vector mapping */ + for (i = 0; i < ARRAY_SIZE(redirect); i++) + qib_write_kreg(dd, kr_intredirect + i, redirect[i]); + dd->cspec->main_int_mask = mask; + tasklet_init(&dd->error_tasklet, qib_error_tasklet, + (unsigned long)dd); +bail:; +} + +/** + * qib_7322_boardname - fill in the board name and note features + * @dd: the qlogic_ib device + * + * info will be based on the board revision register + */ +static unsigned qib_7322_boardname(struct qib_devdata *dd) +{ + /* Will need enumeration of board-types here */ + char *n; + u32 boardid, namelen; + unsigned features = DUAL_PORT_CAP; + + boardid = SYM_FIELD(dd->revision, Revision, BoardID); + + switch (boardid) { + case 0: + n = "InfiniPath_QLE7342_Emulation"; + break; + case 1: + n = "InfiniPath_QLE7340"; + dd->flags |= QIB_HAS_QSFP; + features = PORT_SPD_CAP; + break; + case 2: + n = "InfiniPath_QLE7342"; + dd->flags |= QIB_HAS_QSFP; + break; + case 3: + n = "InfiniPath_QMI7342"; + break; + case 4: + n = "InfiniPath_Unsupported7342"; + qib_dev_err(dd, "Unsupported version of QMH7342\n"); + features = 0; + break; + case BOARD_QMH7342: + n = "InfiniPath_QMH7342"; + features = 0x24; + break; + case BOARD_QME7342: + n = "InfiniPath_QME7342"; + break; + case 8: + n = "InfiniPath_QME7362"; + dd->flags |= QIB_HAS_QSFP; + break; + case BOARD_QMH7360: + n = "Intel IB QDR 1P FLR-QSFP Adptr"; + dd->flags |= QIB_HAS_QSFP; + break; + case 15: + n = "InfiniPath_QLE7342_TEST"; + dd->flags |= QIB_HAS_QSFP; + break; + default: + n = "InfiniPath_QLE73xy_UNKNOWN"; + qib_dev_err(dd, "Unknown 7322 board type %u\n", boardid); + break; + } + dd->board_atten = 1; /* index into txdds_Xdr */ + + namelen = strlen(n) + 1; + dd->boardname = kmalloc(namelen, GFP_KERNEL); + if (!dd->boardname) + qib_dev_err(dd, "Failed allocation for board name: %s\n", n); + else + snprintf(dd->boardname, namelen, "%s", n); + + snprintf(dd->boardversion, sizeof(dd->boardversion), + "ChipABI %u.%u, %s, InfiniPath%u %u.%u, SW Compat %u\n", + QIB_CHIP_VERS_MAJ, QIB_CHIP_VERS_MIN, dd->boardname, + (unsigned)SYM_FIELD(dd->revision, Revision_R, Arch), + dd->majrev, dd->minrev, + (unsigned)SYM_FIELD(dd->revision, Revision_R, SW)); + + if (qib_singleport && (features >> PORT_SPD_CAP_SHIFT) & PORT_SPD_CAP) { + qib_devinfo(dd->pcidev, + "IB%u: Forced to single port mode by module parameter\n", + dd->unit); + features &= PORT_SPD_CAP; + } + + return features; +} + +/* + * This routine sleeps, so it can only be called from user context, not + * from interrupt context. + */ +static int qib_do_7322_reset(struct qib_devdata *dd) +{ + u64 val; + u64 *msix_vecsave; + int i, msix_entries, ret = 1; + u16 cmdval; + u8 int_line, clinesz; + unsigned long flags; + + /* Use dev_err so it shows up in logs, etc. */ + qib_dev_err(dd, "Resetting InfiniPath unit %u\n", dd->unit); + + qib_pcie_getcmd(dd, &cmdval, &int_line, &clinesz); + + msix_entries = dd->cspec->num_msix_entries; + + /* no interrupts till re-initted */ + qib_7322_set_intr_state(dd, 0); + + if (msix_entries) { + qib_7322_nomsix(dd); + /* can be up to 512 bytes, too big for stack */ + msix_vecsave = kmalloc(2 * dd->cspec->num_msix_entries * + sizeof(u64), GFP_KERNEL); + if (!msix_vecsave) + qib_dev_err(dd, "No mem to save MSIx data\n"); + } else + msix_vecsave = NULL; + + /* + * Core PCI (as of 2.6.18) doesn't save or rewrite the full vector + * info that is set up by the BIOS, so we have to save and restore + * it ourselves. There is some risk something could change it, + * after we save it, but since we have disabled the MSIx, it + * shouldn't be touched... + */ + for (i = 0; i < msix_entries; i++) { + u64 vecaddr, vecdata; + + vecaddr = qib_read_kreg64(dd, 2 * i + + (QIB_7322_MsixTable_OFFS / sizeof(u64))); + vecdata = qib_read_kreg64(dd, 1 + 2 * i + + (QIB_7322_MsixTable_OFFS / sizeof(u64))); + if (msix_vecsave) { + msix_vecsave[2 * i] = vecaddr; + /* save it without the masked bit set */ + msix_vecsave[1 + 2 * i] = vecdata & ~0x100000000ULL; + } + } + + dd->pport->cpspec->ibdeltainprog = 0; + dd->pport->cpspec->ibsymdelta = 0; + dd->pport->cpspec->iblnkerrdelta = 0; + dd->pport->cpspec->ibmalfdelta = 0; + /* so we check interrupts work again */ + dd->z_int_counter = qib_int_counter(dd); + + /* + * Keep chip from being accessed until we are ready. Use + * writeq() directly, to allow the write even though QIB_PRESENT + * isn't set. + */ + dd->flags &= ~(QIB_INITTED | QIB_PRESENT | QIB_BADINTR); + dd->flags |= QIB_DOING_RESET; + val = dd->control | QLOGIC_IB_C_RESET; + writeq(val, &dd->kregbase[kr_control]); + + for (i = 1; i <= 5; i++) { + /* + * Allow MBIST, etc. to complete; longer on each retry. + * We sometimes get machine checks from bus timeout if no + * response, so for now, make it *really* long. + */ + msleep(1000 + (1 + i) * 3000); + + qib_pcie_reenable(dd, cmdval, int_line, clinesz); + + /* + * Use readq directly, so we don't need to mark it as PRESENT + * until we get a successful indication that all is well. + */ + val = readq(&dd->kregbase[kr_revision]); + if (val == dd->revision) + break; + if (i == 5) { + qib_dev_err(dd, + "Failed to initialize after reset, unusable\n"); + ret = 0; + goto bail; + } + } + + dd->flags |= QIB_PRESENT; /* it's back */ + + if (msix_entries) { + /* restore the MSIx vector address and data if saved above */ + for (i = 0; i < msix_entries; i++) { + dd->cspec->msix_entries[i].msix.entry = i; + if (!msix_vecsave || !msix_vecsave[2 * i]) + continue; + qib_write_kreg(dd, 2 * i + + (QIB_7322_MsixTable_OFFS / sizeof(u64)), + msix_vecsave[2 * i]); + qib_write_kreg(dd, 1 + 2 * i + + (QIB_7322_MsixTable_OFFS / sizeof(u64)), + msix_vecsave[1 + 2 * i]); + } + } + + /* initialize the remaining registers. */ + for (i = 0; i < dd->num_pports; ++i) + write_7322_init_portregs(&dd->pport[i]); + write_7322_initregs(dd); + + if (qib_pcie_params(dd, dd->lbus_width, + &dd->cspec->num_msix_entries, + dd->cspec->msix_entries)) + qib_dev_err(dd, + "Reset failed to setup PCIe or interrupts; continuing anyway\n"); + + qib_setup_7322_interrupt(dd, 1); + + for (i = 0; i < dd->num_pports; ++i) { + struct qib_pportdata *ppd = &dd->pport[i]; + + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags |= QIBL_IB_FORCE_NOTIFY; + ppd->lflags &= ~QIBL_IB_AUTONEG_FAILED; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + } + +bail: + dd->flags &= ~QIB_DOING_RESET; /* OK or not, no longer resetting */ + kfree(msix_vecsave); + return ret; +} + +/** + * qib_7322_put_tid - write a TID to the chip + * @dd: the qlogic_ib device + * @tidptr: pointer to the expected TID (in chip) to update + * @tidtype: 0 for eager, 1 for expected + * @pa: physical address of in memory buffer; tidinvalid if freeing + */ +static void qib_7322_put_tid(struct qib_devdata *dd, u64 __iomem *tidptr, + u32 type, unsigned long pa) +{ + if (!(dd->flags & QIB_PRESENT)) + return; + if (pa != dd->tidinvalid) { + u64 chippa = pa >> IBA7322_TID_PA_SHIFT; + + /* paranoia checks */ + if (pa != (chippa << IBA7322_TID_PA_SHIFT)) { + qib_dev_err(dd, "Physaddr %lx not 2KB aligned!\n", + pa); + return; + } + if (chippa >= (1UL << IBA7322_TID_SZ_SHIFT)) { + qib_dev_err(dd, + "Physical page address 0x%lx larger than supported\n", + pa); + return; + } + + if (type == RCVHQ_RCV_TYPE_EAGER) + chippa |= dd->tidtemplate; + else /* for now, always full 4KB page */ + chippa |= IBA7322_TID_SZ_4K; + pa = chippa; + } + writeq(pa, tidptr); + mmiowb(); +} + +/** + * qib_7322_clear_tids - clear all TID entries for a ctxt, expected and eager + * @dd: the qlogic_ib device + * @ctxt: the ctxt + * + * clear all TID entries for a ctxt, expected and eager. + * Used from qib_close(). + */ +static void qib_7322_clear_tids(struct qib_devdata *dd, + struct qib_ctxtdata *rcd) +{ + u64 __iomem *tidbase; + unsigned long tidinv; + u32 ctxt; + int i; + + if (!dd->kregbase || !rcd) + return; + + ctxt = rcd->ctxt; + + tidinv = dd->tidinvalid; + tidbase = (u64 __iomem *) + ((char __iomem *) dd->kregbase + + dd->rcvtidbase + + ctxt * dd->rcvtidcnt * sizeof(*tidbase)); + + for (i = 0; i < dd->rcvtidcnt; i++) + qib_7322_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EXPECTED, + tidinv); + + tidbase = (u64 __iomem *) + ((char __iomem *) dd->kregbase + + dd->rcvegrbase + + rcd->rcvegr_tid_base * sizeof(*tidbase)); + + for (i = 0; i < rcd->rcvegrcnt; i++) + qib_7322_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EAGER, + tidinv); +} + +/** + * qib_7322_tidtemplate - setup constants for TID updates + * @dd: the qlogic_ib device + * + * We setup stuff that we use a lot, to avoid calculating each time + */ +static void qib_7322_tidtemplate(struct qib_devdata *dd) +{ + /* + * For now, we always allocate 4KB buffers (at init) so we can + * receive max size packets. We may want a module parameter to + * specify 2KB or 4KB and/or make it per port instead of per device + * for those who want to reduce memory footprint. Note that the + * rcvhdrentsize size must be large enough to hold the largest + * IB header (currently 96 bytes) that we expect to handle (plus of + * course the 2 dwords of RHF). + */ + if (dd->rcvegrbufsize == 2048) + dd->tidtemplate = IBA7322_TID_SZ_2K; + else if (dd->rcvegrbufsize == 4096) + dd->tidtemplate = IBA7322_TID_SZ_4K; + dd->tidinvalid = 0; +} + +/** + * qib_init_7322_get_base_info - set chip-specific flags for user code + * @rcd: the qlogic_ib ctxt + * @kbase: qib_base_info pointer + * + * We set the PCIE flag because the lower bandwidth on PCIe vs + * HyperTransport can affect some user packet algorithims. + */ + +static int qib_7322_get_base_info(struct qib_ctxtdata *rcd, + struct qib_base_info *kinfo) +{ + kinfo->spi_runtime_flags |= QIB_RUNTIME_CTXT_MSB_IN_QP | + QIB_RUNTIME_PCIE | QIB_RUNTIME_NODMA_RTAIL | + QIB_RUNTIME_HDRSUPP | QIB_RUNTIME_SDMA; + if (rcd->dd->cspec->r1) + kinfo->spi_runtime_flags |= QIB_RUNTIME_RCHK; + if (rcd->dd->flags & QIB_USE_SPCL_TRIG) + kinfo->spi_runtime_flags |= QIB_RUNTIME_SPECIAL_TRIGGER; + + return 0; +} + +static struct qib_message_header * +qib_7322_get_msgheader(struct qib_devdata *dd, __le32 *rhf_addr) +{ + u32 offset = qib_hdrget_offset(rhf_addr); + + return (struct qib_message_header *) + (rhf_addr - dd->rhf_offset + offset); +} + +/* + * Configure number of contexts. + */ +static void qib_7322_config_ctxts(struct qib_devdata *dd) +{ + unsigned long flags; + u32 nchipctxts; + + nchipctxts = qib_read_kreg32(dd, kr_contextcnt); + dd->cspec->numctxts = nchipctxts; + if (qib_n_krcv_queues > 1 && dd->num_pports) { + dd->first_user_ctxt = NUM_IB_PORTS + + (qib_n_krcv_queues - 1) * dd->num_pports; + if (dd->first_user_ctxt > nchipctxts) + dd->first_user_ctxt = nchipctxts; + dd->n_krcv_queues = dd->first_user_ctxt / dd->num_pports; + } else { + dd->first_user_ctxt = NUM_IB_PORTS; + dd->n_krcv_queues = 1; + } + + if (!qib_cfgctxts) { + int nctxts = dd->first_user_ctxt + num_online_cpus(); + + if (nctxts <= 6) + dd->ctxtcnt = 6; + else if (nctxts <= 10) + dd->ctxtcnt = 10; + else if (nctxts <= nchipctxts) + dd->ctxtcnt = nchipctxts; + } else if (qib_cfgctxts < dd->num_pports) + dd->ctxtcnt = dd->num_pports; + else if (qib_cfgctxts <= nchipctxts) + dd->ctxtcnt = qib_cfgctxts; + if (!dd->ctxtcnt) /* none of the above, set to max */ + dd->ctxtcnt = nchipctxts; + + /* + * Chip can be configured for 6, 10, or 18 ctxts, and choice + * affects number of eager TIDs per ctxt (1K, 2K, 4K). + * Lock to be paranoid about later motion, etc. + */ + spin_lock_irqsave(&dd->cspec->rcvmod_lock, flags); + if (dd->ctxtcnt > 10) + dd->rcvctrl |= 2ULL << SYM_LSB(RcvCtrl, ContextCfg); + else if (dd->ctxtcnt > 6) + dd->rcvctrl |= 1ULL << SYM_LSB(RcvCtrl, ContextCfg); + /* else configure for default 6 receive ctxts */ + + /* The XRC opcode is 5. */ + dd->rcvctrl |= 5ULL << SYM_LSB(RcvCtrl, XrcTypeCode); + + /* + * RcvCtrl *must* be written here so that the + * chip understands how to change rcvegrcnt below. + */ + qib_write_kreg(dd, kr_rcvctrl, dd->rcvctrl); + spin_unlock_irqrestore(&dd->cspec->rcvmod_lock, flags); + + /* kr_rcvegrcnt changes based on the number of contexts enabled */ + dd->cspec->rcvegrcnt = qib_read_kreg32(dd, kr_rcvegrcnt); + if (qib_rcvhdrcnt) + dd->rcvhdrcnt = max(dd->cspec->rcvegrcnt, qib_rcvhdrcnt); + else + dd->rcvhdrcnt = 2 * max(dd->cspec->rcvegrcnt, + dd->num_pports > 1 ? 1024U : 2048U); +} + +static int qib_7322_get_ib_cfg(struct qib_pportdata *ppd, int which) +{ + + int lsb, ret = 0; + u64 maskr; /* right-justified mask */ + + switch (which) { + + case QIB_IB_CFG_LWID_ENB: /* Get allowed Link-width */ + ret = ppd->link_width_enabled; + goto done; + + case QIB_IB_CFG_LWID: /* Get currently active Link-width */ + ret = ppd->link_width_active; + goto done; + + case QIB_IB_CFG_SPD_ENB: /* Get allowed Link speeds */ + ret = ppd->link_speed_enabled; + goto done; + + case QIB_IB_CFG_SPD: /* Get current Link spd */ + ret = ppd->link_speed_active; + goto done; + + case QIB_IB_CFG_RXPOL_ENB: /* Get Auto-RX-polarity enable */ + lsb = SYM_LSB(IBCCtrlB_0, IB_POLARITY_REV_SUPP); + maskr = SYM_RMASK(IBCCtrlB_0, IB_POLARITY_REV_SUPP); + break; + + case QIB_IB_CFG_LREV_ENB: /* Get Auto-Lane-reversal enable */ + lsb = SYM_LSB(IBCCtrlB_0, IB_LANE_REV_SUPPORTED); + maskr = SYM_RMASK(IBCCtrlB_0, IB_LANE_REV_SUPPORTED); + break; + + case QIB_IB_CFG_LINKLATENCY: + ret = qib_read_kreg_port(ppd, krp_ibcstatus_b) & + SYM_MASK(IBCStatusB_0, LinkRoundTripLatency); + goto done; + + case QIB_IB_CFG_OP_VLS: + ret = ppd->vls_operational; + goto done; + + case QIB_IB_CFG_VL_HIGH_CAP: + ret = 16; + goto done; + + case QIB_IB_CFG_VL_LOW_CAP: + ret = 16; + goto done; + + case QIB_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */ + ret = SYM_FIELD(ppd->cpspec->ibcctrl_a, IBCCtrlA_0, + OverrunThreshold); + goto done; + + case QIB_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */ + ret = SYM_FIELD(ppd->cpspec->ibcctrl_a, IBCCtrlA_0, + PhyerrThreshold); + goto done; + + case QIB_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */ + /* will only take effect when the link state changes */ + ret = (ppd->cpspec->ibcctrl_a & + SYM_MASK(IBCCtrlA_0, LinkDownDefaultState)) ? + IB_LINKINITCMD_SLEEP : IB_LINKINITCMD_POLL; + goto done; + + case QIB_IB_CFG_HRTBT: /* Get Heartbeat off/enable/auto */ + lsb = IBA7322_IBC_HRTBT_LSB; + maskr = IBA7322_IBC_HRTBT_RMASK; /* OR of AUTO and ENB */ + break; + + case QIB_IB_CFG_PMA_TICKS: + /* + * 0x00 = 10x link transfer rate or 4 nsec. for 2.5Gbs + * Since the clock is always 250MHz, the value is 3, 1 or 0. + */ + if (ppd->link_speed_active == QIB_IB_QDR) + ret = 3; + else if (ppd->link_speed_active == QIB_IB_DDR) + ret = 1; + else + ret = 0; + goto done; + + default: + ret = -EINVAL; + goto done; + } + ret = (int)((ppd->cpspec->ibcctrl_b >> lsb) & maskr); +done: + return ret; +} + +/* + * Below again cribbed liberally from older version. Do not lean + * heavily on it. + */ +#define IBA7322_IBC_DLIDLMC_SHIFT QIB_7322_IBCCtrlB_0_IB_DLID_LSB +#define IBA7322_IBC_DLIDLMC_MASK (QIB_7322_IBCCtrlB_0_IB_DLID_RMASK \ + | (QIB_7322_IBCCtrlB_0_IB_DLID_MASK_RMASK << 16)) + +static int qib_7322_set_ib_cfg(struct qib_pportdata *ppd, int which, u32 val) +{ + struct qib_devdata *dd = ppd->dd; + u64 maskr; /* right-justified mask */ + int lsb, ret = 0; + u16 lcmd, licmd; + unsigned long flags; + + switch (which) { + case QIB_IB_CFG_LIDLMC: + /* + * Set LID and LMC. Combined to avoid possible hazard + * caller puts LMC in 16MSbits, DLID in 16LSbits of val + */ + lsb = IBA7322_IBC_DLIDLMC_SHIFT; + maskr = IBA7322_IBC_DLIDLMC_MASK; + /* + * For header-checking, the SLID in the packet will + * be masked with SendIBSLMCMask, and compared + * with SendIBSLIDAssignMask. Make sure we do not + * set any bits not covered by the mask, or we get + * false-positives. + */ + qib_write_kreg_port(ppd, krp_sendslid, + val & (val >> 16) & SendIBSLIDAssignMask); + qib_write_kreg_port(ppd, krp_sendslidmask, + (val >> 16) & SendIBSLMCMask); + break; + + case QIB_IB_CFG_LWID_ENB: /* set allowed Link-width */ + ppd->link_width_enabled = val; + /* convert IB value to chip register value */ + if (val == IB_WIDTH_1X) + val = 0; + else if (val == IB_WIDTH_4X) + val = 1; + else + val = 3; + maskr = SYM_RMASK(IBCCtrlB_0, IB_NUM_CHANNELS); + lsb = SYM_LSB(IBCCtrlB_0, IB_NUM_CHANNELS); + break; + + case QIB_IB_CFG_SPD_ENB: /* set allowed Link speeds */ + /* + * As with width, only write the actual register if the + * link is currently down, otherwise takes effect on next + * link change. Since setting is being explicitly requested + * (via MAD or sysfs), clear autoneg failure status if speed + * autoneg is enabled. + */ + ppd->link_speed_enabled = val; + val <<= IBA7322_IBC_SPEED_LSB; + maskr = IBA7322_IBC_SPEED_MASK | IBA7322_IBC_IBTA_1_2_MASK | + IBA7322_IBC_MAX_SPEED_MASK; + if (val & (val - 1)) { + /* Muliple speeds enabled */ + val |= IBA7322_IBC_IBTA_1_2_MASK | + IBA7322_IBC_MAX_SPEED_MASK; + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags &= ~QIBL_IB_AUTONEG_FAILED; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + } else if (val & IBA7322_IBC_SPEED_QDR) + val |= IBA7322_IBC_IBTA_1_2_MASK; + /* IBTA 1.2 mode + min/max + speed bits are contiguous */ + lsb = SYM_LSB(IBCCtrlB_0, IB_ENHANCED_MODE); + break; + + case QIB_IB_CFG_RXPOL_ENB: /* set Auto-RX-polarity enable */ + lsb = SYM_LSB(IBCCtrlB_0, IB_POLARITY_REV_SUPP); + maskr = SYM_RMASK(IBCCtrlB_0, IB_POLARITY_REV_SUPP); + break; + + case QIB_IB_CFG_LREV_ENB: /* set Auto-Lane-reversal enable */ + lsb = SYM_LSB(IBCCtrlB_0, IB_LANE_REV_SUPPORTED); + maskr = SYM_RMASK(IBCCtrlB_0, IB_LANE_REV_SUPPORTED); + break; + + case QIB_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */ + maskr = SYM_FIELD(ppd->cpspec->ibcctrl_a, IBCCtrlA_0, + OverrunThreshold); + if (maskr != val) { + ppd->cpspec->ibcctrl_a &= + ~SYM_MASK(IBCCtrlA_0, OverrunThreshold); + ppd->cpspec->ibcctrl_a |= (u64) val << + SYM_LSB(IBCCtrlA_0, OverrunThreshold); + qib_write_kreg_port(ppd, krp_ibcctrl_a, + ppd->cpspec->ibcctrl_a); + qib_write_kreg(dd, kr_scratch, 0ULL); + } + goto bail; + + case QIB_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */ + maskr = SYM_FIELD(ppd->cpspec->ibcctrl_a, IBCCtrlA_0, + PhyerrThreshold); + if (maskr != val) { + ppd->cpspec->ibcctrl_a &= + ~SYM_MASK(IBCCtrlA_0, PhyerrThreshold); + ppd->cpspec->ibcctrl_a |= (u64) val << + SYM_LSB(IBCCtrlA_0, PhyerrThreshold); + qib_write_kreg_port(ppd, krp_ibcctrl_a, + ppd->cpspec->ibcctrl_a); + qib_write_kreg(dd, kr_scratch, 0ULL); + } + goto bail; + + case QIB_IB_CFG_PKEYS: /* update pkeys */ + maskr = (u64) ppd->pkeys[0] | ((u64) ppd->pkeys[1] << 16) | + ((u64) ppd->pkeys[2] << 32) | + ((u64) ppd->pkeys[3] << 48); + qib_write_kreg_port(ppd, krp_partitionkey, maskr); + goto bail; + + case QIB_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */ + /* will only take effect when the link state changes */ + if (val == IB_LINKINITCMD_POLL) + ppd->cpspec->ibcctrl_a &= + ~SYM_MASK(IBCCtrlA_0, LinkDownDefaultState); + else /* SLEEP */ + ppd->cpspec->ibcctrl_a |= + SYM_MASK(IBCCtrlA_0, LinkDownDefaultState); + qib_write_kreg_port(ppd, krp_ibcctrl_a, ppd->cpspec->ibcctrl_a); + qib_write_kreg(dd, kr_scratch, 0ULL); + goto bail; + + case QIB_IB_CFG_MTU: /* update the MTU in IBC */ + /* + * Update our housekeeping variables, and set IBC max + * size, same as init code; max IBC is max we allow in + * buffer, less the qword pbc, plus 1 for ICRC, in dwords + * Set even if it's unchanged, print debug message only + * on changes. + */ + val = (ppd->ibmaxlen >> 2) + 1; + ppd->cpspec->ibcctrl_a &= ~SYM_MASK(IBCCtrlA_0, MaxPktLen); + ppd->cpspec->ibcctrl_a |= (u64)val << + SYM_LSB(IBCCtrlA_0, MaxPktLen); + qib_write_kreg_port(ppd, krp_ibcctrl_a, + ppd->cpspec->ibcctrl_a); + qib_write_kreg(dd, kr_scratch, 0ULL); + goto bail; + + case QIB_IB_CFG_LSTATE: /* set the IB link state */ + switch (val & 0xffff0000) { + case IB_LINKCMD_DOWN: + lcmd = QLOGIC_IB_IBCC_LINKCMD_DOWN; + ppd->cpspec->ibmalfusesnap = 1; + ppd->cpspec->ibmalfsnap = read_7322_creg32_port(ppd, + crp_errlink); + if (!ppd->cpspec->ibdeltainprog && + qib_compat_ddr_negotiate) { + ppd->cpspec->ibdeltainprog = 1; + ppd->cpspec->ibsymsnap = + read_7322_creg32_port(ppd, + crp_ibsymbolerr); + ppd->cpspec->iblnkerrsnap = + read_7322_creg32_port(ppd, + crp_iblinkerrrecov); + } + break; + + case IB_LINKCMD_ARMED: + lcmd = QLOGIC_IB_IBCC_LINKCMD_ARMED; + if (ppd->cpspec->ibmalfusesnap) { + ppd->cpspec->ibmalfusesnap = 0; + ppd->cpspec->ibmalfdelta += + read_7322_creg32_port(ppd, + crp_errlink) - + ppd->cpspec->ibmalfsnap; + } + break; + + case IB_LINKCMD_ACTIVE: + lcmd = QLOGIC_IB_IBCC_LINKCMD_ACTIVE; + break; + + default: + ret = -EINVAL; + qib_dev_err(dd, "bad linkcmd req 0x%x\n", val >> 16); + goto bail; + } + switch (val & 0xffff) { + case IB_LINKINITCMD_NOP: + licmd = 0; + break; + + case IB_LINKINITCMD_POLL: + licmd = QLOGIC_IB_IBCC_LINKINITCMD_POLL; + break; + + case IB_LINKINITCMD_SLEEP: + licmd = QLOGIC_IB_IBCC_LINKINITCMD_SLEEP; + break; + + case IB_LINKINITCMD_DISABLE: + licmd = QLOGIC_IB_IBCC_LINKINITCMD_DISABLE; + ppd->cpspec->chase_end = 0; + /* + * stop state chase counter and timer, if running. + * wait forpending timer, but don't clear .data (ppd)! + */ + if (ppd->cpspec->chase_timer.expires) { + del_timer_sync(&ppd->cpspec->chase_timer); + ppd->cpspec->chase_timer.expires = 0; + } + break; + + default: + ret = -EINVAL; + qib_dev_err(dd, "bad linkinitcmd req 0x%x\n", + val & 0xffff); + goto bail; + } + qib_set_ib_7322_lstate(ppd, lcmd, licmd); + goto bail; + + case QIB_IB_CFG_OP_VLS: + if (ppd->vls_operational != val) { + ppd->vls_operational = val; + set_vls(ppd); + } + goto bail; + + case QIB_IB_CFG_VL_HIGH_LIMIT: + qib_write_kreg_port(ppd, krp_highprio_limit, val); + goto bail; + + case QIB_IB_CFG_HRTBT: /* set Heartbeat off/enable/auto */ + if (val > 3) { + ret = -EINVAL; + goto bail; + } + lsb = IBA7322_IBC_HRTBT_LSB; + maskr = IBA7322_IBC_HRTBT_RMASK; /* OR of AUTO and ENB */ + break; + + case QIB_IB_CFG_PORT: + /* val is the port number of the switch we are connected to. */ + if (ppd->dd->cspec->r1) { + cancel_delayed_work(&ppd->cpspec->ipg_work); + ppd->cpspec->ipg_tries = 0; + } + goto bail; + + default: + ret = -EINVAL; + goto bail; + } + ppd->cpspec->ibcctrl_b &= ~(maskr << lsb); + ppd->cpspec->ibcctrl_b |= (((u64) val & maskr) << lsb); + qib_write_kreg_port(ppd, krp_ibcctrl_b, ppd->cpspec->ibcctrl_b); + qib_write_kreg(dd, kr_scratch, 0); +bail: + return ret; +} + +static int qib_7322_set_loopback(struct qib_pportdata *ppd, const char *what) +{ + int ret = 0; + u64 val, ctrlb; + + /* only IBC loopback, may add serdes and xgxs loopbacks later */ + if (!strncmp(what, "ibc", 3)) { + ppd->cpspec->ibcctrl_a |= SYM_MASK(IBCCtrlA_0, + Loopback); + val = 0; /* disable heart beat, so link will come up */ + qib_devinfo(ppd->dd->pcidev, "Enabling IB%u:%u IBC loopback\n", + ppd->dd->unit, ppd->port); + } else if (!strncmp(what, "off", 3)) { + ppd->cpspec->ibcctrl_a &= ~SYM_MASK(IBCCtrlA_0, + Loopback); + /* enable heart beat again */ + val = IBA7322_IBC_HRTBT_RMASK << IBA7322_IBC_HRTBT_LSB; + qib_devinfo(ppd->dd->pcidev, + "Disabling IB%u:%u IBC loopback (normal)\n", + ppd->dd->unit, ppd->port); + } else + ret = -EINVAL; + if (!ret) { + qib_write_kreg_port(ppd, krp_ibcctrl_a, + ppd->cpspec->ibcctrl_a); + ctrlb = ppd->cpspec->ibcctrl_b & ~(IBA7322_IBC_HRTBT_MASK + << IBA7322_IBC_HRTBT_LSB); + ppd->cpspec->ibcctrl_b = ctrlb | val; + qib_write_kreg_port(ppd, krp_ibcctrl_b, + ppd->cpspec->ibcctrl_b); + qib_write_kreg(ppd->dd, kr_scratch, 0); + } + return ret; +} + +static void get_vl_weights(struct qib_pportdata *ppd, unsigned regno, + struct ib_vl_weight_elem *vl) +{ + unsigned i; + + for (i = 0; i < 16; i++, regno++, vl++) { + u32 val = qib_read_kreg_port(ppd, regno); + + vl->vl = (val >> SYM_LSB(LowPriority0_0, VirtualLane)) & + SYM_RMASK(LowPriority0_0, VirtualLane); + vl->weight = (val >> SYM_LSB(LowPriority0_0, Weight)) & + SYM_RMASK(LowPriority0_0, Weight); + } +} + +static void set_vl_weights(struct qib_pportdata *ppd, unsigned regno, + struct ib_vl_weight_elem *vl) +{ + unsigned i; + + for (i = 0; i < 16; i++, regno++, vl++) { + u64 val; + + val = ((vl->vl & SYM_RMASK(LowPriority0_0, VirtualLane)) << + SYM_LSB(LowPriority0_0, VirtualLane)) | + ((vl->weight & SYM_RMASK(LowPriority0_0, Weight)) << + SYM_LSB(LowPriority0_0, Weight)); + qib_write_kreg_port(ppd, regno, val); + } + if (!(ppd->p_sendctrl & SYM_MASK(SendCtrl_0, IBVLArbiterEn))) { + struct qib_devdata *dd = ppd->dd; + unsigned long flags; + + spin_lock_irqsave(&dd->sendctrl_lock, flags); + ppd->p_sendctrl |= SYM_MASK(SendCtrl_0, IBVLArbiterEn); + qib_write_kreg_port(ppd, krp_sendctrl, ppd->p_sendctrl); + qib_write_kreg(dd, kr_scratch, 0); + spin_unlock_irqrestore(&dd->sendctrl_lock, flags); + } +} + +static int qib_7322_get_ib_table(struct qib_pportdata *ppd, int which, void *t) +{ + switch (which) { + case QIB_IB_TBL_VL_HIGH_ARB: + get_vl_weights(ppd, krp_highprio_0, t); + break; + + case QIB_IB_TBL_VL_LOW_ARB: + get_vl_weights(ppd, krp_lowprio_0, t); + break; + + default: + return -EINVAL; + } + return 0; +} + +static int qib_7322_set_ib_table(struct qib_pportdata *ppd, int which, void *t) +{ + switch (which) { + case QIB_IB_TBL_VL_HIGH_ARB: + set_vl_weights(ppd, krp_highprio_0, t); + break; + + case QIB_IB_TBL_VL_LOW_ARB: + set_vl_weights(ppd, krp_lowprio_0, t); + break; + + default: + return -EINVAL; + } + return 0; +} + +static void qib_update_7322_usrhead(struct qib_ctxtdata *rcd, u64 hd, + u32 updegr, u32 egrhd, u32 npkts) +{ + /* + * Need to write timeout register before updating rcvhdrhead to ensure + * that the timer is enabled on reception of a packet. + */ + if (hd >> IBA7322_HDRHEAD_PKTINT_SHIFT) + adjust_rcv_timeout(rcd, npkts); + if (updegr) + qib_write_ureg(rcd->dd, ur_rcvegrindexhead, egrhd, rcd->ctxt); + mmiowb(); + qib_write_ureg(rcd->dd, ur_rcvhdrhead, hd, rcd->ctxt); + qib_write_ureg(rcd->dd, ur_rcvhdrhead, hd, rcd->ctxt); + mmiowb(); +} + +static u32 qib_7322_hdrqempty(struct qib_ctxtdata *rcd) +{ + u32 head, tail; + + head = qib_read_ureg32(rcd->dd, ur_rcvhdrhead, rcd->ctxt); + if (rcd->rcvhdrtail_kvaddr) + tail = qib_get_rcvhdrtail(rcd); + else + tail = qib_read_ureg32(rcd->dd, ur_rcvhdrtail, rcd->ctxt); + return head == tail; +} + +#define RCVCTRL_COMMON_MODS (QIB_RCVCTRL_CTXT_ENB | \ + QIB_RCVCTRL_CTXT_DIS | \ + QIB_RCVCTRL_TIDFLOW_ENB | \ + QIB_RCVCTRL_TIDFLOW_DIS | \ + QIB_RCVCTRL_TAILUPD_ENB | \ + QIB_RCVCTRL_TAILUPD_DIS | \ + QIB_RCVCTRL_INTRAVAIL_ENB | \ + QIB_RCVCTRL_INTRAVAIL_DIS | \ + QIB_RCVCTRL_BP_ENB | \ + QIB_RCVCTRL_BP_DIS) + +#define RCVCTRL_PORT_MODS (QIB_RCVCTRL_CTXT_ENB | \ + QIB_RCVCTRL_CTXT_DIS | \ + QIB_RCVCTRL_PKEY_DIS | \ + QIB_RCVCTRL_PKEY_ENB) + +/* + * Modify the RCVCTRL register in chip-specific way. This + * is a function because bit positions and (future) register + * location is chip-specifc, but the needed operations are + * generic. is a bit-mask because we often want to + * do multiple modifications. + */ +static void rcvctrl_7322_mod(struct qib_pportdata *ppd, unsigned int op, + int ctxt) +{ + struct qib_devdata *dd = ppd->dd; + struct qib_ctxtdata *rcd; + u64 mask, val; + unsigned long flags; + + spin_lock_irqsave(&dd->cspec->rcvmod_lock, flags); + + if (op & QIB_RCVCTRL_TIDFLOW_ENB) + dd->rcvctrl |= SYM_MASK(RcvCtrl, TidFlowEnable); + if (op & QIB_RCVCTRL_TIDFLOW_DIS) + dd->rcvctrl &= ~SYM_MASK(RcvCtrl, TidFlowEnable); + if (op & QIB_RCVCTRL_TAILUPD_ENB) + dd->rcvctrl |= SYM_MASK(RcvCtrl, TailUpd); + if (op & QIB_RCVCTRL_TAILUPD_DIS) + dd->rcvctrl &= ~SYM_MASK(RcvCtrl, TailUpd); + if (op & QIB_RCVCTRL_PKEY_ENB) + ppd->p_rcvctrl &= ~SYM_MASK(RcvCtrl_0, RcvPartitionKeyDisable); + if (op & QIB_RCVCTRL_PKEY_DIS) + ppd->p_rcvctrl |= SYM_MASK(RcvCtrl_0, RcvPartitionKeyDisable); + if (ctxt < 0) { + mask = (1ULL << dd->ctxtcnt) - 1; + rcd = NULL; + } else { + mask = (1ULL << ctxt); + rcd = dd->rcd[ctxt]; + } + if ((op & QIB_RCVCTRL_CTXT_ENB) && rcd) { + ppd->p_rcvctrl |= + (mask << SYM_LSB(RcvCtrl_0, ContextEnableKernel)); + if (!(dd->flags & QIB_NODMA_RTAIL)) { + op |= QIB_RCVCTRL_TAILUPD_ENB; /* need reg write */ + dd->rcvctrl |= SYM_MASK(RcvCtrl, TailUpd); + } + /* Write these registers before the context is enabled. */ + qib_write_kreg_ctxt(dd, krc_rcvhdrtailaddr, ctxt, + rcd->rcvhdrqtailaddr_phys); + qib_write_kreg_ctxt(dd, krc_rcvhdraddr, ctxt, + rcd->rcvhdrq_phys); + rcd->seq_cnt = 1; + } + if (op & QIB_RCVCTRL_CTXT_DIS) + ppd->p_rcvctrl &= + ~(mask << SYM_LSB(RcvCtrl_0, ContextEnableKernel)); + if (op & QIB_RCVCTRL_BP_ENB) + dd->rcvctrl |= mask << SYM_LSB(RcvCtrl, dontDropRHQFull); + if (op & QIB_RCVCTRL_BP_DIS) + dd->rcvctrl &= ~(mask << SYM_LSB(RcvCtrl, dontDropRHQFull)); + if (op & QIB_RCVCTRL_INTRAVAIL_ENB) + dd->rcvctrl |= (mask << SYM_LSB(RcvCtrl, IntrAvail)); + if (op & QIB_RCVCTRL_INTRAVAIL_DIS) + dd->rcvctrl &= ~(mask << SYM_LSB(RcvCtrl, IntrAvail)); + /* + * Decide which registers to write depending on the ops enabled. + * Special case is "flush" (no bits set at all) + * which needs to write both. + */ + if (op == 0 || (op & RCVCTRL_COMMON_MODS)) + qib_write_kreg(dd, kr_rcvctrl, dd->rcvctrl); + if (op == 0 || (op & RCVCTRL_PORT_MODS)) + qib_write_kreg_port(ppd, krp_rcvctrl, ppd->p_rcvctrl); + if ((op & QIB_RCVCTRL_CTXT_ENB) && dd->rcd[ctxt]) { + /* + * Init the context registers also; if we were + * disabled, tail and head should both be zero + * already from the enable, but since we don't + * know, we have to do it explicitly. + */ + val = qib_read_ureg32(dd, ur_rcvegrindextail, ctxt); + qib_write_ureg(dd, ur_rcvegrindexhead, val, ctxt); + + /* be sure enabling write seen; hd/tl should be 0 */ + (void) qib_read_kreg32(dd, kr_scratch); + val = qib_read_ureg32(dd, ur_rcvhdrtail, ctxt); + dd->rcd[ctxt]->head = val; + /* If kctxt, interrupt on next receive. */ + if (ctxt < dd->first_user_ctxt) + val |= dd->rhdrhead_intr_off; + qib_write_ureg(dd, ur_rcvhdrhead, val, ctxt); + } else if ((op & QIB_RCVCTRL_INTRAVAIL_ENB) && + dd->rcd[ctxt] && dd->rhdrhead_intr_off) { + /* arm rcv interrupt */ + val = dd->rcd[ctxt]->head | dd->rhdrhead_intr_off; + qib_write_ureg(dd, ur_rcvhdrhead, val, ctxt); + } + if (op & QIB_RCVCTRL_CTXT_DIS) { + unsigned f; + + /* Now that the context is disabled, clear these registers. */ + if (ctxt >= 0) { + qib_write_kreg_ctxt(dd, krc_rcvhdrtailaddr, ctxt, 0); + qib_write_kreg_ctxt(dd, krc_rcvhdraddr, ctxt, 0); + for (f = 0; f < NUM_TIDFLOWS_CTXT; f++) + qib_write_ureg(dd, ur_rcvflowtable + f, + TIDFLOW_ERRBITS, ctxt); + } else { + unsigned i; + + for (i = 0; i < dd->cfgctxts; i++) { + qib_write_kreg_ctxt(dd, krc_rcvhdrtailaddr, + i, 0); + qib_write_kreg_ctxt(dd, krc_rcvhdraddr, i, 0); + for (f = 0; f < NUM_TIDFLOWS_CTXT; f++) + qib_write_ureg(dd, ur_rcvflowtable + f, + TIDFLOW_ERRBITS, i); + } + } + } + spin_unlock_irqrestore(&dd->cspec->rcvmod_lock, flags); +} + +/* + * Modify the SENDCTRL register in chip-specific way. This + * is a function where there are multiple such registers with + * slightly different layouts. + * The chip doesn't allow back-to-back sendctrl writes, so write + * the scratch register after writing sendctrl. + * + * Which register is written depends on the operation. + * Most operate on the common register, while + * SEND_ENB and SEND_DIS operate on the per-port ones. + * SEND_ENB is included in common because it can change SPCL_TRIG + */ +#define SENDCTRL_COMMON_MODS (\ + QIB_SENDCTRL_CLEAR | \ + QIB_SENDCTRL_AVAIL_DIS | \ + QIB_SENDCTRL_AVAIL_ENB | \ + QIB_SENDCTRL_AVAIL_BLIP | \ + QIB_SENDCTRL_DISARM | \ + QIB_SENDCTRL_DISARM_ALL | \ + QIB_SENDCTRL_SEND_ENB) + +#define SENDCTRL_PORT_MODS (\ + QIB_SENDCTRL_CLEAR | \ + QIB_SENDCTRL_SEND_ENB | \ + QIB_SENDCTRL_SEND_DIS | \ + QIB_SENDCTRL_FLUSH) + +static void sendctrl_7322_mod(struct qib_pportdata *ppd, u32 op) +{ + struct qib_devdata *dd = ppd->dd; + u64 tmp_dd_sendctrl; + unsigned long flags; + + spin_lock_irqsave(&dd->sendctrl_lock, flags); + + /* First the dd ones that are "sticky", saved in shadow */ + if (op & QIB_SENDCTRL_CLEAR) + dd->sendctrl = 0; + if (op & QIB_SENDCTRL_AVAIL_DIS) + dd->sendctrl &= ~SYM_MASK(SendCtrl, SendBufAvailUpd); + else if (op & QIB_SENDCTRL_AVAIL_ENB) { + dd->sendctrl |= SYM_MASK(SendCtrl, SendBufAvailUpd); + if (dd->flags & QIB_USE_SPCL_TRIG) + dd->sendctrl |= SYM_MASK(SendCtrl, SpecialTriggerEn); + } + + /* Then the ppd ones that are "sticky", saved in shadow */ + if (op & QIB_SENDCTRL_SEND_DIS) + ppd->p_sendctrl &= ~SYM_MASK(SendCtrl_0, SendEnable); + else if (op & QIB_SENDCTRL_SEND_ENB) + ppd->p_sendctrl |= SYM_MASK(SendCtrl_0, SendEnable); + + if (op & QIB_SENDCTRL_DISARM_ALL) { + u32 i, last; + + tmp_dd_sendctrl = dd->sendctrl; + last = dd->piobcnt2k + dd->piobcnt4k + NUM_VL15_BUFS; + /* + * Disarm any buffers that are not yet launched, + * disabling updates until done. + */ + tmp_dd_sendctrl &= ~SYM_MASK(SendCtrl, SendBufAvailUpd); + for (i = 0; i < last; i++) { + qib_write_kreg(dd, kr_sendctrl, + tmp_dd_sendctrl | + SYM_MASK(SendCtrl, Disarm) | i); + qib_write_kreg(dd, kr_scratch, 0); + } + } + + if (op & QIB_SENDCTRL_FLUSH) { + u64 tmp_ppd_sendctrl = ppd->p_sendctrl; + + /* + * Now drain all the fifos. The Abort bit should never be + * needed, so for now, at least, we don't use it. + */ + tmp_ppd_sendctrl |= + SYM_MASK(SendCtrl_0, TxeDrainRmFifo) | + SYM_MASK(SendCtrl_0, TxeDrainLaFifo) | + SYM_MASK(SendCtrl_0, TxeBypassIbc); + qib_write_kreg_port(ppd, krp_sendctrl, tmp_ppd_sendctrl); + qib_write_kreg(dd, kr_scratch, 0); + } + + tmp_dd_sendctrl = dd->sendctrl; + + if (op & QIB_SENDCTRL_DISARM) + tmp_dd_sendctrl |= SYM_MASK(SendCtrl, Disarm) | + ((op & QIB_7322_SendCtrl_DisarmSendBuf_RMASK) << + SYM_LSB(SendCtrl, DisarmSendBuf)); + if ((op & QIB_SENDCTRL_AVAIL_BLIP) && + (dd->sendctrl & SYM_MASK(SendCtrl, SendBufAvailUpd))) + tmp_dd_sendctrl &= ~SYM_MASK(SendCtrl, SendBufAvailUpd); + + if (op == 0 || (op & SENDCTRL_COMMON_MODS)) { + qib_write_kreg(dd, kr_sendctrl, tmp_dd_sendctrl); + qib_write_kreg(dd, kr_scratch, 0); + } + + if (op == 0 || (op & SENDCTRL_PORT_MODS)) { + qib_write_kreg_port(ppd, krp_sendctrl, ppd->p_sendctrl); + qib_write_kreg(dd, kr_scratch, 0); + } + + if (op & QIB_SENDCTRL_AVAIL_BLIP) { + qib_write_kreg(dd, kr_sendctrl, dd->sendctrl); + qib_write_kreg(dd, kr_scratch, 0); + } + + spin_unlock_irqrestore(&dd->sendctrl_lock, flags); + + if (op & QIB_SENDCTRL_FLUSH) { + u32 v; + /* + * ensure writes have hit chip, then do a few + * more reads, to allow DMA of pioavail registers + * to occur, so in-memory copy is in sync with + * the chip. Not always safe to sleep. + */ + v = qib_read_kreg32(dd, kr_scratch); + qib_write_kreg(dd, kr_scratch, v); + v = qib_read_kreg32(dd, kr_scratch); + qib_write_kreg(dd, kr_scratch, v); + qib_read_kreg32(dd, kr_scratch); + } +} + +#define _PORT_VIRT_FLAG 0x8000U /* "virtual", need adjustments */ +#define _PORT_64BIT_FLAG 0x10000U /* not "virtual", but 64bit */ +#define _PORT_CNTR_IDXMASK 0x7fffU /* mask off flags above */ + +/** + * qib_portcntr_7322 - read a per-port chip counter + * @ppd: the qlogic_ib pport + * @creg: the counter to read (not a chip offset) + */ +static u64 qib_portcntr_7322(struct qib_pportdata *ppd, u32 reg) +{ + struct qib_devdata *dd = ppd->dd; + u64 ret = 0ULL; + u16 creg; + /* 0xffff for unimplemented or synthesized counters */ + static const u32 xlator[] = { + [QIBPORTCNTR_PKTSEND] = crp_pktsend | _PORT_64BIT_FLAG, + [QIBPORTCNTR_WORDSEND] = crp_wordsend | _PORT_64BIT_FLAG, + [QIBPORTCNTR_PSXMITDATA] = crp_psxmitdatacount, + [QIBPORTCNTR_PSXMITPKTS] = crp_psxmitpktscount, + [QIBPORTCNTR_PSXMITWAIT] = crp_psxmitwaitcount, + [QIBPORTCNTR_SENDSTALL] = crp_sendstall, + [QIBPORTCNTR_PKTRCV] = crp_pktrcv | _PORT_64BIT_FLAG, + [QIBPORTCNTR_PSRCVDATA] = crp_psrcvdatacount, + [QIBPORTCNTR_PSRCVPKTS] = crp_psrcvpktscount, + [QIBPORTCNTR_RCVEBP] = crp_rcvebp, + [QIBPORTCNTR_RCVOVFL] = crp_rcvovfl, + [QIBPORTCNTR_WORDRCV] = crp_wordrcv | _PORT_64BIT_FLAG, + [QIBPORTCNTR_RXDROPPKT] = 0xffff, /* not needed for 7322 */ + [QIBPORTCNTR_RXLOCALPHYERR] = crp_rxotherlocalphyerr, + [QIBPORTCNTR_RXVLERR] = crp_rxvlerr, + [QIBPORTCNTR_ERRICRC] = crp_erricrc, + [QIBPORTCNTR_ERRVCRC] = crp_errvcrc, + [QIBPORTCNTR_ERRLPCRC] = crp_errlpcrc, + [QIBPORTCNTR_BADFORMAT] = crp_badformat, + [QIBPORTCNTR_ERR_RLEN] = crp_err_rlen, + [QIBPORTCNTR_IBSYMBOLERR] = crp_ibsymbolerr, + [QIBPORTCNTR_INVALIDRLEN] = crp_invalidrlen, + [QIBPORTCNTR_UNSUPVL] = crp_txunsupvl, + [QIBPORTCNTR_EXCESSBUFOVFL] = crp_excessbufferovfl, + [QIBPORTCNTR_ERRLINK] = crp_errlink, + [QIBPORTCNTR_IBLINKDOWN] = crp_iblinkdown, + [QIBPORTCNTR_IBLINKERRRECOV] = crp_iblinkerrrecov, + [QIBPORTCNTR_LLI] = crp_locallinkintegrityerr, + [QIBPORTCNTR_VL15PKTDROP] = crp_vl15droppedpkt, + [QIBPORTCNTR_ERRPKEY] = crp_errpkey, + /* + * the next 3 aren't really counters, but were implemented + * as counters in older chips, so still get accessed as + * though they were counters from this code. + */ + [QIBPORTCNTR_PSINTERVAL] = krp_psinterval, + [QIBPORTCNTR_PSSTART] = krp_psstart, + [QIBPORTCNTR_PSSTAT] = krp_psstat, + /* pseudo-counter, summed for all ports */ + [QIBPORTCNTR_KHDROVFL] = 0xffff, + }; + + if (reg >= ARRAY_SIZE(xlator)) { + qib_devinfo(ppd->dd->pcidev, + "Unimplemented portcounter %u\n", reg); + goto done; + } + creg = xlator[reg] & _PORT_CNTR_IDXMASK; + + /* handle non-counters and special cases first */ + if (reg == QIBPORTCNTR_KHDROVFL) { + int i; + + /* sum over all kernel contexts (skip if mini_init) */ + for (i = 0; dd->rcd && i < dd->first_user_ctxt; i++) { + struct qib_ctxtdata *rcd = dd->rcd[i]; + + if (!rcd || rcd->ppd != ppd) + continue; + ret += read_7322_creg32(dd, cr_base_egrovfl + i); + } + goto done; + } else if (reg == QIBPORTCNTR_RXDROPPKT) { + /* + * Used as part of the synthesis of port_rcv_errors + * in the verbs code for IBTA counters. Not needed for 7322, + * because all the errors are already counted by other cntrs. + */ + goto done; + } else if (reg == QIBPORTCNTR_PSINTERVAL || + reg == QIBPORTCNTR_PSSTART || reg == QIBPORTCNTR_PSSTAT) { + /* were counters in older chips, now per-port kernel regs */ + ret = qib_read_kreg_port(ppd, creg); + goto done; + } + + /* + * Only fast increment counters are 64 bits; use 32 bit reads to + * avoid two independent reads when on Opteron. + */ + if (xlator[reg] & _PORT_64BIT_FLAG) + ret = read_7322_creg_port(ppd, creg); + else + ret = read_7322_creg32_port(ppd, creg); + if (creg == crp_ibsymbolerr) { + if (ppd->cpspec->ibdeltainprog) + ret -= ret - ppd->cpspec->ibsymsnap; + ret -= ppd->cpspec->ibsymdelta; + } else if (creg == crp_iblinkerrrecov) { + if (ppd->cpspec->ibdeltainprog) + ret -= ret - ppd->cpspec->iblnkerrsnap; + ret -= ppd->cpspec->iblnkerrdelta; + } else if (creg == crp_errlink) + ret -= ppd->cpspec->ibmalfdelta; + else if (creg == crp_iblinkdown) + ret += ppd->cpspec->iblnkdowndelta; +done: + return ret; +} + +/* + * Device counter names (not port-specific), one line per stat, + * single string. Used by utilities like ipathstats to print the stats + * in a way which works for different versions of drivers, without changing + * the utility. Names need to be 12 chars or less (w/o newline), for proper + * display by utility. + * Non-error counters are first. + * Start of "error" conters is indicated by a leading "E " on the first + * "error" counter, and doesn't count in label length. + * The EgrOvfl list needs to be last so we truncate them at the configured + * context count for the device. + * cntr7322indices contains the corresponding register indices. + */ +static const char cntr7322names[] = + "Interrupts\n" + "HostBusStall\n" + "E RxTIDFull\n" + "RxTIDInvalid\n" + "RxTIDFloDrop\n" /* 7322 only */ + "Ctxt0EgrOvfl\n" + "Ctxt1EgrOvfl\n" + "Ctxt2EgrOvfl\n" + "Ctxt3EgrOvfl\n" + "Ctxt4EgrOvfl\n" + "Ctxt5EgrOvfl\n" + "Ctxt6EgrOvfl\n" + "Ctxt7EgrOvfl\n" + "Ctxt8EgrOvfl\n" + "Ctxt9EgrOvfl\n" + "Ctx10EgrOvfl\n" + "Ctx11EgrOvfl\n" + "Ctx12EgrOvfl\n" + "Ctx13EgrOvfl\n" + "Ctx14EgrOvfl\n" + "Ctx15EgrOvfl\n" + "Ctx16EgrOvfl\n" + "Ctx17EgrOvfl\n" + ; + +static const u32 cntr7322indices[] = { + cr_lbint | _PORT_64BIT_FLAG, + cr_lbstall | _PORT_64BIT_FLAG, + cr_tidfull, + cr_tidinvalid, + cr_rxtidflowdrop, + cr_base_egrovfl + 0, + cr_base_egrovfl + 1, + cr_base_egrovfl + 2, + cr_base_egrovfl + 3, + cr_base_egrovfl + 4, + cr_base_egrovfl + 5, + cr_base_egrovfl + 6, + cr_base_egrovfl + 7, + cr_base_egrovfl + 8, + cr_base_egrovfl + 9, + cr_base_egrovfl + 10, + cr_base_egrovfl + 11, + cr_base_egrovfl + 12, + cr_base_egrovfl + 13, + cr_base_egrovfl + 14, + cr_base_egrovfl + 15, + cr_base_egrovfl + 16, + cr_base_egrovfl + 17, +}; + +/* + * same as cntr7322names and cntr7322indices, but for port-specific counters. + * portcntr7322indices is somewhat complicated by some registers needing + * adjustments of various kinds, and those are ORed with _PORT_VIRT_FLAG + */ +static const char portcntr7322names[] = + "TxPkt\n" + "TxFlowPkt\n" + "TxWords\n" + "RxPkt\n" + "RxFlowPkt\n" + "RxWords\n" + "TxFlowStall\n" + "TxDmaDesc\n" /* 7220 and 7322-only */ + "E RxDlidFltr\n" /* 7220 and 7322-only */ + "IBStatusChng\n" + "IBLinkDown\n" + "IBLnkRecov\n" + "IBRxLinkErr\n" + "IBSymbolErr\n" + "RxLLIErr\n" + "RxBadFormat\n" + "RxBadLen\n" + "RxBufOvrfl\n" + "RxEBP\n" + "RxFlowCtlErr\n" + "RxICRCerr\n" + "RxLPCRCerr\n" + "RxVCRCerr\n" + "RxInvalLen\n" + "RxInvalPKey\n" + "RxPktDropped\n" + "TxBadLength\n" + "TxDropped\n" + "TxInvalLen\n" + "TxUnderrun\n" + "TxUnsupVL\n" + "RxLclPhyErr\n" /* 7220 and 7322-only from here down */ + "RxVL15Drop\n" + "RxVlErr\n" + "XcessBufOvfl\n" + "RxQPBadCtxt\n" /* 7322-only from here down */ + "TXBadHeader\n" + ; + +static const u32 portcntr7322indices[] = { + QIBPORTCNTR_PKTSEND | _PORT_VIRT_FLAG, + crp_pktsendflow, + QIBPORTCNTR_WORDSEND | _PORT_VIRT_FLAG, + QIBPORTCNTR_PKTRCV | _PORT_VIRT_FLAG, + crp_pktrcvflowctrl, + QIBPORTCNTR_WORDRCV | _PORT_VIRT_FLAG, + QIBPORTCNTR_SENDSTALL | _PORT_VIRT_FLAG, + crp_txsdmadesc | _PORT_64BIT_FLAG, + crp_rxdlidfltr, + crp_ibstatuschange, + QIBPORTCNTR_IBLINKDOWN | _PORT_VIRT_FLAG, + QIBPORTCNTR_IBLINKERRRECOV | _PORT_VIRT_FLAG, + QIBPORTCNTR_ERRLINK | _PORT_VIRT_FLAG, + QIBPORTCNTR_IBSYMBOLERR | _PORT_VIRT_FLAG, + QIBPORTCNTR_LLI | _PORT_VIRT_FLAG, + QIBPORTCNTR_BADFORMAT | _PORT_VIRT_FLAG, + QIBPORTCNTR_ERR_RLEN | _PORT_VIRT_FLAG, + QIBPORTCNTR_RCVOVFL | _PORT_VIRT_FLAG, + QIBPORTCNTR_RCVEBP | _PORT_VIRT_FLAG, + crp_rcvflowctrlviol, + QIBPORTCNTR_ERRICRC | _PORT_VIRT_FLAG, + QIBPORTCNTR_ERRLPCRC | _PORT_VIRT_FLAG, + QIBPORTCNTR_ERRVCRC | _PORT_VIRT_FLAG, + QIBPORTCNTR_INVALIDRLEN | _PORT_VIRT_FLAG, + QIBPORTCNTR_ERRPKEY | _PORT_VIRT_FLAG, + QIBPORTCNTR_RXDROPPKT | _PORT_VIRT_FLAG, + crp_txminmaxlenerr, + crp_txdroppedpkt, + crp_txlenerr, + crp_txunderrun, + crp_txunsupvl, + QIBPORTCNTR_RXLOCALPHYERR | _PORT_VIRT_FLAG, + QIBPORTCNTR_VL15PKTDROP | _PORT_VIRT_FLAG, + QIBPORTCNTR_RXVLERR | _PORT_VIRT_FLAG, + QIBPORTCNTR_EXCESSBUFOVFL | _PORT_VIRT_FLAG, + crp_rxqpinvalidctxt, + crp_txhdrerr, +}; + +/* do all the setup to make the counter reads efficient later */ +static void init_7322_cntrnames(struct qib_devdata *dd) +{ + int i, j = 0; + char *s; + + for (i = 0, s = (char *)cntr7322names; s && j <= dd->cfgctxts; + i++) { + /* we always have at least one counter before the egrovfl */ + if (!j && !strncmp("Ctxt0EgrOvfl", s + 1, 12)) + j = 1; + s = strchr(s + 1, '\n'); + if (s && j) + j++; + } + dd->cspec->ncntrs = i; + if (!s) + /* full list; size is without terminating null */ + dd->cspec->cntrnamelen = sizeof(cntr7322names) - 1; + else + dd->cspec->cntrnamelen = 1 + s - cntr7322names; + dd->cspec->cntrs = kmalloc(dd->cspec->ncntrs + * sizeof(u64), GFP_KERNEL); + if (!dd->cspec->cntrs) + qib_dev_err(dd, "Failed allocation for counters\n"); + + for (i = 0, s = (char *)portcntr7322names; s; i++) + s = strchr(s + 1, '\n'); + dd->cspec->nportcntrs = i - 1; + dd->cspec->portcntrnamelen = sizeof(portcntr7322names) - 1; + for (i = 0; i < dd->num_pports; ++i) { + dd->pport[i].cpspec->portcntrs = kmalloc(dd->cspec->nportcntrs + * sizeof(u64), GFP_KERNEL); + if (!dd->pport[i].cpspec->portcntrs) + qib_dev_err(dd, + "Failed allocation for portcounters\n"); + } +} + +static u32 qib_read_7322cntrs(struct qib_devdata *dd, loff_t pos, char **namep, + u64 **cntrp) +{ + u32 ret; + + if (namep) { + ret = dd->cspec->cntrnamelen; + if (pos >= ret) + ret = 0; /* final read after getting everything */ + else + *namep = (char *) cntr7322names; + } else { + u64 *cntr = dd->cspec->cntrs; + int i; + + ret = dd->cspec->ncntrs * sizeof(u64); + if (!cntr || pos >= ret) { + /* everything read, or couldn't get memory */ + ret = 0; + goto done; + } + *cntrp = cntr; + for (i = 0; i < dd->cspec->ncntrs; i++) + if (cntr7322indices[i] & _PORT_64BIT_FLAG) + *cntr++ = read_7322_creg(dd, + cntr7322indices[i] & + _PORT_CNTR_IDXMASK); + else + *cntr++ = read_7322_creg32(dd, + cntr7322indices[i]); + } +done: + return ret; +} + +static u32 qib_read_7322portcntrs(struct qib_devdata *dd, loff_t pos, u32 port, + char **namep, u64 **cntrp) +{ + u32 ret; + + if (namep) { + ret = dd->cspec->portcntrnamelen; + if (pos >= ret) + ret = 0; /* final read after getting everything */ + else + *namep = (char *)portcntr7322names; + } else { + struct qib_pportdata *ppd = &dd->pport[port]; + u64 *cntr = ppd->cpspec->portcntrs; + int i; + + ret = dd->cspec->nportcntrs * sizeof(u64); + if (!cntr || pos >= ret) { + /* everything read, or couldn't get memory */ + ret = 0; + goto done; + } + *cntrp = cntr; + for (i = 0; i < dd->cspec->nportcntrs; i++) { + if (portcntr7322indices[i] & _PORT_VIRT_FLAG) + *cntr++ = qib_portcntr_7322(ppd, + portcntr7322indices[i] & + _PORT_CNTR_IDXMASK); + else if (portcntr7322indices[i] & _PORT_64BIT_FLAG) + *cntr++ = read_7322_creg_port(ppd, + portcntr7322indices[i] & + _PORT_CNTR_IDXMASK); + else + *cntr++ = read_7322_creg32_port(ppd, + portcntr7322indices[i]); + } + } +done: + return ret; +} + +/** + * qib_get_7322_faststats - get word counters from chip before they overflow + * @opaque - contains a pointer to the qlogic_ib device qib_devdata + * + * VESTIGIAL IBA7322 has no "small fast counters", so the only + * real purpose of this function is to maintain the notion of + * "active time", which in turn is only logged into the eeprom, + * which we don;t have, yet, for 7322-based boards. + * + * called from add_timer + */ +static void qib_get_7322_faststats(unsigned long opaque) +{ + struct qib_devdata *dd = (struct qib_devdata *) opaque; + struct qib_pportdata *ppd; + unsigned long flags; + u64 traffic_wds; + int pidx; + + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + + /* + * If port isn't enabled or not operational ports, or + * diags is running (can cause memory diags to fail) + * skip this port this time. + */ + if (!ppd->link_speed_supported || !(dd->flags & QIB_INITTED) + || dd->diag_client) + continue; + + /* + * Maintain an activity timer, based on traffic + * exceeding a threshold, so we need to check the word-counts + * even if they are 64-bit. + */ + traffic_wds = qib_portcntr_7322(ppd, QIBPORTCNTR_WORDRCV) + + qib_portcntr_7322(ppd, QIBPORTCNTR_WORDSEND); + spin_lock_irqsave(&ppd->dd->eep_st_lock, flags); + traffic_wds -= ppd->dd->traffic_wds; + ppd->dd->traffic_wds += traffic_wds; + spin_unlock_irqrestore(&ppd->dd->eep_st_lock, flags); + if (ppd->cpspec->qdr_dfe_on && (ppd->link_speed_active & + QIB_IB_QDR) && + (ppd->lflags & (QIBL_LINKINIT | QIBL_LINKARMED | + QIBL_LINKACTIVE)) && + ppd->cpspec->qdr_dfe_time && + time_is_before_jiffies(ppd->cpspec->qdr_dfe_time)) { + ppd->cpspec->qdr_dfe_on = 0; + + qib_write_kreg_port(ppd, krp_static_adapt_dis(2), + ppd->dd->cspec->r1 ? + QDR_STATIC_ADAPT_INIT_R1 : + QDR_STATIC_ADAPT_INIT); + force_h1(ppd); + } + } + mod_timer(&dd->stats_timer, jiffies + HZ * ACTIVITY_TIMER); +} + +/* + * If we were using MSIx, try to fallback to INTx. + */ +static int qib_7322_intr_fallback(struct qib_devdata *dd) +{ + if (!dd->cspec->num_msix_entries) + return 0; /* already using INTx */ + + qib_devinfo(dd->pcidev, + "MSIx interrupt not detected, trying INTx interrupts\n"); + qib_7322_nomsix(dd); + qib_enable_intx(dd->pcidev); + qib_setup_7322_interrupt(dd, 0); + return 1; +} + +/* + * Reset the XGXS (between serdes and IBC). Slightly less intrusive + * than resetting the IBC or external link state, and useful in some + * cases to cause some retraining. To do this right, we reset IBC + * as well, then return to previous state (which may be still in reset) + * NOTE: some callers of this "know" this writes the current value + * of cpspec->ibcctrl_a as part of it's operation, so if that changes, + * check all callers. + */ +static void qib_7322_mini_pcs_reset(struct qib_pportdata *ppd) +{ + u64 val; + struct qib_devdata *dd = ppd->dd; + const u64 reset_bits = SYM_MASK(IBPCSConfig_0, xcv_rreset) | + SYM_MASK(IBPCSConfig_0, xcv_treset) | + SYM_MASK(IBPCSConfig_0, tx_rx_reset); + + val = qib_read_kreg_port(ppd, krp_ib_pcsconfig); + qib_write_kreg(dd, kr_hwerrmask, + dd->cspec->hwerrmask & ~HWE_MASK(statusValidNoEop)); + qib_write_kreg_port(ppd, krp_ibcctrl_a, + ppd->cpspec->ibcctrl_a & + ~SYM_MASK(IBCCtrlA_0, IBLinkEn)); + + qib_write_kreg_port(ppd, krp_ib_pcsconfig, val | reset_bits); + qib_read_kreg32(dd, kr_scratch); + qib_write_kreg_port(ppd, krp_ib_pcsconfig, val & ~reset_bits); + qib_write_kreg_port(ppd, krp_ibcctrl_a, ppd->cpspec->ibcctrl_a); + qib_write_kreg(dd, kr_scratch, 0ULL); + qib_write_kreg(dd, kr_hwerrclear, + SYM_MASK(HwErrClear, statusValidNoEopClear)); + qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask); +} + +/* + * This code for non-IBTA-compliant IB speed negotiation is only known to + * work for the SDR to DDR transition, and only between an HCA and a switch + * with recent firmware. It is based on observed heuristics, rather than + * actual knowledge of the non-compliant speed negotiation. + * It has a number of hard-coded fields, since the hope is to rewrite this + * when a spec is available on how the negoation is intended to work. + */ +static void autoneg_7322_sendpkt(struct qib_pportdata *ppd, u32 *hdr, + u32 dcnt, u32 *data) +{ + int i; + u64 pbc; + u32 __iomem *piobuf; + u32 pnum, control, len; + struct qib_devdata *dd = ppd->dd; + + i = 0; + len = 7 + dcnt + 1; /* 7 dword header, dword data, icrc */ + control = qib_7322_setpbc_control(ppd, len, 0, 15); + pbc = ((u64) control << 32) | len; + while (!(piobuf = qib_7322_getsendbuf(ppd, pbc, &pnum))) { + if (i++ > 15) + return; + udelay(2); + } + /* disable header check on this packet, since it can't be valid */ + dd->f_txchk_change(dd, pnum, 1, TXCHK_CHG_TYPE_DIS1, NULL); + writeq(pbc, piobuf); + qib_flush_wc(); + qib_pio_copy(piobuf + 2, hdr, 7); + qib_pio_copy(piobuf + 9, data, dcnt); + if (dd->flags & QIB_USE_SPCL_TRIG) { + u32 spcl_off = (pnum >= dd->piobcnt2k) ? 2047 : 1023; + + qib_flush_wc(); + __raw_writel(0xaebecede, piobuf + spcl_off); + } + qib_flush_wc(); + qib_sendbuf_done(dd, pnum); + /* and re-enable hdr check */ + dd->f_txchk_change(dd, pnum, 1, TXCHK_CHG_TYPE_ENAB1, NULL); +} + +/* + * _start packet gets sent twice at start, _done gets sent twice at end + */ +static void qib_autoneg_7322_send(struct qib_pportdata *ppd, int which) +{ + struct qib_devdata *dd = ppd->dd; + static u32 swapped; + u32 dw, i, hcnt, dcnt, *data; + static u32 hdr[7] = { 0xf002ffff, 0x48ffff, 0x6400abba }; + static u32 madpayload_start[0x40] = { + 0x1810103, 0x1, 0x0, 0x0, 0x2c90000, 0x2c9, 0x0, 0x0, + 0xffffffff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x1388, 0x15e, 0x1, /* rest 0's */ + }; + static u32 madpayload_done[0x40] = { + 0x1810103, 0x1, 0x0, 0x0, 0x2c90000, 0x2c9, 0x0, 0x0, + 0xffffffff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x40000001, 0x1388, 0x15e, /* rest 0's */ + }; + + dcnt = ARRAY_SIZE(madpayload_start); + hcnt = ARRAY_SIZE(hdr); + if (!swapped) { + /* for maintainability, do it at runtime */ + for (i = 0; i < hcnt; i++) { + dw = (__force u32) cpu_to_be32(hdr[i]); + hdr[i] = dw; + } + for (i = 0; i < dcnt; i++) { + dw = (__force u32) cpu_to_be32(madpayload_start[i]); + madpayload_start[i] = dw; + dw = (__force u32) cpu_to_be32(madpayload_done[i]); + madpayload_done[i] = dw; + } + swapped = 1; + } + + data = which ? madpayload_done : madpayload_start; + + autoneg_7322_sendpkt(ppd, hdr, dcnt, data); + qib_read_kreg64(dd, kr_scratch); + udelay(2); + autoneg_7322_sendpkt(ppd, hdr, dcnt, data); + qib_read_kreg64(dd, kr_scratch); + udelay(2); +} + +/* + * Do the absolute minimum to cause an IB speed change, and make it + * ready, but don't actually trigger the change. The caller will + * do that when ready (if link is in Polling training state, it will + * happen immediately, otherwise when link next goes down) + * + * This routine should only be used as part of the DDR autonegotation + * code for devices that are not compliant with IB 1.2 (or code that + * fixes things up for same). + * + * When link has gone down, and autoneg enabled, or autoneg has + * failed and we give up until next time we set both speeds, and + * then we want IBTA enabled as well as "use max enabled speed. + */ +static void set_7322_ibspeed_fast(struct qib_pportdata *ppd, u32 speed) +{ + u64 newctrlb; + + newctrlb = ppd->cpspec->ibcctrl_b & ~(IBA7322_IBC_SPEED_MASK | + IBA7322_IBC_IBTA_1_2_MASK | + IBA7322_IBC_MAX_SPEED_MASK); + + if (speed & (speed - 1)) /* multiple speeds */ + newctrlb |= (speed << IBA7322_IBC_SPEED_LSB) | + IBA7322_IBC_IBTA_1_2_MASK | + IBA7322_IBC_MAX_SPEED_MASK; + else + newctrlb |= speed == QIB_IB_QDR ? + IBA7322_IBC_SPEED_QDR | IBA7322_IBC_IBTA_1_2_MASK : + ((speed == QIB_IB_DDR ? + IBA7322_IBC_SPEED_DDR : IBA7322_IBC_SPEED_SDR)); + + if (newctrlb == ppd->cpspec->ibcctrl_b) + return; + + ppd->cpspec->ibcctrl_b = newctrlb; + qib_write_kreg_port(ppd, krp_ibcctrl_b, ppd->cpspec->ibcctrl_b); + qib_write_kreg(ppd->dd, kr_scratch, 0); +} + +/* + * This routine is only used when we are not talking to another + * IB 1.2-compliant device that we think can do DDR. + * (This includes all existing switch chips as of Oct 2007.) + * 1.2-compliant devices go directly to DDR prior to reaching INIT + */ +static void try_7322_autoneg(struct qib_pportdata *ppd) +{ + unsigned long flags; + + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags |= QIBL_IB_AUTONEG_INPROG; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + qib_autoneg_7322_send(ppd, 0); + set_7322_ibspeed_fast(ppd, QIB_IB_DDR); + qib_7322_mini_pcs_reset(ppd); + /* 2 msec is minimum length of a poll cycle */ + queue_delayed_work(ib_wq, &ppd->cpspec->autoneg_work, + msecs_to_jiffies(2)); +} + +/* + * Handle the empirically determined mechanism for auto-negotiation + * of DDR speed with switches. + */ +static void autoneg_7322_work(struct work_struct *work) +{ + struct qib_pportdata *ppd; + struct qib_devdata *dd; + u64 startms; + u32 i; + unsigned long flags; + + ppd = container_of(work, struct qib_chippport_specific, + autoneg_work.work)->ppd; + dd = ppd->dd; + + startms = jiffies_to_msecs(jiffies); + + /* + * Busy wait for this first part, it should be at most a + * few hundred usec, since we scheduled ourselves for 2msec. + */ + for (i = 0; i < 25; i++) { + if (SYM_FIELD(ppd->lastibcstat, IBCStatusA_0, LinkState) + == IB_7322_LT_STATE_POLLQUIET) { + qib_set_linkstate(ppd, QIB_IB_LINKDOWN_DISABLE); + break; + } + udelay(100); + } + + if (!(ppd->lflags & QIBL_IB_AUTONEG_INPROG)) + goto done; /* we got there early or told to stop */ + + /* we expect this to timeout */ + if (wait_event_timeout(ppd->cpspec->autoneg_wait, + !(ppd->lflags & QIBL_IB_AUTONEG_INPROG), + msecs_to_jiffies(90))) + goto done; + qib_7322_mini_pcs_reset(ppd); + + /* we expect this to timeout */ + if (wait_event_timeout(ppd->cpspec->autoneg_wait, + !(ppd->lflags & QIBL_IB_AUTONEG_INPROG), + msecs_to_jiffies(1700))) + goto done; + qib_7322_mini_pcs_reset(ppd); + + set_7322_ibspeed_fast(ppd, QIB_IB_SDR); + + /* + * Wait up to 250 msec for link to train and get to INIT at DDR; + * this should terminate early. + */ + wait_event_timeout(ppd->cpspec->autoneg_wait, + !(ppd->lflags & QIBL_IB_AUTONEG_INPROG), + msecs_to_jiffies(250)); +done: + if (ppd->lflags & QIBL_IB_AUTONEG_INPROG) { + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags &= ~QIBL_IB_AUTONEG_INPROG; + if (ppd->cpspec->autoneg_tries == AUTONEG_TRIES) { + ppd->lflags |= QIBL_IB_AUTONEG_FAILED; + ppd->cpspec->autoneg_tries = 0; + } + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + set_7322_ibspeed_fast(ppd, ppd->link_speed_enabled); + } +} + +/* + * This routine is used to request IPG set in the QLogic switch. + * Only called if r1. + */ +static void try_7322_ipg(struct qib_pportdata *ppd) +{ + struct qib_ibport *ibp = &ppd->ibport_data; + struct ib_mad_send_buf *send_buf; + struct ib_mad_agent *agent; + struct ib_smp *smp; + unsigned delay; + int ret; + + agent = ibp->send_agent; + if (!agent) + goto retry; + + send_buf = ib_create_send_mad(agent, 0, 0, 0, IB_MGMT_MAD_HDR, + IB_MGMT_MAD_DATA, GFP_ATOMIC); + if (IS_ERR(send_buf)) + goto retry; + + if (!ibp->smi_ah) { + struct ib_ah *ah; + + ah = qib_create_qp0_ah(ibp, be16_to_cpu(IB_LID_PERMISSIVE)); + if (IS_ERR(ah)) + ret = PTR_ERR(ah); + else { + send_buf->ah = ah; + ibp->smi_ah = to_iah(ah); + ret = 0; + } + } else { + send_buf->ah = &ibp->smi_ah->ibah; + ret = 0; + } + + smp = send_buf->mad; + smp->base_version = IB_MGMT_BASE_VERSION; + smp->mgmt_class = IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE; + smp->class_version = 1; + smp->method = IB_MGMT_METHOD_SEND; + smp->hop_cnt = 1; + smp->attr_id = QIB_VENDOR_IPG; + smp->attr_mod = 0; + + if (!ret) + ret = ib_post_send_mad(send_buf, NULL); + if (ret) + ib_free_send_mad(send_buf); +retry: + delay = 2 << ppd->cpspec->ipg_tries; + queue_delayed_work(ib_wq, &ppd->cpspec->ipg_work, + msecs_to_jiffies(delay)); +} + +/* + * Timeout handler for setting IPG. + * Only called if r1. + */ +static void ipg_7322_work(struct work_struct *work) +{ + struct qib_pportdata *ppd; + + ppd = container_of(work, struct qib_chippport_specific, + ipg_work.work)->ppd; + if ((ppd->lflags & (QIBL_LINKINIT | QIBL_LINKARMED | QIBL_LINKACTIVE)) + && ++ppd->cpspec->ipg_tries <= 10) + try_7322_ipg(ppd); +} + +static u32 qib_7322_iblink_state(u64 ibcs) +{ + u32 state = (u32)SYM_FIELD(ibcs, IBCStatusA_0, LinkState); + + switch (state) { + case IB_7322_L_STATE_INIT: + state = IB_PORT_INIT; + break; + case IB_7322_L_STATE_ARM: + state = IB_PORT_ARMED; + break; + case IB_7322_L_STATE_ACTIVE: + /* fall through */ + case IB_7322_L_STATE_ACT_DEFER: + state = IB_PORT_ACTIVE; + break; + default: /* fall through */ + case IB_7322_L_STATE_DOWN: + state = IB_PORT_DOWN; + break; + } + return state; +} + +/* returns the IBTA port state, rather than the IBC link training state */ +static u8 qib_7322_phys_portstate(u64 ibcs) +{ + u8 state = (u8)SYM_FIELD(ibcs, IBCStatusA_0, LinkTrainingState); + return qib_7322_physportstate[state]; +} + +static int qib_7322_ib_updown(struct qib_pportdata *ppd, int ibup, u64 ibcs) +{ + int ret = 0, symadj = 0; + unsigned long flags; + int mult; + + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags &= ~QIBL_IB_FORCE_NOTIFY; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + + /* Update our picture of width and speed from chip */ + if (ibcs & SYM_MASK(IBCStatusA_0, LinkSpeedQDR)) { + ppd->link_speed_active = QIB_IB_QDR; + mult = 4; + } else if (ibcs & SYM_MASK(IBCStatusA_0, LinkSpeedActive)) { + ppd->link_speed_active = QIB_IB_DDR; + mult = 2; + } else { + ppd->link_speed_active = QIB_IB_SDR; + mult = 1; + } + if (ibcs & SYM_MASK(IBCStatusA_0, LinkWidthActive)) { + ppd->link_width_active = IB_WIDTH_4X; + mult *= 4; + } else + ppd->link_width_active = IB_WIDTH_1X; + ppd->delay_mult = ib_rate_to_delay[mult_to_ib_rate(mult)]; + + if (!ibup) { + u64 clr; + + /* Link went down. */ + /* do IPG MAD again after linkdown, even if last time failed */ + ppd->cpspec->ipg_tries = 0; + clr = qib_read_kreg_port(ppd, krp_ibcstatus_b) & + (SYM_MASK(IBCStatusB_0, heartbeat_timed_out) | + SYM_MASK(IBCStatusB_0, heartbeat_crosstalk)); + if (clr) + qib_write_kreg_port(ppd, krp_ibcstatus_b, clr); + if (!(ppd->lflags & (QIBL_IB_AUTONEG_FAILED | + QIBL_IB_AUTONEG_INPROG))) + set_7322_ibspeed_fast(ppd, ppd->link_speed_enabled); + if (!(ppd->lflags & QIBL_IB_AUTONEG_INPROG)) { + struct qib_qsfp_data *qd = + &ppd->cpspec->qsfp_data; + /* unlock the Tx settings, speed may change */ + qib_write_kreg_port(ppd, krp_tx_deemph_override, + SYM_MASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0, + reset_tx_deemphasis_override)); + qib_cancel_sends(ppd); + /* on link down, ensure sane pcs state */ + qib_7322_mini_pcs_reset(ppd); + /* schedule the qsfp refresh which should turn the link + off */ + if (ppd->dd->flags & QIB_HAS_QSFP) { + qd->t_insert = jiffies; + queue_work(ib_wq, &qd->work); + } + spin_lock_irqsave(&ppd->sdma_lock, flags); + if (__qib_sdma_running(ppd)) + __qib_sdma_process_event(ppd, + qib_sdma_event_e70_go_idle); + spin_unlock_irqrestore(&ppd->sdma_lock, flags); + } + clr = read_7322_creg32_port(ppd, crp_iblinkdown); + if (clr == ppd->cpspec->iblnkdownsnap) + ppd->cpspec->iblnkdowndelta++; + } else { + if (qib_compat_ddr_negotiate && + !(ppd->lflags & (QIBL_IB_AUTONEG_FAILED | + QIBL_IB_AUTONEG_INPROG)) && + ppd->link_speed_active == QIB_IB_SDR && + (ppd->link_speed_enabled & QIB_IB_DDR) + && ppd->cpspec->autoneg_tries < AUTONEG_TRIES) { + /* we are SDR, and auto-negotiation enabled */ + ++ppd->cpspec->autoneg_tries; + if (!ppd->cpspec->ibdeltainprog) { + ppd->cpspec->ibdeltainprog = 1; + ppd->cpspec->ibsymdelta += + read_7322_creg32_port(ppd, + crp_ibsymbolerr) - + ppd->cpspec->ibsymsnap; + ppd->cpspec->iblnkerrdelta += + read_7322_creg32_port(ppd, + crp_iblinkerrrecov) - + ppd->cpspec->iblnkerrsnap; + } + try_7322_autoneg(ppd); + ret = 1; /* no other IB status change processing */ + } else if ((ppd->lflags & QIBL_IB_AUTONEG_INPROG) && + ppd->link_speed_active == QIB_IB_SDR) { + qib_autoneg_7322_send(ppd, 1); + set_7322_ibspeed_fast(ppd, QIB_IB_DDR); + qib_7322_mini_pcs_reset(ppd); + udelay(2); + ret = 1; /* no other IB status change processing */ + } else if ((ppd->lflags & QIBL_IB_AUTONEG_INPROG) && + (ppd->link_speed_active & QIB_IB_DDR)) { + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags &= ~(QIBL_IB_AUTONEG_INPROG | + QIBL_IB_AUTONEG_FAILED); + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + ppd->cpspec->autoneg_tries = 0; + /* re-enable SDR, for next link down */ + set_7322_ibspeed_fast(ppd, ppd->link_speed_enabled); + wake_up(&ppd->cpspec->autoneg_wait); + symadj = 1; + } else if (ppd->lflags & QIBL_IB_AUTONEG_FAILED) { + /* + * Clear autoneg failure flag, and do setup + * so we'll try next time link goes down and + * back to INIT (possibly connected to a + * different device). + */ + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags &= ~QIBL_IB_AUTONEG_FAILED; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + ppd->cpspec->ibcctrl_b |= IBA7322_IBC_IBTA_1_2_MASK; + symadj = 1; + } + if (!(ppd->lflags & QIBL_IB_AUTONEG_INPROG)) { + symadj = 1; + if (ppd->dd->cspec->r1 && ppd->cpspec->ipg_tries <= 10) + try_7322_ipg(ppd); + if (!ppd->cpspec->recovery_init) + setup_7322_link_recovery(ppd, 0); + ppd->cpspec->qdr_dfe_time = jiffies + + msecs_to_jiffies(QDR_DFE_DISABLE_DELAY); + } + ppd->cpspec->ibmalfusesnap = 0; + ppd->cpspec->ibmalfsnap = read_7322_creg32_port(ppd, + crp_errlink); + } + if (symadj) { + ppd->cpspec->iblnkdownsnap = + read_7322_creg32_port(ppd, crp_iblinkdown); + if (ppd->cpspec->ibdeltainprog) { + ppd->cpspec->ibdeltainprog = 0; + ppd->cpspec->ibsymdelta += read_7322_creg32_port(ppd, + crp_ibsymbolerr) - ppd->cpspec->ibsymsnap; + ppd->cpspec->iblnkerrdelta += read_7322_creg32_port(ppd, + crp_iblinkerrrecov) - ppd->cpspec->iblnkerrsnap; + } + } else if (!ibup && qib_compat_ddr_negotiate && + !ppd->cpspec->ibdeltainprog && + !(ppd->lflags & QIBL_IB_AUTONEG_INPROG)) { + ppd->cpspec->ibdeltainprog = 1; + ppd->cpspec->ibsymsnap = read_7322_creg32_port(ppd, + crp_ibsymbolerr); + ppd->cpspec->iblnkerrsnap = read_7322_creg32_port(ppd, + crp_iblinkerrrecov); + } + + if (!ret) + qib_setup_7322_setextled(ppd, ibup); + return ret; +} + +/* + * Does read/modify/write to appropriate registers to + * set output and direction bits selected by mask. + * these are in their canonical postions (e.g. lsb of + * dir will end up in D48 of extctrl on existing chips). + * returns contents of GP Inputs. + */ +static int gpio_7322_mod(struct qib_devdata *dd, u32 out, u32 dir, u32 mask) +{ + u64 read_val, new_out; + unsigned long flags; + + if (mask) { + /* some bits being written, lock access to GPIO */ + dir &= mask; + out &= mask; + spin_lock_irqsave(&dd->cspec->gpio_lock, flags); + dd->cspec->extctrl &= ~((u64)mask << SYM_LSB(EXTCtrl, GPIOOe)); + dd->cspec->extctrl |= ((u64) dir << SYM_LSB(EXTCtrl, GPIOOe)); + new_out = (dd->cspec->gpio_out & ~mask) | out; + + qib_write_kreg(dd, kr_extctrl, dd->cspec->extctrl); + qib_write_kreg(dd, kr_gpio_out, new_out); + dd->cspec->gpio_out = new_out; + spin_unlock_irqrestore(&dd->cspec->gpio_lock, flags); + } + /* + * It is unlikely that a read at this time would get valid + * data on a pin whose direction line was set in the same + * call to this function. We include the read here because + * that allows us to potentially combine a change on one pin with + * a read on another, and because the old code did something like + * this. + */ + read_val = qib_read_kreg64(dd, kr_extstatus); + return SYM_FIELD(read_val, EXTStatus, GPIOIn); +} + +/* Enable writes to config EEPROM, if possible. Returns previous state */ +static int qib_7322_eeprom_wen(struct qib_devdata *dd, int wen) +{ + int prev_wen; + u32 mask; + + mask = 1 << QIB_EEPROM_WEN_NUM; + prev_wen = ~gpio_7322_mod(dd, 0, 0, 0) >> QIB_EEPROM_WEN_NUM; + gpio_7322_mod(dd, wen ? 0 : mask, mask, mask); + + return prev_wen & 1; +} + +/* + * Read fundamental info we need to use the chip. These are + * the registers that describe chip capabilities, and are + * saved in shadow registers. + */ +static void get_7322_chip_params(struct qib_devdata *dd) +{ + u64 val; + u32 piobufs; + int mtu; + + dd->palign = qib_read_kreg32(dd, kr_pagealign); + + dd->uregbase = qib_read_kreg32(dd, kr_userregbase); + + dd->rcvtidcnt = qib_read_kreg32(dd, kr_rcvtidcnt); + dd->rcvtidbase = qib_read_kreg32(dd, kr_rcvtidbase); + dd->rcvegrbase = qib_read_kreg32(dd, kr_rcvegrbase); + dd->piobufbase = qib_read_kreg64(dd, kr_sendpiobufbase); + dd->pio2k_bufbase = dd->piobufbase & 0xffffffff; + + val = qib_read_kreg64(dd, kr_sendpiobufcnt); + dd->piobcnt2k = val & ~0U; + dd->piobcnt4k = val >> 32; + val = qib_read_kreg64(dd, kr_sendpiosize); + dd->piosize2k = val & ~0U; + dd->piosize4k = val >> 32; + + mtu = ib_mtu_enum_to_int(qib_ibmtu); + if (mtu == -1) + mtu = QIB_DEFAULT_MTU; + dd->pport[0].ibmtu = (u32)mtu; + dd->pport[1].ibmtu = (u32)mtu; + + /* these may be adjusted in init_chip_wc_pat() */ + dd->pio2kbase = (u32 __iomem *) + ((char __iomem *) dd->kregbase + dd->pio2k_bufbase); + dd->pio4kbase = (u32 __iomem *) + ((char __iomem *) dd->kregbase + + (dd->piobufbase >> 32)); + /* + * 4K buffers take 2 pages; we use roundup just to be + * paranoid; we calculate it once here, rather than on + * ever buf allocate + */ + dd->align4k = ALIGN(dd->piosize4k, dd->palign); + + piobufs = dd->piobcnt4k + dd->piobcnt2k + NUM_VL15_BUFS; + + dd->pioavregs = ALIGN(piobufs, sizeof(u64) * BITS_PER_BYTE / 2) / + (sizeof(u64) * BITS_PER_BYTE / 2); +} + +/* + * The chip base addresses in cspec and cpspec have to be set + * after possible init_chip_wc_pat(), rather than in + * get_7322_chip_params(), so split out as separate function + */ +static void qib_7322_set_baseaddrs(struct qib_devdata *dd) +{ + u32 cregbase; + + cregbase = qib_read_kreg32(dd, kr_counterregbase); + + dd->cspec->cregbase = (u64 __iomem *)(cregbase + + (char __iomem *)dd->kregbase); + + dd->egrtidbase = (u64 __iomem *) + ((char __iomem *) dd->kregbase + dd->rcvegrbase); + + /* port registers are defined as relative to base of chip */ + dd->pport[0].cpspec->kpregbase = + (u64 __iomem *)((char __iomem *)dd->kregbase); + dd->pport[1].cpspec->kpregbase = + (u64 __iomem *)(dd->palign + + (char __iomem *)dd->kregbase); + dd->pport[0].cpspec->cpregbase = + (u64 __iomem *)(qib_read_kreg_port(&dd->pport[0], + kr_counterregbase) + (char __iomem *)dd->kregbase); + dd->pport[1].cpspec->cpregbase = + (u64 __iomem *)(qib_read_kreg_port(&dd->pport[1], + kr_counterregbase) + (char __iomem *)dd->kregbase); +} + +/* + * This is a fairly special-purpose observer, so we only support + * the port-specific parts of SendCtrl + */ + +#define SENDCTRL_SHADOWED (SYM_MASK(SendCtrl_0, SendEnable) | \ + SYM_MASK(SendCtrl_0, SDmaEnable) | \ + SYM_MASK(SendCtrl_0, SDmaIntEnable) | \ + SYM_MASK(SendCtrl_0, SDmaSingleDescriptor) | \ + SYM_MASK(SendCtrl_0, SDmaHalt) | \ + SYM_MASK(SendCtrl_0, IBVLArbiterEn) | \ + SYM_MASK(SendCtrl_0, ForceCreditUpToDate)) + +static int sendctrl_hook(struct qib_devdata *dd, + const struct diag_observer *op, u32 offs, + u64 *data, u64 mask, int only_32) +{ + unsigned long flags; + unsigned idx; + unsigned pidx; + struct qib_pportdata *ppd = NULL; + u64 local_data, all_bits; + + /* + * The fixed correspondence between Physical ports and pports is + * severed. We need to hunt for the ppd that corresponds + * to the offset we got. And we have to do that without admitting + * we know the stride, apparently. + */ + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + u64 __iomem *psptr; + u32 psoffs; + + ppd = dd->pport + pidx; + if (!ppd->cpspec->kpregbase) + continue; + + psptr = ppd->cpspec->kpregbase + krp_sendctrl; + psoffs = (u32) (psptr - dd->kregbase) * sizeof(*psptr); + if (psoffs == offs) + break; + } + + /* If pport is not being managed by driver, just avoid shadows. */ + if (pidx >= dd->num_pports) + ppd = NULL; + + /* In any case, "idx" is flat index in kreg space */ + idx = offs / sizeof(u64); + + all_bits = ~0ULL; + if (only_32) + all_bits >>= 32; + + spin_lock_irqsave(&dd->sendctrl_lock, flags); + if (!ppd || (mask & all_bits) != all_bits) { + /* + * At least some mask bits are zero, so we need + * to read. The judgement call is whether from + * reg or shadow. First-cut: read reg, and complain + * if any bits which should be shadowed are different + * from their shadowed value. + */ + if (only_32) + local_data = (u64)qib_read_kreg32(dd, idx); + else + local_data = qib_read_kreg64(dd, idx); + *data = (local_data & ~mask) | (*data & mask); + } + if (mask) { + /* + * At least some mask bits are one, so we need + * to write, but only shadow some bits. + */ + u64 sval, tval; /* Shadowed, transient */ + + /* + * New shadow val is bits we don't want to touch, + * ORed with bits we do, that are intended for shadow. + */ + if (ppd) { + sval = ppd->p_sendctrl & ~mask; + sval |= *data & SENDCTRL_SHADOWED & mask; + ppd->p_sendctrl = sval; + } else + sval = *data & SENDCTRL_SHADOWED & mask; + tval = sval | (*data & ~SENDCTRL_SHADOWED & mask); + qib_write_kreg(dd, idx, tval); + qib_write_kreg(dd, kr_scratch, 0Ull); + } + spin_unlock_irqrestore(&dd->sendctrl_lock, flags); + return only_32 ? 4 : 8; +} + +static const struct diag_observer sendctrl_0_observer = { + sendctrl_hook, KREG_IDX(SendCtrl_0) * sizeof(u64), + KREG_IDX(SendCtrl_0) * sizeof(u64) +}; + +static const struct diag_observer sendctrl_1_observer = { + sendctrl_hook, KREG_IDX(SendCtrl_1) * sizeof(u64), + KREG_IDX(SendCtrl_1) * sizeof(u64) +}; + +static ushort sdma_fetch_prio = 8; +module_param_named(sdma_fetch_prio, sdma_fetch_prio, ushort, S_IRUGO); +MODULE_PARM_DESC(sdma_fetch_prio, "SDMA descriptor fetch priority"); + +/* Besides logging QSFP events, we set appropriate TxDDS values */ +static void init_txdds_table(struct qib_pportdata *ppd, int override); + +static void qsfp_7322_event(struct work_struct *work) +{ + struct qib_qsfp_data *qd; + struct qib_pportdata *ppd; + unsigned long pwrup; + unsigned long flags; + int ret; + u32 le2; + + qd = container_of(work, struct qib_qsfp_data, work); + ppd = qd->ppd; + pwrup = qd->t_insert + + msecs_to_jiffies(QSFP_PWR_LAG_MSEC - QSFP_MODPRS_LAG_MSEC); + + /* Delay for 20 msecs to allow ModPrs resistor to setup */ + mdelay(QSFP_MODPRS_LAG_MSEC); + + if (!qib_qsfp_mod_present(ppd)) { + ppd->cpspec->qsfp_data.modpresent = 0; + /* Set the physical link to disabled */ + qib_set_ib_7322_lstate(ppd, 0, + QLOGIC_IB_IBCC_LINKINITCMD_DISABLE); + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags &= ~QIBL_LINKV; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + } else { + /* + * Some QSFP's not only do not respond until the full power-up + * time, but may behave badly if we try. So hold off responding + * to insertion. + */ + while (1) { + if (time_is_before_jiffies(pwrup)) + break; + msleep(20); + } + + ret = qib_refresh_qsfp_cache(ppd, &qd->cache); + + /* + * Need to change LE2 back to defaults if we couldn't + * read the cable type (to handle cable swaps), so do this + * even on failure to read cable information. We don't + * get here for QME, so IS_QME check not needed here. + */ + if (!ret && !ppd->dd->cspec->r1) { + if (QSFP_IS_ACTIVE_FAR(qd->cache.tech)) + le2 = LE2_QME; + else if (qd->cache.atten[1] >= qib_long_atten && + QSFP_IS_CU(qd->cache.tech)) + le2 = LE2_5m; + else + le2 = LE2_DEFAULT; + } else + le2 = LE2_DEFAULT; + ibsd_wr_allchans(ppd, 13, (le2 << 7), BMASK(9, 7)); + /* + * We always change parameteters, since we can choose + * values for cables without eeproms, and the cable may have + * changed from a cable with full or partial eeprom content + * to one with partial or no content. + */ + init_txdds_table(ppd, 0); + /* The physical link is being re-enabled only when the + * previous state was DISABLED and the VALID bit is not + * set. This should only happen when the cable has been + * physically pulled. */ + if (!ppd->cpspec->qsfp_data.modpresent && + (ppd->lflags & (QIBL_LINKV | QIBL_IB_LINK_DISABLED))) { + ppd->cpspec->qsfp_data.modpresent = 1; + qib_set_ib_7322_lstate(ppd, 0, + QLOGIC_IB_IBCC_LINKINITCMD_SLEEP); + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags |= QIBL_LINKV; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + } + } +} + +/* + * There is little we can do but complain to the user if QSFP + * initialization fails. + */ +static void qib_init_7322_qsfp(struct qib_pportdata *ppd) +{ + unsigned long flags; + struct qib_qsfp_data *qd = &ppd->cpspec->qsfp_data; + struct qib_devdata *dd = ppd->dd; + u64 mod_prs_bit = QSFP_GPIO_MOD_PRS_N; + + mod_prs_bit <<= (QSFP_GPIO_PORT2_SHIFT * ppd->hw_pidx); + qd->ppd = ppd; + qib_qsfp_init(qd, qsfp_7322_event); + spin_lock_irqsave(&dd->cspec->gpio_lock, flags); + dd->cspec->extctrl |= (mod_prs_bit << SYM_LSB(EXTCtrl, GPIOInvert)); + dd->cspec->gpio_mask |= mod_prs_bit; + qib_write_kreg(dd, kr_extctrl, dd->cspec->extctrl); + qib_write_kreg(dd, kr_gpio_mask, dd->cspec->gpio_mask); + spin_unlock_irqrestore(&dd->cspec->gpio_lock, flags); +} + +/* + * called at device initialization time, and also if the txselect + * module parameter is changed. This is used for cables that don't + * have valid QSFP EEPROMs (not present, or attenuation is zero). + * We initialize to the default, then if there is a specific + * unit,port match, we use that (and set it immediately, for the + * current speed, if the link is at INIT or better). + * String format is "default# unit#,port#=# ... u,p=#", separators must + * be a SPACE character. A newline terminates. The u,p=# tuples may + * optionally have "u,p=#,#", where the final # is the H1 value + * The last specific match is used (actually, all are used, but last + * one is the one that winds up set); if none at all, fall back on default. + */ +static void set_no_qsfp_atten(struct qib_devdata *dd, int change) +{ + char *nxt, *str; + u32 pidx, unit, port, deflt, h1; + unsigned long val; + int any = 0, seth1; + int txdds_size; + + str = txselect_list; + + /* default number is validated in setup_txselect() */ + deflt = simple_strtoul(str, &nxt, 0); + for (pidx = 0; pidx < dd->num_pports; ++pidx) + dd->pport[pidx].cpspec->no_eep = deflt; + + txdds_size = TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ; + if (IS_QME(dd) || IS_QMH(dd)) + txdds_size += TXDDS_MFG_SZ; + + while (*nxt && nxt[1]) { + str = ++nxt; + unit = simple_strtoul(str, &nxt, 0); + if (nxt == str || !*nxt || *nxt != ',') { + while (*nxt && *nxt++ != ' ') /* skip to next, if any */ + ; + continue; + } + str = ++nxt; + port = simple_strtoul(str, &nxt, 0); + if (nxt == str || *nxt != '=') { + while (*nxt && *nxt++ != ' ') /* skip to next, if any */ + ; + continue; + } + str = ++nxt; + val = simple_strtoul(str, &nxt, 0); + if (nxt == str) { + while (*nxt && *nxt++ != ' ') /* skip to next, if any */ + ; + continue; + } + if (val >= txdds_size) + continue; + seth1 = 0; + h1 = 0; /* gcc thinks it might be used uninitted */ + if (*nxt == ',' && nxt[1]) { + str = ++nxt; + h1 = (u32)simple_strtoul(str, &nxt, 0); + if (nxt == str) + while (*nxt && *nxt++ != ' ') /* skip */ + ; + else + seth1 = 1; + } + for (pidx = 0; dd->unit == unit && pidx < dd->num_pports; + ++pidx) { + struct qib_pportdata *ppd = &dd->pport[pidx]; + + if (ppd->port != port || !ppd->link_speed_supported) + continue; + ppd->cpspec->no_eep = val; + if (seth1) + ppd->cpspec->h1_val = h1; + /* now change the IBC and serdes, overriding generic */ + init_txdds_table(ppd, 1); + /* Re-enable the physical state machine on mezz boards + * now that the correct settings have been set. + * QSFP boards are handles by the QSFP event handler */ + if (IS_QMH(dd) || IS_QME(dd)) + qib_set_ib_7322_lstate(ppd, 0, + QLOGIC_IB_IBCC_LINKINITCMD_SLEEP); + any++; + } + if (*nxt == '\n') + break; /* done */ + } + if (change && !any) { + /* no specific setting, use the default. + * Change the IBC and serdes, but since it's + * general, don't override specific settings. + */ + for (pidx = 0; pidx < dd->num_pports; ++pidx) + if (dd->pport[pidx].link_speed_supported) + init_txdds_table(&dd->pport[pidx], 0); + } +} + +/* handle the txselect parameter changing */ +static int setup_txselect(const char *str, struct kernel_param *kp) +{ + struct qib_devdata *dd; + unsigned long val; + char *n; + + if (strlen(str) >= MAX_ATTEN_LEN) { + pr_info("txselect_values string too long\n"); + return -ENOSPC; + } + val = simple_strtoul(str, &n, 0); + if (n == str || val >= (TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ + + TXDDS_MFG_SZ)) { + pr_info("txselect_values must start with a number < %d\n", + TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ + TXDDS_MFG_SZ); + return -EINVAL; + } + strcpy(txselect_list, str); + + list_for_each_entry(dd, &qib_dev_list, list) + if (dd->deviceid == PCI_DEVICE_ID_QLOGIC_IB_7322) + set_no_qsfp_atten(dd, 1); + return 0; +} + +/* + * Write the final few registers that depend on some of the + * init setup. Done late in init, just before bringing up + * the serdes. + */ +static int qib_late_7322_initreg(struct qib_devdata *dd) +{ + int ret = 0, n; + u64 val; + + qib_write_kreg(dd, kr_rcvhdrentsize, dd->rcvhdrentsize); + qib_write_kreg(dd, kr_rcvhdrsize, dd->rcvhdrsize); + qib_write_kreg(dd, kr_rcvhdrcnt, dd->rcvhdrcnt); + qib_write_kreg(dd, kr_sendpioavailaddr, dd->pioavailregs_phys); + val = qib_read_kreg64(dd, kr_sendpioavailaddr); + if (val != dd->pioavailregs_phys) { + qib_dev_err(dd, + "Catastrophic software error, SendPIOAvailAddr written as %lx, read back as %llx\n", + (unsigned long) dd->pioavailregs_phys, + (unsigned long long) val); + ret = -EINVAL; + } + + n = dd->piobcnt2k + dd->piobcnt4k + NUM_VL15_BUFS; + qib_7322_txchk_change(dd, 0, n, TXCHK_CHG_TYPE_KERN, NULL); + /* driver sends get pkey, lid, etc. checking also, to catch bugs */ + qib_7322_txchk_change(dd, 0, n, TXCHK_CHG_TYPE_ENAB1, NULL); + + qib_register_observer(dd, &sendctrl_0_observer); + qib_register_observer(dd, &sendctrl_1_observer); + + dd->control &= ~QLOGIC_IB_C_SDMAFETCHPRIOEN; + qib_write_kreg(dd, kr_control, dd->control); + /* + * Set SendDmaFetchPriority and init Tx params, including + * QSFP handler on boards that have QSFP. + * First set our default attenuation entry for cables that + * don't have valid attenuation. + */ + set_no_qsfp_atten(dd, 0); + for (n = 0; n < dd->num_pports; ++n) { + struct qib_pportdata *ppd = dd->pport + n; + + qib_write_kreg_port(ppd, krp_senddmaprioritythld, + sdma_fetch_prio & 0xf); + /* Initialize qsfp if present on board. */ + if (dd->flags & QIB_HAS_QSFP) + qib_init_7322_qsfp(ppd); + } + dd->control |= QLOGIC_IB_C_SDMAFETCHPRIOEN; + qib_write_kreg(dd, kr_control, dd->control); + + return ret; +} + +/* per IB port errors. */ +#define SENDCTRL_PIBP (MASK_ACROSS(0, 1) | MASK_ACROSS(3, 3) | \ + MASK_ACROSS(8, 15)) +#define RCVCTRL_PIBP (MASK_ACROSS(0, 17) | MASK_ACROSS(39, 41)) +#define ERRS_PIBP (MASK_ACROSS(57, 58) | MASK_ACROSS(54, 54) | \ + MASK_ACROSS(36, 49) | MASK_ACROSS(29, 34) | MASK_ACROSS(14, 17) | \ + MASK_ACROSS(0, 11)) + +/* + * Write the initialization per-port registers that need to be done at + * driver load and after reset completes (i.e., that aren't done as part + * of other init procedures called from qib_init.c). + * Some of these should be redundant on reset, but play safe. + */ +static void write_7322_init_portregs(struct qib_pportdata *ppd) +{ + u64 val; + int i; + + if (!ppd->link_speed_supported) { + /* no buffer credits for this port */ + for (i = 1; i < 8; i++) + qib_write_kreg_port(ppd, krp_rxcreditvl0 + i, 0); + qib_write_kreg_port(ppd, krp_ibcctrl_b, 0); + qib_write_kreg(ppd->dd, kr_scratch, 0); + return; + } + + /* + * Set the number of supported virtual lanes in IBC, + * for flow control packet handling on unsupported VLs + */ + val = qib_read_kreg_port(ppd, krp_ibsdtestiftx); + val &= ~SYM_MASK(IB_SDTEST_IF_TX_0, VL_CAP); + val |= (u64)(ppd->vls_supported - 1) << + SYM_LSB(IB_SDTEST_IF_TX_0, VL_CAP); + qib_write_kreg_port(ppd, krp_ibsdtestiftx, val); + + qib_write_kreg_port(ppd, krp_rcvbthqp, QIB_KD_QP); + + /* enable tx header checking */ + qib_write_kreg_port(ppd, krp_sendcheckcontrol, IBA7322_SENDCHK_PKEY | + IBA7322_SENDCHK_BTHQP | IBA7322_SENDCHK_SLID | + IBA7322_SENDCHK_RAW_IPV6 | IBA7322_SENDCHK_MINSZ); + + qib_write_kreg_port(ppd, krp_ncmodectrl, + SYM_MASK(IBNCModeCtrl_0, ScrambleCapLocal)); + + /* + * Unconditionally clear the bufmask bits. If SDMA is + * enabled, we'll set them appropriately later. + */ + qib_write_kreg_port(ppd, krp_senddmabufmask0, 0); + qib_write_kreg_port(ppd, krp_senddmabufmask1, 0); + qib_write_kreg_port(ppd, krp_senddmabufmask2, 0); + if (ppd->dd->cspec->r1) + ppd->p_sendctrl |= SYM_MASK(SendCtrl_0, ForceCreditUpToDate); +} + +/* + * Write the initialization per-device registers that need to be done at + * driver load and after reset completes (i.e., that aren't done as part + * of other init procedures called from qib_init.c). Also write per-port + * registers that are affected by overall device config, such as QP mapping + * Some of these should be redundant on reset, but play safe. + */ +static void write_7322_initregs(struct qib_devdata *dd) +{ + struct qib_pportdata *ppd; + int i, pidx; + u64 val; + + /* Set Multicast QPs received by port 2 to map to context one. */ + qib_write_kreg(dd, KREG_IDX(RcvQPMulticastContext_1), 1); + + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + unsigned n, regno; + unsigned long flags; + + if (dd->n_krcv_queues < 2 || + !dd->pport[pidx].link_speed_supported) + continue; + + ppd = &dd->pport[pidx]; + + /* be paranoid against later code motion, etc. */ + spin_lock_irqsave(&dd->cspec->rcvmod_lock, flags); + ppd->p_rcvctrl |= SYM_MASK(RcvCtrl_0, RcvQPMapEnable); + spin_unlock_irqrestore(&dd->cspec->rcvmod_lock, flags); + + /* Initialize QP to context mapping */ + regno = krp_rcvqpmaptable; + val = 0; + if (dd->num_pports > 1) + n = dd->first_user_ctxt / dd->num_pports; + else + n = dd->first_user_ctxt - 1; + for (i = 0; i < 32; ) { + unsigned ctxt; + + if (dd->num_pports > 1) + ctxt = (i % n) * dd->num_pports + pidx; + else if (i % n) + ctxt = (i % n) + 1; + else + ctxt = ppd->hw_pidx; + val |= ctxt << (5 * (i % 6)); + i++; + if (i % 6 == 0) { + qib_write_kreg_port(ppd, regno, val); + val = 0; + regno++; + } + } + qib_write_kreg_port(ppd, regno, val); + } + + /* + * Setup up interrupt mitigation for kernel contexts, but + * not user contexts (user contexts use interrupts when + * stalled waiting for any packet, so want those interrupts + * right away). + */ + for (i = 0; i < dd->first_user_ctxt; i++) { + dd->cspec->rcvavail_timeout[i] = rcv_int_timeout; + qib_write_kreg(dd, kr_rcvavailtimeout + i, rcv_int_timeout); + } + + /* + * Initialize as (disabled) rcvflow tables. Application code + * will setup each flow as it uses the flow. + * Doesn't clear any of the error bits that might be set. + */ + val = TIDFLOW_ERRBITS; /* these are W1C */ + for (i = 0; i < dd->cfgctxts; i++) { + int flow; + + for (flow = 0; flow < NUM_TIDFLOWS_CTXT; flow++) + qib_write_ureg(dd, ur_rcvflowtable+flow, val, i); + } + + /* + * dual cards init to dual port recovery, single port cards to + * the one port. Dual port cards may later adjust to 1 port, + * and then back to dual port if both ports are connected + * */ + if (dd->num_pports) + setup_7322_link_recovery(dd->pport, dd->num_pports > 1); +} + +static int qib_init_7322_variables(struct qib_devdata *dd) +{ + struct qib_pportdata *ppd; + unsigned features, pidx, sbufcnt; + int ret, mtu; + u32 sbufs, updthresh; + resource_size_t vl15off; + + /* pport structs are contiguous, allocated after devdata */ + ppd = (struct qib_pportdata *)(dd + 1); + dd->pport = ppd; + ppd[0].dd = dd; + ppd[1].dd = dd; + + dd->cspec = (struct qib_chip_specific *)(ppd + 2); + + ppd[0].cpspec = (struct qib_chippport_specific *)(dd->cspec + 1); + ppd[1].cpspec = &ppd[0].cpspec[1]; + ppd[0].cpspec->ppd = &ppd[0]; /* for autoneg_7322_work() */ + ppd[1].cpspec->ppd = &ppd[1]; /* for autoneg_7322_work() */ + + spin_lock_init(&dd->cspec->rcvmod_lock); + spin_lock_init(&dd->cspec->gpio_lock); + + /* we haven't yet set QIB_PRESENT, so use read directly */ + dd->revision = readq(&dd->kregbase[kr_revision]); + + if ((dd->revision & 0xffffffffU) == 0xffffffffU) { + qib_dev_err(dd, + "Revision register read failure, giving up initialization\n"); + ret = -ENODEV; + goto bail; + } + dd->flags |= QIB_PRESENT; /* now register routines work */ + + dd->majrev = (u8) SYM_FIELD(dd->revision, Revision_R, ChipRevMajor); + dd->minrev = (u8) SYM_FIELD(dd->revision, Revision_R, ChipRevMinor); + dd->cspec->r1 = dd->minrev == 1; + + get_7322_chip_params(dd); + features = qib_7322_boardname(dd); + + /* now that piobcnt2k and 4k set, we can allocate these */ + sbufcnt = dd->piobcnt2k + dd->piobcnt4k + + NUM_VL15_BUFS + BITS_PER_LONG - 1; + sbufcnt /= BITS_PER_LONG; + dd->cspec->sendchkenable = kmalloc(sbufcnt * + sizeof(*dd->cspec->sendchkenable), GFP_KERNEL); + dd->cspec->sendgrhchk = kmalloc(sbufcnt * + sizeof(*dd->cspec->sendgrhchk), GFP_KERNEL); + dd->cspec->sendibchk = kmalloc(sbufcnt * + sizeof(*dd->cspec->sendibchk), GFP_KERNEL); + if (!dd->cspec->sendchkenable || !dd->cspec->sendgrhchk || + !dd->cspec->sendibchk) { + qib_dev_err(dd, "Failed allocation for hdrchk bitmaps\n"); + ret = -ENOMEM; + goto bail; + } + + ppd = dd->pport; + + /* + * GPIO bits for TWSI data and clock, + * used for serial EEPROM. + */ + dd->gpio_sda_num = _QIB_GPIO_SDA_NUM; + dd->gpio_scl_num = _QIB_GPIO_SCL_NUM; + dd->twsi_eeprom_dev = QIB_TWSI_EEPROM_DEV; + + dd->flags |= QIB_HAS_INTX | QIB_HAS_LINK_LATENCY | + QIB_NODMA_RTAIL | QIB_HAS_VLSUPP | QIB_HAS_HDRSUPP | + QIB_HAS_THRESH_UPDATE | + (sdma_idle_cnt ? QIB_HAS_SDMA_TIMEOUT : 0); + dd->flags |= qib_special_trigger ? + QIB_USE_SPCL_TRIG : QIB_HAS_SEND_DMA; + + /* + * Setup initial values. These may change when PAT is enabled, but + * we need these to do initial chip register accesses. + */ + qib_7322_set_baseaddrs(dd); + + mtu = ib_mtu_enum_to_int(qib_ibmtu); + if (mtu == -1) + mtu = QIB_DEFAULT_MTU; + + dd->cspec->int_enable_mask = QIB_I_BITSEXTANT; + /* all hwerrors become interrupts, unless special purposed */ + dd->cspec->hwerrmask = ~0ULL; + /* link_recovery setup causes these errors, so ignore them, + * other than clearing them when they occur */ + dd->cspec->hwerrmask &= + ~(SYM_MASK(HwErrMask, IBSerdesPClkNotDetectMask_0) | + SYM_MASK(HwErrMask, IBSerdesPClkNotDetectMask_1) | + HWE_MASK(LATriggered)); + + for (pidx = 0; pidx < NUM_IB_PORTS; ++pidx) { + struct qib_chippport_specific *cp = ppd->cpspec; + + ppd->link_speed_supported = features & PORT_SPD_CAP; + features >>= PORT_SPD_CAP_SHIFT; + if (!ppd->link_speed_supported) { + /* single port mode (7340, or configured) */ + dd->skip_kctxt_mask |= 1 << pidx; + if (pidx == 0) { + /* Make sure port is disabled. */ + qib_write_kreg_port(ppd, krp_rcvctrl, 0); + qib_write_kreg_port(ppd, krp_ibcctrl_a, 0); + ppd[0] = ppd[1]; + dd->cspec->hwerrmask &= ~(SYM_MASK(HwErrMask, + IBSerdesPClkNotDetectMask_0) + | SYM_MASK(HwErrMask, + SDmaMemReadErrMask_0)); + dd->cspec->int_enable_mask &= ~( + SYM_MASK(IntMask, SDmaCleanupDoneMask_0) | + SYM_MASK(IntMask, SDmaIdleIntMask_0) | + SYM_MASK(IntMask, SDmaProgressIntMask_0) | + SYM_MASK(IntMask, SDmaIntMask_0) | + SYM_MASK(IntMask, ErrIntMask_0) | + SYM_MASK(IntMask, SendDoneIntMask_0)); + } else { + /* Make sure port is disabled. */ + qib_write_kreg_port(ppd, krp_rcvctrl, 0); + qib_write_kreg_port(ppd, krp_ibcctrl_a, 0); + dd->cspec->hwerrmask &= ~(SYM_MASK(HwErrMask, + IBSerdesPClkNotDetectMask_1) + | SYM_MASK(HwErrMask, + SDmaMemReadErrMask_1)); + dd->cspec->int_enable_mask &= ~( + SYM_MASK(IntMask, SDmaCleanupDoneMask_1) | + SYM_MASK(IntMask, SDmaIdleIntMask_1) | + SYM_MASK(IntMask, SDmaProgressIntMask_1) | + SYM_MASK(IntMask, SDmaIntMask_1) | + SYM_MASK(IntMask, ErrIntMask_1) | + SYM_MASK(IntMask, SendDoneIntMask_1)); + } + continue; + } + + dd->num_pports++; + ret = qib_init_pportdata(ppd, dd, pidx, dd->num_pports); + if (ret) { + dd->num_pports--; + goto bail; + } + + ppd->link_width_supported = IB_WIDTH_1X | IB_WIDTH_4X; + ppd->link_width_enabled = IB_WIDTH_4X; + ppd->link_speed_enabled = ppd->link_speed_supported; + /* + * Set the initial values to reasonable default, will be set + * for real when link is up. + */ + ppd->link_width_active = IB_WIDTH_4X; + ppd->link_speed_active = QIB_IB_SDR; + ppd->delay_mult = ib_rate_to_delay[IB_RATE_10_GBPS]; + switch (qib_num_cfg_vls) { + case 1: + ppd->vls_supported = IB_VL_VL0; + break; + case 2: + ppd->vls_supported = IB_VL_VL0_1; + break; + default: + qib_devinfo(dd->pcidev, + "Invalid num_vls %u, using 4 VLs\n", + qib_num_cfg_vls); + qib_num_cfg_vls = 4; + /* fall through */ + case 4: + ppd->vls_supported = IB_VL_VL0_3; + break; + case 8: + if (mtu <= 2048) + ppd->vls_supported = IB_VL_VL0_7; + else { + qib_devinfo(dd->pcidev, + "Invalid num_vls %u for MTU %d , using 4 VLs\n", + qib_num_cfg_vls, mtu); + ppd->vls_supported = IB_VL_VL0_3; + qib_num_cfg_vls = 4; + } + break; + } + ppd->vls_operational = ppd->vls_supported; + + init_waitqueue_head(&cp->autoneg_wait); + INIT_DELAYED_WORK(&cp->autoneg_work, + autoneg_7322_work); + if (ppd->dd->cspec->r1) + INIT_DELAYED_WORK(&cp->ipg_work, ipg_7322_work); + + /* + * For Mez and similar cards, no qsfp info, so do + * the "cable info" setup here. Can be overridden + * in adapter-specific routines. + */ + if (!(dd->flags & QIB_HAS_QSFP)) { + if (!IS_QMH(dd) && !IS_QME(dd)) + qib_devinfo(dd->pcidev, + "IB%u:%u: Unknown mezzanine card type\n", + dd->unit, ppd->port); + cp->h1_val = IS_QMH(dd) ? H1_FORCE_QMH : H1_FORCE_QME; + /* + * Choose center value as default tx serdes setting + * until changed through module parameter. + */ + ppd->cpspec->no_eep = IS_QMH(dd) ? + TXDDS_TABLE_SZ + 2 : TXDDS_TABLE_SZ + 4; + } else + cp->h1_val = H1_FORCE_VAL; + + /* Avoid writes to chip for mini_init */ + if (!qib_mini_init) + write_7322_init_portregs(ppd); + + init_timer(&cp->chase_timer); + cp->chase_timer.function = reenable_chase; + cp->chase_timer.data = (unsigned long)ppd; + + ppd++; + } + + dd->rcvhdrentsize = qib_rcvhdrentsize ? + qib_rcvhdrentsize : QIB_RCVHDR_ENTSIZE; + dd->rcvhdrsize = qib_rcvhdrsize ? + qib_rcvhdrsize : QIB_DFLT_RCVHDRSIZE; + dd->rhf_offset = dd->rcvhdrentsize - sizeof(u64) / sizeof(u32); + + /* we always allocate at least 2048 bytes for eager buffers */ + dd->rcvegrbufsize = max(mtu, 2048); + BUG_ON(!is_power_of_2(dd->rcvegrbufsize)); + dd->rcvegrbufsize_shift = ilog2(dd->rcvegrbufsize); + + qib_7322_tidtemplate(dd); + + /* + * We can request a receive interrupt for 1 or + * more packets from current offset. + */ + dd->rhdrhead_intr_off = + (u64) rcv_int_count << IBA7322_HDRHEAD_PKTINT_SHIFT; + + /* setup the stats timer; the add_timer is done at end of init */ + init_timer(&dd->stats_timer); + dd->stats_timer.function = qib_get_7322_faststats; + dd->stats_timer.data = (unsigned long) dd; + + dd->ureg_align = 0x10000; /* 64KB alignment */ + + dd->piosize2kmax_dwords = dd->piosize2k >> 2; + + qib_7322_config_ctxts(dd); + qib_set_ctxtcnt(dd); + + /* + * We do not set WC on the VL15 buffers to avoid + * a rare problem with unaligned writes from + * interrupt-flushed store buffers, so we need + * to map those separately here. We can't solve + * this for the rarely used mtrr case. + */ + ret = init_chip_wc_pat(dd, 0); + if (ret) + goto bail; + + /* vl15 buffers start just after the 4k buffers */ + vl15off = dd->physaddr + (dd->piobufbase >> 32) + + dd->piobcnt4k * dd->align4k; + dd->piovl15base = ioremap_nocache(vl15off, + NUM_VL15_BUFS * dd->align4k); + if (!dd->piovl15base) { + ret = -ENOMEM; + goto bail; + } + + qib_7322_set_baseaddrs(dd); /* set chip access pointers now */ + + ret = 0; + if (qib_mini_init) + goto bail; + if (!dd->num_pports) { + qib_dev_err(dd, "No ports enabled, giving up initialization\n"); + goto bail; /* no error, so can still figure out why err */ + } + + write_7322_initregs(dd); + ret = qib_create_ctxts(dd); + init_7322_cntrnames(dd); + + updthresh = 8U; /* update threshold */ + + /* use all of 4KB buffers for the kernel SDMA, zero if !SDMA. + * reserve the update threshold amount for other kernel use, such + * as sending SMI, MAD, and ACKs, or 3, whichever is greater, + * unless we aren't enabling SDMA, in which case we want to use + * all the 4k bufs for the kernel. + * if this was less than the update threshold, we could wait + * a long time for an update. Coded this way because we + * sometimes change the update threshold for various reasons, + * and we want this to remain robust. + */ + if (dd->flags & QIB_HAS_SEND_DMA) { + dd->cspec->sdmabufcnt = dd->piobcnt4k; + sbufs = updthresh > 3 ? updthresh : 3; + } else { + dd->cspec->sdmabufcnt = 0; + sbufs = dd->piobcnt4k; + } + dd->cspec->lastbuf_for_pio = dd->piobcnt2k + dd->piobcnt4k - + dd->cspec->sdmabufcnt; + dd->lastctxt_piobuf = dd->cspec->lastbuf_for_pio - sbufs; + dd->cspec->lastbuf_for_pio--; /* range is <= , not < */ + dd->last_pio = dd->cspec->lastbuf_for_pio; + dd->pbufsctxt = (dd->cfgctxts > dd->first_user_ctxt) ? + dd->lastctxt_piobuf / (dd->cfgctxts - dd->first_user_ctxt) : 0; + + /* + * If we have 16 user contexts, we will have 7 sbufs + * per context, so reduce the update threshold to match. We + * want to update before we actually run out, at low pbufs/ctxt + * so give ourselves some margin. + */ + if (dd->pbufsctxt >= 2 && dd->pbufsctxt - 2 < updthresh) + updthresh = dd->pbufsctxt - 2; + dd->cspec->updthresh_dflt = updthresh; + dd->cspec->updthresh = updthresh; + + /* before full enable, no interrupts, no locking needed */ + dd->sendctrl |= ((updthresh & SYM_RMASK(SendCtrl, AvailUpdThld)) + << SYM_LSB(SendCtrl, AvailUpdThld)) | + SYM_MASK(SendCtrl, SendBufAvailPad64Byte); + + dd->psxmitwait_supported = 1; + dd->psxmitwait_check_rate = QIB_7322_PSXMITWAIT_CHECK_RATE; +bail: + if (!dd->ctxtcnt) + dd->ctxtcnt = 1; /* for other initialization code */ + + return ret; +} + +static u32 __iomem *qib_7322_getsendbuf(struct qib_pportdata *ppd, u64 pbc, + u32 *pbufnum) +{ + u32 first, last, plen = pbc & QIB_PBC_LENGTH_MASK; + struct qib_devdata *dd = ppd->dd; + + /* last is same for 2k and 4k, because we use 4k if all 2k busy */ + if (pbc & PBC_7322_VL15_SEND) { + first = dd->piobcnt2k + dd->piobcnt4k + ppd->hw_pidx; + last = first; + } else { + if ((plen + 1) > dd->piosize2kmax_dwords) + first = dd->piobcnt2k; + else + first = 0; + last = dd->cspec->lastbuf_for_pio; + } + return qib_getsendbuf_range(dd, pbufnum, first, last); +} + +static void qib_set_cntr_7322_sample(struct qib_pportdata *ppd, u32 intv, + u32 start) +{ + qib_write_kreg_port(ppd, krp_psinterval, intv); + qib_write_kreg_port(ppd, krp_psstart, start); +} + +/* + * Must be called with sdma_lock held, or before init finished. + */ +static void qib_sdma_set_7322_desc_cnt(struct qib_pportdata *ppd, unsigned cnt) +{ + qib_write_kreg_port(ppd, krp_senddmadesccnt, cnt); +} + +/* + * sdma_lock should be acquired before calling this routine + */ +static void dump_sdma_7322_state(struct qib_pportdata *ppd) +{ + u64 reg, reg1, reg2; + + reg = qib_read_kreg_port(ppd, krp_senddmastatus); + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA senddmastatus: 0x%016llx\n", reg); + + reg = qib_read_kreg_port(ppd, krp_sendctrl); + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA sendctrl: 0x%016llx\n", reg); + + reg = qib_read_kreg_port(ppd, krp_senddmabase); + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA senddmabase: 0x%016llx\n", reg); + + reg = qib_read_kreg_port(ppd, krp_senddmabufmask0); + reg1 = qib_read_kreg_port(ppd, krp_senddmabufmask1); + reg2 = qib_read_kreg_port(ppd, krp_senddmabufmask2); + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA senddmabufmask 0:%llx 1:%llx 2:%llx\n", + reg, reg1, reg2); + + /* get bufuse bits, clear them, and print them again if non-zero */ + reg = qib_read_kreg_port(ppd, krp_senddmabuf_use0); + qib_write_kreg_port(ppd, krp_senddmabuf_use0, reg); + reg1 = qib_read_kreg_port(ppd, krp_senddmabuf_use1); + qib_write_kreg_port(ppd, krp_senddmabuf_use0, reg1); + reg2 = qib_read_kreg_port(ppd, krp_senddmabuf_use2); + qib_write_kreg_port(ppd, krp_senddmabuf_use0, reg2); + /* 0 and 1 should always be zero, so print as short form */ + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA current senddmabuf_use 0:%llx 1:%llx 2:%llx\n", + reg, reg1, reg2); + reg = qib_read_kreg_port(ppd, krp_senddmabuf_use0); + reg1 = qib_read_kreg_port(ppd, krp_senddmabuf_use1); + reg2 = qib_read_kreg_port(ppd, krp_senddmabuf_use2); + /* 0 and 1 should always be zero, so print as short form */ + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA cleared senddmabuf_use 0:%llx 1:%llx 2:%llx\n", + reg, reg1, reg2); + + reg = qib_read_kreg_port(ppd, krp_senddmatail); + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA senddmatail: 0x%016llx\n", reg); + + reg = qib_read_kreg_port(ppd, krp_senddmahead); + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA senddmahead: 0x%016llx\n", reg); + + reg = qib_read_kreg_port(ppd, krp_senddmaheadaddr); + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA senddmaheadaddr: 0x%016llx\n", reg); + + reg = qib_read_kreg_port(ppd, krp_senddmalengen); + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA senddmalengen: 0x%016llx\n", reg); + + reg = qib_read_kreg_port(ppd, krp_senddmadesccnt); + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA senddmadesccnt: 0x%016llx\n", reg); + + reg = qib_read_kreg_port(ppd, krp_senddmaidlecnt); + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA senddmaidlecnt: 0x%016llx\n", reg); + + reg = qib_read_kreg_port(ppd, krp_senddmaprioritythld); + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA senddmapriorityhld: 0x%016llx\n", reg); + + reg = qib_read_kreg_port(ppd, krp_senddmareloadcnt); + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA senddmareloadcnt: 0x%016llx\n", reg); + + dump_sdma_state(ppd); +} + +static struct sdma_set_state_action sdma_7322_action_table[] = { + [qib_sdma_state_s00_hw_down] = { + .go_s99_running_tofalse = 1, + .op_enable = 0, + .op_intenable = 0, + .op_halt = 0, + .op_drain = 0, + }, + [qib_sdma_state_s10_hw_start_up_wait] = { + .op_enable = 0, + .op_intenable = 1, + .op_halt = 1, + .op_drain = 0, + }, + [qib_sdma_state_s20_idle] = { + .op_enable = 1, + .op_intenable = 1, + .op_halt = 1, + .op_drain = 0, + }, + [qib_sdma_state_s30_sw_clean_up_wait] = { + .op_enable = 0, + .op_intenable = 1, + .op_halt = 1, + .op_drain = 0, + }, + [qib_sdma_state_s40_hw_clean_up_wait] = { + .op_enable = 1, + .op_intenable = 1, + .op_halt = 1, + .op_drain = 0, + }, + [qib_sdma_state_s50_hw_halt_wait] = { + .op_enable = 1, + .op_intenable = 1, + .op_halt = 1, + .op_drain = 1, + }, + [qib_sdma_state_s99_running] = { + .op_enable = 1, + .op_intenable = 1, + .op_halt = 0, + .op_drain = 0, + .go_s99_running_totrue = 1, + }, +}; + +static void qib_7322_sdma_init_early(struct qib_pportdata *ppd) +{ + ppd->sdma_state.set_state_action = sdma_7322_action_table; +} + +static int init_sdma_7322_regs(struct qib_pportdata *ppd) +{ + struct qib_devdata *dd = ppd->dd; + unsigned lastbuf, erstbuf; + u64 senddmabufmask[3] = { 0 }; + int n, ret = 0; + + qib_write_kreg_port(ppd, krp_senddmabase, ppd->sdma_descq_phys); + qib_sdma_7322_setlengen(ppd); + qib_sdma_update_7322_tail(ppd, 0); /* Set SendDmaTail */ + qib_write_kreg_port(ppd, krp_senddmareloadcnt, sdma_idle_cnt); + qib_write_kreg_port(ppd, krp_senddmadesccnt, 0); + qib_write_kreg_port(ppd, krp_senddmaheadaddr, ppd->sdma_head_phys); + + if (dd->num_pports) + n = dd->cspec->sdmabufcnt / dd->num_pports; /* no remainder */ + else + n = dd->cspec->sdmabufcnt; /* failsafe for init */ + erstbuf = (dd->piobcnt2k + dd->piobcnt4k) - + ((dd->num_pports == 1 || ppd->port == 2) ? n : + dd->cspec->sdmabufcnt); + lastbuf = erstbuf + n; + + ppd->sdma_state.first_sendbuf = erstbuf; + ppd->sdma_state.last_sendbuf = lastbuf; + for (; erstbuf < lastbuf; ++erstbuf) { + unsigned word = erstbuf / BITS_PER_LONG; + unsigned bit = erstbuf & (BITS_PER_LONG - 1); + + BUG_ON(word >= 3); + senddmabufmask[word] |= 1ULL << bit; + } + qib_write_kreg_port(ppd, krp_senddmabufmask0, senddmabufmask[0]); + qib_write_kreg_port(ppd, krp_senddmabufmask1, senddmabufmask[1]); + qib_write_kreg_port(ppd, krp_senddmabufmask2, senddmabufmask[2]); + return ret; +} + +/* sdma_lock must be held */ +static u16 qib_sdma_7322_gethead(struct qib_pportdata *ppd) +{ + struct qib_devdata *dd = ppd->dd; + int sane; + int use_dmahead; + u16 swhead; + u16 swtail; + u16 cnt; + u16 hwhead; + + use_dmahead = __qib_sdma_running(ppd) && + (dd->flags & QIB_HAS_SDMA_TIMEOUT); +retry: + hwhead = use_dmahead ? + (u16) le64_to_cpu(*ppd->sdma_head_dma) : + (u16) qib_read_kreg_port(ppd, krp_senddmahead); + + swhead = ppd->sdma_descq_head; + swtail = ppd->sdma_descq_tail; + cnt = ppd->sdma_descq_cnt; + + if (swhead < swtail) + /* not wrapped */ + sane = (hwhead >= swhead) & (hwhead <= swtail); + else if (swhead > swtail) + /* wrapped around */ + sane = ((hwhead >= swhead) && (hwhead < cnt)) || + (hwhead <= swtail); + else + /* empty */ + sane = (hwhead == swhead); + + if (unlikely(!sane)) { + if (use_dmahead) { + /* try one more time, directly from the register */ + use_dmahead = 0; + goto retry; + } + /* proceed as if no progress */ + hwhead = swhead; + } + + return hwhead; +} + +static int qib_sdma_7322_busy(struct qib_pportdata *ppd) +{ + u64 hwstatus = qib_read_kreg_port(ppd, krp_senddmastatus); + + return (hwstatus & SYM_MASK(SendDmaStatus_0, ScoreBoardDrainInProg)) || + (hwstatus & SYM_MASK(SendDmaStatus_0, HaltInProg)) || + !(hwstatus & SYM_MASK(SendDmaStatus_0, InternalSDmaHalt)) || + !(hwstatus & SYM_MASK(SendDmaStatus_0, ScbEmpty)); +} + +/* + * Compute the amount of delay before sending the next packet if the + * port's send rate differs from the static rate set for the QP. + * The delay affects the next packet and the amount of the delay is + * based on the length of the this packet. + */ +static u32 qib_7322_setpbc_control(struct qib_pportdata *ppd, u32 plen, + u8 srate, u8 vl) +{ + u8 snd_mult = ppd->delay_mult; + u8 rcv_mult = ib_rate_to_delay[srate]; + u32 ret; + + ret = rcv_mult > snd_mult ? ((plen + 1) >> 1) * snd_mult : 0; + + /* Indicate VL15, else set the VL in the control word */ + if (vl == 15) + ret |= PBC_7322_VL15_SEND_CTRL; + else + ret |= vl << PBC_VL_NUM_LSB; + ret |= ((u32)(ppd->hw_pidx)) << PBC_PORT_SEL_LSB; + + return ret; +} + +/* + * Enable the per-port VL15 send buffers for use. + * They follow the rest of the buffers, without a config parameter. + * This was in initregs, but that is done before the shadow + * is set up, and this has to be done after the shadow is + * set up. + */ +static void qib_7322_initvl15_bufs(struct qib_devdata *dd) +{ + unsigned vl15bufs; + + vl15bufs = dd->piobcnt2k + dd->piobcnt4k; + qib_chg_pioavailkernel(dd, vl15bufs, NUM_VL15_BUFS, + TXCHK_CHG_TYPE_KERN, NULL); +} + +static void qib_7322_init_ctxt(struct qib_ctxtdata *rcd) +{ + if (rcd->ctxt < NUM_IB_PORTS) { + if (rcd->dd->num_pports > 1) { + rcd->rcvegrcnt = KCTXT0_EGRCNT / 2; + rcd->rcvegr_tid_base = rcd->ctxt ? rcd->rcvegrcnt : 0; + } else { + rcd->rcvegrcnt = KCTXT0_EGRCNT; + rcd->rcvegr_tid_base = 0; + } + } else { + rcd->rcvegrcnt = rcd->dd->cspec->rcvegrcnt; + rcd->rcvegr_tid_base = KCTXT0_EGRCNT + + (rcd->ctxt - NUM_IB_PORTS) * rcd->rcvegrcnt; + } +} + +#define QTXSLEEPS 5000 +static void qib_7322_txchk_change(struct qib_devdata *dd, u32 start, + u32 len, u32 which, struct qib_ctxtdata *rcd) +{ + int i; + const int last = start + len - 1; + const int lastr = last / BITS_PER_LONG; + u32 sleeps = 0; + int wait = rcd != NULL; + unsigned long flags; + + while (wait) { + unsigned long shadow; + int cstart, previ = -1; + + /* + * when flipping from kernel to user, we can't change + * the checking type if the buffer is allocated to the + * driver. It's OK the other direction, because it's + * from close, and we have just disarm'ed all the + * buffers. All the kernel to kernel changes are also + * OK. + */ + for (cstart = start; cstart <= last; cstart++) { + i = ((2 * cstart) + QLOGIC_IB_SENDPIOAVAIL_BUSY_SHIFT) + / BITS_PER_LONG; + if (i != previ) { + shadow = (unsigned long) + le64_to_cpu(dd->pioavailregs_dma[i]); + previ = i; + } + if (test_bit(((2 * cstart) + + QLOGIC_IB_SENDPIOAVAIL_BUSY_SHIFT) + % BITS_PER_LONG, &shadow)) + break; + } + + if (cstart > last) + break; + + if (sleeps == QTXSLEEPS) + break; + /* make sure we see an updated copy next time around */ + sendctrl_7322_mod(dd->pport, QIB_SENDCTRL_AVAIL_BLIP); + sleeps++; + msleep(20); + } + + switch (which) { + case TXCHK_CHG_TYPE_DIS1: + /* + * disable checking on a range; used by diags; just + * one buffer, but still written generically + */ + for (i = start; i <= last; i++) + clear_bit(i, dd->cspec->sendchkenable); + break; + + case TXCHK_CHG_TYPE_ENAB1: + /* + * (re)enable checking on a range; used by diags; just + * one buffer, but still written generically; read + * scratch to be sure buffer actually triggered, not + * just flushed from processor. + */ + qib_read_kreg32(dd, kr_scratch); + for (i = start; i <= last; i++) + set_bit(i, dd->cspec->sendchkenable); + break; + + case TXCHK_CHG_TYPE_KERN: + /* usable by kernel */ + for (i = start; i <= last; i++) { + set_bit(i, dd->cspec->sendibchk); + clear_bit(i, dd->cspec->sendgrhchk); + } + spin_lock_irqsave(&dd->uctxt_lock, flags); + /* see if we need to raise avail update threshold */ + for (i = dd->first_user_ctxt; + dd->cspec->updthresh != dd->cspec->updthresh_dflt + && i < dd->cfgctxts; i++) + if (dd->rcd[i] && dd->rcd[i]->subctxt_cnt && + ((dd->rcd[i]->piocnt / dd->rcd[i]->subctxt_cnt) - 1) + < dd->cspec->updthresh_dflt) + break; + spin_unlock_irqrestore(&dd->uctxt_lock, flags); + if (i == dd->cfgctxts) { + spin_lock_irqsave(&dd->sendctrl_lock, flags); + dd->cspec->updthresh = dd->cspec->updthresh_dflt; + dd->sendctrl &= ~SYM_MASK(SendCtrl, AvailUpdThld); + dd->sendctrl |= (dd->cspec->updthresh & + SYM_RMASK(SendCtrl, AvailUpdThld)) << + SYM_LSB(SendCtrl, AvailUpdThld); + spin_unlock_irqrestore(&dd->sendctrl_lock, flags); + sendctrl_7322_mod(dd->pport, QIB_SENDCTRL_AVAIL_BLIP); + } + break; + + case TXCHK_CHG_TYPE_USER: + /* for user process */ + for (i = start; i <= last; i++) { + clear_bit(i, dd->cspec->sendibchk); + set_bit(i, dd->cspec->sendgrhchk); + } + spin_lock_irqsave(&dd->sendctrl_lock, flags); + if (rcd && rcd->subctxt_cnt && ((rcd->piocnt + / rcd->subctxt_cnt) - 1) < dd->cspec->updthresh) { + dd->cspec->updthresh = (rcd->piocnt / + rcd->subctxt_cnt) - 1; + dd->sendctrl &= ~SYM_MASK(SendCtrl, AvailUpdThld); + dd->sendctrl |= (dd->cspec->updthresh & + SYM_RMASK(SendCtrl, AvailUpdThld)) + << SYM_LSB(SendCtrl, AvailUpdThld); + spin_unlock_irqrestore(&dd->sendctrl_lock, flags); + sendctrl_7322_mod(dd->pport, QIB_SENDCTRL_AVAIL_BLIP); + } else + spin_unlock_irqrestore(&dd->sendctrl_lock, flags); + break; + + default: + break; + } + + for (i = start / BITS_PER_LONG; which >= 2 && i <= lastr; ++i) + qib_write_kreg(dd, kr_sendcheckmask + i, + dd->cspec->sendchkenable[i]); + + for (i = start / BITS_PER_LONG; which < 2 && i <= lastr; ++i) { + qib_write_kreg(dd, kr_sendgrhcheckmask + i, + dd->cspec->sendgrhchk[i]); + qib_write_kreg(dd, kr_sendibpktmask + i, + dd->cspec->sendibchk[i]); + } + + /* + * Be sure whatever we did was seen by the chip and acted upon, + * before we return. Mostly important for which >= 2. + */ + qib_read_kreg32(dd, kr_scratch); +} + + +/* useful for trigger analyzers, etc. */ +static void writescratch(struct qib_devdata *dd, u32 val) +{ + qib_write_kreg(dd, kr_scratch, val); +} + +/* Dummy for now, use chip regs soon */ +static int qib_7322_tempsense_rd(struct qib_devdata *dd, int regnum) +{ + return -ENXIO; +} + +/** + * qib_init_iba7322_funcs - set up the chip-specific function pointers + * @dev: the pci_dev for qlogic_ib device + * @ent: pci_device_id struct for this dev + * + * Also allocates, inits, and returns the devdata struct for this + * device instance + * + * This is global, and is called directly at init to set up the + * chip-specific function pointers for later use. + */ +struct qib_devdata *qib_init_iba7322_funcs(struct pci_dev *pdev, + const struct pci_device_id *ent) +{ + struct qib_devdata *dd; + int ret, i; + u32 tabsize, actual_cnt = 0; + + dd = qib_alloc_devdata(pdev, + NUM_IB_PORTS * sizeof(struct qib_pportdata) + + sizeof(struct qib_chip_specific) + + NUM_IB_PORTS * sizeof(struct qib_chippport_specific)); + if (IS_ERR(dd)) + goto bail; + + dd->f_bringup_serdes = qib_7322_bringup_serdes; + dd->f_cleanup = qib_setup_7322_cleanup; + dd->f_clear_tids = qib_7322_clear_tids; + dd->f_free_irq = qib_7322_free_irq; + dd->f_get_base_info = qib_7322_get_base_info; + dd->f_get_msgheader = qib_7322_get_msgheader; + dd->f_getsendbuf = qib_7322_getsendbuf; + dd->f_gpio_mod = gpio_7322_mod; + dd->f_eeprom_wen = qib_7322_eeprom_wen; + dd->f_hdrqempty = qib_7322_hdrqempty; + dd->f_ib_updown = qib_7322_ib_updown; + dd->f_init_ctxt = qib_7322_init_ctxt; + dd->f_initvl15_bufs = qib_7322_initvl15_bufs; + dd->f_intr_fallback = qib_7322_intr_fallback; + dd->f_late_initreg = qib_late_7322_initreg; + dd->f_setpbc_control = qib_7322_setpbc_control; + dd->f_portcntr = qib_portcntr_7322; + dd->f_put_tid = qib_7322_put_tid; + dd->f_quiet_serdes = qib_7322_mini_quiet_serdes; + dd->f_rcvctrl = rcvctrl_7322_mod; + dd->f_read_cntrs = qib_read_7322cntrs; + dd->f_read_portcntrs = qib_read_7322portcntrs; + dd->f_reset = qib_do_7322_reset; + dd->f_init_sdma_regs = init_sdma_7322_regs; + dd->f_sdma_busy = qib_sdma_7322_busy; + dd->f_sdma_gethead = qib_sdma_7322_gethead; + dd->f_sdma_sendctrl = qib_7322_sdma_sendctrl; + dd->f_sdma_set_desc_cnt = qib_sdma_set_7322_desc_cnt; + dd->f_sdma_update_tail = qib_sdma_update_7322_tail; + dd->f_sendctrl = sendctrl_7322_mod; + dd->f_set_armlaunch = qib_set_7322_armlaunch; + dd->f_set_cntr_sample = qib_set_cntr_7322_sample; + dd->f_iblink_state = qib_7322_iblink_state; + dd->f_ibphys_portstate = qib_7322_phys_portstate; + dd->f_get_ib_cfg = qib_7322_get_ib_cfg; + dd->f_set_ib_cfg = qib_7322_set_ib_cfg; + dd->f_set_ib_loopback = qib_7322_set_loopback; + dd->f_get_ib_table = qib_7322_get_ib_table; + dd->f_set_ib_table = qib_7322_set_ib_table; + dd->f_set_intr_state = qib_7322_set_intr_state; + dd->f_setextled = qib_setup_7322_setextled; + dd->f_txchk_change = qib_7322_txchk_change; + dd->f_update_usrhead = qib_update_7322_usrhead; + dd->f_wantpiobuf_intr = qib_wantpiobuf_7322_intr; + dd->f_xgxs_reset = qib_7322_mini_pcs_reset; + dd->f_sdma_hw_clean_up = qib_7322_sdma_hw_clean_up; + dd->f_sdma_hw_start_up = qib_7322_sdma_hw_start_up; + dd->f_sdma_init_early = qib_7322_sdma_init_early; + dd->f_writescratch = writescratch; + dd->f_tempsense_rd = qib_7322_tempsense_rd; +#ifdef CONFIG_INFINIBAND_QIB_DCA + dd->f_notify_dca = qib_7322_notify_dca; +#endif + /* + * Do remaining PCIe setup and save PCIe values in dd. + * Any error printing is already done by the init code. + * On return, we have the chip mapped, but chip registers + * are not set up until start of qib_init_7322_variables. + */ + ret = qib_pcie_ddinit(dd, pdev, ent); + if (ret < 0) + goto bail_free; + + /* initialize chip-specific variables */ + ret = qib_init_7322_variables(dd); + if (ret) + goto bail_cleanup; + + if (qib_mini_init || !dd->num_pports) + goto bail; + + /* + * Determine number of vectors we want; depends on port count + * and number of configured kernel receive queues actually used. + * Should also depend on whether sdma is enabled or not, but + * that's such a rare testing case it's not worth worrying about. + */ + tabsize = dd->first_user_ctxt + ARRAY_SIZE(irq_table); + for (i = 0; i < tabsize; i++) + if ((i < ARRAY_SIZE(irq_table) && + irq_table[i].port <= dd->num_pports) || + (i >= ARRAY_SIZE(irq_table) && + dd->rcd[i - ARRAY_SIZE(irq_table)])) + actual_cnt++; + /* reduce by ctxt's < 2 */ + if (qib_krcvq01_no_msi) + actual_cnt -= dd->num_pports; + + tabsize = actual_cnt; + dd->cspec->msix_entries = kzalloc(tabsize * + sizeof(struct qib_msix_entry), GFP_KERNEL); + if (!dd->cspec->msix_entries) { + qib_dev_err(dd, "No memory for MSIx table\n"); + tabsize = 0; + } + for (i = 0; i < tabsize; i++) + dd->cspec->msix_entries[i].msix.entry = i; + + if (qib_pcie_params(dd, 8, &tabsize, dd->cspec->msix_entries)) + qib_dev_err(dd, + "Failed to setup PCIe or interrupts; continuing anyway\n"); + /* may be less than we wanted, if not enough available */ + dd->cspec->num_msix_entries = tabsize; + + /* setup interrupt handler */ + qib_setup_7322_interrupt(dd, 1); + + /* clear diagctrl register, in case diags were running and crashed */ + qib_write_kreg(dd, kr_hwdiagctrl, 0); +#ifdef CONFIG_INFINIBAND_QIB_DCA + if (!dca_add_requester(&pdev->dev)) { + qib_devinfo(dd->pcidev, "DCA enabled\n"); + dd->flags |= QIB_DCA_ENABLED; + qib_setup_dca(dd); + } +#endif + goto bail; + +bail_cleanup: + qib_pcie_ddcleanup(dd); +bail_free: + qib_free_devdata(dd); + dd = ERR_PTR(ret); +bail: + return dd; +} + +/* + * Set the table entry at the specified index from the table specifed. + * There are 3 * TXDDS_TABLE_SZ entries in all per port, with the first + * TXDDS_TABLE_SZ for SDR, the next for DDR, and the last for QDR. + * 'idx' below addresses the correct entry, while its 4 LSBs select the + * corresponding entry (one of TXDDS_TABLE_SZ) from the selected table. + */ +#define DDS_ENT_AMP_LSB 14 +#define DDS_ENT_MAIN_LSB 9 +#define DDS_ENT_POST_LSB 5 +#define DDS_ENT_PRE_XTRA_LSB 3 +#define DDS_ENT_PRE_LSB 0 + +/* + * Set one entry in the TxDDS table for spec'd port + * ridx picks one of the entries, while tp points + * to the appropriate table entry. + */ +static void set_txdds(struct qib_pportdata *ppd, int ridx, + const struct txdds_ent *tp) +{ + struct qib_devdata *dd = ppd->dd; + u32 pack_ent; + int regidx; + + /* Get correct offset in chip-space, and in source table */ + regidx = KREG_IBPORT_IDX(IBSD_DDS_MAP_TABLE) + ridx; + /* + * We do not use qib_write_kreg_port() because it was intended + * only for registers in the lower "port specific" pages. + * So do index calculation by hand. + */ + if (ppd->hw_pidx) + regidx += (dd->palign / sizeof(u64)); + + pack_ent = tp->amp << DDS_ENT_AMP_LSB; + pack_ent |= tp->main << DDS_ENT_MAIN_LSB; + pack_ent |= tp->pre << DDS_ENT_PRE_LSB; + pack_ent |= tp->post << DDS_ENT_POST_LSB; + qib_write_kreg(dd, regidx, pack_ent); + /* Prevent back-to-back writes by hitting scratch */ + qib_write_kreg(ppd->dd, kr_scratch, 0); +} + +static const struct vendor_txdds_ent vendor_txdds[] = { + { /* Amphenol 1m 30awg NoEq */ + { 0x41, 0x50, 0x48 }, "584470002 ", + { 10, 0, 0, 5 }, { 10, 0, 0, 9 }, { 7, 1, 0, 13 }, + }, + { /* Amphenol 3m 28awg NoEq */ + { 0x41, 0x50, 0x48 }, "584470004 ", + { 0, 0, 0, 8 }, { 0, 0, 0, 11 }, { 0, 1, 7, 15 }, + }, + { /* Finisar 3m OM2 Optical */ + { 0x00, 0x90, 0x65 }, "FCBG410QB1C03-QL", + { 0, 0, 0, 3 }, { 0, 0, 0, 4 }, { 0, 0, 0, 13 }, + }, + { /* Finisar 30m OM2 Optical */ + { 0x00, 0x90, 0x65 }, "FCBG410QB1C30-QL", + { 0, 0, 0, 1 }, { 0, 0, 0, 5 }, { 0, 0, 0, 11 }, + }, + { /* Finisar Default OM2 Optical */ + { 0x00, 0x90, 0x65 }, NULL, + { 0, 0, 0, 2 }, { 0, 0, 0, 5 }, { 0, 0, 0, 12 }, + }, + { /* Gore 1m 30awg NoEq */ + { 0x00, 0x21, 0x77 }, "QSN3300-1 ", + { 0, 0, 0, 6 }, { 0, 0, 0, 9 }, { 0, 1, 0, 15 }, + }, + { /* Gore 2m 30awg NoEq */ + { 0x00, 0x21, 0x77 }, "QSN3300-2 ", + { 0, 0, 0, 8 }, { 0, 0, 0, 10 }, { 0, 1, 7, 15 }, + }, + { /* Gore 1m 28awg NoEq */ + { 0x00, 0x21, 0x77 }, "QSN3800-1 ", + { 0, 0, 0, 6 }, { 0, 0, 0, 8 }, { 0, 1, 0, 15 }, + }, + { /* Gore 3m 28awg NoEq */ + { 0x00, 0x21, 0x77 }, "QSN3800-3 ", + { 0, 0, 0, 9 }, { 0, 0, 0, 13 }, { 0, 1, 7, 15 }, + }, + { /* Gore 5m 24awg Eq */ + { 0x00, 0x21, 0x77 }, "QSN7000-5 ", + { 0, 0, 0, 7 }, { 0, 0, 0, 9 }, { 0, 1, 3, 15 }, + }, + { /* Gore 7m 24awg Eq */ + { 0x00, 0x21, 0x77 }, "QSN7000-7 ", + { 0, 0, 0, 9 }, { 0, 0, 0, 11 }, { 0, 2, 6, 15 }, + }, + { /* Gore 5m 26awg Eq */ + { 0x00, 0x21, 0x77 }, "QSN7600-5 ", + { 0, 0, 0, 8 }, { 0, 0, 0, 11 }, { 0, 1, 9, 13 }, + }, + { /* Gore 7m 26awg Eq */ + { 0x00, 0x21, 0x77 }, "QSN7600-7 ", + { 0, 0, 0, 8 }, { 0, 0, 0, 11 }, { 10, 1, 8, 15 }, + }, + { /* Intersil 12m 24awg Active */ + { 0x00, 0x30, 0xB4 }, "QLX4000CQSFP1224", + { 0, 0, 0, 2 }, { 0, 0, 0, 5 }, { 0, 3, 0, 9 }, + }, + { /* Intersil 10m 28awg Active */ + { 0x00, 0x30, 0xB4 }, "QLX4000CQSFP1028", + { 0, 0, 0, 6 }, { 0, 0, 0, 4 }, { 0, 2, 0, 2 }, + }, + { /* Intersil 7m 30awg Active */ + { 0x00, 0x30, 0xB4 }, "QLX4000CQSFP0730", + { 0, 0, 0, 6 }, { 0, 0, 0, 4 }, { 0, 1, 0, 3 }, + }, + { /* Intersil 5m 32awg Active */ + { 0x00, 0x30, 0xB4 }, "QLX4000CQSFP0532", + { 0, 0, 0, 6 }, { 0, 0, 0, 6 }, { 0, 2, 0, 8 }, + }, + { /* Intersil Default Active */ + { 0x00, 0x30, 0xB4 }, NULL, + { 0, 0, 0, 6 }, { 0, 0, 0, 5 }, { 0, 2, 0, 5 }, + }, + { /* Luxtera 20m Active Optical */ + { 0x00, 0x25, 0x63 }, NULL, + { 0, 0, 0, 5 }, { 0, 0, 0, 8 }, { 0, 2, 0, 12 }, + }, + { /* Molex 1M Cu loopback */ + { 0x00, 0x09, 0x3A }, "74763-0025 ", + { 2, 2, 6, 15 }, { 2, 2, 6, 15 }, { 2, 2, 6, 15 }, + }, + { /* Molex 2m 28awg NoEq */ + { 0x00, 0x09, 0x3A }, "74757-2201 ", + { 0, 0, 0, 6 }, { 0, 0, 0, 9 }, { 0, 1, 1, 15 }, + }, +}; + +static const struct txdds_ent txdds_sdr[TXDDS_TABLE_SZ] = { + /* amp, pre, main, post */ + { 2, 2, 15, 6 }, /* Loopback */ + { 0, 0, 0, 1 }, /* 2 dB */ + { 0, 0, 0, 2 }, /* 3 dB */ + { 0, 0, 0, 3 }, /* 4 dB */ + { 0, 0, 0, 4 }, /* 5 dB */ + { 0, 0, 0, 5 }, /* 6 dB */ + { 0, 0, 0, 6 }, /* 7 dB */ + { 0, 0, 0, 7 }, /* 8 dB */ + { 0, 0, 0, 8 }, /* 9 dB */ + { 0, 0, 0, 9 }, /* 10 dB */ + { 0, 0, 0, 10 }, /* 11 dB */ + { 0, 0, 0, 11 }, /* 12 dB */ + { 0, 0, 0, 12 }, /* 13 dB */ + { 0, 0, 0, 13 }, /* 14 dB */ + { 0, 0, 0, 14 }, /* 15 dB */ + { 0, 0, 0, 15 }, /* 16 dB */ +}; + +static const struct txdds_ent txdds_ddr[TXDDS_TABLE_SZ] = { + /* amp, pre, main, post */ + { 2, 2, 15, 6 }, /* Loopback */ + { 0, 0, 0, 8 }, /* 2 dB */ + { 0, 0, 0, 8 }, /* 3 dB */ + { 0, 0, 0, 9 }, /* 4 dB */ + { 0, 0, 0, 9 }, /* 5 dB */ + { 0, 0, 0, 10 }, /* 6 dB */ + { 0, 0, 0, 10 }, /* 7 dB */ + { 0, 0, 0, 11 }, /* 8 dB */ + { 0, 0, 0, 11 }, /* 9 dB */ + { 0, 0, 0, 12 }, /* 10 dB */ + { 0, 0, 0, 12 }, /* 11 dB */ + { 0, 0, 0, 13 }, /* 12 dB */ + { 0, 0, 0, 13 }, /* 13 dB */ + { 0, 0, 0, 14 }, /* 14 dB */ + { 0, 0, 0, 14 }, /* 15 dB */ + { 0, 0, 0, 15 }, /* 16 dB */ +}; + +static const struct txdds_ent txdds_qdr[TXDDS_TABLE_SZ] = { + /* amp, pre, main, post */ + { 2, 2, 15, 6 }, /* Loopback */ + { 0, 1, 0, 7 }, /* 2 dB (also QMH7342) */ + { 0, 1, 0, 9 }, /* 3 dB (also QMH7342) */ + { 0, 1, 0, 11 }, /* 4 dB */ + { 0, 1, 0, 13 }, /* 5 dB */ + { 0, 1, 0, 15 }, /* 6 dB */ + { 0, 1, 3, 15 }, /* 7 dB */ + { 0, 1, 7, 15 }, /* 8 dB */ + { 0, 1, 7, 15 }, /* 9 dB */ + { 0, 1, 8, 15 }, /* 10 dB */ + { 0, 1, 9, 15 }, /* 11 dB */ + { 0, 1, 10, 15 }, /* 12 dB */ + { 0, 2, 6, 15 }, /* 13 dB */ + { 0, 2, 7, 15 }, /* 14 dB */ + { 0, 2, 8, 15 }, /* 15 dB */ + { 0, 2, 9, 15 }, /* 16 dB */ +}; + +/* + * extra entries for use with txselect, for indices >= TXDDS_TABLE_SZ. + * These are mostly used for mez cards going through connectors + * and backplane traces, but can be used to add other "unusual" + * table values as well. + */ +static const struct txdds_ent txdds_extra_sdr[TXDDS_EXTRA_SZ] = { + /* amp, pre, main, post */ + { 0, 0, 0, 1 }, /* QMH7342 backplane settings */ + { 0, 0, 0, 1 }, /* QMH7342 backplane settings */ + { 0, 0, 0, 2 }, /* QMH7342 backplane settings */ + { 0, 0, 0, 2 }, /* QMH7342 backplane settings */ + { 0, 0, 0, 3 }, /* QMH7342 backplane settings */ + { 0, 0, 0, 4 }, /* QMH7342 backplane settings */ + { 0, 1, 4, 15 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 3, 15 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 0, 12 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 0, 11 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 0, 9 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 0, 14 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 2, 15 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 0, 11 }, /* QME7342 backplane settings 1.1 */ + { 0, 1, 0, 7 }, /* QME7342 backplane settings 1.1 */ + { 0, 1, 0, 9 }, /* QME7342 backplane settings 1.1 */ + { 0, 1, 0, 6 }, /* QME7342 backplane settings 1.1 */ + { 0, 1, 0, 8 }, /* QME7342 backplane settings 1.1 */ +}; + +static const struct txdds_ent txdds_extra_ddr[TXDDS_EXTRA_SZ] = { + /* amp, pre, main, post */ + { 0, 0, 0, 7 }, /* QMH7342 backplane settings */ + { 0, 0, 0, 7 }, /* QMH7342 backplane settings */ + { 0, 0, 0, 8 }, /* QMH7342 backplane settings */ + { 0, 0, 0, 8 }, /* QMH7342 backplane settings */ + { 0, 0, 0, 9 }, /* QMH7342 backplane settings */ + { 0, 0, 0, 10 }, /* QMH7342 backplane settings */ + { 0, 1, 4, 15 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 3, 15 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 0, 12 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 0, 11 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 0, 9 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 0, 14 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 2, 15 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 0, 11 }, /* QME7342 backplane settings 1.1 */ + { 0, 1, 0, 7 }, /* QME7342 backplane settings 1.1 */ + { 0, 1, 0, 9 }, /* QME7342 backplane settings 1.1 */ + { 0, 1, 0, 6 }, /* QME7342 backplane settings 1.1 */ + { 0, 1, 0, 8 }, /* QME7342 backplane settings 1.1 */ +}; + +static const struct txdds_ent txdds_extra_qdr[TXDDS_EXTRA_SZ] = { + /* amp, pre, main, post */ + { 0, 1, 0, 4 }, /* QMH7342 backplane settings */ + { 0, 1, 0, 5 }, /* QMH7342 backplane settings */ + { 0, 1, 0, 6 }, /* QMH7342 backplane settings */ + { 0, 1, 0, 8 }, /* QMH7342 backplane settings */ + { 0, 1, 0, 10 }, /* QMH7342 backplane settings */ + { 0, 1, 0, 12 }, /* QMH7342 backplane settings */ + { 0, 1, 4, 15 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 3, 15 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 0, 12 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 0, 11 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 0, 9 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 0, 14 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 2, 15 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 0, 11 }, /* QME7342 backplane settings 1.1 */ + { 0, 1, 0, 7 }, /* QME7342 backplane settings 1.1 */ + { 0, 1, 0, 9 }, /* QME7342 backplane settings 1.1 */ + { 0, 1, 0, 6 }, /* QME7342 backplane settings 1.1 */ + { 0, 1, 0, 8 }, /* QME7342 backplane settings 1.1 */ +}; + +static const struct txdds_ent txdds_extra_mfg[TXDDS_MFG_SZ] = { + /* amp, pre, main, post */ + { 0, 0, 0, 0 }, /* QME7342 mfg settings */ + { 0, 0, 0, 6 }, /* QME7342 P2 mfg settings */ +}; + +static const struct txdds_ent *get_atten_table(const struct txdds_ent *txdds, + unsigned atten) +{ + /* + * The attenuation table starts at 2dB for entry 1, + * with entry 0 being the loopback entry. + */ + if (atten <= 2) + atten = 1; + else if (atten > TXDDS_TABLE_SZ) + atten = TXDDS_TABLE_SZ - 1; + else + atten--; + return txdds + atten; +} + +/* + * if override is set, the module parameter txselect has a value + * for this specific port, so use it, rather than our normal mechanism. + */ +static void find_best_ent(struct qib_pportdata *ppd, + const struct txdds_ent **sdr_dds, + const struct txdds_ent **ddr_dds, + const struct txdds_ent **qdr_dds, int override) +{ + struct qib_qsfp_cache *qd = &ppd->cpspec->qsfp_data.cache; + int idx; + + /* Search table of known cables */ + for (idx = 0; !override && idx < ARRAY_SIZE(vendor_txdds); ++idx) { + const struct vendor_txdds_ent *v = vendor_txdds + idx; + + if (!memcmp(v->oui, qd->oui, QSFP_VOUI_LEN) && + (!v->partnum || + !memcmp(v->partnum, qd->partnum, QSFP_PN_LEN))) { + *sdr_dds = &v->sdr; + *ddr_dds = &v->ddr; + *qdr_dds = &v->qdr; + return; + } + } + + /* Active cables don't have attenuation so we only set SERDES + * settings to account for the attenuation of the board traces. */ + if (!override && QSFP_IS_ACTIVE(qd->tech)) { + *sdr_dds = txdds_sdr + ppd->dd->board_atten; + *ddr_dds = txdds_ddr + ppd->dd->board_atten; + *qdr_dds = txdds_qdr + ppd->dd->board_atten; + return; + } + + if (!override && QSFP_HAS_ATTEN(qd->tech) && (qd->atten[0] || + qd->atten[1])) { + *sdr_dds = get_atten_table(txdds_sdr, qd->atten[0]); + *ddr_dds = get_atten_table(txdds_ddr, qd->atten[0]); + *qdr_dds = get_atten_table(txdds_qdr, qd->atten[1]); + return; + } else if (ppd->cpspec->no_eep < TXDDS_TABLE_SZ) { + /* + * If we have no (or incomplete) data from the cable + * EEPROM, or no QSFP, or override is set, use the + * module parameter value to index into the attentuation + * table. + */ + idx = ppd->cpspec->no_eep; + *sdr_dds = &txdds_sdr[idx]; + *ddr_dds = &txdds_ddr[idx]; + *qdr_dds = &txdds_qdr[idx]; + } else if (ppd->cpspec->no_eep < (TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ)) { + /* similar to above, but index into the "extra" table. */ + idx = ppd->cpspec->no_eep - TXDDS_TABLE_SZ; + *sdr_dds = &txdds_extra_sdr[idx]; + *ddr_dds = &txdds_extra_ddr[idx]; + *qdr_dds = &txdds_extra_qdr[idx]; + } else if ((IS_QME(ppd->dd) || IS_QMH(ppd->dd)) && + ppd->cpspec->no_eep < (TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ + + TXDDS_MFG_SZ)) { + idx = ppd->cpspec->no_eep - (TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ); + pr_info("IB%u:%u use idx %u into txdds_mfg\n", + ppd->dd->unit, ppd->port, idx); + *sdr_dds = &txdds_extra_mfg[idx]; + *ddr_dds = &txdds_extra_mfg[idx]; + *qdr_dds = &txdds_extra_mfg[idx]; + } else { + /* this shouldn't happen, it's range checked */ + *sdr_dds = txdds_sdr + qib_long_atten; + *ddr_dds = txdds_ddr + qib_long_atten; + *qdr_dds = txdds_qdr + qib_long_atten; + } +} + +static void init_txdds_table(struct qib_pportdata *ppd, int override) +{ + const struct txdds_ent *sdr_dds, *ddr_dds, *qdr_dds; + struct txdds_ent *dds; + int idx; + int single_ent = 0; + + find_best_ent(ppd, &sdr_dds, &ddr_dds, &qdr_dds, override); + + /* for mez cards or override, use the selected value for all entries */ + if (!(ppd->dd->flags & QIB_HAS_QSFP) || override) + single_ent = 1; + + /* Fill in the first entry with the best entry found. */ + set_txdds(ppd, 0, sdr_dds); + set_txdds(ppd, TXDDS_TABLE_SZ, ddr_dds); + set_txdds(ppd, 2 * TXDDS_TABLE_SZ, qdr_dds); + if (ppd->lflags & (QIBL_LINKINIT | QIBL_LINKARMED | + QIBL_LINKACTIVE)) { + dds = (struct txdds_ent *)(ppd->link_speed_active == + QIB_IB_QDR ? qdr_dds : + (ppd->link_speed_active == + QIB_IB_DDR ? ddr_dds : sdr_dds)); + write_tx_serdes_param(ppd, dds); + } + + /* Fill in the remaining entries with the default table values. */ + for (idx = 1; idx < ARRAY_SIZE(txdds_sdr); ++idx) { + set_txdds(ppd, idx, single_ent ? sdr_dds : txdds_sdr + idx); + set_txdds(ppd, idx + TXDDS_TABLE_SZ, + single_ent ? ddr_dds : txdds_ddr + idx); + set_txdds(ppd, idx + 2 * TXDDS_TABLE_SZ, + single_ent ? qdr_dds : txdds_qdr + idx); + } +} + +#define KR_AHB_ACC KREG_IDX(ahb_access_ctrl) +#define KR_AHB_TRANS KREG_IDX(ahb_transaction_reg) +#define AHB_TRANS_RDY SYM_MASK(ahb_transaction_reg, ahb_rdy) +#define AHB_ADDR_LSB SYM_LSB(ahb_transaction_reg, ahb_address) +#define AHB_DATA_LSB SYM_LSB(ahb_transaction_reg, ahb_data) +#define AHB_WR SYM_MASK(ahb_transaction_reg, write_not_read) +#define AHB_TRANS_TRIES 10 + +/* + * The chan argument is 0=chan0, 1=chan1, 2=pll, 3=chan2, 4=chan4, + * 5=subsystem which is why most calls have "chan + chan >> 1" + * for the channel argument. + */ +static u32 ahb_mod(struct qib_devdata *dd, int quad, int chan, int addr, + u32 data, u32 mask) +{ + u32 rd_data, wr_data, sz_mask; + u64 trans, acc, prev_acc; + u32 ret = 0xBAD0BAD; + int tries; + + prev_acc = qib_read_kreg64(dd, KR_AHB_ACC); + /* From this point on, make sure we return access */ + acc = (quad << 1) | 1; + qib_write_kreg(dd, KR_AHB_ACC, acc); + + for (tries = 1; tries < AHB_TRANS_TRIES; ++tries) { + trans = qib_read_kreg64(dd, KR_AHB_TRANS); + if (trans & AHB_TRANS_RDY) + break; + } + if (tries >= AHB_TRANS_TRIES) { + qib_dev_err(dd, "No ahb_rdy in %d tries\n", AHB_TRANS_TRIES); + goto bail; + } + + /* If mask is not all 1s, we need to read, but different SerDes + * entities have different sizes + */ + sz_mask = (1UL << ((quad == 1) ? 32 : 16)) - 1; + wr_data = data & mask & sz_mask; + if ((~mask & sz_mask) != 0) { + trans = ((chan << 6) | addr) << (AHB_ADDR_LSB + 1); + qib_write_kreg(dd, KR_AHB_TRANS, trans); + + for (tries = 1; tries < AHB_TRANS_TRIES; ++tries) { + trans = qib_read_kreg64(dd, KR_AHB_TRANS); + if (trans & AHB_TRANS_RDY) + break; + } + if (tries >= AHB_TRANS_TRIES) { + qib_dev_err(dd, "No Rd ahb_rdy in %d tries\n", + AHB_TRANS_TRIES); + goto bail; + } + /* Re-read in case host split reads and read data first */ + trans = qib_read_kreg64(dd, KR_AHB_TRANS); + rd_data = (uint32_t)(trans >> AHB_DATA_LSB); + wr_data |= (rd_data & ~mask & sz_mask); + } + + /* If mask is not zero, we need to write. */ + if (mask & sz_mask) { + trans = ((chan << 6) | addr) << (AHB_ADDR_LSB + 1); + trans |= ((uint64_t)wr_data << AHB_DATA_LSB); + trans |= AHB_WR; + qib_write_kreg(dd, KR_AHB_TRANS, trans); + + for (tries = 1; tries < AHB_TRANS_TRIES; ++tries) { + trans = qib_read_kreg64(dd, KR_AHB_TRANS); + if (trans & AHB_TRANS_RDY) + break; + } + if (tries >= AHB_TRANS_TRIES) { + qib_dev_err(dd, "No Wr ahb_rdy in %d tries\n", + AHB_TRANS_TRIES); + goto bail; + } + } + ret = wr_data; +bail: + qib_write_kreg(dd, KR_AHB_ACC, prev_acc); + return ret; +} + +static void ibsd_wr_allchans(struct qib_pportdata *ppd, int addr, unsigned data, + unsigned mask) +{ + struct qib_devdata *dd = ppd->dd; + int chan; + u32 rbc; + + for (chan = 0; chan < SERDES_CHANS; ++chan) { + ahb_mod(dd, IBSD(ppd->hw_pidx), (chan + (chan >> 1)), addr, + data, mask); + rbc = ahb_mod(dd, IBSD(ppd->hw_pidx), (chan + (chan >> 1)), + addr, 0, 0); + } +} + +static void serdes_7322_los_enable(struct qib_pportdata *ppd, int enable) +{ + u64 data = qib_read_kreg_port(ppd, krp_serdesctrl); + u8 state = SYM_FIELD(data, IBSerdesCtrl_0, RXLOSEN); + + if (enable && !state) { + pr_info("IB%u:%u Turning LOS on\n", + ppd->dd->unit, ppd->port); + data |= SYM_MASK(IBSerdesCtrl_0, RXLOSEN); + } else if (!enable && state) { + pr_info("IB%u:%u Turning LOS off\n", + ppd->dd->unit, ppd->port); + data &= ~SYM_MASK(IBSerdesCtrl_0, RXLOSEN); + } + qib_write_kreg_port(ppd, krp_serdesctrl, data); +} + +static int serdes_7322_init(struct qib_pportdata *ppd) +{ + int ret = 0; + + if (ppd->dd->cspec->r1) + ret = serdes_7322_init_old(ppd); + else + ret = serdes_7322_init_new(ppd); + return ret; +} + +static int serdes_7322_init_old(struct qib_pportdata *ppd) +{ + u32 le_val; + + /* + * Initialize the Tx DDS tables. Also done every QSFP event, + * for adapters with QSFP + */ + init_txdds_table(ppd, 0); + + /* ensure no tx overrides from earlier driver loads */ + qib_write_kreg_port(ppd, krp_tx_deemph_override, + SYM_MASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0, + reset_tx_deemphasis_override)); + + /* Patch some SerDes defaults to "Better for IB" */ + /* Timing Loop Bandwidth: cdr_timing[11:9] = 0 */ + ibsd_wr_allchans(ppd, 2, 0, BMASK(11, 9)); + + /* Termination: rxtermctrl_r2d addr 11 bits [12:11] = 1 */ + ibsd_wr_allchans(ppd, 11, (1 << 11), BMASK(12, 11)); + /* Enable LE2: rxle2en_r2a addr 13 bit [6] = 1 */ + ibsd_wr_allchans(ppd, 13, (1 << 6), (1 << 6)); + + /* May be overridden in qsfp_7322_event */ + le_val = IS_QME(ppd->dd) ? LE2_QME : LE2_DEFAULT; + ibsd_wr_allchans(ppd, 13, (le_val << 7), BMASK(9, 7)); + + /* enable LE1 adaptation for all but QME, which is disabled */ + le_val = IS_QME(ppd->dd) ? 0 : 1; + ibsd_wr_allchans(ppd, 13, (le_val << 5), (1 << 5)); + + /* Clear cmode-override, may be set from older driver */ + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 10, 0 << 14, 1 << 14); + + /* Timing Recovery: rxtapsel addr 5 bits [9:8] = 0 */ + ibsd_wr_allchans(ppd, 5, (0 << 8), BMASK(9, 8)); + + /* setup LoS params; these are subsystem, so chan == 5 */ + /* LoS filter threshold_count on, ch 0-3, set to 8 */ + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 5, 8 << 11, BMASK(14, 11)); + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 7, 8 << 4, BMASK(7, 4)); + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 8, 8 << 11, BMASK(14, 11)); + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 10, 8 << 4, BMASK(7, 4)); + + /* LoS filter threshold_count off, ch 0-3, set to 4 */ + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 6, 4 << 0, BMASK(3, 0)); + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 7, 4 << 8, BMASK(11, 8)); + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 9, 4 << 0, BMASK(3, 0)); + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 10, 4 << 8, BMASK(11, 8)); + + /* LoS filter select enabled */ + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 9, 1 << 15, 1 << 15); + + /* LoS target data: SDR=4, DDR=2, QDR=1 */ + ibsd_wr_allchans(ppd, 14, (1 << 3), BMASK(5, 3)); /* QDR */ + ibsd_wr_allchans(ppd, 20, (2 << 10), BMASK(12, 10)); /* DDR */ + ibsd_wr_allchans(ppd, 20, (4 << 13), BMASK(15, 13)); /* SDR */ + + serdes_7322_los_enable(ppd, 1); + + /* rxbistena; set 0 to avoid effects of it switch later */ + ibsd_wr_allchans(ppd, 9, 0 << 15, 1 << 15); + + /* Configure 4 DFE taps, and only they adapt */ + ibsd_wr_allchans(ppd, 16, 0 << 0, BMASK(1, 0)); + + /* gain hi stop 32 (22) (6:1) lo stop 7 (10:7) target 22 (13) (15:11) */ + le_val = (ppd->dd->cspec->r1 || IS_QME(ppd->dd)) ? 0xb6c0 : 0x6bac; + ibsd_wr_allchans(ppd, 21, le_val, 0xfffe); + + /* + * Set receive adaptation mode. SDR and DDR adaptation are + * always on, and QDR is initially enabled; later disabled. + */ + qib_write_kreg_port(ppd, krp_static_adapt_dis(0), 0ULL); + qib_write_kreg_port(ppd, krp_static_adapt_dis(1), 0ULL); + qib_write_kreg_port(ppd, krp_static_adapt_dis(2), + ppd->dd->cspec->r1 ? + QDR_STATIC_ADAPT_DOWN_R1 : QDR_STATIC_ADAPT_DOWN); + ppd->cpspec->qdr_dfe_on = 1; + + /* FLoop LOS gate: PPM filter enabled */ + ibsd_wr_allchans(ppd, 38, 0 << 10, 1 << 10); + + /* rx offset center enabled */ + ibsd_wr_allchans(ppd, 12, 1 << 4, 1 << 4); + + if (!ppd->dd->cspec->r1) { + ibsd_wr_allchans(ppd, 12, 1 << 12, 1 << 12); + ibsd_wr_allchans(ppd, 12, 2 << 8, 0x0f << 8); + } + + /* Set the frequency loop bandwidth to 15 */ + ibsd_wr_allchans(ppd, 2, 15 << 5, BMASK(8, 5)); + + return 0; +} + +static int serdes_7322_init_new(struct qib_pportdata *ppd) +{ + unsigned long tend; + u32 le_val, rxcaldone; + int chan, chan_done = (1 << SERDES_CHANS) - 1; + + /* Clear cmode-override, may be set from older driver */ + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 10, 0 << 14, 1 << 14); + + /* ensure no tx overrides from earlier driver loads */ + qib_write_kreg_port(ppd, krp_tx_deemph_override, + SYM_MASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0, + reset_tx_deemphasis_override)); + + /* START OF LSI SUGGESTED SERDES BRINGUP */ + /* Reset - Calibration Setup */ + /* Stop DFE adaptaion */ + ibsd_wr_allchans(ppd, 1, 0, BMASK(9, 1)); + /* Disable LE1 */ + ibsd_wr_allchans(ppd, 13, 0, BMASK(5, 5)); + /* Disable autoadapt for LE1 */ + ibsd_wr_allchans(ppd, 1, 0, BMASK(15, 15)); + /* Disable LE2 */ + ibsd_wr_allchans(ppd, 13, 0, BMASK(6, 6)); + /* Disable VGA */ + ibsd_wr_allchans(ppd, 5, 0, BMASK(0, 0)); + /* Disable AFE Offset Cancel */ + ibsd_wr_allchans(ppd, 12, 0, BMASK(12, 12)); + /* Disable Timing Loop */ + ibsd_wr_allchans(ppd, 2, 0, BMASK(3, 3)); + /* Disable Frequency Loop */ + ibsd_wr_allchans(ppd, 2, 0, BMASK(4, 4)); + /* Disable Baseline Wander Correction */ + ibsd_wr_allchans(ppd, 13, 0, BMASK(13, 13)); + /* Disable RX Calibration */ + ibsd_wr_allchans(ppd, 4, 0, BMASK(10, 10)); + /* Disable RX Offset Calibration */ + ibsd_wr_allchans(ppd, 12, 0, BMASK(4, 4)); + /* Select BB CDR */ + ibsd_wr_allchans(ppd, 2, (1 << 15), BMASK(15, 15)); + /* CDR Step Size */ + ibsd_wr_allchans(ppd, 5, 0, BMASK(9, 8)); + /* Enable phase Calibration */ + ibsd_wr_allchans(ppd, 12, (1 << 5), BMASK(5, 5)); + /* DFE Bandwidth [2:14-12] */ + ibsd_wr_allchans(ppd, 2, (4 << 12), BMASK(14, 12)); + /* DFE Config (4 taps only) */ + ibsd_wr_allchans(ppd, 16, 0, BMASK(1, 0)); + /* Gain Loop Bandwidth */ + if (!ppd->dd->cspec->r1) { + ibsd_wr_allchans(ppd, 12, 1 << 12, BMASK(12, 12)); + ibsd_wr_allchans(ppd, 12, 2 << 8, BMASK(11, 8)); + } else { + ibsd_wr_allchans(ppd, 19, (3 << 11), BMASK(13, 11)); + } + /* Baseline Wander Correction Gain [13:4-0] (leave as default) */ + /* Baseline Wander Correction Gain [3:7-5] (leave as default) */ + /* Data Rate Select [5:7-6] (leave as default) */ + /* RX Parallel Word Width [3:10-8] (leave as default) */ + + /* RX REST */ + /* Single- or Multi-channel reset */ + /* RX Analog reset */ + /* RX Digital reset */ + ibsd_wr_allchans(ppd, 0, 0, BMASK(15, 13)); + msleep(20); + /* RX Analog reset */ + ibsd_wr_allchans(ppd, 0, (1 << 14), BMASK(14, 14)); + msleep(20); + /* RX Digital reset */ + ibsd_wr_allchans(ppd, 0, (1 << 13), BMASK(13, 13)); + msleep(20); + + /* setup LoS params; these are subsystem, so chan == 5 */ + /* LoS filter threshold_count on, ch 0-3, set to 8 */ + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 5, 8 << 11, BMASK(14, 11)); + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 7, 8 << 4, BMASK(7, 4)); + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 8, 8 << 11, BMASK(14, 11)); + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 10, 8 << 4, BMASK(7, 4)); + + /* LoS filter threshold_count off, ch 0-3, set to 4 */ + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 6, 4 << 0, BMASK(3, 0)); + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 7, 4 << 8, BMASK(11, 8)); + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 9, 4 << 0, BMASK(3, 0)); + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 10, 4 << 8, BMASK(11, 8)); + + /* LoS filter select enabled */ + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 9, 1 << 15, 1 << 15); + + /* LoS target data: SDR=4, DDR=2, QDR=1 */ + ibsd_wr_allchans(ppd, 14, (1 << 3), BMASK(5, 3)); /* QDR */ + ibsd_wr_allchans(ppd, 20, (2 << 10), BMASK(12, 10)); /* DDR */ + ibsd_wr_allchans(ppd, 20, (4 << 13), BMASK(15, 13)); /* SDR */ + + /* Turn on LOS on initial SERDES init */ + serdes_7322_los_enable(ppd, 1); + /* FLoop LOS gate: PPM filter enabled */ + ibsd_wr_allchans(ppd, 38, 0 << 10, 1 << 10); + + /* RX LATCH CALIBRATION */ + /* Enable Eyefinder Phase Calibration latch */ + ibsd_wr_allchans(ppd, 15, 1, BMASK(0, 0)); + /* Enable RX Offset Calibration latch */ + ibsd_wr_allchans(ppd, 12, (1 << 4), BMASK(4, 4)); + msleep(20); + /* Start Calibration */ + ibsd_wr_allchans(ppd, 4, (1 << 10), BMASK(10, 10)); + tend = jiffies + msecs_to_jiffies(500); + while (chan_done && !time_is_before_jiffies(tend)) { + msleep(20); + for (chan = 0; chan < SERDES_CHANS; ++chan) { + rxcaldone = ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), + (chan + (chan >> 1)), + 25, 0, 0); + if ((~rxcaldone & (u32)BMASK(9, 9)) == 0 && + (~chan_done & (1 << chan)) == 0) + chan_done &= ~(1 << chan); + } + } + if (chan_done) { + pr_info("Serdes %d calibration not done after .5 sec: 0x%x\n", + IBSD(ppd->hw_pidx), chan_done); + } else { + for (chan = 0; chan < SERDES_CHANS; ++chan) { + rxcaldone = ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), + (chan + (chan >> 1)), + 25, 0, 0); + if ((~rxcaldone & (u32)BMASK(10, 10)) == 0) + pr_info("Serdes %d chan %d calibration failed\n", + IBSD(ppd->hw_pidx), chan); + } + } + + /* Turn off Calibration */ + ibsd_wr_allchans(ppd, 4, 0, BMASK(10, 10)); + msleep(20); + + /* BRING RX UP */ + /* Set LE2 value (May be overridden in qsfp_7322_event) */ + le_val = IS_QME(ppd->dd) ? LE2_QME : LE2_DEFAULT; + ibsd_wr_allchans(ppd, 13, (le_val << 7), BMASK(9, 7)); + /* Set LE2 Loop bandwidth */ + ibsd_wr_allchans(ppd, 3, (7 << 5), BMASK(7, 5)); + /* Enable LE2 */ + ibsd_wr_allchans(ppd, 13, (1 << 6), BMASK(6, 6)); + msleep(20); + /* Enable H0 only */ + ibsd_wr_allchans(ppd, 1, 1, BMASK(9, 1)); + /* gain hi stop 32 (22) (6:1) lo stop 7 (10:7) target 22 (13) (15:11) */ + le_val = (ppd->dd->cspec->r1 || IS_QME(ppd->dd)) ? 0xb6c0 : 0x6bac; + ibsd_wr_allchans(ppd, 21, le_val, 0xfffe); + /* Enable VGA */ + ibsd_wr_allchans(ppd, 5, 0, BMASK(0, 0)); + msleep(20); + /* Set Frequency Loop Bandwidth */ + ibsd_wr_allchans(ppd, 2, (15 << 5), BMASK(8, 5)); + /* Enable Frequency Loop */ + ibsd_wr_allchans(ppd, 2, (1 << 4), BMASK(4, 4)); + /* Set Timing Loop Bandwidth */ + ibsd_wr_allchans(ppd, 2, 0, BMASK(11, 9)); + /* Enable Timing Loop */ + ibsd_wr_allchans(ppd, 2, (1 << 3), BMASK(3, 3)); + msleep(50); + /* Enable DFE + * Set receive adaptation mode. SDR and DDR adaptation are + * always on, and QDR is initially enabled; later disabled. + */ + qib_write_kreg_port(ppd, krp_static_adapt_dis(0), 0ULL); + qib_write_kreg_port(ppd, krp_static_adapt_dis(1), 0ULL); + qib_write_kreg_port(ppd, krp_static_adapt_dis(2), + ppd->dd->cspec->r1 ? + QDR_STATIC_ADAPT_DOWN_R1 : QDR_STATIC_ADAPT_DOWN); + ppd->cpspec->qdr_dfe_on = 1; + /* Disable LE1 */ + ibsd_wr_allchans(ppd, 13, (0 << 5), (1 << 5)); + /* Disable auto adapt for LE1 */ + ibsd_wr_allchans(ppd, 1, (0 << 15), BMASK(15, 15)); + msleep(20); + /* Enable AFE Offset Cancel */ + ibsd_wr_allchans(ppd, 12, (1 << 12), BMASK(12, 12)); + /* Enable Baseline Wander Correction */ + ibsd_wr_allchans(ppd, 12, (1 << 13), BMASK(13, 13)); + /* Termination: rxtermctrl_r2d addr 11 bits [12:11] = 1 */ + ibsd_wr_allchans(ppd, 11, (1 << 11), BMASK(12, 11)); + /* VGA output common mode */ + ibsd_wr_allchans(ppd, 12, (3 << 2), BMASK(3, 2)); + + /* + * Initialize the Tx DDS tables. Also done every QSFP event, + * for adapters with QSFP + */ + init_txdds_table(ppd, 0); + + return 0; +} + +/* start adjust QMH serdes parameters */ + +static void set_man_code(struct qib_pportdata *ppd, int chan, int code) +{ + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), (chan + (chan >> 1)), + 9, code << 9, 0x3f << 9); +} + +static void set_man_mode_h1(struct qib_pportdata *ppd, int chan, + int enable, u32 tapenable) +{ + if (enable) + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), (chan + (chan >> 1)), + 1, 3 << 10, 0x1f << 10); + else + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), (chan + (chan >> 1)), + 1, 0, 0x1f << 10); +} + +/* Set clock to 1, 0, 1, 0 */ +static void clock_man(struct qib_pportdata *ppd, int chan) +{ + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), (chan + (chan >> 1)), + 4, 0x4000, 0x4000); + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), (chan + (chan >> 1)), + 4, 0, 0x4000); + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), (chan + (chan >> 1)), + 4, 0x4000, 0x4000); + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), (chan + (chan >> 1)), + 4, 0, 0x4000); +} + +/* + * write the current Tx serdes pre,post,main,amp settings into the serdes. + * The caller must pass the settings appropriate for the current speed, + * or not care if they are correct for the current speed. + */ +static void write_tx_serdes_param(struct qib_pportdata *ppd, + struct txdds_ent *txdds) +{ + u64 deemph; + + deemph = qib_read_kreg_port(ppd, krp_tx_deemph_override); + /* field names for amp, main, post, pre, respectively */ + deemph &= ~(SYM_MASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0, txampcntl_d2a) | + SYM_MASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0, txc0_ena) | + SYM_MASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0, txcp1_ena) | + SYM_MASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0, txcn1_ena)); + + deemph |= SYM_MASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0, + tx_override_deemphasis_select); + deemph |= (txdds->amp & SYM_RMASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0, + txampcntl_d2a)) << SYM_LSB(IBSD_TX_DEEMPHASIS_OVERRIDE_0, + txampcntl_d2a); + deemph |= (txdds->main & SYM_RMASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0, + txc0_ena)) << SYM_LSB(IBSD_TX_DEEMPHASIS_OVERRIDE_0, + txc0_ena); + deemph |= (txdds->post & SYM_RMASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0, + txcp1_ena)) << SYM_LSB(IBSD_TX_DEEMPHASIS_OVERRIDE_0, + txcp1_ena); + deemph |= (txdds->pre & SYM_RMASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0, + txcn1_ena)) << SYM_LSB(IBSD_TX_DEEMPHASIS_OVERRIDE_0, + txcn1_ena); + qib_write_kreg_port(ppd, krp_tx_deemph_override, deemph); +} + +/* + * Set the parameters for mez cards on link bounce, so they are + * always exactly what was requested. Similar logic to init_txdds + * but does just the serdes. + */ +static void adj_tx_serdes(struct qib_pportdata *ppd) +{ + const struct txdds_ent *sdr_dds, *ddr_dds, *qdr_dds; + struct txdds_ent *dds; + + find_best_ent(ppd, &sdr_dds, &ddr_dds, &qdr_dds, 1); + dds = (struct txdds_ent *)(ppd->link_speed_active == QIB_IB_QDR ? + qdr_dds : (ppd->link_speed_active == QIB_IB_DDR ? + ddr_dds : sdr_dds)); + write_tx_serdes_param(ppd, dds); +} + +/* set QDR forced value for H1, if needed */ +static void force_h1(struct qib_pportdata *ppd) +{ + int chan; + + ppd->cpspec->qdr_reforce = 0; + if (!ppd->dd->cspec->r1) + return; + + for (chan = 0; chan < SERDES_CHANS; chan++) { + set_man_mode_h1(ppd, chan, 1, 0); + set_man_code(ppd, chan, ppd->cpspec->h1_val); + clock_man(ppd, chan); + set_man_mode_h1(ppd, chan, 0, 0); + } +} + +#define SJA_EN SYM_MASK(SPC_JTAG_ACCESS_REG, SPC_JTAG_ACCESS_EN) +#define BISTEN_LSB SYM_LSB(SPC_JTAG_ACCESS_REG, bist_en) + +#define R_OPCODE_LSB 3 +#define R_OP_NOP 0 +#define R_OP_SHIFT 2 +#define R_OP_UPDATE 3 +#define R_TDI_LSB 2 +#define R_TDO_LSB 1 +#define R_RDY 1 + +static int qib_r_grab(struct qib_devdata *dd) +{ + u64 val = SJA_EN; + + qib_write_kreg(dd, kr_r_access, val); + qib_read_kreg32(dd, kr_scratch); + return 0; +} + +/* qib_r_wait_for_rdy() not only waits for the ready bit, it + * returns the current state of R_TDO + */ +static int qib_r_wait_for_rdy(struct qib_devdata *dd) +{ + u64 val; + int timeout; + + for (timeout = 0; timeout < 100 ; ++timeout) { + val = qib_read_kreg32(dd, kr_r_access); + if (val & R_RDY) + return (val >> R_TDO_LSB) & 1; + } + return -1; +} + +static int qib_r_shift(struct qib_devdata *dd, int bisten, + int len, u8 *inp, u8 *outp) +{ + u64 valbase, val; + int ret, pos; + + valbase = SJA_EN | (bisten << BISTEN_LSB) | + (R_OP_SHIFT << R_OPCODE_LSB); + ret = qib_r_wait_for_rdy(dd); + if (ret < 0) + goto bail; + for (pos = 0; pos < len; ++pos) { + val = valbase; + if (outp) { + outp[pos >> 3] &= ~(1 << (pos & 7)); + outp[pos >> 3] |= (ret << (pos & 7)); + } + if (inp) { + int tdi = inp[pos >> 3] >> (pos & 7); + + val |= ((tdi & 1) << R_TDI_LSB); + } + qib_write_kreg(dd, kr_r_access, val); + qib_read_kreg32(dd, kr_scratch); + ret = qib_r_wait_for_rdy(dd); + if (ret < 0) + break; + } + /* Restore to NOP between operations. */ + val = SJA_EN | (bisten << BISTEN_LSB); + qib_write_kreg(dd, kr_r_access, val); + qib_read_kreg32(dd, kr_scratch); + ret = qib_r_wait_for_rdy(dd); + + if (ret >= 0) + ret = pos; +bail: + return ret; +} + +static int qib_r_update(struct qib_devdata *dd, int bisten) +{ + u64 val; + int ret; + + val = SJA_EN | (bisten << BISTEN_LSB) | (R_OP_UPDATE << R_OPCODE_LSB); + ret = qib_r_wait_for_rdy(dd); + if (ret >= 0) { + qib_write_kreg(dd, kr_r_access, val); + qib_read_kreg32(dd, kr_scratch); + } + return ret; +} + +#define BISTEN_PORT_SEL 15 +#define LEN_PORT_SEL 625 +#define BISTEN_AT 17 +#define LEN_AT 156 +#define BISTEN_ETM 16 +#define LEN_ETM 632 + +#define BIT2BYTE(x) (((x) + BITS_PER_BYTE - 1) / BITS_PER_BYTE) + +/* these are common for all IB port use cases. */ +static u8 reset_at[BIT2BYTE(LEN_AT)] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, +}; +static u8 reset_atetm[BIT2BYTE(LEN_ETM)] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x80, 0xe3, 0x81, 0x73, 0x3c, 0x70, 0x8e, + 0x07, 0xce, 0xf1, 0xc0, 0x39, 0x1e, 0x38, 0xc7, 0x03, 0xe7, + 0x78, 0xe0, 0x1c, 0x0f, 0x9c, 0x7f, 0x80, 0x73, 0x0f, 0x70, + 0xde, 0x01, 0xce, 0x39, 0xc0, 0xf9, 0x06, 0x38, 0xd7, 0x00, + 0xe7, 0x19, 0xe0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, +}; +static u8 at[BIT2BYTE(LEN_AT)] = { + 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, +}; + +/* used for IB1 or IB2, only one in use */ +static u8 atetm_1port[BIT2BYTE(LEN_ETM)] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x10, 0xf2, 0x80, 0x83, 0x1e, 0x38, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x50, 0xf4, 0x41, 0x00, 0x18, 0x78, 0xc8, 0x03, + 0x07, 0x7b, 0xa0, 0x3e, 0x00, 0x02, 0x00, 0x00, 0x18, 0x00, + 0x18, 0x00, 0x00, 0x00, 0x00, 0x4b, 0x00, 0x00, 0x00, +}; + +/* used when both IB1 and IB2 are in use */ +static u8 atetm_2port[BIT2BYTE(LEN_ETM)] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x79, + 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0xf8, 0x80, 0x83, 0x1e, 0x38, 0xe0, 0x03, 0x05, + 0x7b, 0xa0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, + 0xa2, 0x0f, 0x50, 0xf4, 0x41, 0x00, 0x18, 0x78, 0xd1, 0x07, + 0x02, 0x7c, 0x80, 0x3e, 0x00, 0x02, 0x00, 0x00, 0x3e, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x00, 0x64, 0x00, 0x00, 0x00, +}; + +/* used when only IB1 is in use */ +static u8 portsel_port1[BIT2BYTE(LEN_PORT_SEL)] = { + 0x32, 0x65, 0xa4, 0x7b, 0x10, 0x98, 0xdc, 0xfe, 0x13, 0x13, + 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x73, 0x0c, 0x0c, 0x0c, + 0x0c, 0x0c, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, + 0x13, 0x78, 0x78, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, + 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x74, 0x32, + 0x32, 0x32, 0x32, 0x32, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, + 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, + 0x14, 0x14, 0x9f, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, +}; + +/* used when only IB2 is in use */ +static u8 portsel_port2[BIT2BYTE(LEN_PORT_SEL)] = { + 0x32, 0x65, 0xa4, 0x7b, 0x10, 0x98, 0xdc, 0xfe, 0x39, 0x39, + 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x73, 0x32, 0x32, 0x32, + 0x32, 0x32, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, + 0x39, 0x78, 0x78, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, + 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x74, 0x32, + 0x32, 0x32, 0x32, 0x32, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, + 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, + 0x3a, 0x3a, 0x9f, 0x01, 0x00, 0x00, 0x00, 0x00, 0x01, +}; + +/* used when both IB1 and IB2 are in use */ +static u8 portsel_2port[BIT2BYTE(LEN_PORT_SEL)] = { + 0x32, 0xba, 0x54, 0x76, 0x10, 0x98, 0xdc, 0xfe, 0x13, 0x13, + 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x73, 0x0c, 0x0c, 0x0c, + 0x0c, 0x0c, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, + 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, + 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x74, 0x32, + 0x32, 0x32, 0x32, 0x32, 0x14, 0x14, 0x14, 0x14, 0x14, 0x3a, + 0x3a, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, + 0x14, 0x14, 0x9f, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, +}; + +/* + * Do setup to properly handle IB link recovery; if port is zero, we + * are initializing to cover both ports; otherwise we are initializing + * to cover a single port card, or the port has reached INIT and we may + * need to switch coverage types. + */ +static void setup_7322_link_recovery(struct qib_pportdata *ppd, u32 both) +{ + u8 *portsel, *etm; + struct qib_devdata *dd = ppd->dd; + + if (!ppd->dd->cspec->r1) + return; + if (!both) { + dd->cspec->recovery_ports_initted++; + ppd->cpspec->recovery_init = 1; + } + if (!both && dd->cspec->recovery_ports_initted == 1) { + portsel = ppd->port == 1 ? portsel_port1 : portsel_port2; + etm = atetm_1port; + } else { + portsel = portsel_2port; + etm = atetm_2port; + } + + if (qib_r_grab(dd) < 0 || + qib_r_shift(dd, BISTEN_ETM, LEN_ETM, reset_atetm, NULL) < 0 || + qib_r_update(dd, BISTEN_ETM) < 0 || + qib_r_shift(dd, BISTEN_AT, LEN_AT, reset_at, NULL) < 0 || + qib_r_update(dd, BISTEN_AT) < 0 || + qib_r_shift(dd, BISTEN_PORT_SEL, LEN_PORT_SEL, + portsel, NULL) < 0 || + qib_r_update(dd, BISTEN_PORT_SEL) < 0 || + qib_r_shift(dd, BISTEN_AT, LEN_AT, at, NULL) < 0 || + qib_r_update(dd, BISTEN_AT) < 0 || + qib_r_shift(dd, BISTEN_ETM, LEN_ETM, etm, NULL) < 0 || + qib_r_update(dd, BISTEN_ETM) < 0) + qib_dev_err(dd, "Failed IB link recovery setup\n"); +} + +static void check_7322_rxe_status(struct qib_pportdata *ppd) +{ + struct qib_devdata *dd = ppd->dd; + u64 fmask; + + if (dd->cspec->recovery_ports_initted != 1) + return; /* rest doesn't apply to dualport */ + qib_write_kreg(dd, kr_control, dd->control | + SYM_MASK(Control, FreezeMode)); + (void)qib_read_kreg64(dd, kr_scratch); + udelay(3); /* ibcreset asserted 400ns, be sure that's over */ + fmask = qib_read_kreg64(dd, kr_act_fmask); + if (!fmask) { + /* + * require a powercycle before we'll work again, and make + * sure we get no more interrupts, and don't turn off + * freeze. + */ + ppd->dd->cspec->stay_in_freeze = 1; + qib_7322_set_intr_state(ppd->dd, 0); + qib_write_kreg(dd, kr_fmask, 0ULL); + qib_dev_err(dd, "HCA unusable until powercycled\n"); + return; /* eventually reset */ + } + + qib_write_kreg(ppd->dd, kr_hwerrclear, + SYM_MASK(HwErrClear, IBSerdesPClkNotDetectClear_1)); + + /* don't do the full clear_freeze(), not needed for this */ + qib_write_kreg(dd, kr_control, dd->control); + qib_read_kreg32(dd, kr_scratch); + /* take IBC out of reset */ + if (ppd->link_speed_supported) { + ppd->cpspec->ibcctrl_a &= + ~SYM_MASK(IBCCtrlA_0, IBStatIntReductionEn); + qib_write_kreg_port(ppd, krp_ibcctrl_a, + ppd->cpspec->ibcctrl_a); + qib_read_kreg32(dd, kr_scratch); + if (ppd->lflags & QIBL_IB_LINK_DISABLED) + qib_set_ib_7322_lstate(ppd, 0, + QLOGIC_IB_IBCC_LINKINITCMD_DISABLE); + } +} diff --git a/kernel/drivers/infiniband/hw/qib/qib_init.c b/kernel/drivers/infiniband/hw/qib/qib_init.c new file mode 100644 index 000000000..7e00470ad --- /dev/null +++ b/kernel/drivers/infiniband/hw/qib/qib_init.c @@ -0,0 +1,1847 @@ +/* + * Copyright (c) 2012, 2013 Intel Corporation. All rights reserved. + * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_INFINIBAND_QIB_DCA +#include +#endif + +#include "qib.h" +#include "qib_common.h" +#include "qib_mad.h" +#ifdef CONFIG_DEBUG_FS +#include "qib_debugfs.h" +#include "qib_verbs.h" +#endif + +#undef pr_fmt +#define pr_fmt(fmt) QIB_DRV_NAME ": " fmt + +/* + * min buffers we want to have per context, after driver + */ +#define QIB_MIN_USER_CTXT_BUFCNT 7 + +#define QLOGIC_IB_R_SOFTWARE_MASK 0xFF +#define QLOGIC_IB_R_SOFTWARE_SHIFT 24 +#define QLOGIC_IB_R_EMULATOR_MASK (1ULL<<62) + +/* + * Number of ctxts we are configured to use (to allow for more pio + * buffers per ctxt, etc.) Zero means use chip value. + */ +ushort qib_cfgctxts; +module_param_named(cfgctxts, qib_cfgctxts, ushort, S_IRUGO); +MODULE_PARM_DESC(cfgctxts, "Set max number of contexts to use"); + +unsigned qib_numa_aware; +module_param_named(numa_aware, qib_numa_aware, uint, S_IRUGO); +MODULE_PARM_DESC(numa_aware, + "0 -> PSM allocation close to HCA, 1 -> PSM allocation local to process"); + +/* + * If set, do not write to any regs if avoidable, hack to allow + * check for deranged default register values. + */ +ushort qib_mini_init; +module_param_named(mini_init, qib_mini_init, ushort, S_IRUGO); +MODULE_PARM_DESC(mini_init, "If set, do minimal diag init"); + +unsigned qib_n_krcv_queues; +module_param_named(krcvqs, qib_n_krcv_queues, uint, S_IRUGO); +MODULE_PARM_DESC(krcvqs, "number of kernel receive queues per IB port"); + +unsigned qib_cc_table_size; +module_param_named(cc_table_size, qib_cc_table_size, uint, S_IRUGO); +MODULE_PARM_DESC(cc_table_size, "Congestion control table entries 0 (CCA disabled - default), min = 128, max = 1984"); + +static void verify_interrupt(unsigned long); + +static struct idr qib_unit_table; +u32 qib_cpulist_count; +unsigned long *qib_cpulist; + +/* set number of contexts we'll actually use */ +void qib_set_ctxtcnt(struct qib_devdata *dd) +{ + if (!qib_cfgctxts) { + dd->cfgctxts = dd->first_user_ctxt + num_online_cpus(); + if (dd->cfgctxts > dd->ctxtcnt) + dd->cfgctxts = dd->ctxtcnt; + } else if (qib_cfgctxts < dd->num_pports) + dd->cfgctxts = dd->ctxtcnt; + else if (qib_cfgctxts <= dd->ctxtcnt) + dd->cfgctxts = qib_cfgctxts; + else + dd->cfgctxts = dd->ctxtcnt; + dd->freectxts = (dd->first_user_ctxt > dd->cfgctxts) ? 0 : + dd->cfgctxts - dd->first_user_ctxt; +} + +/* + * Common code for creating the receive context array. + */ +int qib_create_ctxts(struct qib_devdata *dd) +{ + unsigned i; + int local_node_id = pcibus_to_node(dd->pcidev->bus); + + if (local_node_id < 0) + local_node_id = numa_node_id(); + dd->assigned_node_id = local_node_id; + + /* + * Allocate full ctxtcnt array, rather than just cfgctxts, because + * cleanup iterates across all possible ctxts. + */ + dd->rcd = kcalloc(dd->ctxtcnt, sizeof(*dd->rcd), GFP_KERNEL); + if (!dd->rcd) { + qib_dev_err(dd, + "Unable to allocate ctxtdata array, failing\n"); + return -ENOMEM; + } + + /* create (one or more) kctxt */ + for (i = 0; i < dd->first_user_ctxt; ++i) { + struct qib_pportdata *ppd; + struct qib_ctxtdata *rcd; + + if (dd->skip_kctxt_mask & (1 << i)) + continue; + + ppd = dd->pport + (i % dd->num_pports); + + rcd = qib_create_ctxtdata(ppd, i, dd->assigned_node_id); + if (!rcd) { + qib_dev_err(dd, + "Unable to allocate ctxtdata for Kernel ctxt, failing\n"); + kfree(dd->rcd); + dd->rcd = NULL; + return -ENOMEM; + } + rcd->pkeys[0] = QIB_DEFAULT_P_KEY; + rcd->seq_cnt = 1; + } + return 0; +} + +/* + * Common code for user and kernel context setup. + */ +struct qib_ctxtdata *qib_create_ctxtdata(struct qib_pportdata *ppd, u32 ctxt, + int node_id) +{ + struct qib_devdata *dd = ppd->dd; + struct qib_ctxtdata *rcd; + + rcd = kzalloc_node(sizeof(*rcd), GFP_KERNEL, node_id); + if (rcd) { + INIT_LIST_HEAD(&rcd->qp_wait_list); + rcd->node_id = node_id; + rcd->ppd = ppd; + rcd->dd = dd; + rcd->cnt = 1; + rcd->ctxt = ctxt; + dd->rcd[ctxt] = rcd; +#ifdef CONFIG_DEBUG_FS + if (ctxt < dd->first_user_ctxt) { /* N/A for PSM contexts */ + rcd->opstats = kzalloc_node(sizeof(*rcd->opstats), + GFP_KERNEL, node_id); + if (!rcd->opstats) { + kfree(rcd); + qib_dev_err(dd, + "Unable to allocate per ctxt stats buffer\n"); + return NULL; + } + } +#endif + dd->f_init_ctxt(rcd); + + /* + * To avoid wasting a lot of memory, we allocate 32KB chunks + * of physically contiguous memory, advance through it until + * used up and then allocate more. Of course, we need + * memory to store those extra pointers, now. 32KB seems to + * be the most that is "safe" under memory pressure + * (creating large files and then copying them over + * NFS while doing lots of MPI jobs). The OOM killer can + * get invoked, even though we say we can sleep and this can + * cause significant system problems.... + */ + rcd->rcvegrbuf_size = 0x8000; + rcd->rcvegrbufs_perchunk = + rcd->rcvegrbuf_size / dd->rcvegrbufsize; + rcd->rcvegrbuf_chunks = (rcd->rcvegrcnt + + rcd->rcvegrbufs_perchunk - 1) / + rcd->rcvegrbufs_perchunk; + BUG_ON(!is_power_of_2(rcd->rcvegrbufs_perchunk)); + rcd->rcvegrbufs_perchunk_shift = + ilog2(rcd->rcvegrbufs_perchunk); + } + return rcd; +} + +/* + * Common code for initializing the physical port structure. + */ +int qib_init_pportdata(struct qib_pportdata *ppd, struct qib_devdata *dd, + u8 hw_pidx, u8 port) +{ + int size; + + ppd->dd = dd; + ppd->hw_pidx = hw_pidx; + ppd->port = port; /* IB port number, not index */ + + spin_lock_init(&ppd->sdma_lock); + spin_lock_init(&ppd->lflags_lock); + spin_lock_init(&ppd->cc_shadow_lock); + init_waitqueue_head(&ppd->state_wait); + + init_timer(&ppd->symerr_clear_timer); + ppd->symerr_clear_timer.function = qib_clear_symerror_on_linkup; + ppd->symerr_clear_timer.data = (unsigned long)ppd; + + ppd->qib_wq = NULL; + ppd->ibport_data.pmastats = + alloc_percpu(struct qib_pma_counters); + if (!ppd->ibport_data.pmastats) + return -ENOMEM; + + if (qib_cc_table_size < IB_CCT_MIN_ENTRIES) + goto bail; + + ppd->cc_supported_table_entries = min(max_t(int, qib_cc_table_size, + IB_CCT_MIN_ENTRIES), IB_CCT_ENTRIES*IB_CC_TABLE_CAP_DEFAULT); + + ppd->cc_max_table_entries = + ppd->cc_supported_table_entries/IB_CCT_ENTRIES; + + size = IB_CC_TABLE_CAP_DEFAULT * sizeof(struct ib_cc_table_entry) + * IB_CCT_ENTRIES; + ppd->ccti_entries = kzalloc(size, GFP_KERNEL); + if (!ppd->ccti_entries) { + qib_dev_err(dd, + "failed to allocate congestion control table for port %d!\n", + port); + goto bail; + } + + size = IB_CC_CCS_ENTRIES * sizeof(struct ib_cc_congestion_entry); + ppd->congestion_entries = kzalloc(size, GFP_KERNEL); + if (!ppd->congestion_entries) { + qib_dev_err(dd, + "failed to allocate congestion setting list for port %d!\n", + port); + goto bail_1; + } + + size = sizeof(struct cc_table_shadow); + ppd->ccti_entries_shadow = kzalloc(size, GFP_KERNEL); + if (!ppd->ccti_entries_shadow) { + qib_dev_err(dd, + "failed to allocate shadow ccti list for port %d!\n", + port); + goto bail_2; + } + + size = sizeof(struct ib_cc_congestion_setting_attr); + ppd->congestion_entries_shadow = kzalloc(size, GFP_KERNEL); + if (!ppd->congestion_entries_shadow) { + qib_dev_err(dd, + "failed to allocate shadow congestion setting list for port %d!\n", + port); + goto bail_3; + } + + return 0; + +bail_3: + kfree(ppd->ccti_entries_shadow); + ppd->ccti_entries_shadow = NULL; +bail_2: + kfree(ppd->congestion_entries); + ppd->congestion_entries = NULL; +bail_1: + kfree(ppd->ccti_entries); + ppd->ccti_entries = NULL; +bail: + /* User is intentionally disabling the congestion control agent */ + if (!qib_cc_table_size) + return 0; + + if (qib_cc_table_size < IB_CCT_MIN_ENTRIES) { + qib_cc_table_size = 0; + qib_dev_err(dd, + "Congestion Control table size %d less than minimum %d for port %d\n", + qib_cc_table_size, IB_CCT_MIN_ENTRIES, port); + } + + qib_dev_err(dd, "Congestion Control Agent disabled for port %d\n", + port); + return 0; +} + +static int init_pioavailregs(struct qib_devdata *dd) +{ + int ret, pidx; + u64 *status_page; + + dd->pioavailregs_dma = dma_alloc_coherent( + &dd->pcidev->dev, PAGE_SIZE, &dd->pioavailregs_phys, + GFP_KERNEL); + if (!dd->pioavailregs_dma) { + qib_dev_err(dd, + "failed to allocate PIOavail reg area in memory\n"); + ret = -ENOMEM; + goto done; + } + + /* + * We really want L2 cache aligned, but for current CPUs of + * interest, they are the same. + */ + status_page = (u64 *) + ((char *) dd->pioavailregs_dma + + ((2 * L1_CACHE_BYTES + + dd->pioavregs * sizeof(u64)) & ~L1_CACHE_BYTES)); + /* device status comes first, for backwards compatibility */ + dd->devstatusp = status_page; + *status_page++ = 0; + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + dd->pport[pidx].statusp = status_page; + *status_page++ = 0; + } + + /* + * Setup buffer to hold freeze and other messages, accessible to + * apps, following statusp. This is per-unit, not per port. + */ + dd->freezemsg = (char *) status_page; + *dd->freezemsg = 0; + /* length of msg buffer is "whatever is left" */ + ret = (char *) status_page - (char *) dd->pioavailregs_dma; + dd->freezelen = PAGE_SIZE - ret; + + ret = 0; + +done: + return ret; +} + +/** + * init_shadow_tids - allocate the shadow TID array + * @dd: the qlogic_ib device + * + * allocate the shadow TID array, so we can qib_munlock previous + * entries. It may make more sense to move the pageshadow to the + * ctxt data structure, so we only allocate memory for ctxts actually + * in use, since we at 8k per ctxt, now. + * We don't want failures here to prevent use of the driver/chip, + * so no return value. + */ +static void init_shadow_tids(struct qib_devdata *dd) +{ + struct page **pages; + dma_addr_t *addrs; + + pages = vzalloc(dd->cfgctxts * dd->rcvtidcnt * sizeof(struct page *)); + if (!pages) { + qib_dev_err(dd, + "failed to allocate shadow page * array, no expected sends!\n"); + goto bail; + } + + addrs = vzalloc(dd->cfgctxts * dd->rcvtidcnt * sizeof(dma_addr_t)); + if (!addrs) { + qib_dev_err(dd, + "failed to allocate shadow dma handle array, no expected sends!\n"); + goto bail_free; + } + + dd->pageshadow = pages; + dd->physshadow = addrs; + return; + +bail_free: + vfree(pages); +bail: + dd->pageshadow = NULL; +} + +/* + * Do initialization for device that is only needed on + * first detect, not on resets. + */ +static int loadtime_init(struct qib_devdata *dd) +{ + int ret = 0; + + if (((dd->revision >> QLOGIC_IB_R_SOFTWARE_SHIFT) & + QLOGIC_IB_R_SOFTWARE_MASK) != QIB_CHIP_SWVERSION) { + qib_dev_err(dd, + "Driver only handles version %d, chip swversion is %d (%llx), failng\n", + QIB_CHIP_SWVERSION, + (int)(dd->revision >> + QLOGIC_IB_R_SOFTWARE_SHIFT) & + QLOGIC_IB_R_SOFTWARE_MASK, + (unsigned long long) dd->revision); + ret = -ENOSYS; + goto done; + } + + if (dd->revision & QLOGIC_IB_R_EMULATOR_MASK) + qib_devinfo(dd->pcidev, "%s", dd->boardversion); + + spin_lock_init(&dd->pioavail_lock); + spin_lock_init(&dd->sendctrl_lock); + spin_lock_init(&dd->uctxt_lock); + spin_lock_init(&dd->qib_diag_trans_lock); + spin_lock_init(&dd->eep_st_lock); + mutex_init(&dd->eep_lock); + + if (qib_mini_init) + goto done; + + ret = init_pioavailregs(dd); + init_shadow_tids(dd); + + qib_get_eeprom_info(dd); + + /* setup time (don't start yet) to verify we got interrupt */ + init_timer(&dd->intrchk_timer); + dd->intrchk_timer.function = verify_interrupt; + dd->intrchk_timer.data = (unsigned long) dd; + + ret = qib_cq_init(dd); +done: + return ret; +} + +/** + * init_after_reset - re-initialize after a reset + * @dd: the qlogic_ib device + * + * sanity check at least some of the values after reset, and + * ensure no receive or transmit (explicitly, in case reset + * failed + */ +static int init_after_reset(struct qib_devdata *dd) +{ + int i; + + /* + * Ensure chip does no sends or receives, tail updates, or + * pioavail updates while we re-initialize. This is mostly + * for the driver data structures, not chip registers. + */ + for (i = 0; i < dd->num_pports; ++i) { + /* + * ctxt == -1 means "all contexts". Only really safe for + * _dis_abling things, as here. + */ + dd->f_rcvctrl(dd->pport + i, QIB_RCVCTRL_CTXT_DIS | + QIB_RCVCTRL_INTRAVAIL_DIS | + QIB_RCVCTRL_TAILUPD_DIS, -1); + /* Redundant across ports for some, but no big deal. */ + dd->f_sendctrl(dd->pport + i, QIB_SENDCTRL_SEND_DIS | + QIB_SENDCTRL_AVAIL_DIS); + } + + return 0; +} + +static void enable_chip(struct qib_devdata *dd) +{ + u64 rcvmask; + int i; + + /* + * Enable PIO send, and update of PIOavail regs to memory. + */ + for (i = 0; i < dd->num_pports; ++i) + dd->f_sendctrl(dd->pport + i, QIB_SENDCTRL_SEND_ENB | + QIB_SENDCTRL_AVAIL_ENB); + /* + * Enable kernel ctxts' receive and receive interrupt. + * Other ctxts done as user opens and inits them. + */ + rcvmask = QIB_RCVCTRL_CTXT_ENB | QIB_RCVCTRL_INTRAVAIL_ENB; + rcvmask |= (dd->flags & QIB_NODMA_RTAIL) ? + QIB_RCVCTRL_TAILUPD_DIS : QIB_RCVCTRL_TAILUPD_ENB; + for (i = 0; dd->rcd && i < dd->first_user_ctxt; ++i) { + struct qib_ctxtdata *rcd = dd->rcd[i]; + + if (rcd) + dd->f_rcvctrl(rcd->ppd, rcvmask, i); + } +} + +static void verify_interrupt(unsigned long opaque) +{ + struct qib_devdata *dd = (struct qib_devdata *) opaque; + u64 int_counter; + + if (!dd) + return; /* being torn down */ + + /* + * If we don't have a lid or any interrupts, let the user know and + * don't bother checking again. + */ + int_counter = qib_int_counter(dd) - dd->z_int_counter; + if (int_counter == 0) { + if (!dd->f_intr_fallback(dd)) + dev_err(&dd->pcidev->dev, + "No interrupts detected, not usable.\n"); + else /* re-arm the timer to see if fallback works */ + mod_timer(&dd->intrchk_timer, jiffies + HZ/2); + } +} + +static void init_piobuf_state(struct qib_devdata *dd) +{ + int i, pidx; + u32 uctxts; + + /* + * Ensure all buffers are free, and fifos empty. Buffers + * are common, so only do once for port 0. + * + * After enable and qib_chg_pioavailkernel so we can safely + * enable pioavail updates and PIOENABLE. After this, packets + * are ready and able to go out. + */ + dd->f_sendctrl(dd->pport, QIB_SENDCTRL_DISARM_ALL); + for (pidx = 0; pidx < dd->num_pports; ++pidx) + dd->f_sendctrl(dd->pport + pidx, QIB_SENDCTRL_FLUSH); + + /* + * If not all sendbufs are used, add the one to each of the lower + * numbered contexts. pbufsctxt and lastctxt_piobuf are + * calculated in chip-specific code because it may cause some + * chip-specific adjustments to be made. + */ + uctxts = dd->cfgctxts - dd->first_user_ctxt; + dd->ctxts_extrabuf = dd->pbufsctxt ? + dd->lastctxt_piobuf - (dd->pbufsctxt * uctxts) : 0; + + /* + * Set up the shadow copies of the piobufavail registers, + * which we compare against the chip registers for now, and + * the in memory DMA'ed copies of the registers. + * By now pioavail updates to memory should have occurred, so + * copy them into our working/shadow registers; this is in + * case something went wrong with abort, but mostly to get the + * initial values of the generation bit correct. + */ + for (i = 0; i < dd->pioavregs; i++) { + __le64 tmp; + + tmp = dd->pioavailregs_dma[i]; + /* + * Don't need to worry about pioavailkernel here + * because we will call qib_chg_pioavailkernel() later + * in initialization, to busy out buffers as needed. + */ + dd->pioavailshadow[i] = le64_to_cpu(tmp); + } + while (i < ARRAY_SIZE(dd->pioavailshadow)) + dd->pioavailshadow[i++] = 0; /* for debugging sanity */ + + /* after pioavailshadow is setup */ + qib_chg_pioavailkernel(dd, 0, dd->piobcnt2k + dd->piobcnt4k, + TXCHK_CHG_TYPE_KERN, NULL); + dd->f_initvl15_bufs(dd); +} + +/** + * qib_create_workqueues - create per port workqueues + * @dd: the qlogic_ib device + */ +static int qib_create_workqueues(struct qib_devdata *dd) +{ + int pidx; + struct qib_pportdata *ppd; + + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + if (!ppd->qib_wq) { + char wq_name[8]; /* 3 + 2 + 1 + 1 + 1 */ + + snprintf(wq_name, sizeof(wq_name), "qib%d_%d", + dd->unit, pidx); + ppd->qib_wq = + create_singlethread_workqueue(wq_name); + if (!ppd->qib_wq) + goto wq_error; + } + } + return 0; +wq_error: + pr_err("create_singlethread_workqueue failed for port %d\n", + pidx + 1); + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + if (ppd->qib_wq) { + destroy_workqueue(ppd->qib_wq); + ppd->qib_wq = NULL; + } + } + return -ENOMEM; +} + +static void qib_free_pportdata(struct qib_pportdata *ppd) +{ + free_percpu(ppd->ibport_data.pmastats); + ppd->ibport_data.pmastats = NULL; +} + +/** + * qib_init - do the actual initialization sequence on the chip + * @dd: the qlogic_ib device + * @reinit: reinitializing, so don't allocate new memory + * + * Do the actual initialization sequence on the chip. This is done + * both from the init routine called from the PCI infrastructure, and + * when we reset the chip, or detect that it was reset internally, + * or it's administratively re-enabled. + * + * Memory allocation here and in called routines is only done in + * the first case (reinit == 0). We have to be careful, because even + * without memory allocation, we need to re-write all the chip registers + * TIDs, etc. after the reset or enable has completed. + */ +int qib_init(struct qib_devdata *dd, int reinit) +{ + int ret = 0, pidx, lastfail = 0; + u32 portok = 0; + unsigned i; + struct qib_ctxtdata *rcd; + struct qib_pportdata *ppd; + unsigned long flags; + + /* Set linkstate to unknown, so we can watch for a transition. */ + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags &= ~(QIBL_LINKACTIVE | QIBL_LINKARMED | + QIBL_LINKDOWN | QIBL_LINKINIT | + QIBL_LINKV); + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + } + + if (reinit) + ret = init_after_reset(dd); + else + ret = loadtime_init(dd); + if (ret) + goto done; + + /* Bypass most chip-init, to get to device creation */ + if (qib_mini_init) + return 0; + + ret = dd->f_late_initreg(dd); + if (ret) + goto done; + + /* dd->rcd can be NULL if early init failed */ + for (i = 0; dd->rcd && i < dd->first_user_ctxt; ++i) { + /* + * Set up the (kernel) rcvhdr queue and egr TIDs. If doing + * re-init, the simplest way to handle this is to free + * existing, and re-allocate. + * Need to re-create rest of ctxt 0 ctxtdata as well. + */ + rcd = dd->rcd[i]; + if (!rcd) + continue; + + lastfail = qib_create_rcvhdrq(dd, rcd); + if (!lastfail) + lastfail = qib_setup_eagerbufs(rcd); + if (lastfail) { + qib_dev_err(dd, + "failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n"); + continue; + } + } + + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + int mtu; + + if (lastfail) + ret = lastfail; + ppd = dd->pport + pidx; + mtu = ib_mtu_enum_to_int(qib_ibmtu); + if (mtu == -1) { + mtu = QIB_DEFAULT_MTU; + qib_ibmtu = 0; /* don't leave invalid value */ + } + /* set max we can ever have for this driver load */ + ppd->init_ibmaxlen = min(mtu > 2048 ? + dd->piosize4k : dd->piosize2k, + dd->rcvegrbufsize + + (dd->rcvhdrentsize << 2)); + /* + * Have to initialize ibmaxlen, but this will normally + * change immediately in qib_set_mtu(). + */ + ppd->ibmaxlen = ppd->init_ibmaxlen; + qib_set_mtu(ppd, mtu); + + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags |= QIBL_IB_LINK_DISABLED; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + + lastfail = dd->f_bringup_serdes(ppd); + if (lastfail) { + qib_devinfo(dd->pcidev, + "Failed to bringup IB port %u\n", ppd->port); + lastfail = -ENETDOWN; + continue; + } + + portok++; + } + + if (!portok) { + /* none of the ports initialized */ + if (!ret && lastfail) + ret = lastfail; + else if (!ret) + ret = -ENETDOWN; + /* but continue on, so we can debug cause */ + } + + enable_chip(dd); + + init_piobuf_state(dd); + +done: + if (!ret) { + /* chip is OK for user apps; mark it as initialized */ + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + /* + * Set status even if port serdes is not initialized + * so that diags will work. + */ + *ppd->statusp |= QIB_STATUS_CHIP_PRESENT | + QIB_STATUS_INITTED; + if (!ppd->link_speed_enabled) + continue; + if (dd->flags & QIB_HAS_SEND_DMA) + ret = qib_setup_sdma(ppd); + init_timer(&ppd->hol_timer); + ppd->hol_timer.function = qib_hol_event; + ppd->hol_timer.data = (unsigned long)ppd; + ppd->hol_state = QIB_HOL_UP; + } + + /* now we can enable all interrupts from the chip */ + dd->f_set_intr_state(dd, 1); + + /* + * Setup to verify we get an interrupt, and fallback + * to an alternate if necessary and possible. + */ + mod_timer(&dd->intrchk_timer, jiffies + HZ/2); + /* start stats retrieval timer */ + mod_timer(&dd->stats_timer, jiffies + HZ * ACTIVITY_TIMER); + } + + /* if ret is non-zero, we probably should do some cleanup here... */ + return ret; +} + +/* + * These next two routines are placeholders in case we don't have per-arch + * code for controlling write combining. If explicit control of write + * combining is not available, performance will probably be awful. + */ + +int __attribute__((weak)) qib_enable_wc(struct qib_devdata *dd) +{ + return -EOPNOTSUPP; +} + +void __attribute__((weak)) qib_disable_wc(struct qib_devdata *dd) +{ +} + +static inline struct qib_devdata *__qib_lookup(int unit) +{ + return idr_find(&qib_unit_table, unit); +} + +struct qib_devdata *qib_lookup(int unit) +{ + struct qib_devdata *dd; + unsigned long flags; + + spin_lock_irqsave(&qib_devs_lock, flags); + dd = __qib_lookup(unit); + spin_unlock_irqrestore(&qib_devs_lock, flags); + + return dd; +} + +/* + * Stop the timers during unit shutdown, or after an error late + * in initialization. + */ +static void qib_stop_timers(struct qib_devdata *dd) +{ + struct qib_pportdata *ppd; + int pidx; + + if (dd->stats_timer.data) { + del_timer_sync(&dd->stats_timer); + dd->stats_timer.data = 0; + } + if (dd->intrchk_timer.data) { + del_timer_sync(&dd->intrchk_timer); + dd->intrchk_timer.data = 0; + } + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + if (ppd->hol_timer.data) + del_timer_sync(&ppd->hol_timer); + if (ppd->led_override_timer.data) { + del_timer_sync(&ppd->led_override_timer); + atomic_set(&ppd->led_override_timer_active, 0); + } + if (ppd->symerr_clear_timer.data) + del_timer_sync(&ppd->symerr_clear_timer); + } +} + +/** + * qib_shutdown_device - shut down a device + * @dd: the qlogic_ib device + * + * This is called to make the device quiet when we are about to + * unload the driver, and also when the device is administratively + * disabled. It does not free any data structures. + * Everything it does has to be setup again by qib_init(dd, 1) + */ +static void qib_shutdown_device(struct qib_devdata *dd) +{ + struct qib_pportdata *ppd; + unsigned pidx; + + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + + spin_lock_irq(&ppd->lflags_lock); + ppd->lflags &= ~(QIBL_LINKDOWN | QIBL_LINKINIT | + QIBL_LINKARMED | QIBL_LINKACTIVE | + QIBL_LINKV); + spin_unlock_irq(&ppd->lflags_lock); + *ppd->statusp &= ~(QIB_STATUS_IB_CONF | QIB_STATUS_IB_READY); + } + dd->flags &= ~QIB_INITTED; + + /* mask interrupts, but not errors */ + dd->f_set_intr_state(dd, 0); + + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + dd->f_rcvctrl(ppd, QIB_RCVCTRL_TAILUPD_DIS | + QIB_RCVCTRL_CTXT_DIS | + QIB_RCVCTRL_INTRAVAIL_DIS | + QIB_RCVCTRL_PKEY_ENB, -1); + /* + * Gracefully stop all sends allowing any in progress to + * trickle out first. + */ + dd->f_sendctrl(ppd, QIB_SENDCTRL_CLEAR); + } + + /* + * Enough for anything that's going to trickle out to have actually + * done so. + */ + udelay(20); + + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + dd->f_setextled(ppd, 0); /* make sure LEDs are off */ + + if (dd->flags & QIB_HAS_SEND_DMA) + qib_teardown_sdma(ppd); + + dd->f_sendctrl(ppd, QIB_SENDCTRL_AVAIL_DIS | + QIB_SENDCTRL_SEND_DIS); + /* + * Clear SerdesEnable. + * We can't count on interrupts since we are stopping. + */ + dd->f_quiet_serdes(ppd); + + if (ppd->qib_wq) { + destroy_workqueue(ppd->qib_wq); + ppd->qib_wq = NULL; + } + qib_free_pportdata(ppd); + } + +} + +/** + * qib_free_ctxtdata - free a context's allocated data + * @dd: the qlogic_ib device + * @rcd: the ctxtdata structure + * + * free up any allocated data for a context + * This should not touch anything that would affect a simultaneous + * re-allocation of context data, because it is called after qib_mutex + * is released (and can be called from reinit as well). + * It should never change any chip state, or global driver state. + */ +void qib_free_ctxtdata(struct qib_devdata *dd, struct qib_ctxtdata *rcd) +{ + if (!rcd) + return; + + if (rcd->rcvhdrq) { + dma_free_coherent(&dd->pcidev->dev, rcd->rcvhdrq_size, + rcd->rcvhdrq, rcd->rcvhdrq_phys); + rcd->rcvhdrq = NULL; + if (rcd->rcvhdrtail_kvaddr) { + dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE, + rcd->rcvhdrtail_kvaddr, + rcd->rcvhdrqtailaddr_phys); + rcd->rcvhdrtail_kvaddr = NULL; + } + } + if (rcd->rcvegrbuf) { + unsigned e; + + for (e = 0; e < rcd->rcvegrbuf_chunks; e++) { + void *base = rcd->rcvegrbuf[e]; + size_t size = rcd->rcvegrbuf_size; + + dma_free_coherent(&dd->pcidev->dev, size, + base, rcd->rcvegrbuf_phys[e]); + } + kfree(rcd->rcvegrbuf); + rcd->rcvegrbuf = NULL; + kfree(rcd->rcvegrbuf_phys); + rcd->rcvegrbuf_phys = NULL; + rcd->rcvegrbuf_chunks = 0; + } + + kfree(rcd->tid_pg_list); + vfree(rcd->user_event_mask); + vfree(rcd->subctxt_uregbase); + vfree(rcd->subctxt_rcvegrbuf); + vfree(rcd->subctxt_rcvhdr_base); +#ifdef CONFIG_DEBUG_FS + kfree(rcd->opstats); + rcd->opstats = NULL; +#endif + kfree(rcd); +} + +/* + * Perform a PIO buffer bandwidth write test, to verify proper system + * configuration. Even when all the setup calls work, occasionally + * BIOS or other issues can prevent write combining from working, or + * can cause other bandwidth problems to the chip. + * + * This test simply writes the same buffer over and over again, and + * measures close to the peak bandwidth to the chip (not testing + * data bandwidth to the wire). On chips that use an address-based + * trigger to send packets to the wire, this is easy. On chips that + * use a count to trigger, we want to make sure that the packet doesn't + * go out on the wire, or trigger flow control checks. + */ +static void qib_verify_pioperf(struct qib_devdata *dd) +{ + u32 pbnum, cnt, lcnt; + u32 __iomem *piobuf; + u32 *addr; + u64 msecs, emsecs; + + piobuf = dd->f_getsendbuf(dd->pport, 0ULL, &pbnum); + if (!piobuf) { + qib_devinfo(dd->pcidev, + "No PIObufs for checking perf, skipping\n"); + return; + } + + /* + * Enough to give us a reasonable test, less than piobuf size, and + * likely multiple of store buffer length. + */ + cnt = 1024; + + addr = vmalloc(cnt); + if (!addr) { + qib_devinfo(dd->pcidev, + "Couldn't get memory for checking PIO perf, skipping\n"); + goto done; + } + + preempt_disable(); /* we want reasonably accurate elapsed time */ + msecs = 1 + jiffies_to_msecs(jiffies); + for (lcnt = 0; lcnt < 10000U; lcnt++) { + /* wait until we cross msec boundary */ + if (jiffies_to_msecs(jiffies) >= msecs) + break; + udelay(1); + } + + dd->f_set_armlaunch(dd, 0); + + /* + * length 0, no dwords actually sent + */ + writeq(0, piobuf); + qib_flush_wc(); + + /* + * This is only roughly accurate, since even with preempt we + * still take interrupts that could take a while. Running for + * >= 5 msec seems to get us "close enough" to accurate values. + */ + msecs = jiffies_to_msecs(jiffies); + for (emsecs = lcnt = 0; emsecs <= 5UL; lcnt++) { + qib_pio_copy(piobuf + 64, addr, cnt >> 2); + emsecs = jiffies_to_msecs(jiffies) - msecs; + } + + /* 1 GiB/sec, slightly over IB SDR line rate */ + if (lcnt < (emsecs * 1024U)) + qib_dev_err(dd, + "Performance problem: bandwidth to PIO buffers is only %u MiB/sec\n", + lcnt / (u32) emsecs); + + preempt_enable(); + + vfree(addr); + +done: + /* disarm piobuf, so it's available again */ + dd->f_sendctrl(dd->pport, QIB_SENDCTRL_DISARM_BUF(pbnum)); + qib_sendbuf_done(dd, pbnum); + dd->f_set_armlaunch(dd, 1); +} + +void qib_free_devdata(struct qib_devdata *dd) +{ + unsigned long flags; + + spin_lock_irqsave(&qib_devs_lock, flags); + idr_remove(&qib_unit_table, dd->unit); + list_del(&dd->list); + spin_unlock_irqrestore(&qib_devs_lock, flags); + +#ifdef CONFIG_DEBUG_FS + qib_dbg_ibdev_exit(&dd->verbs_dev); +#endif + free_percpu(dd->int_counter); + ib_dealloc_device(&dd->verbs_dev.ibdev); +} + +u64 qib_int_counter(struct qib_devdata *dd) +{ + int cpu; + u64 int_counter = 0; + + for_each_possible_cpu(cpu) + int_counter += *per_cpu_ptr(dd->int_counter, cpu); + return int_counter; +} + +u64 qib_sps_ints(void) +{ + unsigned long flags; + struct qib_devdata *dd; + u64 sps_ints = 0; + + spin_lock_irqsave(&qib_devs_lock, flags); + list_for_each_entry(dd, &qib_dev_list, list) { + sps_ints += qib_int_counter(dd); + } + spin_unlock_irqrestore(&qib_devs_lock, flags); + return sps_ints; +} + +/* + * Allocate our primary per-unit data structure. Must be done via verbs + * allocator, because the verbs cleanup process both does cleanup and + * free of the data structure. + * "extra" is for chip-specific data. + * + * Use the idr mechanism to get a unit number for this unit. + */ +struct qib_devdata *qib_alloc_devdata(struct pci_dev *pdev, size_t extra) +{ + unsigned long flags; + struct qib_devdata *dd; + int ret; + + dd = (struct qib_devdata *) ib_alloc_device(sizeof(*dd) + extra); + if (!dd) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&dd->list); + + idr_preload(GFP_KERNEL); + spin_lock_irqsave(&qib_devs_lock, flags); + + ret = idr_alloc(&qib_unit_table, dd, 0, 0, GFP_NOWAIT); + if (ret >= 0) { + dd->unit = ret; + list_add(&dd->list, &qib_dev_list); + } + + spin_unlock_irqrestore(&qib_devs_lock, flags); + idr_preload_end(); + + if (ret < 0) { + qib_early_err(&pdev->dev, + "Could not allocate unit ID: error %d\n", -ret); + goto bail; + } + dd->int_counter = alloc_percpu(u64); + if (!dd->int_counter) { + ret = -ENOMEM; + qib_early_err(&pdev->dev, + "Could not allocate per-cpu int_counter\n"); + goto bail; + } + + if (!qib_cpulist_count) { + u32 count = num_online_cpus(); + + qib_cpulist = kzalloc(BITS_TO_LONGS(count) * + sizeof(long), GFP_KERNEL); + if (qib_cpulist) + qib_cpulist_count = count; + else + qib_early_err(&pdev->dev, + "Could not alloc cpulist info, cpu affinity might be wrong\n"); + } +#ifdef CONFIG_DEBUG_FS + qib_dbg_ibdev_init(&dd->verbs_dev); +#endif + return dd; +bail: + if (!list_empty(&dd->list)) + list_del_init(&dd->list); + ib_dealloc_device(&dd->verbs_dev.ibdev); + return ERR_PTR(ret); +} + +/* + * Called from freeze mode handlers, and from PCI error + * reporting code. Should be paranoid about state of + * system and data structures. + */ +void qib_disable_after_error(struct qib_devdata *dd) +{ + if (dd->flags & QIB_INITTED) { + u32 pidx; + + dd->flags &= ~QIB_INITTED; + if (dd->pport) + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + struct qib_pportdata *ppd; + + ppd = dd->pport + pidx; + if (dd->flags & QIB_PRESENT) { + qib_set_linkstate(ppd, + QIB_IB_LINKDOWN_DISABLE); + dd->f_setextled(ppd, 0); + } + *ppd->statusp &= ~QIB_STATUS_IB_READY; + } + } + + /* + * Mark as having had an error for driver, and also + * for /sys and status word mapped to user programs. + * This marks unit as not usable, until reset. + */ + if (dd->devstatusp) + *dd->devstatusp |= QIB_STATUS_HWERROR; +} + +static void qib_remove_one(struct pci_dev *); +static int qib_init_one(struct pci_dev *, const struct pci_device_id *); + +#define DRIVER_LOAD_MSG "Intel " QIB_DRV_NAME " loaded: " +#define PFX QIB_DRV_NAME ": " + +static const struct pci_device_id qib_pci_tbl[] = { + { PCI_DEVICE(PCI_VENDOR_ID_PATHSCALE, PCI_DEVICE_ID_QLOGIC_IB_6120) }, + { PCI_DEVICE(PCI_VENDOR_ID_QLOGIC, PCI_DEVICE_ID_QLOGIC_IB_7220) }, + { PCI_DEVICE(PCI_VENDOR_ID_QLOGIC, PCI_DEVICE_ID_QLOGIC_IB_7322) }, + { 0, } +}; + +MODULE_DEVICE_TABLE(pci, qib_pci_tbl); + +static struct pci_driver qib_driver = { + .name = QIB_DRV_NAME, + .probe = qib_init_one, + .remove = qib_remove_one, + .id_table = qib_pci_tbl, + .err_handler = &qib_pci_err_handler, +}; + +#ifdef CONFIG_INFINIBAND_QIB_DCA + +static int qib_notify_dca(struct notifier_block *, unsigned long, void *); +static struct notifier_block dca_notifier = { + .notifier_call = qib_notify_dca, + .next = NULL, + .priority = 0 +}; + +static int qib_notify_dca_device(struct device *device, void *data) +{ + struct qib_devdata *dd = dev_get_drvdata(device); + unsigned long event = *(unsigned long *)data; + + return dd->f_notify_dca(dd, event); +} + +static int qib_notify_dca(struct notifier_block *nb, unsigned long event, + void *p) +{ + int rval; + + rval = driver_for_each_device(&qib_driver.driver, NULL, + &event, qib_notify_dca_device); + return rval ? NOTIFY_BAD : NOTIFY_DONE; +} + +#endif + +/* + * Do all the generic driver unit- and chip-independent memory + * allocation and initialization. + */ +static int __init qib_ib_init(void) +{ + int ret; + + ret = qib_dev_init(); + if (ret) + goto bail; + + /* + * These must be called before the driver is registered with + * the PCI subsystem. + */ + idr_init(&qib_unit_table); + +#ifdef CONFIG_INFINIBAND_QIB_DCA + dca_register_notify(&dca_notifier); +#endif +#ifdef CONFIG_DEBUG_FS + qib_dbg_init(); +#endif + ret = pci_register_driver(&qib_driver); + if (ret < 0) { + pr_err("Unable to register driver: error %d\n", -ret); + goto bail_dev; + } + + /* not fatal if it doesn't work */ + if (qib_init_qibfs()) + pr_err("Unable to register ipathfs\n"); + goto bail; /* all OK */ + +bail_dev: +#ifdef CONFIG_INFINIBAND_QIB_DCA + dca_unregister_notify(&dca_notifier); +#endif +#ifdef CONFIG_DEBUG_FS + qib_dbg_exit(); +#endif + idr_destroy(&qib_unit_table); + qib_dev_cleanup(); +bail: + return ret; +} + +module_init(qib_ib_init); + +/* + * Do the non-unit driver cleanup, memory free, etc. at unload. + */ +static void __exit qib_ib_cleanup(void) +{ + int ret; + + ret = qib_exit_qibfs(); + if (ret) + pr_err( + "Unable to cleanup counter filesystem: error %d\n", + -ret); + +#ifdef CONFIG_INFINIBAND_QIB_DCA + dca_unregister_notify(&dca_notifier); +#endif + pci_unregister_driver(&qib_driver); +#ifdef CONFIG_DEBUG_FS + qib_dbg_exit(); +#endif + + qib_cpulist_count = 0; + kfree(qib_cpulist); + + idr_destroy(&qib_unit_table); + qib_dev_cleanup(); +} + +module_exit(qib_ib_cleanup); + +/* this can only be called after a successful initialization */ +static void cleanup_device_data(struct qib_devdata *dd) +{ + int ctxt; + int pidx; + struct qib_ctxtdata **tmp; + unsigned long flags; + + /* users can't do anything more with chip */ + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + if (dd->pport[pidx].statusp) + *dd->pport[pidx].statusp &= ~QIB_STATUS_CHIP_PRESENT; + + spin_lock(&dd->pport[pidx].cc_shadow_lock); + + kfree(dd->pport[pidx].congestion_entries); + dd->pport[pidx].congestion_entries = NULL; + kfree(dd->pport[pidx].ccti_entries); + dd->pport[pidx].ccti_entries = NULL; + kfree(dd->pport[pidx].ccti_entries_shadow); + dd->pport[pidx].ccti_entries_shadow = NULL; + kfree(dd->pport[pidx].congestion_entries_shadow); + dd->pport[pidx].congestion_entries_shadow = NULL; + + spin_unlock(&dd->pport[pidx].cc_shadow_lock); + } + + qib_disable_wc(dd); + + if (dd->pioavailregs_dma) { + dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE, + (void *) dd->pioavailregs_dma, + dd->pioavailregs_phys); + dd->pioavailregs_dma = NULL; + } + + if (dd->pageshadow) { + struct page **tmpp = dd->pageshadow; + dma_addr_t *tmpd = dd->physshadow; + int i; + + for (ctxt = 0; ctxt < dd->cfgctxts; ctxt++) { + int ctxt_tidbase = ctxt * dd->rcvtidcnt; + int maxtid = ctxt_tidbase + dd->rcvtidcnt; + + for (i = ctxt_tidbase; i < maxtid; i++) { + if (!tmpp[i]) + continue; + pci_unmap_page(dd->pcidev, tmpd[i], + PAGE_SIZE, PCI_DMA_FROMDEVICE); + qib_release_user_pages(&tmpp[i], 1); + tmpp[i] = NULL; + } + } + + dd->pageshadow = NULL; + vfree(tmpp); + dd->physshadow = NULL; + vfree(tmpd); + } + + /* + * Free any resources still in use (usually just kernel contexts) + * at unload; we do for ctxtcnt, because that's what we allocate. + * We acquire lock to be really paranoid that rcd isn't being + * accessed from some interrupt-related code (that should not happen, + * but best to be sure). + */ + spin_lock_irqsave(&dd->uctxt_lock, flags); + tmp = dd->rcd; + dd->rcd = NULL; + spin_unlock_irqrestore(&dd->uctxt_lock, flags); + for (ctxt = 0; tmp && ctxt < dd->ctxtcnt; ctxt++) { + struct qib_ctxtdata *rcd = tmp[ctxt]; + + tmp[ctxt] = NULL; /* debugging paranoia */ + qib_free_ctxtdata(dd, rcd); + } + kfree(tmp); + kfree(dd->boardname); + qib_cq_exit(dd); +} + +/* + * Clean up on unit shutdown, or error during unit load after + * successful initialization. + */ +static void qib_postinit_cleanup(struct qib_devdata *dd) +{ + /* + * Clean up chip-specific stuff. + * We check for NULL here, because it's outside + * the kregbase check, and we need to call it + * after the free_irq. Thus it's possible that + * the function pointers were never initialized. + */ + if (dd->f_cleanup) + dd->f_cleanup(dd); + + qib_pcie_ddcleanup(dd); + + cleanup_device_data(dd); + + qib_free_devdata(dd); +} + +static int qib_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) +{ + int ret, j, pidx, initfail; + struct qib_devdata *dd = NULL; + + ret = qib_pcie_init(pdev, ent); + if (ret) + goto bail; + + /* + * Do device-specific initialiation, function table setup, dd + * allocation, etc. + */ + switch (ent->device) { + case PCI_DEVICE_ID_QLOGIC_IB_6120: +#ifdef CONFIG_PCI_MSI + dd = qib_init_iba6120_funcs(pdev, ent); +#else + qib_early_err(&pdev->dev, + "Intel PCIE device 0x%x cannot work if CONFIG_PCI_MSI is not enabled\n", + ent->device); + dd = ERR_PTR(-ENODEV); +#endif + break; + + case PCI_DEVICE_ID_QLOGIC_IB_7220: + dd = qib_init_iba7220_funcs(pdev, ent); + break; + + case PCI_DEVICE_ID_QLOGIC_IB_7322: + dd = qib_init_iba7322_funcs(pdev, ent); + break; + + default: + qib_early_err(&pdev->dev, + "Failing on unknown Intel deviceid 0x%x\n", + ent->device); + ret = -ENODEV; + } + + if (IS_ERR(dd)) + ret = PTR_ERR(dd); + if (ret) + goto bail; /* error already printed */ + + ret = qib_create_workqueues(dd); + if (ret) + goto bail; + + /* do the generic initialization */ + initfail = qib_init(dd, 0); + + ret = qib_register_ib_device(dd); + + /* + * Now ready for use. this should be cleared whenever we + * detect a reset, or initiate one. If earlier failure, + * we still create devices, so diags, etc. can be used + * to determine cause of problem. + */ + if (!qib_mini_init && !initfail && !ret) + dd->flags |= QIB_INITTED; + + j = qib_device_create(dd); + if (j) + qib_dev_err(dd, "Failed to create /dev devices: %d\n", -j); + j = qibfs_add(dd); + if (j) + qib_dev_err(dd, "Failed filesystem setup for counters: %d\n", + -j); + + if (qib_mini_init || initfail || ret) { + qib_stop_timers(dd); + flush_workqueue(ib_wq); + for (pidx = 0; pidx < dd->num_pports; ++pidx) + dd->f_quiet_serdes(dd->pport + pidx); + if (qib_mini_init) + goto bail; + if (!j) { + (void) qibfs_remove(dd); + qib_device_remove(dd); + } + if (!ret) + qib_unregister_ib_device(dd); + qib_postinit_cleanup(dd); + if (initfail) + ret = initfail; + goto bail; + } + + ret = qib_enable_wc(dd); + if (ret) { + qib_dev_err(dd, + "Write combining not enabled (err %d): performance may be poor\n", + -ret); + ret = 0; + } + + qib_verify_pioperf(dd); +bail: + return ret; +} + +static void qib_remove_one(struct pci_dev *pdev) +{ + struct qib_devdata *dd = pci_get_drvdata(pdev); + int ret; + + /* unregister from IB core */ + qib_unregister_ib_device(dd); + + /* + * Disable the IB link, disable interrupts on the device, + * clear dma engines, etc. + */ + if (!qib_mini_init) + qib_shutdown_device(dd); + + qib_stop_timers(dd); + + /* wait until all of our (qsfp) queue_work() calls complete */ + flush_workqueue(ib_wq); + + ret = qibfs_remove(dd); + if (ret) + qib_dev_err(dd, "Failed counters filesystem cleanup: %d\n", + -ret); + + qib_device_remove(dd); + + qib_postinit_cleanup(dd); +} + +/** + * qib_create_rcvhdrq - create a receive header queue + * @dd: the qlogic_ib device + * @rcd: the context data + * + * This must be contiguous memory (from an i/o perspective), and must be + * DMA'able (which means for some systems, it will go through an IOMMU, + * or be forced into a low address range). + */ +int qib_create_rcvhdrq(struct qib_devdata *dd, struct qib_ctxtdata *rcd) +{ + unsigned amt; + int old_node_id; + + if (!rcd->rcvhdrq) { + dma_addr_t phys_hdrqtail; + gfp_t gfp_flags; + + amt = ALIGN(dd->rcvhdrcnt * dd->rcvhdrentsize * + sizeof(u32), PAGE_SIZE); + gfp_flags = (rcd->ctxt >= dd->first_user_ctxt) ? + GFP_USER : GFP_KERNEL; + + old_node_id = dev_to_node(&dd->pcidev->dev); + set_dev_node(&dd->pcidev->dev, rcd->node_id); + rcd->rcvhdrq = dma_alloc_coherent( + &dd->pcidev->dev, amt, &rcd->rcvhdrq_phys, + gfp_flags | __GFP_COMP); + set_dev_node(&dd->pcidev->dev, old_node_id); + + if (!rcd->rcvhdrq) { + qib_dev_err(dd, + "attempt to allocate %d bytes for ctxt %u rcvhdrq failed\n", + amt, rcd->ctxt); + goto bail; + } + + if (rcd->ctxt >= dd->first_user_ctxt) { + rcd->user_event_mask = vmalloc_user(PAGE_SIZE); + if (!rcd->user_event_mask) + goto bail_free_hdrq; + } + + if (!(dd->flags & QIB_NODMA_RTAIL)) { + set_dev_node(&dd->pcidev->dev, rcd->node_id); + rcd->rcvhdrtail_kvaddr = dma_alloc_coherent( + &dd->pcidev->dev, PAGE_SIZE, &phys_hdrqtail, + gfp_flags); + set_dev_node(&dd->pcidev->dev, old_node_id); + if (!rcd->rcvhdrtail_kvaddr) + goto bail_free; + rcd->rcvhdrqtailaddr_phys = phys_hdrqtail; + } + + rcd->rcvhdrq_size = amt; + } + + /* clear for security and sanity on each use */ + memset(rcd->rcvhdrq, 0, rcd->rcvhdrq_size); + if (rcd->rcvhdrtail_kvaddr) + memset(rcd->rcvhdrtail_kvaddr, 0, PAGE_SIZE); + return 0; + +bail_free: + qib_dev_err(dd, + "attempt to allocate 1 page for ctxt %u rcvhdrqtailaddr failed\n", + rcd->ctxt); + vfree(rcd->user_event_mask); + rcd->user_event_mask = NULL; +bail_free_hdrq: + dma_free_coherent(&dd->pcidev->dev, amt, rcd->rcvhdrq, + rcd->rcvhdrq_phys); + rcd->rcvhdrq = NULL; +bail: + return -ENOMEM; +} + +/** + * allocate eager buffers, both kernel and user contexts. + * @rcd: the context we are setting up. + * + * Allocate the eager TID buffers and program them into hip. + * They are no longer completely contiguous, we do multiple allocation + * calls. Otherwise we get the OOM code involved, by asking for too + * much per call, with disastrous results on some kernels. + */ +int qib_setup_eagerbufs(struct qib_ctxtdata *rcd) +{ + struct qib_devdata *dd = rcd->dd; + unsigned e, egrcnt, egrperchunk, chunk, egrsize, egroff; + size_t size; + gfp_t gfp_flags; + int old_node_id; + + /* + * GFP_USER, but without GFP_FS, so buffer cache can be + * coalesced (we hope); otherwise, even at order 4, + * heavy filesystem activity makes these fail, and we can + * use compound pages. + */ + gfp_flags = __GFP_WAIT | __GFP_IO | __GFP_COMP; + + egrcnt = rcd->rcvegrcnt; + egroff = rcd->rcvegr_tid_base; + egrsize = dd->rcvegrbufsize; + + chunk = rcd->rcvegrbuf_chunks; + egrperchunk = rcd->rcvegrbufs_perchunk; + size = rcd->rcvegrbuf_size; + if (!rcd->rcvegrbuf) { + rcd->rcvegrbuf = + kzalloc_node(chunk * sizeof(rcd->rcvegrbuf[0]), + GFP_KERNEL, rcd->node_id); + if (!rcd->rcvegrbuf) + goto bail; + } + if (!rcd->rcvegrbuf_phys) { + rcd->rcvegrbuf_phys = + kmalloc_node(chunk * sizeof(rcd->rcvegrbuf_phys[0]), + GFP_KERNEL, rcd->node_id); + if (!rcd->rcvegrbuf_phys) + goto bail_rcvegrbuf; + } + for (e = 0; e < rcd->rcvegrbuf_chunks; e++) { + if (rcd->rcvegrbuf[e]) + continue; + + old_node_id = dev_to_node(&dd->pcidev->dev); + set_dev_node(&dd->pcidev->dev, rcd->node_id); + rcd->rcvegrbuf[e] = + dma_alloc_coherent(&dd->pcidev->dev, size, + &rcd->rcvegrbuf_phys[e], + gfp_flags); + set_dev_node(&dd->pcidev->dev, old_node_id); + if (!rcd->rcvegrbuf[e]) + goto bail_rcvegrbuf_phys; + } + + rcd->rcvegr_phys = rcd->rcvegrbuf_phys[0]; + + for (e = chunk = 0; chunk < rcd->rcvegrbuf_chunks; chunk++) { + dma_addr_t pa = rcd->rcvegrbuf_phys[chunk]; + unsigned i; + + /* clear for security and sanity on each use */ + memset(rcd->rcvegrbuf[chunk], 0, size); + + for (i = 0; e < egrcnt && i < egrperchunk; e++, i++) { + dd->f_put_tid(dd, e + egroff + + (u64 __iomem *) + ((char __iomem *) + dd->kregbase + + dd->rcvegrbase), + RCVHQ_RCV_TYPE_EAGER, pa); + pa += egrsize; + } + cond_resched(); /* don't hog the cpu */ + } + + return 0; + +bail_rcvegrbuf_phys: + for (e = 0; e < rcd->rcvegrbuf_chunks && rcd->rcvegrbuf[e]; e++) + dma_free_coherent(&dd->pcidev->dev, size, + rcd->rcvegrbuf[e], rcd->rcvegrbuf_phys[e]); + kfree(rcd->rcvegrbuf_phys); + rcd->rcvegrbuf_phys = NULL; +bail_rcvegrbuf: + kfree(rcd->rcvegrbuf); + rcd->rcvegrbuf = NULL; +bail: + return -ENOMEM; +} + +/* + * Note: Changes to this routine should be mirrored + * for the diagnostics routine qib_remap_ioaddr32(). + * There is also related code for VL15 buffers in qib_init_7322_variables(). + * The teardown code that unmaps is in qib_pcie_ddcleanup() + */ +int init_chip_wc_pat(struct qib_devdata *dd, u32 vl15buflen) +{ + u64 __iomem *qib_kregbase = NULL; + void __iomem *qib_piobase = NULL; + u64 __iomem *qib_userbase = NULL; + u64 qib_kreglen; + u64 qib_pio2koffset = dd->piobufbase & 0xffffffff; + u64 qib_pio4koffset = dd->piobufbase >> 32; + u64 qib_pio2klen = dd->piobcnt2k * dd->palign; + u64 qib_pio4klen = dd->piobcnt4k * dd->align4k; + u64 qib_physaddr = dd->physaddr; + u64 qib_piolen; + u64 qib_userlen = 0; + + /* + * Free the old mapping because the kernel will try to reuse the + * old mapping and not create a new mapping with the + * write combining attribute. + */ + iounmap(dd->kregbase); + dd->kregbase = NULL; + + /* + * Assumes chip address space looks like: + * - kregs + sregs + cregs + uregs (in any order) + * - piobufs (2K and 4K bufs in either order) + * or: + * - kregs + sregs + cregs (in any order) + * - piobufs (2K and 4K bufs in either order) + * - uregs + */ + if (dd->piobcnt4k == 0) { + qib_kreglen = qib_pio2koffset; + qib_piolen = qib_pio2klen; + } else if (qib_pio2koffset < qib_pio4koffset) { + qib_kreglen = qib_pio2koffset; + qib_piolen = qib_pio4koffset + qib_pio4klen - qib_kreglen; + } else { + qib_kreglen = qib_pio4koffset; + qib_piolen = qib_pio2koffset + qib_pio2klen - qib_kreglen; + } + qib_piolen += vl15buflen; + /* Map just the configured ports (not all hw ports) */ + if (dd->uregbase > qib_kreglen) + qib_userlen = dd->ureg_align * dd->cfgctxts; + + /* Sanity checks passed, now create the new mappings */ + qib_kregbase = ioremap_nocache(qib_physaddr, qib_kreglen); + if (!qib_kregbase) + goto bail; + + qib_piobase = ioremap_wc(qib_physaddr + qib_kreglen, qib_piolen); + if (!qib_piobase) + goto bail_kregbase; + + if (qib_userlen) { + qib_userbase = ioremap_nocache(qib_physaddr + dd->uregbase, + qib_userlen); + if (!qib_userbase) + goto bail_piobase; + } + + dd->kregbase = qib_kregbase; + dd->kregend = (u64 __iomem *) + ((char __iomem *) qib_kregbase + qib_kreglen); + dd->piobase = qib_piobase; + dd->pio2kbase = (void __iomem *) + (((char __iomem *) dd->piobase) + + qib_pio2koffset - qib_kreglen); + if (dd->piobcnt4k) + dd->pio4kbase = (void __iomem *) + (((char __iomem *) dd->piobase) + + qib_pio4koffset - qib_kreglen); + if (qib_userlen) + /* ureg will now be accessed relative to dd->userbase */ + dd->userbase = qib_userbase; + return 0; + +bail_piobase: + iounmap(qib_piobase); +bail_kregbase: + iounmap(qib_kregbase); +bail: + return -ENOMEM; +} diff --git a/kernel/drivers/infiniband/hw/qib/qib_intr.c b/kernel/drivers/infiniband/hw/qib/qib_intr.c new file mode 100644 index 000000000..086616d07 --- /dev/null +++ b/kernel/drivers/infiniband/hw/qib/qib_intr.c @@ -0,0 +1,240 @@ +/* + * Copyright (c) 2006, 2007, 2008, 2009, 2010 QLogic Corporation. + * All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include "qib.h" +#include "qib_common.h" + +/** + * qib_format_hwmsg - format a single hwerror message + * @msg message buffer + * @msgl length of message buffer + * @hwmsg message to add to message buffer + */ +static void qib_format_hwmsg(char *msg, size_t msgl, const char *hwmsg) +{ + strlcat(msg, "[", msgl); + strlcat(msg, hwmsg, msgl); + strlcat(msg, "]", msgl); +} + +/** + * qib_format_hwerrors - format hardware error messages for display + * @hwerrs hardware errors bit vector + * @hwerrmsgs hardware error descriptions + * @nhwerrmsgs number of hwerrmsgs + * @msg message buffer + * @msgl message buffer length + */ +void qib_format_hwerrors(u64 hwerrs, const struct qib_hwerror_msgs *hwerrmsgs, + size_t nhwerrmsgs, char *msg, size_t msgl) +{ + int i; + + for (i = 0; i < nhwerrmsgs; i++) + if (hwerrs & hwerrmsgs[i].mask) + qib_format_hwmsg(msg, msgl, hwerrmsgs[i].msg); +} + +static void signal_ib_event(struct qib_pportdata *ppd, enum ib_event_type ev) +{ + struct ib_event event; + struct qib_devdata *dd = ppd->dd; + + event.device = &dd->verbs_dev.ibdev; + event.element.port_num = ppd->port; + event.event = ev; + ib_dispatch_event(&event); +} + +void qib_handle_e_ibstatuschanged(struct qib_pportdata *ppd, u64 ibcs) +{ + struct qib_devdata *dd = ppd->dd; + unsigned long flags; + u32 lstate; + u8 ltstate; + enum ib_event_type ev = 0; + + lstate = dd->f_iblink_state(ibcs); /* linkstate */ + ltstate = dd->f_ibphys_portstate(ibcs); + + /* + * If linkstate transitions into INIT from any of the various down + * states, or if it transitions from any of the up (INIT or better) + * states into any of the down states (except link recovery), then + * call the chip-specific code to take appropriate actions. + * + * ppd->lflags could be 0 if this is the first time the interrupt + * handlers has been called but the link is already up. + */ + if (lstate >= IB_PORT_INIT && + (!ppd->lflags || (ppd->lflags & QIBL_LINKDOWN)) && + ltstate == IB_PHYSPORTSTATE_LINKUP) { + /* transitioned to UP */ + if (dd->f_ib_updown(ppd, 1, ibcs)) + goto skip_ibchange; /* chip-code handled */ + } else if (ppd->lflags & (QIBL_LINKINIT | QIBL_LINKARMED | + QIBL_LINKACTIVE | QIBL_IB_FORCE_NOTIFY)) { + if (ltstate != IB_PHYSPORTSTATE_LINKUP && + ltstate <= IB_PHYSPORTSTATE_CFG_TRAIN && + dd->f_ib_updown(ppd, 0, ibcs)) + goto skip_ibchange; /* chip-code handled */ + qib_set_uevent_bits(ppd, _QIB_EVENT_LINKDOWN_BIT); + } + + if (lstate != IB_PORT_DOWN) { + /* lstate is INIT, ARMED, or ACTIVE */ + if (lstate != IB_PORT_ACTIVE) { + *ppd->statusp &= ~QIB_STATUS_IB_READY; + if (ppd->lflags & QIBL_LINKACTIVE) + ev = IB_EVENT_PORT_ERR; + spin_lock_irqsave(&ppd->lflags_lock, flags); + if (lstate == IB_PORT_ARMED) { + ppd->lflags |= QIBL_LINKARMED | QIBL_LINKV; + ppd->lflags &= ~(QIBL_LINKINIT | + QIBL_LINKDOWN | QIBL_LINKACTIVE); + } else { + ppd->lflags |= QIBL_LINKINIT | QIBL_LINKV; + ppd->lflags &= ~(QIBL_LINKARMED | + QIBL_LINKDOWN | QIBL_LINKACTIVE); + } + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + /* start a 75msec timer to clear symbol errors */ + mod_timer(&ppd->symerr_clear_timer, + msecs_to_jiffies(75)); + } else if (ltstate == IB_PHYSPORTSTATE_LINKUP && + !(ppd->lflags & QIBL_LINKACTIVE)) { + /* active, but not active defered */ + qib_hol_up(ppd); /* useful only for 6120 now */ + *ppd->statusp |= + QIB_STATUS_IB_READY | QIB_STATUS_IB_CONF; + qib_clear_symerror_on_linkup((unsigned long)ppd); + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags |= QIBL_LINKACTIVE | QIBL_LINKV; + ppd->lflags &= ~(QIBL_LINKINIT | + QIBL_LINKDOWN | QIBL_LINKARMED); + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + if (dd->flags & QIB_HAS_SEND_DMA) + qib_sdma_process_event(ppd, + qib_sdma_event_e30_go_running); + ev = IB_EVENT_PORT_ACTIVE; + dd->f_setextled(ppd, 1); + } + } else { /* down */ + if (ppd->lflags & QIBL_LINKACTIVE) + ev = IB_EVENT_PORT_ERR; + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags |= QIBL_LINKDOWN | QIBL_LINKV; + ppd->lflags &= ~(QIBL_LINKINIT | + QIBL_LINKACTIVE | QIBL_LINKARMED); + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + *ppd->statusp &= ~QIB_STATUS_IB_READY; + } + +skip_ibchange: + ppd->lastibcstat = ibcs; + if (ev) + signal_ib_event(ppd, ev); +} + +void qib_clear_symerror_on_linkup(unsigned long opaque) +{ + struct qib_pportdata *ppd = (struct qib_pportdata *)opaque; + + if (ppd->lflags & QIBL_LINKACTIVE) + return; + + ppd->ibport_data.z_symbol_error_counter = + ppd->dd->f_portcntr(ppd, QIBPORTCNTR_IBSYMBOLERR); +} + +/* + * Handle receive interrupts for user ctxts; this means a user + * process was waiting for a packet to arrive, and didn't want + * to poll. + */ +void qib_handle_urcv(struct qib_devdata *dd, u64 ctxtr) +{ + struct qib_ctxtdata *rcd; + unsigned long flags; + int i; + + spin_lock_irqsave(&dd->uctxt_lock, flags); + for (i = dd->first_user_ctxt; dd->rcd && i < dd->cfgctxts; i++) { + if (!(ctxtr & (1ULL << i))) + continue; + rcd = dd->rcd[i]; + if (!rcd || !rcd->cnt) + continue; + + if (test_and_clear_bit(QIB_CTXT_WAITING_RCV, &rcd->flag)) { + wake_up_interruptible(&rcd->wait); + dd->f_rcvctrl(rcd->ppd, QIB_RCVCTRL_INTRAVAIL_DIS, + rcd->ctxt); + } else if (test_and_clear_bit(QIB_CTXT_WAITING_URG, + &rcd->flag)) { + rcd->urgent++; + wake_up_interruptible(&rcd->wait); + } + } + spin_unlock_irqrestore(&dd->uctxt_lock, flags); +} + +void qib_bad_intrstatus(struct qib_devdata *dd) +{ + static int allbits; + + /* separate routine, for better optimization of qib_intr() */ + + /* + * We print the message and disable interrupts, in hope of + * having a better chance of debugging the problem. + */ + qib_dev_err(dd, + "Read of chip interrupt status failed disabling interrupts\n"); + if (allbits++) { + /* disable interrupt delivery, something is very wrong */ + if (allbits == 2) + dd->f_set_intr_state(dd, 0); + if (allbits == 3) { + qib_dev_err(dd, + "2nd bad interrupt status, unregistering interrupts\n"); + dd->flags |= QIB_BADINTR; + dd->flags &= ~QIB_INITTED; + dd->f_free_irq(dd); + } + } +} diff --git a/kernel/drivers/infiniband/hw/qib/qib_keys.c b/kernel/drivers/infiniband/hw/qib/qib_keys.c new file mode 100644 index 000000000..ad843c786 --- /dev/null +++ b/kernel/drivers/infiniband/hw/qib/qib_keys.c @@ -0,0 +1,387 @@ +/* + * Copyright (c) 2006, 2007, 2009 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "qib.h" + +/** + * qib_alloc_lkey - allocate an lkey + * @mr: memory region that this lkey protects + * @dma_region: 0->normal key, 1->restricted DMA key + * + * Returns 0 if successful, otherwise returns -errno. + * + * Increments mr reference count as required. + * + * Sets the lkey field mr for non-dma regions. + * + */ + +int qib_alloc_lkey(struct qib_mregion *mr, int dma_region) +{ + unsigned long flags; + u32 r; + u32 n; + int ret = 0; + struct qib_ibdev *dev = to_idev(mr->pd->device); + struct qib_lkey_table *rkt = &dev->lk_table; + + spin_lock_irqsave(&rkt->lock, flags); + + /* special case for dma_mr lkey == 0 */ + if (dma_region) { + struct qib_mregion *tmr; + + tmr = rcu_access_pointer(dev->dma_mr); + if (!tmr) { + qib_get_mr(mr); + rcu_assign_pointer(dev->dma_mr, mr); + mr->lkey_published = 1; + } + goto success; + } + + /* Find the next available LKEY */ + r = rkt->next; + n = r; + for (;;) { + if (rkt->table[r] == NULL) + break; + r = (r + 1) & (rkt->max - 1); + if (r == n) + goto bail; + } + rkt->next = (r + 1) & (rkt->max - 1); + /* + * Make sure lkey is never zero which is reserved to indicate an + * unrestricted LKEY. + */ + rkt->gen++; + mr->lkey = (r << (32 - ib_qib_lkey_table_size)) | + ((((1 << (24 - ib_qib_lkey_table_size)) - 1) & rkt->gen) + << 8); + if (mr->lkey == 0) { + mr->lkey |= 1 << 8; + rkt->gen++; + } + qib_get_mr(mr); + rcu_assign_pointer(rkt->table[r], mr); + mr->lkey_published = 1; +success: + spin_unlock_irqrestore(&rkt->lock, flags); +out: + return ret; +bail: + spin_unlock_irqrestore(&rkt->lock, flags); + ret = -ENOMEM; + goto out; +} + +/** + * qib_free_lkey - free an lkey + * @mr: mr to free from tables + */ +void qib_free_lkey(struct qib_mregion *mr) +{ + unsigned long flags; + u32 lkey = mr->lkey; + u32 r; + struct qib_ibdev *dev = to_idev(mr->pd->device); + struct qib_lkey_table *rkt = &dev->lk_table; + + spin_lock_irqsave(&rkt->lock, flags); + if (!mr->lkey_published) + goto out; + if (lkey == 0) + RCU_INIT_POINTER(dev->dma_mr, NULL); + else { + r = lkey >> (32 - ib_qib_lkey_table_size); + RCU_INIT_POINTER(rkt->table[r], NULL); + } + qib_put_mr(mr); + mr->lkey_published = 0; +out: + spin_unlock_irqrestore(&rkt->lock, flags); +} + +/** + * qib_lkey_ok - check IB SGE for validity and initialize + * @rkt: table containing lkey to check SGE against + * @pd: protection domain + * @isge: outgoing internal SGE + * @sge: SGE to check + * @acc: access flags + * + * Return 1 if valid and successful, otherwise returns 0. + * + * increments the reference count upon success + * + * Check the IB SGE for validity and initialize our internal version + * of it. + */ +int qib_lkey_ok(struct qib_lkey_table *rkt, struct qib_pd *pd, + struct qib_sge *isge, struct ib_sge *sge, int acc) +{ + struct qib_mregion *mr; + unsigned n, m; + size_t off; + + /* + * We use LKEY == zero for kernel virtual addresses + * (see qib_get_dma_mr and qib_dma.c). + */ + rcu_read_lock(); + if (sge->lkey == 0) { + struct qib_ibdev *dev = to_idev(pd->ibpd.device); + + if (pd->user) + goto bail; + mr = rcu_dereference(dev->dma_mr); + if (!mr) + goto bail; + if (unlikely(!atomic_inc_not_zero(&mr->refcount))) + goto bail; + rcu_read_unlock(); + + isge->mr = mr; + isge->vaddr = (void *) sge->addr; + isge->length = sge->length; + isge->sge_length = sge->length; + isge->m = 0; + isge->n = 0; + goto ok; + } + mr = rcu_dereference( + rkt->table[(sge->lkey >> (32 - ib_qib_lkey_table_size))]); + if (unlikely(!mr || mr->lkey != sge->lkey || mr->pd != &pd->ibpd)) + goto bail; + + off = sge->addr - mr->user_base; + if (unlikely(sge->addr < mr->user_base || + off + sge->length > mr->length || + (mr->access_flags & acc) != acc)) + goto bail; + if (unlikely(!atomic_inc_not_zero(&mr->refcount))) + goto bail; + rcu_read_unlock(); + + off += mr->offset; + if (mr->page_shift) { + /* + page sizes are uniform power of 2 so no loop is necessary + entries_spanned_by_off is the number of times the loop below + would have executed. + */ + size_t entries_spanned_by_off; + + entries_spanned_by_off = off >> mr->page_shift; + off -= (entries_spanned_by_off << mr->page_shift); + m = entries_spanned_by_off/QIB_SEGSZ; + n = entries_spanned_by_off%QIB_SEGSZ; + } else { + m = 0; + n = 0; + while (off >= mr->map[m]->segs[n].length) { + off -= mr->map[m]->segs[n].length; + n++; + if (n >= QIB_SEGSZ) { + m++; + n = 0; + } + } + } + isge->mr = mr; + isge->vaddr = mr->map[m]->segs[n].vaddr + off; + isge->length = mr->map[m]->segs[n].length - off; + isge->sge_length = sge->length; + isge->m = m; + isge->n = n; +ok: + return 1; +bail: + rcu_read_unlock(); + return 0; +} + +/** + * qib_rkey_ok - check the IB virtual address, length, and RKEY + * @qp: qp for validation + * @sge: SGE state + * @len: length of data + * @vaddr: virtual address to place data + * @rkey: rkey to check + * @acc: access flags + * + * Return 1 if successful, otherwise 0. + * + * increments the reference count upon success + */ +int qib_rkey_ok(struct qib_qp *qp, struct qib_sge *sge, + u32 len, u64 vaddr, u32 rkey, int acc) +{ + struct qib_lkey_table *rkt = &to_idev(qp->ibqp.device)->lk_table; + struct qib_mregion *mr; + unsigned n, m; + size_t off; + + /* + * We use RKEY == zero for kernel virtual addresses + * (see qib_get_dma_mr and qib_dma.c). + */ + rcu_read_lock(); + if (rkey == 0) { + struct qib_pd *pd = to_ipd(qp->ibqp.pd); + struct qib_ibdev *dev = to_idev(pd->ibpd.device); + + if (pd->user) + goto bail; + mr = rcu_dereference(dev->dma_mr); + if (!mr) + goto bail; + if (unlikely(!atomic_inc_not_zero(&mr->refcount))) + goto bail; + rcu_read_unlock(); + + sge->mr = mr; + sge->vaddr = (void *) vaddr; + sge->length = len; + sge->sge_length = len; + sge->m = 0; + sge->n = 0; + goto ok; + } + + mr = rcu_dereference( + rkt->table[(rkey >> (32 - ib_qib_lkey_table_size))]); + if (unlikely(!mr || mr->lkey != rkey || qp->ibqp.pd != mr->pd)) + goto bail; + + off = vaddr - mr->iova; + if (unlikely(vaddr < mr->iova || off + len > mr->length || + (mr->access_flags & acc) == 0)) + goto bail; + if (unlikely(!atomic_inc_not_zero(&mr->refcount))) + goto bail; + rcu_read_unlock(); + + off += mr->offset; + if (mr->page_shift) { + /* + page sizes are uniform power of 2 so no loop is necessary + entries_spanned_by_off is the number of times the loop below + would have executed. + */ + size_t entries_spanned_by_off; + + entries_spanned_by_off = off >> mr->page_shift; + off -= (entries_spanned_by_off << mr->page_shift); + m = entries_spanned_by_off/QIB_SEGSZ; + n = entries_spanned_by_off%QIB_SEGSZ; + } else { + m = 0; + n = 0; + while (off >= mr->map[m]->segs[n].length) { + off -= mr->map[m]->segs[n].length; + n++; + if (n >= QIB_SEGSZ) { + m++; + n = 0; + } + } + } + sge->mr = mr; + sge->vaddr = mr->map[m]->segs[n].vaddr + off; + sge->length = mr->map[m]->segs[n].length - off; + sge->sge_length = len; + sge->m = m; + sge->n = n; +ok: + return 1; +bail: + rcu_read_unlock(); + return 0; +} + +/* + * Initialize the memory region specified by the work reqeust. + */ +int qib_fast_reg_mr(struct qib_qp *qp, struct ib_send_wr *wr) +{ + struct qib_lkey_table *rkt = &to_idev(qp->ibqp.device)->lk_table; + struct qib_pd *pd = to_ipd(qp->ibqp.pd); + struct qib_mregion *mr; + u32 rkey = wr->wr.fast_reg.rkey; + unsigned i, n, m; + int ret = -EINVAL; + unsigned long flags; + u64 *page_list; + size_t ps; + + spin_lock_irqsave(&rkt->lock, flags); + if (pd->user || rkey == 0) + goto bail; + + mr = rcu_dereference_protected( + rkt->table[(rkey >> (32 - ib_qib_lkey_table_size))], + lockdep_is_held(&rkt->lock)); + if (unlikely(mr == NULL || qp->ibqp.pd != mr->pd)) + goto bail; + + if (wr->wr.fast_reg.page_list_len > mr->max_segs) + goto bail; + + ps = 1UL << wr->wr.fast_reg.page_shift; + if (wr->wr.fast_reg.length > ps * wr->wr.fast_reg.page_list_len) + goto bail; + + mr->user_base = wr->wr.fast_reg.iova_start; + mr->iova = wr->wr.fast_reg.iova_start; + mr->lkey = rkey; + mr->length = wr->wr.fast_reg.length; + mr->access_flags = wr->wr.fast_reg.access_flags; + page_list = wr->wr.fast_reg.page_list->page_list; + m = 0; + n = 0; + for (i = 0; i < wr->wr.fast_reg.page_list_len; i++) { + mr->map[m]->segs[n].vaddr = (void *) page_list[i]; + mr->map[m]->segs[n].length = ps; + if (++n == QIB_SEGSZ) { + m++; + n = 0; + } + } + + ret = 0; +bail: + spin_unlock_irqrestore(&rkt->lock, flags); + return ret; +} diff --git a/kernel/drivers/infiniband/hw/qib/qib_mad.c b/kernel/drivers/infiniband/hw/qib/qib_mad.c new file mode 100644 index 000000000..395f4046d --- /dev/null +++ b/kernel/drivers/infiniband/hw/qib/qib_mad.c @@ -0,0 +1,2533 @@ +/* + * Copyright (c) 2012 Intel Corporation. All rights reserved. + * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include "qib.h" +#include "qib_mad.h" + +static int reply(struct ib_smp *smp) +{ + /* + * The verbs framework will handle the directed/LID route + * packet changes. + */ + smp->method = IB_MGMT_METHOD_GET_RESP; + if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + smp->status |= IB_SMP_DIRECTION; + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; +} + +static int reply_failure(struct ib_smp *smp) +{ + /* + * The verbs framework will handle the directed/LID route + * packet changes. + */ + smp->method = IB_MGMT_METHOD_GET_RESP; + if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + smp->status |= IB_SMP_DIRECTION; + return IB_MAD_RESULT_FAILURE | IB_MAD_RESULT_REPLY; +} + +static void qib_send_trap(struct qib_ibport *ibp, void *data, unsigned len) +{ + struct ib_mad_send_buf *send_buf; + struct ib_mad_agent *agent; + struct ib_smp *smp; + int ret; + unsigned long flags; + unsigned long timeout; + + agent = ibp->send_agent; + if (!agent) + return; + + /* o14-3.2.1 */ + if (!(ppd_from_ibp(ibp)->lflags & QIBL_LINKACTIVE)) + return; + + /* o14-2 */ + if (ibp->trap_timeout && time_before(jiffies, ibp->trap_timeout)) + return; + + send_buf = ib_create_send_mad(agent, 0, 0, 0, IB_MGMT_MAD_HDR, + IB_MGMT_MAD_DATA, GFP_ATOMIC); + if (IS_ERR(send_buf)) + return; + + smp = send_buf->mad; + smp->base_version = IB_MGMT_BASE_VERSION; + smp->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED; + smp->class_version = 1; + smp->method = IB_MGMT_METHOD_TRAP; + ibp->tid++; + smp->tid = cpu_to_be64(ibp->tid); + smp->attr_id = IB_SMP_ATTR_NOTICE; + /* o14-1: smp->mkey = 0; */ + memcpy(smp->data, data, len); + + spin_lock_irqsave(&ibp->lock, flags); + if (!ibp->sm_ah) { + if (ibp->sm_lid != be16_to_cpu(IB_LID_PERMISSIVE)) { + struct ib_ah *ah; + + ah = qib_create_qp0_ah(ibp, ibp->sm_lid); + if (IS_ERR(ah)) + ret = PTR_ERR(ah); + else { + send_buf->ah = ah; + ibp->sm_ah = to_iah(ah); + ret = 0; + } + } else + ret = -EINVAL; + } else { + send_buf->ah = &ibp->sm_ah->ibah; + ret = 0; + } + spin_unlock_irqrestore(&ibp->lock, flags); + + if (!ret) + ret = ib_post_send_mad(send_buf, NULL); + if (!ret) { + /* 4.096 usec. */ + timeout = (4096 * (1UL << ibp->subnet_timeout)) / 1000; + ibp->trap_timeout = jiffies + usecs_to_jiffies(timeout); + } else { + ib_free_send_mad(send_buf); + ibp->trap_timeout = 0; + } +} + +/* + * Send a bad [PQ]_Key trap (ch. 14.3.8). + */ +void qib_bad_pqkey(struct qib_ibport *ibp, __be16 trap_num, u32 key, u32 sl, + u32 qp1, u32 qp2, __be16 lid1, __be16 lid2) +{ + struct ib_mad_notice_attr data; + + if (trap_num == IB_NOTICE_TRAP_BAD_PKEY) + ibp->pkey_violations++; + else + ibp->qkey_violations++; + ibp->n_pkt_drops++; + + /* Send violation trap */ + data.generic_type = IB_NOTICE_TYPE_SECURITY; + data.prod_type_msb = 0; + data.prod_type_lsb = IB_NOTICE_PROD_CA; + data.trap_num = trap_num; + data.issuer_lid = cpu_to_be16(ppd_from_ibp(ibp)->lid); + data.toggle_count = 0; + memset(&data.details, 0, sizeof(data.details)); + data.details.ntc_257_258.lid1 = lid1; + data.details.ntc_257_258.lid2 = lid2; + data.details.ntc_257_258.key = cpu_to_be32(key); + data.details.ntc_257_258.sl_qp1 = cpu_to_be32((sl << 28) | qp1); + data.details.ntc_257_258.qp2 = cpu_to_be32(qp2); + + qib_send_trap(ibp, &data, sizeof(data)); +} + +/* + * Send a bad M_Key trap (ch. 14.3.9). + */ +static void qib_bad_mkey(struct qib_ibport *ibp, struct ib_smp *smp) +{ + struct ib_mad_notice_attr data; + + /* Send violation trap */ + data.generic_type = IB_NOTICE_TYPE_SECURITY; + data.prod_type_msb = 0; + data.prod_type_lsb = IB_NOTICE_PROD_CA; + data.trap_num = IB_NOTICE_TRAP_BAD_MKEY; + data.issuer_lid = cpu_to_be16(ppd_from_ibp(ibp)->lid); + data.toggle_count = 0; + memset(&data.details, 0, sizeof(data.details)); + data.details.ntc_256.lid = data.issuer_lid; + data.details.ntc_256.method = smp->method; + data.details.ntc_256.attr_id = smp->attr_id; + data.details.ntc_256.attr_mod = smp->attr_mod; + data.details.ntc_256.mkey = smp->mkey; + if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { + u8 hop_cnt; + + data.details.ntc_256.dr_slid = smp->dr_slid; + data.details.ntc_256.dr_trunc_hop = IB_NOTICE_TRAP_DR_NOTICE; + hop_cnt = smp->hop_cnt; + if (hop_cnt > ARRAY_SIZE(data.details.ntc_256.dr_rtn_path)) { + data.details.ntc_256.dr_trunc_hop |= + IB_NOTICE_TRAP_DR_TRUNC; + hop_cnt = ARRAY_SIZE(data.details.ntc_256.dr_rtn_path); + } + data.details.ntc_256.dr_trunc_hop |= hop_cnt; + memcpy(data.details.ntc_256.dr_rtn_path, smp->return_path, + hop_cnt); + } + + qib_send_trap(ibp, &data, sizeof(data)); +} + +/* + * Send a Port Capability Mask Changed trap (ch. 14.3.11). + */ +void qib_cap_mask_chg(struct qib_ibport *ibp) +{ + struct ib_mad_notice_attr data; + + data.generic_type = IB_NOTICE_TYPE_INFO; + data.prod_type_msb = 0; + data.prod_type_lsb = IB_NOTICE_PROD_CA; + data.trap_num = IB_NOTICE_TRAP_CAP_MASK_CHG; + data.issuer_lid = cpu_to_be16(ppd_from_ibp(ibp)->lid); + data.toggle_count = 0; + memset(&data.details, 0, sizeof(data.details)); + data.details.ntc_144.lid = data.issuer_lid; + data.details.ntc_144.new_cap_mask = cpu_to_be32(ibp->port_cap_flags); + + qib_send_trap(ibp, &data, sizeof(data)); +} + +/* + * Send a System Image GUID Changed trap (ch. 14.3.12). + */ +void qib_sys_guid_chg(struct qib_ibport *ibp) +{ + struct ib_mad_notice_attr data; + + data.generic_type = IB_NOTICE_TYPE_INFO; + data.prod_type_msb = 0; + data.prod_type_lsb = IB_NOTICE_PROD_CA; + data.trap_num = IB_NOTICE_TRAP_SYS_GUID_CHG; + data.issuer_lid = cpu_to_be16(ppd_from_ibp(ibp)->lid); + data.toggle_count = 0; + memset(&data.details, 0, sizeof(data.details)); + data.details.ntc_145.lid = data.issuer_lid; + data.details.ntc_145.new_sys_guid = ib_qib_sys_image_guid; + + qib_send_trap(ibp, &data, sizeof(data)); +} + +/* + * Send a Node Description Changed trap (ch. 14.3.13). + */ +void qib_node_desc_chg(struct qib_ibport *ibp) +{ + struct ib_mad_notice_attr data; + + data.generic_type = IB_NOTICE_TYPE_INFO; + data.prod_type_msb = 0; + data.prod_type_lsb = IB_NOTICE_PROD_CA; + data.trap_num = IB_NOTICE_TRAP_CAP_MASK_CHG; + data.issuer_lid = cpu_to_be16(ppd_from_ibp(ibp)->lid); + data.toggle_count = 0; + memset(&data.details, 0, sizeof(data.details)); + data.details.ntc_144.lid = data.issuer_lid; + data.details.ntc_144.local_changes = 1; + data.details.ntc_144.change_flags = IB_NOTICE_TRAP_NODE_DESC_CHG; + + qib_send_trap(ibp, &data, sizeof(data)); +} + +static int subn_get_nodedescription(struct ib_smp *smp, + struct ib_device *ibdev) +{ + if (smp->attr_mod) + smp->status |= IB_SMP_INVALID_FIELD; + + memcpy(smp->data, ibdev->node_desc, sizeof(smp->data)); + + return reply(smp); +} + +static int subn_get_nodeinfo(struct ib_smp *smp, struct ib_device *ibdev, + u8 port) +{ + struct ib_node_info *nip = (struct ib_node_info *)&smp->data; + struct qib_devdata *dd = dd_from_ibdev(ibdev); + u32 vendor, majrev, minrev; + unsigned pidx = port - 1; /* IB number port from 1, hdw from 0 */ + + /* GUID 0 is illegal */ + if (smp->attr_mod || pidx >= dd->num_pports || + dd->pport[pidx].guid == 0) + smp->status |= IB_SMP_INVALID_FIELD; + else + nip->port_guid = dd->pport[pidx].guid; + + nip->base_version = 1; + nip->class_version = 1; + nip->node_type = 1; /* channel adapter */ + nip->num_ports = ibdev->phys_port_cnt; + /* This is already in network order */ + nip->sys_guid = ib_qib_sys_image_guid; + nip->node_guid = dd->pport->guid; /* Use first-port GUID as node */ + nip->partition_cap = cpu_to_be16(qib_get_npkeys(dd)); + nip->device_id = cpu_to_be16(dd->deviceid); + majrev = dd->majrev; + minrev = dd->minrev; + nip->revision = cpu_to_be32((majrev << 16) | minrev); + nip->local_port_num = port; + vendor = dd->vendorid; + nip->vendor_id[0] = QIB_SRC_OUI_1; + nip->vendor_id[1] = QIB_SRC_OUI_2; + nip->vendor_id[2] = QIB_SRC_OUI_3; + + return reply(smp); +} + +static int subn_get_guidinfo(struct ib_smp *smp, struct ib_device *ibdev, + u8 port) +{ + struct qib_devdata *dd = dd_from_ibdev(ibdev); + u32 startgx = 8 * be32_to_cpu(smp->attr_mod); + __be64 *p = (__be64 *) smp->data; + unsigned pidx = port - 1; /* IB number port from 1, hdw from 0 */ + + /* 32 blocks of 8 64-bit GUIDs per block */ + + memset(smp->data, 0, sizeof(smp->data)); + + if (startgx == 0 && pidx < dd->num_pports) { + struct qib_pportdata *ppd = dd->pport + pidx; + struct qib_ibport *ibp = &ppd->ibport_data; + __be64 g = ppd->guid; + unsigned i; + + /* GUID 0 is illegal */ + if (g == 0) + smp->status |= IB_SMP_INVALID_FIELD; + else { + /* The first is a copy of the read-only HW GUID. */ + p[0] = g; + for (i = 1; i < QIB_GUIDS_PER_PORT; i++) + p[i] = ibp->guids[i - 1]; + } + } else + smp->status |= IB_SMP_INVALID_FIELD; + + return reply(smp); +} + +static void set_link_width_enabled(struct qib_pportdata *ppd, u32 w) +{ + (void) ppd->dd->f_set_ib_cfg(ppd, QIB_IB_CFG_LWID_ENB, w); +} + +static void set_link_speed_enabled(struct qib_pportdata *ppd, u32 s) +{ + (void) ppd->dd->f_set_ib_cfg(ppd, QIB_IB_CFG_SPD_ENB, s); +} + +static int get_overrunthreshold(struct qib_pportdata *ppd) +{ + return ppd->dd->f_get_ib_cfg(ppd, QIB_IB_CFG_OVERRUN_THRESH); +} + +/** + * set_overrunthreshold - set the overrun threshold + * @ppd: the physical port data + * @n: the new threshold + * + * Note that this will only take effect when the link state changes. + */ +static int set_overrunthreshold(struct qib_pportdata *ppd, unsigned n) +{ + (void) ppd->dd->f_set_ib_cfg(ppd, QIB_IB_CFG_OVERRUN_THRESH, + (u32)n); + return 0; +} + +static int get_phyerrthreshold(struct qib_pportdata *ppd) +{ + return ppd->dd->f_get_ib_cfg(ppd, QIB_IB_CFG_PHYERR_THRESH); +} + +/** + * set_phyerrthreshold - set the physical error threshold + * @ppd: the physical port data + * @n: the new threshold + * + * Note that this will only take effect when the link state changes. + */ +static int set_phyerrthreshold(struct qib_pportdata *ppd, unsigned n) +{ + (void) ppd->dd->f_set_ib_cfg(ppd, QIB_IB_CFG_PHYERR_THRESH, + (u32)n); + return 0; +} + +/** + * get_linkdowndefaultstate - get the default linkdown state + * @ppd: the physical port data + * + * Returns zero if the default is POLL, 1 if the default is SLEEP. + */ +static int get_linkdowndefaultstate(struct qib_pportdata *ppd) +{ + return ppd->dd->f_get_ib_cfg(ppd, QIB_IB_CFG_LINKDEFAULT) == + IB_LINKINITCMD_SLEEP; +} + +static int check_mkey(struct qib_ibport *ibp, struct ib_smp *smp, int mad_flags) +{ + int valid_mkey = 0; + int ret = 0; + + /* Is the mkey in the process of expiring? */ + if (ibp->mkey_lease_timeout && + time_after_eq(jiffies, ibp->mkey_lease_timeout)) { + /* Clear timeout and mkey protection field. */ + ibp->mkey_lease_timeout = 0; + ibp->mkeyprot = 0; + } + + if ((mad_flags & IB_MAD_IGNORE_MKEY) || ibp->mkey == 0 || + ibp->mkey == smp->mkey) + valid_mkey = 1; + + /* Unset lease timeout on any valid Get/Set/TrapRepress */ + if (valid_mkey && ibp->mkey_lease_timeout && + (smp->method == IB_MGMT_METHOD_GET || + smp->method == IB_MGMT_METHOD_SET || + smp->method == IB_MGMT_METHOD_TRAP_REPRESS)) + ibp->mkey_lease_timeout = 0; + + if (!valid_mkey) { + switch (smp->method) { + case IB_MGMT_METHOD_GET: + /* Bad mkey not a violation below level 2 */ + if (ibp->mkeyprot < 2) + break; + case IB_MGMT_METHOD_SET: + case IB_MGMT_METHOD_TRAP_REPRESS: + if (ibp->mkey_violations != 0xFFFF) + ++ibp->mkey_violations; + if (!ibp->mkey_lease_timeout && ibp->mkey_lease_period) + ibp->mkey_lease_timeout = jiffies + + ibp->mkey_lease_period * HZ; + /* Generate a trap notice. */ + qib_bad_mkey(ibp, smp); + ret = 1; + } + } + + return ret; +} + +static int subn_get_portinfo(struct ib_smp *smp, struct ib_device *ibdev, + u8 port) +{ + struct qib_devdata *dd; + struct qib_pportdata *ppd; + struct qib_ibport *ibp; + struct ib_port_info *pip = (struct ib_port_info *)smp->data; + u8 mtu; + int ret; + u32 state; + u32 port_num = be32_to_cpu(smp->attr_mod); + + if (port_num == 0) + port_num = port; + else { + if (port_num > ibdev->phys_port_cnt) { + smp->status |= IB_SMP_INVALID_FIELD; + ret = reply(smp); + goto bail; + } + if (port_num != port) { + ibp = to_iport(ibdev, port_num); + ret = check_mkey(ibp, smp, 0); + if (ret) { + ret = IB_MAD_RESULT_FAILURE; + goto bail; + } + } + } + + dd = dd_from_ibdev(ibdev); + /* IB numbers ports from 1, hdw from 0 */ + ppd = dd->pport + (port_num - 1); + ibp = &ppd->ibport_data; + + /* Clear all fields. Only set the non-zero fields. */ + memset(smp->data, 0, sizeof(smp->data)); + + /* Only return the mkey if the protection field allows it. */ + if (!(smp->method == IB_MGMT_METHOD_GET && + ibp->mkey != smp->mkey && + ibp->mkeyprot == 1)) + pip->mkey = ibp->mkey; + pip->gid_prefix = ibp->gid_prefix; + pip->lid = cpu_to_be16(ppd->lid); + pip->sm_lid = cpu_to_be16(ibp->sm_lid); + pip->cap_mask = cpu_to_be32(ibp->port_cap_flags); + /* pip->diag_code; */ + pip->mkey_lease_period = cpu_to_be16(ibp->mkey_lease_period); + pip->local_port_num = port; + pip->link_width_enabled = ppd->link_width_enabled; + pip->link_width_supported = ppd->link_width_supported; + pip->link_width_active = ppd->link_width_active; + state = dd->f_iblink_state(ppd->lastibcstat); + pip->linkspeed_portstate = ppd->link_speed_supported << 4 | state; + + pip->portphysstate_linkdown = + (dd->f_ibphys_portstate(ppd->lastibcstat) << 4) | + (get_linkdowndefaultstate(ppd) ? 1 : 2); + pip->mkeyprot_resv_lmc = (ibp->mkeyprot << 6) | ppd->lmc; + pip->linkspeedactive_enabled = (ppd->link_speed_active << 4) | + ppd->link_speed_enabled; + switch (ppd->ibmtu) { + default: /* something is wrong; fall through */ + case 4096: + mtu = IB_MTU_4096; + break; + case 2048: + mtu = IB_MTU_2048; + break; + case 1024: + mtu = IB_MTU_1024; + break; + case 512: + mtu = IB_MTU_512; + break; + case 256: + mtu = IB_MTU_256; + break; + } + pip->neighbormtu_mastersmsl = (mtu << 4) | ibp->sm_sl; + pip->vlcap_inittype = ppd->vls_supported << 4; /* InitType = 0 */ + pip->vl_high_limit = ibp->vl_high_limit; + pip->vl_arb_high_cap = + dd->f_get_ib_cfg(ppd, QIB_IB_CFG_VL_HIGH_CAP); + pip->vl_arb_low_cap = + dd->f_get_ib_cfg(ppd, QIB_IB_CFG_VL_LOW_CAP); + /* InitTypeReply = 0 */ + pip->inittypereply_mtucap = qib_ibmtu ? qib_ibmtu : IB_MTU_4096; + /* HCAs ignore VLStallCount and HOQLife */ + /* pip->vlstallcnt_hoqlife; */ + pip->operationalvl_pei_peo_fpi_fpo = + dd->f_get_ib_cfg(ppd, QIB_IB_CFG_OP_VLS) << 4; + pip->mkey_violations = cpu_to_be16(ibp->mkey_violations); + /* P_KeyViolations are counted by hardware. */ + pip->pkey_violations = cpu_to_be16(ibp->pkey_violations); + pip->qkey_violations = cpu_to_be16(ibp->qkey_violations); + /* Only the hardware GUID is supported for now */ + pip->guid_cap = QIB_GUIDS_PER_PORT; + pip->clientrereg_resv_subnetto = ibp->subnet_timeout; + /* 32.768 usec. response time (guessing) */ + pip->resv_resptimevalue = 3; + pip->localphyerrors_overrunerrors = + (get_phyerrthreshold(ppd) << 4) | + get_overrunthreshold(ppd); + /* pip->max_credit_hint; */ + if (ibp->port_cap_flags & IB_PORT_LINK_LATENCY_SUP) { + u32 v; + + v = dd->f_get_ib_cfg(ppd, QIB_IB_CFG_LINKLATENCY); + pip->link_roundtrip_latency[0] = v >> 16; + pip->link_roundtrip_latency[1] = v >> 8; + pip->link_roundtrip_latency[2] = v; + } + + ret = reply(smp); + +bail: + return ret; +} + +/** + * get_pkeys - return the PKEY table + * @dd: the qlogic_ib device + * @port: the IB port number + * @pkeys: the pkey table is placed here + */ +static int get_pkeys(struct qib_devdata *dd, u8 port, u16 *pkeys) +{ + struct qib_pportdata *ppd = dd->pport + port - 1; + /* + * always a kernel context, no locking needed. + * If we get here with ppd setup, no need to check + * that pd is valid. + */ + struct qib_ctxtdata *rcd = dd->rcd[ppd->hw_pidx]; + + memcpy(pkeys, rcd->pkeys, sizeof(rcd->pkeys)); + + return 0; +} + +static int subn_get_pkeytable(struct ib_smp *smp, struct ib_device *ibdev, + u8 port) +{ + u32 startpx = 32 * (be32_to_cpu(smp->attr_mod) & 0xffff); + u16 *p = (u16 *) smp->data; + __be16 *q = (__be16 *) smp->data; + + /* 64 blocks of 32 16-bit P_Key entries */ + + memset(smp->data, 0, sizeof(smp->data)); + if (startpx == 0) { + struct qib_devdata *dd = dd_from_ibdev(ibdev); + unsigned i, n = qib_get_npkeys(dd); + + get_pkeys(dd, port, p); + + for (i = 0; i < n; i++) + q[i] = cpu_to_be16(p[i]); + } else + smp->status |= IB_SMP_INVALID_FIELD; + + return reply(smp); +} + +static int subn_set_guidinfo(struct ib_smp *smp, struct ib_device *ibdev, + u8 port) +{ + struct qib_devdata *dd = dd_from_ibdev(ibdev); + u32 startgx = 8 * be32_to_cpu(smp->attr_mod); + __be64 *p = (__be64 *) smp->data; + unsigned pidx = port - 1; /* IB number port from 1, hdw from 0 */ + + /* 32 blocks of 8 64-bit GUIDs per block */ + + if (startgx == 0 && pidx < dd->num_pports) { + struct qib_pportdata *ppd = dd->pport + pidx; + struct qib_ibport *ibp = &ppd->ibport_data; + unsigned i; + + /* The first entry is read-only. */ + for (i = 1; i < QIB_GUIDS_PER_PORT; i++) + ibp->guids[i - 1] = p[i]; + } else + smp->status |= IB_SMP_INVALID_FIELD; + + /* The only GUID we support is the first read-only entry. */ + return subn_get_guidinfo(smp, ibdev, port); +} + +/** + * subn_set_portinfo - set port information + * @smp: the incoming SM packet + * @ibdev: the infiniband device + * @port: the port on the device + * + * Set Portinfo (see ch. 14.2.5.6). + */ +static int subn_set_portinfo(struct ib_smp *smp, struct ib_device *ibdev, + u8 port) +{ + struct ib_port_info *pip = (struct ib_port_info *)smp->data; + struct ib_event event; + struct qib_devdata *dd; + struct qib_pportdata *ppd; + struct qib_ibport *ibp; + u8 clientrereg = (pip->clientrereg_resv_subnetto & 0x80); + unsigned long flags; + u16 lid, smlid; + u8 lwe; + u8 lse; + u8 state; + u8 vls; + u8 msl; + u16 lstate; + int ret, ore, mtu; + u32 port_num = be32_to_cpu(smp->attr_mod); + + if (port_num == 0) + port_num = port; + else { + if (port_num > ibdev->phys_port_cnt) + goto err; + /* Port attributes can only be set on the receiving port */ + if (port_num != port) + goto get_only; + } + + dd = dd_from_ibdev(ibdev); + /* IB numbers ports from 1, hdw from 0 */ + ppd = dd->pport + (port_num - 1); + ibp = &ppd->ibport_data; + event.device = ibdev; + event.element.port_num = port; + + ibp->mkey = pip->mkey; + ibp->gid_prefix = pip->gid_prefix; + ibp->mkey_lease_period = be16_to_cpu(pip->mkey_lease_period); + + lid = be16_to_cpu(pip->lid); + /* Must be a valid unicast LID address. */ + if (lid == 0 || lid >= QIB_MULTICAST_LID_BASE) + smp->status |= IB_SMP_INVALID_FIELD; + else if (ppd->lid != lid || ppd->lmc != (pip->mkeyprot_resv_lmc & 7)) { + if (ppd->lid != lid) + qib_set_uevent_bits(ppd, _QIB_EVENT_LID_CHANGE_BIT); + if (ppd->lmc != (pip->mkeyprot_resv_lmc & 7)) + qib_set_uevent_bits(ppd, _QIB_EVENT_LMC_CHANGE_BIT); + qib_set_lid(ppd, lid, pip->mkeyprot_resv_lmc & 7); + event.event = IB_EVENT_LID_CHANGE; + ib_dispatch_event(&event); + } + + smlid = be16_to_cpu(pip->sm_lid); + msl = pip->neighbormtu_mastersmsl & 0xF; + /* Must be a valid unicast LID address. */ + if (smlid == 0 || smlid >= QIB_MULTICAST_LID_BASE) + smp->status |= IB_SMP_INVALID_FIELD; + else if (smlid != ibp->sm_lid || msl != ibp->sm_sl) { + spin_lock_irqsave(&ibp->lock, flags); + if (ibp->sm_ah) { + if (smlid != ibp->sm_lid) + ibp->sm_ah->attr.dlid = smlid; + if (msl != ibp->sm_sl) + ibp->sm_ah->attr.sl = msl; + } + spin_unlock_irqrestore(&ibp->lock, flags); + if (smlid != ibp->sm_lid) + ibp->sm_lid = smlid; + if (msl != ibp->sm_sl) + ibp->sm_sl = msl; + event.event = IB_EVENT_SM_CHANGE; + ib_dispatch_event(&event); + } + + /* Allow 1x or 4x to be set (see 14.2.6.6). */ + lwe = pip->link_width_enabled; + if (lwe) { + if (lwe == 0xFF) + set_link_width_enabled(ppd, ppd->link_width_supported); + else if (lwe >= 16 || (lwe & ~ppd->link_width_supported)) + smp->status |= IB_SMP_INVALID_FIELD; + else if (lwe != ppd->link_width_enabled) + set_link_width_enabled(ppd, lwe); + } + + lse = pip->linkspeedactive_enabled & 0xF; + if (lse) { + /* + * The IB 1.2 spec. only allows link speed values + * 1, 3, 5, 7, 15. 1.2.1 extended to allow specific + * speeds. + */ + if (lse == 15) + set_link_speed_enabled(ppd, + ppd->link_speed_supported); + else if (lse >= 8 || (lse & ~ppd->link_speed_supported)) + smp->status |= IB_SMP_INVALID_FIELD; + else if (lse != ppd->link_speed_enabled) + set_link_speed_enabled(ppd, lse); + } + + /* Set link down default state. */ + switch (pip->portphysstate_linkdown & 0xF) { + case 0: /* NOP */ + break; + case 1: /* SLEEP */ + (void) dd->f_set_ib_cfg(ppd, QIB_IB_CFG_LINKDEFAULT, + IB_LINKINITCMD_SLEEP); + break; + case 2: /* POLL */ + (void) dd->f_set_ib_cfg(ppd, QIB_IB_CFG_LINKDEFAULT, + IB_LINKINITCMD_POLL); + break; + default: + smp->status |= IB_SMP_INVALID_FIELD; + } + + ibp->mkeyprot = pip->mkeyprot_resv_lmc >> 6; + ibp->vl_high_limit = pip->vl_high_limit; + (void) dd->f_set_ib_cfg(ppd, QIB_IB_CFG_VL_HIGH_LIMIT, + ibp->vl_high_limit); + + mtu = ib_mtu_enum_to_int((pip->neighbormtu_mastersmsl >> 4) & 0xF); + if (mtu == -1) + smp->status |= IB_SMP_INVALID_FIELD; + else + qib_set_mtu(ppd, mtu); + + /* Set operational VLs */ + vls = (pip->operationalvl_pei_peo_fpi_fpo >> 4) & 0xF; + if (vls) { + if (vls > ppd->vls_supported) + smp->status |= IB_SMP_INVALID_FIELD; + else + (void) dd->f_set_ib_cfg(ppd, QIB_IB_CFG_OP_VLS, vls); + } + + if (pip->mkey_violations == 0) + ibp->mkey_violations = 0; + + if (pip->pkey_violations == 0) + ibp->pkey_violations = 0; + + if (pip->qkey_violations == 0) + ibp->qkey_violations = 0; + + ore = pip->localphyerrors_overrunerrors; + if (set_phyerrthreshold(ppd, (ore >> 4) & 0xF)) + smp->status |= IB_SMP_INVALID_FIELD; + + if (set_overrunthreshold(ppd, (ore & 0xF))) + smp->status |= IB_SMP_INVALID_FIELD; + + ibp->subnet_timeout = pip->clientrereg_resv_subnetto & 0x1F; + + /* + * Do the port state change now that the other link parameters + * have been set. + * Changing the port physical state only makes sense if the link + * is down or is being set to down. + */ + state = pip->linkspeed_portstate & 0xF; + lstate = (pip->portphysstate_linkdown >> 4) & 0xF; + if (lstate && !(state == IB_PORT_DOWN || state == IB_PORT_NOP)) + smp->status |= IB_SMP_INVALID_FIELD; + + /* + * Only state changes of DOWN, ARM, and ACTIVE are valid + * and must be in the correct state to take effect (see 7.2.6). + */ + switch (state) { + case IB_PORT_NOP: + if (lstate == 0) + break; + /* FALLTHROUGH */ + case IB_PORT_DOWN: + if (lstate == 0) + lstate = QIB_IB_LINKDOWN_ONLY; + else if (lstate == 1) + lstate = QIB_IB_LINKDOWN_SLEEP; + else if (lstate == 2) + lstate = QIB_IB_LINKDOWN; + else if (lstate == 3) + lstate = QIB_IB_LINKDOWN_DISABLE; + else { + smp->status |= IB_SMP_INVALID_FIELD; + break; + } + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags &= ~QIBL_LINKV; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + qib_set_linkstate(ppd, lstate); + /* + * Don't send a reply if the response would be sent + * through the disabled port. + */ + if (lstate == QIB_IB_LINKDOWN_DISABLE && smp->hop_cnt) { + ret = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; + goto done; + } + qib_wait_linkstate(ppd, QIBL_LINKV, 10); + break; + case IB_PORT_ARMED: + qib_set_linkstate(ppd, QIB_IB_LINKARM); + break; + case IB_PORT_ACTIVE: + qib_set_linkstate(ppd, QIB_IB_LINKACTIVE); + break; + default: + smp->status |= IB_SMP_INVALID_FIELD; + } + + if (clientrereg) { + event.event = IB_EVENT_CLIENT_REREGISTER; + ib_dispatch_event(&event); + } + + ret = subn_get_portinfo(smp, ibdev, port); + + /* restore re-reg bit per o14-12.2.1 */ + pip->clientrereg_resv_subnetto |= clientrereg; + + goto get_only; + +err: + smp->status |= IB_SMP_INVALID_FIELD; +get_only: + ret = subn_get_portinfo(smp, ibdev, port); +done: + return ret; +} + +/** + * rm_pkey - decrecment the reference count for the given PKEY + * @dd: the qlogic_ib device + * @key: the PKEY index + * + * Return true if this was the last reference and the hardware table entry + * needs to be changed. + */ +static int rm_pkey(struct qib_pportdata *ppd, u16 key) +{ + int i; + int ret; + + for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) { + if (ppd->pkeys[i] != key) + continue; + if (atomic_dec_and_test(&ppd->pkeyrefs[i])) { + ppd->pkeys[i] = 0; + ret = 1; + goto bail; + } + break; + } + + ret = 0; + +bail: + return ret; +} + +/** + * add_pkey - add the given PKEY to the hardware table + * @dd: the qlogic_ib device + * @key: the PKEY + * + * Return an error code if unable to add the entry, zero if no change, + * or 1 if the hardware PKEY register needs to be updated. + */ +static int add_pkey(struct qib_pportdata *ppd, u16 key) +{ + int i; + u16 lkey = key & 0x7FFF; + int any = 0; + int ret; + + if (lkey == 0x7FFF) { + ret = 0; + goto bail; + } + + /* Look for an empty slot or a matching PKEY. */ + for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) { + if (!ppd->pkeys[i]) { + any++; + continue; + } + /* If it matches exactly, try to increment the ref count */ + if (ppd->pkeys[i] == key) { + if (atomic_inc_return(&ppd->pkeyrefs[i]) > 1) { + ret = 0; + goto bail; + } + /* Lost the race. Look for an empty slot below. */ + atomic_dec(&ppd->pkeyrefs[i]); + any++; + } + /* + * It makes no sense to have both the limited and unlimited + * PKEY set at the same time since the unlimited one will + * disable the limited one. + */ + if ((ppd->pkeys[i] & 0x7FFF) == lkey) { + ret = -EEXIST; + goto bail; + } + } + if (!any) { + ret = -EBUSY; + goto bail; + } + for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) { + if (!ppd->pkeys[i] && + atomic_inc_return(&ppd->pkeyrefs[i]) == 1) { + /* for qibstats, etc. */ + ppd->pkeys[i] = key; + ret = 1; + goto bail; + } + } + ret = -EBUSY; + +bail: + return ret; +} + +/** + * set_pkeys - set the PKEY table for ctxt 0 + * @dd: the qlogic_ib device + * @port: the IB port number + * @pkeys: the PKEY table + */ +static int set_pkeys(struct qib_devdata *dd, u8 port, u16 *pkeys) +{ + struct qib_pportdata *ppd; + struct qib_ctxtdata *rcd; + int i; + int changed = 0; + + /* + * IB port one/two always maps to context zero/one, + * always a kernel context, no locking needed + * If we get here with ppd setup, no need to check + * that rcd is valid. + */ + ppd = dd->pport + (port - 1); + rcd = dd->rcd[ppd->hw_pidx]; + + for (i = 0; i < ARRAY_SIZE(rcd->pkeys); i++) { + u16 key = pkeys[i]; + u16 okey = rcd->pkeys[i]; + + if (key == okey) + continue; + /* + * The value of this PKEY table entry is changing. + * Remove the old entry in the hardware's array of PKEYs. + */ + if (okey & 0x7FFF) + changed |= rm_pkey(ppd, okey); + if (key & 0x7FFF) { + int ret = add_pkey(ppd, key); + + if (ret < 0) + key = 0; + else + changed |= ret; + } + rcd->pkeys[i] = key; + } + if (changed) { + struct ib_event event; + + (void) dd->f_set_ib_cfg(ppd, QIB_IB_CFG_PKEYS, 0); + + event.event = IB_EVENT_PKEY_CHANGE; + event.device = &dd->verbs_dev.ibdev; + event.element.port_num = port; + ib_dispatch_event(&event); + } + return 0; +} + +static int subn_set_pkeytable(struct ib_smp *smp, struct ib_device *ibdev, + u8 port) +{ + u32 startpx = 32 * (be32_to_cpu(smp->attr_mod) & 0xffff); + __be16 *p = (__be16 *) smp->data; + u16 *q = (u16 *) smp->data; + struct qib_devdata *dd = dd_from_ibdev(ibdev); + unsigned i, n = qib_get_npkeys(dd); + + for (i = 0; i < n; i++) + q[i] = be16_to_cpu(p[i]); + + if (startpx != 0 || set_pkeys(dd, port, q) != 0) + smp->status |= IB_SMP_INVALID_FIELD; + + return subn_get_pkeytable(smp, ibdev, port); +} + +static int subn_get_sl_to_vl(struct ib_smp *smp, struct ib_device *ibdev, + u8 port) +{ + struct qib_ibport *ibp = to_iport(ibdev, port); + u8 *p = (u8 *) smp->data; + unsigned i; + + memset(smp->data, 0, sizeof(smp->data)); + + if (!(ibp->port_cap_flags & IB_PORT_SL_MAP_SUP)) + smp->status |= IB_SMP_UNSUP_METHOD; + else + for (i = 0; i < ARRAY_SIZE(ibp->sl_to_vl); i += 2) + *p++ = (ibp->sl_to_vl[i] << 4) | ibp->sl_to_vl[i + 1]; + + return reply(smp); +} + +static int subn_set_sl_to_vl(struct ib_smp *smp, struct ib_device *ibdev, + u8 port) +{ + struct qib_ibport *ibp = to_iport(ibdev, port); + u8 *p = (u8 *) smp->data; + unsigned i; + + if (!(ibp->port_cap_flags & IB_PORT_SL_MAP_SUP)) { + smp->status |= IB_SMP_UNSUP_METHOD; + return reply(smp); + } + + for (i = 0; i < ARRAY_SIZE(ibp->sl_to_vl); i += 2, p++) { + ibp->sl_to_vl[i] = *p >> 4; + ibp->sl_to_vl[i + 1] = *p & 0xF; + } + qib_set_uevent_bits(ppd_from_ibp(to_iport(ibdev, port)), + _QIB_EVENT_SL2VL_CHANGE_BIT); + + return subn_get_sl_to_vl(smp, ibdev, port); +} + +static int subn_get_vl_arb(struct ib_smp *smp, struct ib_device *ibdev, + u8 port) +{ + unsigned which = be32_to_cpu(smp->attr_mod) >> 16; + struct qib_pportdata *ppd = ppd_from_ibp(to_iport(ibdev, port)); + + memset(smp->data, 0, sizeof(smp->data)); + + if (ppd->vls_supported == IB_VL_VL0) + smp->status |= IB_SMP_UNSUP_METHOD; + else if (which == IB_VLARB_LOWPRI_0_31) + (void) ppd->dd->f_get_ib_table(ppd, QIB_IB_TBL_VL_LOW_ARB, + smp->data); + else if (which == IB_VLARB_HIGHPRI_0_31) + (void) ppd->dd->f_get_ib_table(ppd, QIB_IB_TBL_VL_HIGH_ARB, + smp->data); + else + smp->status |= IB_SMP_INVALID_FIELD; + + return reply(smp); +} + +static int subn_set_vl_arb(struct ib_smp *smp, struct ib_device *ibdev, + u8 port) +{ + unsigned which = be32_to_cpu(smp->attr_mod) >> 16; + struct qib_pportdata *ppd = ppd_from_ibp(to_iport(ibdev, port)); + + if (ppd->vls_supported == IB_VL_VL0) + smp->status |= IB_SMP_UNSUP_METHOD; + else if (which == IB_VLARB_LOWPRI_0_31) + (void) ppd->dd->f_set_ib_table(ppd, QIB_IB_TBL_VL_LOW_ARB, + smp->data); + else if (which == IB_VLARB_HIGHPRI_0_31) + (void) ppd->dd->f_set_ib_table(ppd, QIB_IB_TBL_VL_HIGH_ARB, + smp->data); + else + smp->status |= IB_SMP_INVALID_FIELD; + + return subn_get_vl_arb(smp, ibdev, port); +} + +static int subn_trap_repress(struct ib_smp *smp, struct ib_device *ibdev, + u8 port) +{ + /* + * For now, we only send the trap once so no need to process this. + * o13-6, o13-7, + * o14-3.a4 The SMA shall not send any message in response to a valid + * SubnTrapRepress() message. + */ + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; +} + +static int pma_get_classportinfo(struct ib_pma_mad *pmp, + struct ib_device *ibdev) +{ + struct ib_class_port_info *p = + (struct ib_class_port_info *)pmp->data; + struct qib_devdata *dd = dd_from_ibdev(ibdev); + + memset(pmp->data, 0, sizeof(pmp->data)); + + if (pmp->mad_hdr.attr_mod != 0) + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + + /* Note that AllPortSelect is not valid */ + p->base_version = 1; + p->class_version = 1; + p->capability_mask = IB_PMA_CLASS_CAP_EXT_WIDTH; + /* + * Set the most significant bit of CM2 to indicate support for + * congestion statistics + */ + p->reserved[0] = dd->psxmitwait_supported << 7; + /* + * Expected response time is 4.096 usec. * 2^18 == 1.073741824 sec. + */ + p->resp_time_value = 18; + + return reply((struct ib_smp *) pmp); +} + +static int pma_get_portsamplescontrol(struct ib_pma_mad *pmp, + struct ib_device *ibdev, u8 port) +{ + struct ib_pma_portsamplescontrol *p = + (struct ib_pma_portsamplescontrol *)pmp->data; + struct qib_ibdev *dev = to_idev(ibdev); + struct qib_devdata *dd = dd_from_dev(dev); + struct qib_ibport *ibp = to_iport(ibdev, port); + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + unsigned long flags; + u8 port_select = p->port_select; + + memset(pmp->data, 0, sizeof(pmp->data)); + + p->port_select = port_select; + if (pmp->mad_hdr.attr_mod != 0 || port_select != port) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + goto bail; + } + spin_lock_irqsave(&ibp->lock, flags); + p->tick = dd->f_get_ib_cfg(ppd, QIB_IB_CFG_PMA_TICKS); + p->sample_status = dd->f_portcntr(ppd, QIBPORTCNTR_PSSTAT); + p->counter_width = 4; /* 32 bit counters */ + p->counter_mask0_9 = COUNTER_MASK0_9; + p->sample_start = cpu_to_be32(ibp->pma_sample_start); + p->sample_interval = cpu_to_be32(ibp->pma_sample_interval); + p->tag = cpu_to_be16(ibp->pma_tag); + p->counter_select[0] = ibp->pma_counter_select[0]; + p->counter_select[1] = ibp->pma_counter_select[1]; + p->counter_select[2] = ibp->pma_counter_select[2]; + p->counter_select[3] = ibp->pma_counter_select[3]; + p->counter_select[4] = ibp->pma_counter_select[4]; + spin_unlock_irqrestore(&ibp->lock, flags); + +bail: + return reply((struct ib_smp *) pmp); +} + +static int pma_set_portsamplescontrol(struct ib_pma_mad *pmp, + struct ib_device *ibdev, u8 port) +{ + struct ib_pma_portsamplescontrol *p = + (struct ib_pma_portsamplescontrol *)pmp->data; + struct qib_ibdev *dev = to_idev(ibdev); + struct qib_devdata *dd = dd_from_dev(dev); + struct qib_ibport *ibp = to_iport(ibdev, port); + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + unsigned long flags; + u8 status, xmit_flags; + int ret; + + if (pmp->mad_hdr.attr_mod != 0 || p->port_select != port) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + ret = reply((struct ib_smp *) pmp); + goto bail; + } + + spin_lock_irqsave(&ibp->lock, flags); + + /* Port Sampling code owns the PS* HW counters */ + xmit_flags = ppd->cong_stats.flags; + ppd->cong_stats.flags = IB_PMA_CONG_HW_CONTROL_SAMPLE; + status = dd->f_portcntr(ppd, QIBPORTCNTR_PSSTAT); + if (status == IB_PMA_SAMPLE_STATUS_DONE || + (status == IB_PMA_SAMPLE_STATUS_RUNNING && + xmit_flags == IB_PMA_CONG_HW_CONTROL_TIMER)) { + ibp->pma_sample_start = be32_to_cpu(p->sample_start); + ibp->pma_sample_interval = be32_to_cpu(p->sample_interval); + ibp->pma_tag = be16_to_cpu(p->tag); + ibp->pma_counter_select[0] = p->counter_select[0]; + ibp->pma_counter_select[1] = p->counter_select[1]; + ibp->pma_counter_select[2] = p->counter_select[2]; + ibp->pma_counter_select[3] = p->counter_select[3]; + ibp->pma_counter_select[4] = p->counter_select[4]; + dd->f_set_cntr_sample(ppd, ibp->pma_sample_interval, + ibp->pma_sample_start); + } + spin_unlock_irqrestore(&ibp->lock, flags); + + ret = pma_get_portsamplescontrol(pmp, ibdev, port); + +bail: + return ret; +} + +static u64 get_counter(struct qib_ibport *ibp, struct qib_pportdata *ppd, + __be16 sel) +{ + u64 ret; + + switch (sel) { + case IB_PMA_PORT_XMIT_DATA: + ret = ppd->dd->f_portcntr(ppd, QIBPORTCNTR_PSXMITDATA); + break; + case IB_PMA_PORT_RCV_DATA: + ret = ppd->dd->f_portcntr(ppd, QIBPORTCNTR_PSRCVDATA); + break; + case IB_PMA_PORT_XMIT_PKTS: + ret = ppd->dd->f_portcntr(ppd, QIBPORTCNTR_PSXMITPKTS); + break; + case IB_PMA_PORT_RCV_PKTS: + ret = ppd->dd->f_portcntr(ppd, QIBPORTCNTR_PSRCVPKTS); + break; + case IB_PMA_PORT_XMIT_WAIT: + ret = ppd->dd->f_portcntr(ppd, QIBPORTCNTR_PSXMITWAIT); + break; + default: + ret = 0; + } + + return ret; +} + +/* This function assumes that the xmit_wait lock is already held */ +static u64 xmit_wait_get_value_delta(struct qib_pportdata *ppd) +{ + u32 delta; + + delta = get_counter(&ppd->ibport_data, ppd, + IB_PMA_PORT_XMIT_WAIT); + return ppd->cong_stats.counter + delta; +} + +static void cache_hw_sample_counters(struct qib_pportdata *ppd) +{ + struct qib_ibport *ibp = &ppd->ibport_data; + + ppd->cong_stats.counter_cache.psxmitdata = + get_counter(ibp, ppd, IB_PMA_PORT_XMIT_DATA); + ppd->cong_stats.counter_cache.psrcvdata = + get_counter(ibp, ppd, IB_PMA_PORT_RCV_DATA); + ppd->cong_stats.counter_cache.psxmitpkts = + get_counter(ibp, ppd, IB_PMA_PORT_XMIT_PKTS); + ppd->cong_stats.counter_cache.psrcvpkts = + get_counter(ibp, ppd, IB_PMA_PORT_RCV_PKTS); + ppd->cong_stats.counter_cache.psxmitwait = + get_counter(ibp, ppd, IB_PMA_PORT_XMIT_WAIT); +} + +static u64 get_cache_hw_sample_counters(struct qib_pportdata *ppd, + __be16 sel) +{ + u64 ret; + + switch (sel) { + case IB_PMA_PORT_XMIT_DATA: + ret = ppd->cong_stats.counter_cache.psxmitdata; + break; + case IB_PMA_PORT_RCV_DATA: + ret = ppd->cong_stats.counter_cache.psrcvdata; + break; + case IB_PMA_PORT_XMIT_PKTS: + ret = ppd->cong_stats.counter_cache.psxmitpkts; + break; + case IB_PMA_PORT_RCV_PKTS: + ret = ppd->cong_stats.counter_cache.psrcvpkts; + break; + case IB_PMA_PORT_XMIT_WAIT: + ret = ppd->cong_stats.counter_cache.psxmitwait; + break; + default: + ret = 0; + } + + return ret; +} + +static int pma_get_portsamplesresult(struct ib_pma_mad *pmp, + struct ib_device *ibdev, u8 port) +{ + struct ib_pma_portsamplesresult *p = + (struct ib_pma_portsamplesresult *)pmp->data; + struct qib_ibdev *dev = to_idev(ibdev); + struct qib_devdata *dd = dd_from_dev(dev); + struct qib_ibport *ibp = to_iport(ibdev, port); + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + unsigned long flags; + u8 status; + int i; + + memset(pmp->data, 0, sizeof(pmp->data)); + spin_lock_irqsave(&ibp->lock, flags); + p->tag = cpu_to_be16(ibp->pma_tag); + if (ppd->cong_stats.flags == IB_PMA_CONG_HW_CONTROL_TIMER) + p->sample_status = IB_PMA_SAMPLE_STATUS_DONE; + else { + status = dd->f_portcntr(ppd, QIBPORTCNTR_PSSTAT); + p->sample_status = cpu_to_be16(status); + if (status == IB_PMA_SAMPLE_STATUS_DONE) { + cache_hw_sample_counters(ppd); + ppd->cong_stats.counter = + xmit_wait_get_value_delta(ppd); + dd->f_set_cntr_sample(ppd, + QIB_CONG_TIMER_PSINTERVAL, 0); + ppd->cong_stats.flags = IB_PMA_CONG_HW_CONTROL_TIMER; + } + } + for (i = 0; i < ARRAY_SIZE(ibp->pma_counter_select); i++) + p->counter[i] = cpu_to_be32( + get_cache_hw_sample_counters( + ppd, ibp->pma_counter_select[i])); + spin_unlock_irqrestore(&ibp->lock, flags); + + return reply((struct ib_smp *) pmp); +} + +static int pma_get_portsamplesresult_ext(struct ib_pma_mad *pmp, + struct ib_device *ibdev, u8 port) +{ + struct ib_pma_portsamplesresult_ext *p = + (struct ib_pma_portsamplesresult_ext *)pmp->data; + struct qib_ibdev *dev = to_idev(ibdev); + struct qib_devdata *dd = dd_from_dev(dev); + struct qib_ibport *ibp = to_iport(ibdev, port); + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + unsigned long flags; + u8 status; + int i; + + /* Port Sampling code owns the PS* HW counters */ + memset(pmp->data, 0, sizeof(pmp->data)); + spin_lock_irqsave(&ibp->lock, flags); + p->tag = cpu_to_be16(ibp->pma_tag); + if (ppd->cong_stats.flags == IB_PMA_CONG_HW_CONTROL_TIMER) + p->sample_status = IB_PMA_SAMPLE_STATUS_DONE; + else { + status = dd->f_portcntr(ppd, QIBPORTCNTR_PSSTAT); + p->sample_status = cpu_to_be16(status); + /* 64 bits */ + p->extended_width = cpu_to_be32(0x80000000); + if (status == IB_PMA_SAMPLE_STATUS_DONE) { + cache_hw_sample_counters(ppd); + ppd->cong_stats.counter = + xmit_wait_get_value_delta(ppd); + dd->f_set_cntr_sample(ppd, + QIB_CONG_TIMER_PSINTERVAL, 0); + ppd->cong_stats.flags = IB_PMA_CONG_HW_CONTROL_TIMER; + } + } + for (i = 0; i < ARRAY_SIZE(ibp->pma_counter_select); i++) + p->counter[i] = cpu_to_be64( + get_cache_hw_sample_counters( + ppd, ibp->pma_counter_select[i])); + spin_unlock_irqrestore(&ibp->lock, flags); + + return reply((struct ib_smp *) pmp); +} + +static int pma_get_portcounters(struct ib_pma_mad *pmp, + struct ib_device *ibdev, u8 port) +{ + struct ib_pma_portcounters *p = (struct ib_pma_portcounters *) + pmp->data; + struct qib_ibport *ibp = to_iport(ibdev, port); + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + struct qib_verbs_counters cntrs; + u8 port_select = p->port_select; + + qib_get_counters(ppd, &cntrs); + + /* Adjust counters for any resets done. */ + cntrs.symbol_error_counter -= ibp->z_symbol_error_counter; + cntrs.link_error_recovery_counter -= + ibp->z_link_error_recovery_counter; + cntrs.link_downed_counter -= ibp->z_link_downed_counter; + cntrs.port_rcv_errors -= ibp->z_port_rcv_errors; + cntrs.port_rcv_remphys_errors -= ibp->z_port_rcv_remphys_errors; + cntrs.port_xmit_discards -= ibp->z_port_xmit_discards; + cntrs.port_xmit_data -= ibp->z_port_xmit_data; + cntrs.port_rcv_data -= ibp->z_port_rcv_data; + cntrs.port_xmit_packets -= ibp->z_port_xmit_packets; + cntrs.port_rcv_packets -= ibp->z_port_rcv_packets; + cntrs.local_link_integrity_errors -= + ibp->z_local_link_integrity_errors; + cntrs.excessive_buffer_overrun_errors -= + ibp->z_excessive_buffer_overrun_errors; + cntrs.vl15_dropped -= ibp->z_vl15_dropped; + cntrs.vl15_dropped += ibp->n_vl15_dropped; + + memset(pmp->data, 0, sizeof(pmp->data)); + + p->port_select = port_select; + if (pmp->mad_hdr.attr_mod != 0 || port_select != port) + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + + if (cntrs.symbol_error_counter > 0xFFFFUL) + p->symbol_error_counter = cpu_to_be16(0xFFFF); + else + p->symbol_error_counter = + cpu_to_be16((u16)cntrs.symbol_error_counter); + if (cntrs.link_error_recovery_counter > 0xFFUL) + p->link_error_recovery_counter = 0xFF; + else + p->link_error_recovery_counter = + (u8)cntrs.link_error_recovery_counter; + if (cntrs.link_downed_counter > 0xFFUL) + p->link_downed_counter = 0xFF; + else + p->link_downed_counter = (u8)cntrs.link_downed_counter; + if (cntrs.port_rcv_errors > 0xFFFFUL) + p->port_rcv_errors = cpu_to_be16(0xFFFF); + else + p->port_rcv_errors = + cpu_to_be16((u16) cntrs.port_rcv_errors); + if (cntrs.port_rcv_remphys_errors > 0xFFFFUL) + p->port_rcv_remphys_errors = cpu_to_be16(0xFFFF); + else + p->port_rcv_remphys_errors = + cpu_to_be16((u16)cntrs.port_rcv_remphys_errors); + if (cntrs.port_xmit_discards > 0xFFFFUL) + p->port_xmit_discards = cpu_to_be16(0xFFFF); + else + p->port_xmit_discards = + cpu_to_be16((u16)cntrs.port_xmit_discards); + if (cntrs.local_link_integrity_errors > 0xFUL) + cntrs.local_link_integrity_errors = 0xFUL; + if (cntrs.excessive_buffer_overrun_errors > 0xFUL) + cntrs.excessive_buffer_overrun_errors = 0xFUL; + p->link_overrun_errors = (cntrs.local_link_integrity_errors << 4) | + cntrs.excessive_buffer_overrun_errors; + if (cntrs.vl15_dropped > 0xFFFFUL) + p->vl15_dropped = cpu_to_be16(0xFFFF); + else + p->vl15_dropped = cpu_to_be16((u16)cntrs.vl15_dropped); + if (cntrs.port_xmit_data > 0xFFFFFFFFUL) + p->port_xmit_data = cpu_to_be32(0xFFFFFFFF); + else + p->port_xmit_data = cpu_to_be32((u32)cntrs.port_xmit_data); + if (cntrs.port_rcv_data > 0xFFFFFFFFUL) + p->port_rcv_data = cpu_to_be32(0xFFFFFFFF); + else + p->port_rcv_data = cpu_to_be32((u32)cntrs.port_rcv_data); + if (cntrs.port_xmit_packets > 0xFFFFFFFFUL) + p->port_xmit_packets = cpu_to_be32(0xFFFFFFFF); + else + p->port_xmit_packets = + cpu_to_be32((u32)cntrs.port_xmit_packets); + if (cntrs.port_rcv_packets > 0xFFFFFFFFUL) + p->port_rcv_packets = cpu_to_be32(0xFFFFFFFF); + else + p->port_rcv_packets = + cpu_to_be32((u32) cntrs.port_rcv_packets); + + return reply((struct ib_smp *) pmp); +} + +static int pma_get_portcounters_cong(struct ib_pma_mad *pmp, + struct ib_device *ibdev, u8 port) +{ + /* Congestion PMA packets start at offset 24 not 64 */ + struct ib_pma_portcounters_cong *p = + (struct ib_pma_portcounters_cong *)pmp->reserved; + struct qib_verbs_counters cntrs; + struct qib_ibport *ibp = to_iport(ibdev, port); + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + struct qib_devdata *dd = dd_from_ppd(ppd); + u32 port_select = be32_to_cpu(pmp->mad_hdr.attr_mod) & 0xFF; + u64 xmit_wait_counter; + unsigned long flags; + + /* + * This check is performed only in the GET method because the + * SET method ends up calling this anyway. + */ + if (!dd->psxmitwait_supported) + pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR; + if (port_select != port) + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + + qib_get_counters(ppd, &cntrs); + spin_lock_irqsave(&ppd->ibport_data.lock, flags); + xmit_wait_counter = xmit_wait_get_value_delta(ppd); + spin_unlock_irqrestore(&ppd->ibport_data.lock, flags); + + /* Adjust counters for any resets done. */ + cntrs.symbol_error_counter -= ibp->z_symbol_error_counter; + cntrs.link_error_recovery_counter -= + ibp->z_link_error_recovery_counter; + cntrs.link_downed_counter -= ibp->z_link_downed_counter; + cntrs.port_rcv_errors -= ibp->z_port_rcv_errors; + cntrs.port_rcv_remphys_errors -= + ibp->z_port_rcv_remphys_errors; + cntrs.port_xmit_discards -= ibp->z_port_xmit_discards; + cntrs.local_link_integrity_errors -= + ibp->z_local_link_integrity_errors; + cntrs.excessive_buffer_overrun_errors -= + ibp->z_excessive_buffer_overrun_errors; + cntrs.vl15_dropped -= ibp->z_vl15_dropped; + cntrs.vl15_dropped += ibp->n_vl15_dropped; + cntrs.port_xmit_data -= ibp->z_port_xmit_data; + cntrs.port_rcv_data -= ibp->z_port_rcv_data; + cntrs.port_xmit_packets -= ibp->z_port_xmit_packets; + cntrs.port_rcv_packets -= ibp->z_port_rcv_packets; + + memset(pmp->reserved, 0, sizeof(pmp->reserved) + + sizeof(pmp->data)); + + /* + * Set top 3 bits to indicate interval in picoseconds in + * remaining bits. + */ + p->port_check_rate = + cpu_to_be16((QIB_XMIT_RATE_PICO << 13) | + (dd->psxmitwait_check_rate & + ~(QIB_XMIT_RATE_PICO << 13))); + p->port_adr_events = cpu_to_be64(0); + p->port_xmit_wait = cpu_to_be64(xmit_wait_counter); + p->port_xmit_data = cpu_to_be64(cntrs.port_xmit_data); + p->port_rcv_data = cpu_to_be64(cntrs.port_rcv_data); + p->port_xmit_packets = + cpu_to_be64(cntrs.port_xmit_packets); + p->port_rcv_packets = + cpu_to_be64(cntrs.port_rcv_packets); + if (cntrs.symbol_error_counter > 0xFFFFUL) + p->symbol_error_counter = cpu_to_be16(0xFFFF); + else + p->symbol_error_counter = + cpu_to_be16( + (u16)cntrs.symbol_error_counter); + if (cntrs.link_error_recovery_counter > 0xFFUL) + p->link_error_recovery_counter = 0xFF; + else + p->link_error_recovery_counter = + (u8)cntrs.link_error_recovery_counter; + if (cntrs.link_downed_counter > 0xFFUL) + p->link_downed_counter = 0xFF; + else + p->link_downed_counter = + (u8)cntrs.link_downed_counter; + if (cntrs.port_rcv_errors > 0xFFFFUL) + p->port_rcv_errors = cpu_to_be16(0xFFFF); + else + p->port_rcv_errors = + cpu_to_be16((u16) cntrs.port_rcv_errors); + if (cntrs.port_rcv_remphys_errors > 0xFFFFUL) + p->port_rcv_remphys_errors = cpu_to_be16(0xFFFF); + else + p->port_rcv_remphys_errors = + cpu_to_be16( + (u16)cntrs.port_rcv_remphys_errors); + if (cntrs.port_xmit_discards > 0xFFFFUL) + p->port_xmit_discards = cpu_to_be16(0xFFFF); + else + p->port_xmit_discards = + cpu_to_be16((u16)cntrs.port_xmit_discards); + if (cntrs.local_link_integrity_errors > 0xFUL) + cntrs.local_link_integrity_errors = 0xFUL; + if (cntrs.excessive_buffer_overrun_errors > 0xFUL) + cntrs.excessive_buffer_overrun_errors = 0xFUL; + p->link_overrun_errors = (cntrs.local_link_integrity_errors << 4) | + cntrs.excessive_buffer_overrun_errors; + if (cntrs.vl15_dropped > 0xFFFFUL) + p->vl15_dropped = cpu_to_be16(0xFFFF); + else + p->vl15_dropped = cpu_to_be16((u16)cntrs.vl15_dropped); + + return reply((struct ib_smp *)pmp); +} + +static void qib_snapshot_pmacounters( + struct qib_ibport *ibp, + struct qib_pma_counters *pmacounters) +{ + struct qib_pma_counters *p; + int cpu; + + memset(pmacounters, 0, sizeof(*pmacounters)); + for_each_possible_cpu(cpu) { + p = per_cpu_ptr(ibp->pmastats, cpu); + pmacounters->n_unicast_xmit += p->n_unicast_xmit; + pmacounters->n_unicast_rcv += p->n_unicast_rcv; + pmacounters->n_multicast_xmit += p->n_multicast_xmit; + pmacounters->n_multicast_rcv += p->n_multicast_rcv; + } +} + +static int pma_get_portcounters_ext(struct ib_pma_mad *pmp, + struct ib_device *ibdev, u8 port) +{ + struct ib_pma_portcounters_ext *p = + (struct ib_pma_portcounters_ext *)pmp->data; + struct qib_ibport *ibp = to_iport(ibdev, port); + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + u64 swords, rwords, spkts, rpkts, xwait; + struct qib_pma_counters pma; + u8 port_select = p->port_select; + + memset(pmp->data, 0, sizeof(pmp->data)); + + p->port_select = port_select; + if (pmp->mad_hdr.attr_mod != 0 || port_select != port) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + goto bail; + } + + qib_snapshot_counters(ppd, &swords, &rwords, &spkts, &rpkts, &xwait); + + /* Adjust counters for any resets done. */ + swords -= ibp->z_port_xmit_data; + rwords -= ibp->z_port_rcv_data; + spkts -= ibp->z_port_xmit_packets; + rpkts -= ibp->z_port_rcv_packets; + + p->port_xmit_data = cpu_to_be64(swords); + p->port_rcv_data = cpu_to_be64(rwords); + p->port_xmit_packets = cpu_to_be64(spkts); + p->port_rcv_packets = cpu_to_be64(rpkts); + + qib_snapshot_pmacounters(ibp, &pma); + + p->port_unicast_xmit_packets = cpu_to_be64(pma.n_unicast_xmit + - ibp->z_unicast_xmit); + p->port_unicast_rcv_packets = cpu_to_be64(pma.n_unicast_rcv + - ibp->z_unicast_rcv); + p->port_multicast_xmit_packets = cpu_to_be64(pma.n_multicast_xmit + - ibp->z_multicast_xmit); + p->port_multicast_rcv_packets = cpu_to_be64(pma.n_multicast_rcv + - ibp->z_multicast_rcv); + +bail: + return reply((struct ib_smp *) pmp); +} + +static int pma_set_portcounters(struct ib_pma_mad *pmp, + struct ib_device *ibdev, u8 port) +{ + struct ib_pma_portcounters *p = (struct ib_pma_portcounters *) + pmp->data; + struct qib_ibport *ibp = to_iport(ibdev, port); + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + struct qib_verbs_counters cntrs; + + /* + * Since the HW doesn't support clearing counters, we save the + * current count and subtract it from future responses. + */ + qib_get_counters(ppd, &cntrs); + + if (p->counter_select & IB_PMA_SEL_SYMBOL_ERROR) + ibp->z_symbol_error_counter = cntrs.symbol_error_counter; + + if (p->counter_select & IB_PMA_SEL_LINK_ERROR_RECOVERY) + ibp->z_link_error_recovery_counter = + cntrs.link_error_recovery_counter; + + if (p->counter_select & IB_PMA_SEL_LINK_DOWNED) + ibp->z_link_downed_counter = cntrs.link_downed_counter; + + if (p->counter_select & IB_PMA_SEL_PORT_RCV_ERRORS) + ibp->z_port_rcv_errors = cntrs.port_rcv_errors; + + if (p->counter_select & IB_PMA_SEL_PORT_RCV_REMPHYS_ERRORS) + ibp->z_port_rcv_remphys_errors = + cntrs.port_rcv_remphys_errors; + + if (p->counter_select & IB_PMA_SEL_PORT_XMIT_DISCARDS) + ibp->z_port_xmit_discards = cntrs.port_xmit_discards; + + if (p->counter_select & IB_PMA_SEL_LOCAL_LINK_INTEGRITY_ERRORS) + ibp->z_local_link_integrity_errors = + cntrs.local_link_integrity_errors; + + if (p->counter_select & IB_PMA_SEL_EXCESSIVE_BUFFER_OVERRUNS) + ibp->z_excessive_buffer_overrun_errors = + cntrs.excessive_buffer_overrun_errors; + + if (p->counter_select & IB_PMA_SEL_PORT_VL15_DROPPED) { + ibp->n_vl15_dropped = 0; + ibp->z_vl15_dropped = cntrs.vl15_dropped; + } + + if (p->counter_select & IB_PMA_SEL_PORT_XMIT_DATA) + ibp->z_port_xmit_data = cntrs.port_xmit_data; + + if (p->counter_select & IB_PMA_SEL_PORT_RCV_DATA) + ibp->z_port_rcv_data = cntrs.port_rcv_data; + + if (p->counter_select & IB_PMA_SEL_PORT_XMIT_PACKETS) + ibp->z_port_xmit_packets = cntrs.port_xmit_packets; + + if (p->counter_select & IB_PMA_SEL_PORT_RCV_PACKETS) + ibp->z_port_rcv_packets = cntrs.port_rcv_packets; + + return pma_get_portcounters(pmp, ibdev, port); +} + +static int pma_set_portcounters_cong(struct ib_pma_mad *pmp, + struct ib_device *ibdev, u8 port) +{ + struct qib_ibport *ibp = to_iport(ibdev, port); + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + struct qib_devdata *dd = dd_from_ppd(ppd); + struct qib_verbs_counters cntrs; + u32 counter_select = (be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24) & 0xFF; + int ret = 0; + unsigned long flags; + + qib_get_counters(ppd, &cntrs); + /* Get counter values before we save them */ + ret = pma_get_portcounters_cong(pmp, ibdev, port); + + if (counter_select & IB_PMA_SEL_CONG_XMIT) { + spin_lock_irqsave(&ppd->ibport_data.lock, flags); + ppd->cong_stats.counter = 0; + dd->f_set_cntr_sample(ppd, QIB_CONG_TIMER_PSINTERVAL, + 0x0); + spin_unlock_irqrestore(&ppd->ibport_data.lock, flags); + } + if (counter_select & IB_PMA_SEL_CONG_PORT_DATA) { + ibp->z_port_xmit_data = cntrs.port_xmit_data; + ibp->z_port_rcv_data = cntrs.port_rcv_data; + ibp->z_port_xmit_packets = cntrs.port_xmit_packets; + ibp->z_port_rcv_packets = cntrs.port_rcv_packets; + } + if (counter_select & IB_PMA_SEL_CONG_ALL) { + ibp->z_symbol_error_counter = + cntrs.symbol_error_counter; + ibp->z_link_error_recovery_counter = + cntrs.link_error_recovery_counter; + ibp->z_link_downed_counter = + cntrs.link_downed_counter; + ibp->z_port_rcv_errors = cntrs.port_rcv_errors; + ibp->z_port_rcv_remphys_errors = + cntrs.port_rcv_remphys_errors; + ibp->z_port_xmit_discards = + cntrs.port_xmit_discards; + ibp->z_local_link_integrity_errors = + cntrs.local_link_integrity_errors; + ibp->z_excessive_buffer_overrun_errors = + cntrs.excessive_buffer_overrun_errors; + ibp->n_vl15_dropped = 0; + ibp->z_vl15_dropped = cntrs.vl15_dropped; + } + + return ret; +} + +static int pma_set_portcounters_ext(struct ib_pma_mad *pmp, + struct ib_device *ibdev, u8 port) +{ + struct ib_pma_portcounters *p = (struct ib_pma_portcounters *) + pmp->data; + struct qib_ibport *ibp = to_iport(ibdev, port); + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + u64 swords, rwords, spkts, rpkts, xwait; + struct qib_pma_counters pma; + + qib_snapshot_counters(ppd, &swords, &rwords, &spkts, &rpkts, &xwait); + + if (p->counter_select & IB_PMA_SELX_PORT_XMIT_DATA) + ibp->z_port_xmit_data = swords; + + if (p->counter_select & IB_PMA_SELX_PORT_RCV_DATA) + ibp->z_port_rcv_data = rwords; + + if (p->counter_select & IB_PMA_SELX_PORT_XMIT_PACKETS) + ibp->z_port_xmit_packets = spkts; + + if (p->counter_select & IB_PMA_SELX_PORT_RCV_PACKETS) + ibp->z_port_rcv_packets = rpkts; + + qib_snapshot_pmacounters(ibp, &pma); + + if (p->counter_select & IB_PMA_SELX_PORT_UNI_XMIT_PACKETS) + ibp->z_unicast_xmit = pma.n_unicast_xmit; + + if (p->counter_select & IB_PMA_SELX_PORT_UNI_RCV_PACKETS) + ibp->z_unicast_rcv = pma.n_unicast_rcv; + + if (p->counter_select & IB_PMA_SELX_PORT_MULTI_XMIT_PACKETS) + ibp->z_multicast_xmit = pma.n_multicast_xmit; + + if (p->counter_select & IB_PMA_SELX_PORT_MULTI_RCV_PACKETS) + ibp->z_multicast_rcv = pma.n_multicast_rcv; + + return pma_get_portcounters_ext(pmp, ibdev, port); +} + +static int process_subn(struct ib_device *ibdev, int mad_flags, + u8 port, struct ib_mad *in_mad, + struct ib_mad *out_mad) +{ + struct ib_smp *smp = (struct ib_smp *)out_mad; + struct qib_ibport *ibp = to_iport(ibdev, port); + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + int ret; + + *out_mad = *in_mad; + if (smp->class_version != 1) { + smp->status |= IB_SMP_UNSUP_VERSION; + ret = reply(smp); + goto bail; + } + + ret = check_mkey(ibp, smp, mad_flags); + if (ret) { + u32 port_num = be32_to_cpu(smp->attr_mod); + + /* + * If this is a get/set portinfo, we already check the + * M_Key if the MAD is for another port and the M_Key + * is OK on the receiving port. This check is needed + * to increment the error counters when the M_Key + * fails to match on *both* ports. + */ + if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO && + (smp->method == IB_MGMT_METHOD_GET || + smp->method == IB_MGMT_METHOD_SET) && + port_num && port_num <= ibdev->phys_port_cnt && + port != port_num) + (void) check_mkey(to_iport(ibdev, port_num), smp, 0); + ret = IB_MAD_RESULT_FAILURE; + goto bail; + } + + switch (smp->method) { + case IB_MGMT_METHOD_GET: + switch (smp->attr_id) { + case IB_SMP_ATTR_NODE_DESC: + ret = subn_get_nodedescription(smp, ibdev); + goto bail; + case IB_SMP_ATTR_NODE_INFO: + ret = subn_get_nodeinfo(smp, ibdev, port); + goto bail; + case IB_SMP_ATTR_GUID_INFO: + ret = subn_get_guidinfo(smp, ibdev, port); + goto bail; + case IB_SMP_ATTR_PORT_INFO: + ret = subn_get_portinfo(smp, ibdev, port); + goto bail; + case IB_SMP_ATTR_PKEY_TABLE: + ret = subn_get_pkeytable(smp, ibdev, port); + goto bail; + case IB_SMP_ATTR_SL_TO_VL_TABLE: + ret = subn_get_sl_to_vl(smp, ibdev, port); + goto bail; + case IB_SMP_ATTR_VL_ARB_TABLE: + ret = subn_get_vl_arb(smp, ibdev, port); + goto bail; + case IB_SMP_ATTR_SM_INFO: + if (ibp->port_cap_flags & IB_PORT_SM_DISABLED) { + ret = IB_MAD_RESULT_SUCCESS | + IB_MAD_RESULT_CONSUMED; + goto bail; + } + if (ibp->port_cap_flags & IB_PORT_SM) { + ret = IB_MAD_RESULT_SUCCESS; + goto bail; + } + /* FALLTHROUGH */ + default: + smp->status |= IB_SMP_UNSUP_METH_ATTR; + ret = reply(smp); + goto bail; + } + + case IB_MGMT_METHOD_SET: + switch (smp->attr_id) { + case IB_SMP_ATTR_GUID_INFO: + ret = subn_set_guidinfo(smp, ibdev, port); + goto bail; + case IB_SMP_ATTR_PORT_INFO: + ret = subn_set_portinfo(smp, ibdev, port); + goto bail; + case IB_SMP_ATTR_PKEY_TABLE: + ret = subn_set_pkeytable(smp, ibdev, port); + goto bail; + case IB_SMP_ATTR_SL_TO_VL_TABLE: + ret = subn_set_sl_to_vl(smp, ibdev, port); + goto bail; + case IB_SMP_ATTR_VL_ARB_TABLE: + ret = subn_set_vl_arb(smp, ibdev, port); + goto bail; + case IB_SMP_ATTR_SM_INFO: + if (ibp->port_cap_flags & IB_PORT_SM_DISABLED) { + ret = IB_MAD_RESULT_SUCCESS | + IB_MAD_RESULT_CONSUMED; + goto bail; + } + if (ibp->port_cap_flags & IB_PORT_SM) { + ret = IB_MAD_RESULT_SUCCESS; + goto bail; + } + /* FALLTHROUGH */ + default: + smp->status |= IB_SMP_UNSUP_METH_ATTR; + ret = reply(smp); + goto bail; + } + + case IB_MGMT_METHOD_TRAP_REPRESS: + if (smp->attr_id == IB_SMP_ATTR_NOTICE) + ret = subn_trap_repress(smp, ibdev, port); + else { + smp->status |= IB_SMP_UNSUP_METH_ATTR; + ret = reply(smp); + } + goto bail; + + case IB_MGMT_METHOD_TRAP: + case IB_MGMT_METHOD_REPORT: + case IB_MGMT_METHOD_REPORT_RESP: + case IB_MGMT_METHOD_GET_RESP: + /* + * The ib_mad module will call us to process responses + * before checking for other consumers. + * Just tell the caller to process it normally. + */ + ret = IB_MAD_RESULT_SUCCESS; + goto bail; + + case IB_MGMT_METHOD_SEND: + if (ib_get_smp_direction(smp) && + smp->attr_id == QIB_VENDOR_IPG) { + ppd->dd->f_set_ib_cfg(ppd, QIB_IB_CFG_PORT, + smp->data[0]); + ret = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; + } else + ret = IB_MAD_RESULT_SUCCESS; + goto bail; + + default: + smp->status |= IB_SMP_UNSUP_METHOD; + ret = reply(smp); + } + +bail: + return ret; +} + +static int process_perf(struct ib_device *ibdev, u8 port, + struct ib_mad *in_mad, + struct ib_mad *out_mad) +{ + struct ib_pma_mad *pmp = (struct ib_pma_mad *)out_mad; + int ret; + + *out_mad = *in_mad; + if (pmp->mad_hdr.class_version != 1) { + pmp->mad_hdr.status |= IB_SMP_UNSUP_VERSION; + ret = reply((struct ib_smp *) pmp); + goto bail; + } + + switch (pmp->mad_hdr.method) { + case IB_MGMT_METHOD_GET: + switch (pmp->mad_hdr.attr_id) { + case IB_PMA_CLASS_PORT_INFO: + ret = pma_get_classportinfo(pmp, ibdev); + goto bail; + case IB_PMA_PORT_SAMPLES_CONTROL: + ret = pma_get_portsamplescontrol(pmp, ibdev, port); + goto bail; + case IB_PMA_PORT_SAMPLES_RESULT: + ret = pma_get_portsamplesresult(pmp, ibdev, port); + goto bail; + case IB_PMA_PORT_SAMPLES_RESULT_EXT: + ret = pma_get_portsamplesresult_ext(pmp, ibdev, port); + goto bail; + case IB_PMA_PORT_COUNTERS: + ret = pma_get_portcounters(pmp, ibdev, port); + goto bail; + case IB_PMA_PORT_COUNTERS_EXT: + ret = pma_get_portcounters_ext(pmp, ibdev, port); + goto bail; + case IB_PMA_PORT_COUNTERS_CONG: + ret = pma_get_portcounters_cong(pmp, ibdev, port); + goto bail; + default: + pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR; + ret = reply((struct ib_smp *) pmp); + goto bail; + } + + case IB_MGMT_METHOD_SET: + switch (pmp->mad_hdr.attr_id) { + case IB_PMA_PORT_SAMPLES_CONTROL: + ret = pma_set_portsamplescontrol(pmp, ibdev, port); + goto bail; + case IB_PMA_PORT_COUNTERS: + ret = pma_set_portcounters(pmp, ibdev, port); + goto bail; + case IB_PMA_PORT_COUNTERS_EXT: + ret = pma_set_portcounters_ext(pmp, ibdev, port); + goto bail; + case IB_PMA_PORT_COUNTERS_CONG: + ret = pma_set_portcounters_cong(pmp, ibdev, port); + goto bail; + default: + pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR; + ret = reply((struct ib_smp *) pmp); + goto bail; + } + + case IB_MGMT_METHOD_TRAP: + case IB_MGMT_METHOD_GET_RESP: + /* + * The ib_mad module will call us to process responses + * before checking for other consumers. + * Just tell the caller to process it normally. + */ + ret = IB_MAD_RESULT_SUCCESS; + goto bail; + + default: + pmp->mad_hdr.status |= IB_SMP_UNSUP_METHOD; + ret = reply((struct ib_smp *) pmp); + } + +bail: + return ret; +} + +static int cc_get_classportinfo(struct ib_cc_mad *ccp, + struct ib_device *ibdev) +{ + struct ib_cc_classportinfo_attr *p = + (struct ib_cc_classportinfo_attr *)ccp->mgmt_data; + + memset(ccp->mgmt_data, 0, sizeof(ccp->mgmt_data)); + + p->base_version = 1; + p->class_version = 1; + p->cap_mask = 0; + + /* + * Expected response time is 4.096 usec. * 2^18 == 1.073741824 sec. + */ + p->resp_time_value = 18; + + return reply((struct ib_smp *) ccp); +} + +static int cc_get_congestion_info(struct ib_cc_mad *ccp, + struct ib_device *ibdev, u8 port) +{ + struct ib_cc_info_attr *p = + (struct ib_cc_info_attr *)ccp->mgmt_data; + struct qib_ibport *ibp = to_iport(ibdev, port); + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + + memset(ccp->mgmt_data, 0, sizeof(ccp->mgmt_data)); + + p->congestion_info = 0; + p->control_table_cap = ppd->cc_max_table_entries; + + return reply((struct ib_smp *) ccp); +} + +static int cc_get_congestion_setting(struct ib_cc_mad *ccp, + struct ib_device *ibdev, u8 port) +{ + int i; + struct ib_cc_congestion_setting_attr *p = + (struct ib_cc_congestion_setting_attr *)ccp->mgmt_data; + struct qib_ibport *ibp = to_iport(ibdev, port); + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + struct ib_cc_congestion_entry_shadow *entries; + + memset(ccp->mgmt_data, 0, sizeof(ccp->mgmt_data)); + + spin_lock(&ppd->cc_shadow_lock); + + entries = ppd->congestion_entries_shadow->entries; + p->port_control = cpu_to_be16( + ppd->congestion_entries_shadow->port_control); + p->control_map = cpu_to_be16( + ppd->congestion_entries_shadow->control_map); + for (i = 0; i < IB_CC_CCS_ENTRIES; i++) { + p->entries[i].ccti_increase = entries[i].ccti_increase; + p->entries[i].ccti_timer = cpu_to_be16(entries[i].ccti_timer); + p->entries[i].trigger_threshold = entries[i].trigger_threshold; + p->entries[i].ccti_min = entries[i].ccti_min; + } + + spin_unlock(&ppd->cc_shadow_lock); + + return reply((struct ib_smp *) ccp); +} + +static int cc_get_congestion_control_table(struct ib_cc_mad *ccp, + struct ib_device *ibdev, u8 port) +{ + struct ib_cc_table_attr *p = + (struct ib_cc_table_attr *)ccp->mgmt_data; + struct qib_ibport *ibp = to_iport(ibdev, port); + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + u32 cct_block_index = be32_to_cpu(ccp->attr_mod); + u32 max_cct_block; + u32 cct_entry; + struct ib_cc_table_entry_shadow *entries; + int i; + + /* Is the table index more than what is supported? */ + if (cct_block_index > IB_CC_TABLE_CAP_DEFAULT - 1) + goto bail; + + memset(ccp->mgmt_data, 0, sizeof(ccp->mgmt_data)); + + spin_lock(&ppd->cc_shadow_lock); + + max_cct_block = + (ppd->ccti_entries_shadow->ccti_last_entry + 1)/IB_CCT_ENTRIES; + max_cct_block = max_cct_block ? max_cct_block - 1 : 0; + + if (cct_block_index > max_cct_block) { + spin_unlock(&ppd->cc_shadow_lock); + goto bail; + } + + ccp->attr_mod = cpu_to_be32(cct_block_index); + + cct_entry = IB_CCT_ENTRIES * (cct_block_index + 1); + + cct_entry--; + + p->ccti_limit = cpu_to_be16(cct_entry); + + entries = &ppd->ccti_entries_shadow-> + entries[IB_CCT_ENTRIES * cct_block_index]; + cct_entry %= IB_CCT_ENTRIES; + + for (i = 0; i <= cct_entry; i++) + p->ccti_entries[i].entry = cpu_to_be16(entries[i].entry); + + spin_unlock(&ppd->cc_shadow_lock); + + return reply((struct ib_smp *) ccp); + +bail: + return reply_failure((struct ib_smp *) ccp); +} + +static int cc_set_congestion_setting(struct ib_cc_mad *ccp, + struct ib_device *ibdev, u8 port) +{ + struct ib_cc_congestion_setting_attr *p = + (struct ib_cc_congestion_setting_attr *)ccp->mgmt_data; + struct qib_ibport *ibp = to_iport(ibdev, port); + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + int i; + + ppd->cc_sl_control_map = be16_to_cpu(p->control_map); + + for (i = 0; i < IB_CC_CCS_ENTRIES; i++) { + ppd->congestion_entries[i].ccti_increase = + p->entries[i].ccti_increase; + + ppd->congestion_entries[i].ccti_timer = + be16_to_cpu(p->entries[i].ccti_timer); + + ppd->congestion_entries[i].trigger_threshold = + p->entries[i].trigger_threshold; + + ppd->congestion_entries[i].ccti_min = + p->entries[i].ccti_min; + } + + return reply((struct ib_smp *) ccp); +} + +static int cc_set_congestion_control_table(struct ib_cc_mad *ccp, + struct ib_device *ibdev, u8 port) +{ + struct ib_cc_table_attr *p = + (struct ib_cc_table_attr *)ccp->mgmt_data; + struct qib_ibport *ibp = to_iport(ibdev, port); + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + u32 cct_block_index = be32_to_cpu(ccp->attr_mod); + u32 cct_entry; + struct ib_cc_table_entry_shadow *entries; + int i; + + /* Is the table index more than what is supported? */ + if (cct_block_index > IB_CC_TABLE_CAP_DEFAULT - 1) + goto bail; + + /* If this packet is the first in the sequence then + * zero the total table entry count. + */ + if (be16_to_cpu(p->ccti_limit) < IB_CCT_ENTRIES) + ppd->total_cct_entry = 0; + + cct_entry = (be16_to_cpu(p->ccti_limit))%IB_CCT_ENTRIES; + + /* ccti_limit is 0 to 63 */ + ppd->total_cct_entry += (cct_entry + 1); + + if (ppd->total_cct_entry > ppd->cc_supported_table_entries) + goto bail; + + ppd->ccti_limit = be16_to_cpu(p->ccti_limit); + + entries = ppd->ccti_entries + (IB_CCT_ENTRIES * cct_block_index); + + for (i = 0; i <= cct_entry; i++) + entries[i].entry = be16_to_cpu(p->ccti_entries[i].entry); + + spin_lock(&ppd->cc_shadow_lock); + + ppd->ccti_entries_shadow->ccti_last_entry = ppd->total_cct_entry - 1; + memcpy(ppd->ccti_entries_shadow->entries, ppd->ccti_entries, + (ppd->total_cct_entry * sizeof(struct ib_cc_table_entry))); + + ppd->congestion_entries_shadow->port_control = IB_CC_CCS_PC_SL_BASED; + ppd->congestion_entries_shadow->control_map = ppd->cc_sl_control_map; + memcpy(ppd->congestion_entries_shadow->entries, ppd->congestion_entries, + IB_CC_CCS_ENTRIES * sizeof(struct ib_cc_congestion_entry)); + + spin_unlock(&ppd->cc_shadow_lock); + + return reply((struct ib_smp *) ccp); + +bail: + return reply_failure((struct ib_smp *) ccp); +} + +static int check_cc_key(struct qib_ibport *ibp, + struct ib_cc_mad *ccp, int mad_flags) +{ + return 0; +} + +static int process_cc(struct ib_device *ibdev, int mad_flags, + u8 port, struct ib_mad *in_mad, + struct ib_mad *out_mad) +{ + struct ib_cc_mad *ccp = (struct ib_cc_mad *)out_mad; + struct qib_ibport *ibp = to_iport(ibdev, port); + int ret; + + *out_mad = *in_mad; + + if (ccp->class_version != 2) { + ccp->status |= IB_SMP_UNSUP_VERSION; + ret = reply((struct ib_smp *)ccp); + goto bail; + } + + ret = check_cc_key(ibp, ccp, mad_flags); + if (ret) + goto bail; + + switch (ccp->method) { + case IB_MGMT_METHOD_GET: + switch (ccp->attr_id) { + case IB_CC_ATTR_CLASSPORTINFO: + ret = cc_get_classportinfo(ccp, ibdev); + goto bail; + + case IB_CC_ATTR_CONGESTION_INFO: + ret = cc_get_congestion_info(ccp, ibdev, port); + goto bail; + + case IB_CC_ATTR_CA_CONGESTION_SETTING: + ret = cc_get_congestion_setting(ccp, ibdev, port); + goto bail; + + case IB_CC_ATTR_CONGESTION_CONTROL_TABLE: + ret = cc_get_congestion_control_table(ccp, ibdev, port); + goto bail; + + /* FALLTHROUGH */ + default: + ccp->status |= IB_SMP_UNSUP_METH_ATTR; + ret = reply((struct ib_smp *) ccp); + goto bail; + } + + case IB_MGMT_METHOD_SET: + switch (ccp->attr_id) { + case IB_CC_ATTR_CA_CONGESTION_SETTING: + ret = cc_set_congestion_setting(ccp, ibdev, port); + goto bail; + + case IB_CC_ATTR_CONGESTION_CONTROL_TABLE: + ret = cc_set_congestion_control_table(ccp, ibdev, port); + goto bail; + + /* FALLTHROUGH */ + default: + ccp->status |= IB_SMP_UNSUP_METH_ATTR; + ret = reply((struct ib_smp *) ccp); + goto bail; + } + + case IB_MGMT_METHOD_GET_RESP: + /* + * The ib_mad module will call us to process responses + * before checking for other consumers. + * Just tell the caller to process it normally. + */ + ret = IB_MAD_RESULT_SUCCESS; + goto bail; + + case IB_MGMT_METHOD_TRAP: + default: + ccp->status |= IB_SMP_UNSUP_METHOD; + ret = reply((struct ib_smp *) ccp); + } + +bail: + return ret; +} + +/** + * qib_process_mad - process an incoming MAD packet + * @ibdev: the infiniband device this packet came in on + * @mad_flags: MAD flags + * @port: the port number this packet came in on + * @in_wc: the work completion entry for this packet + * @in_grh: the global route header for this packet + * @in_mad: the incoming MAD + * @out_mad: any outgoing MAD reply + * + * Returns IB_MAD_RESULT_SUCCESS if this is a MAD that we are not + * interested in processing. + * + * Note that the verbs framework has already done the MAD sanity checks, + * and hop count/pointer updating for IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE + * MADs. + * + * This is called by the ib_mad module. + */ +int qib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port, + struct ib_wc *in_wc, struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad) +{ + int ret; + struct qib_ibport *ibp = to_iport(ibdev, port); + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + + switch (in_mad->mad_hdr.mgmt_class) { + case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE: + case IB_MGMT_CLASS_SUBN_LID_ROUTED: + ret = process_subn(ibdev, mad_flags, port, in_mad, out_mad); + goto bail; + + case IB_MGMT_CLASS_PERF_MGMT: + ret = process_perf(ibdev, port, in_mad, out_mad); + goto bail; + + case IB_MGMT_CLASS_CONG_MGMT: + if (!ppd->congestion_entries_shadow || + !qib_cc_table_size) { + ret = IB_MAD_RESULT_SUCCESS; + goto bail; + } + ret = process_cc(ibdev, mad_flags, port, in_mad, out_mad); + goto bail; + + default: + ret = IB_MAD_RESULT_SUCCESS; + } + +bail: + return ret; +} + +static void send_handler(struct ib_mad_agent *agent, + struct ib_mad_send_wc *mad_send_wc) +{ + ib_free_send_mad(mad_send_wc->send_buf); +} + +static void xmit_wait_timer_func(unsigned long opaque) +{ + struct qib_pportdata *ppd = (struct qib_pportdata *)opaque; + struct qib_devdata *dd = dd_from_ppd(ppd); + unsigned long flags; + u8 status; + + spin_lock_irqsave(&ppd->ibport_data.lock, flags); + if (ppd->cong_stats.flags == IB_PMA_CONG_HW_CONTROL_SAMPLE) { + status = dd->f_portcntr(ppd, QIBPORTCNTR_PSSTAT); + if (status == IB_PMA_SAMPLE_STATUS_DONE) { + /* save counter cache */ + cache_hw_sample_counters(ppd); + ppd->cong_stats.flags = IB_PMA_CONG_HW_CONTROL_TIMER; + } else + goto done; + } + ppd->cong_stats.counter = xmit_wait_get_value_delta(ppd); + dd->f_set_cntr_sample(ppd, QIB_CONG_TIMER_PSINTERVAL, 0x0); +done: + spin_unlock_irqrestore(&ppd->ibport_data.lock, flags); + mod_timer(&ppd->cong_stats.timer, jiffies + HZ); +} + +int qib_create_agents(struct qib_ibdev *dev) +{ + struct qib_devdata *dd = dd_from_dev(dev); + struct ib_mad_agent *agent; + struct qib_ibport *ibp; + int p; + int ret; + + for (p = 0; p < dd->num_pports; p++) { + ibp = &dd->pport[p].ibport_data; + agent = ib_register_mad_agent(&dev->ibdev, p + 1, IB_QPT_SMI, + NULL, 0, send_handler, + NULL, NULL, 0); + if (IS_ERR(agent)) { + ret = PTR_ERR(agent); + goto err; + } + + /* Initialize xmit_wait structure */ + dd->pport[p].cong_stats.counter = 0; + init_timer(&dd->pport[p].cong_stats.timer); + dd->pport[p].cong_stats.timer.function = xmit_wait_timer_func; + dd->pport[p].cong_stats.timer.data = + (unsigned long)(&dd->pport[p]); + dd->pport[p].cong_stats.timer.expires = 0; + add_timer(&dd->pport[p].cong_stats.timer); + + ibp->send_agent = agent; + } + + return 0; + +err: + for (p = 0; p < dd->num_pports; p++) { + ibp = &dd->pport[p].ibport_data; + if (ibp->send_agent) { + agent = ibp->send_agent; + ibp->send_agent = NULL; + ib_unregister_mad_agent(agent); + } + } + + return ret; +} + +void qib_free_agents(struct qib_ibdev *dev) +{ + struct qib_devdata *dd = dd_from_dev(dev); + struct ib_mad_agent *agent; + struct qib_ibport *ibp; + int p; + + for (p = 0; p < dd->num_pports; p++) { + ibp = &dd->pport[p].ibport_data; + if (ibp->send_agent) { + agent = ibp->send_agent; + ibp->send_agent = NULL; + ib_unregister_mad_agent(agent); + } + if (ibp->sm_ah) { + ib_destroy_ah(&ibp->sm_ah->ibah); + ibp->sm_ah = NULL; + } + if (dd->pport[p].cong_stats.timer.data) + del_timer_sync(&dd->pport[p].cong_stats.timer); + } +} diff --git a/kernel/drivers/infiniband/hw/qib/qib_mad.h b/kernel/drivers/infiniband/hw/qib/qib_mad.h new file mode 100644 index 000000000..941d4d50d --- /dev/null +++ b/kernel/drivers/infiniband/hw/qib/qib_mad.h @@ -0,0 +1,431 @@ +/* + * Copyright (c) 2012 Intel Corporation. All rights reserved. + * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _QIB_MAD_H +#define _QIB_MAD_H + +#include + +#define IB_SMP_UNSUP_VERSION cpu_to_be16(0x0004) +#define IB_SMP_UNSUP_METHOD cpu_to_be16(0x0008) +#define IB_SMP_UNSUP_METH_ATTR cpu_to_be16(0x000C) +#define IB_SMP_INVALID_FIELD cpu_to_be16(0x001C) + +struct ib_node_info { + u8 base_version; + u8 class_version; + u8 node_type; + u8 num_ports; + __be64 sys_guid; + __be64 node_guid; + __be64 port_guid; + __be16 partition_cap; + __be16 device_id; + __be32 revision; + u8 local_port_num; + u8 vendor_id[3]; +} __packed; + +struct ib_mad_notice_attr { + u8 generic_type; + u8 prod_type_msb; + __be16 prod_type_lsb; + __be16 trap_num; + __be16 issuer_lid; + __be16 toggle_count; + + union { + struct { + u8 details[54]; + } raw_data; + + struct { + __be16 reserved; + __be16 lid; /* where violation happened */ + u8 port_num; /* where violation happened */ + } __packed ntc_129_131; + + struct { + __be16 reserved; + __be16 lid; /* LID where change occurred */ + u8 reserved2; + u8 local_changes; /* low bit - local changes */ + __be32 new_cap_mask; /* new capability mask */ + u8 reserved3; + u8 change_flags; /* low 3 bits only */ + } __packed ntc_144; + + struct { + __be16 reserved; + __be16 lid; /* lid where sys guid changed */ + __be16 reserved2; + __be64 new_sys_guid; + } __packed ntc_145; + + struct { + __be16 reserved; + __be16 lid; + __be16 dr_slid; + u8 method; + u8 reserved2; + __be16 attr_id; + __be32 attr_mod; + __be64 mkey; + u8 reserved3; + u8 dr_trunc_hop; + u8 dr_rtn_path[30]; + } __packed ntc_256; + + struct { + __be16 reserved; + __be16 lid1; + __be16 lid2; + __be32 key; + __be32 sl_qp1; /* SL: high 4 bits */ + __be32 qp2; /* high 8 bits reserved */ + union ib_gid gid1; + union ib_gid gid2; + } __packed ntc_257_258; + + } details; +}; + +/* + * Generic trap/notice types + */ +#define IB_NOTICE_TYPE_FATAL 0x80 +#define IB_NOTICE_TYPE_URGENT 0x81 +#define IB_NOTICE_TYPE_SECURITY 0x82 +#define IB_NOTICE_TYPE_SM 0x83 +#define IB_NOTICE_TYPE_INFO 0x84 + +/* + * Generic trap/notice producers + */ +#define IB_NOTICE_PROD_CA cpu_to_be16(1) +#define IB_NOTICE_PROD_SWITCH cpu_to_be16(2) +#define IB_NOTICE_PROD_ROUTER cpu_to_be16(3) +#define IB_NOTICE_PROD_CLASS_MGR cpu_to_be16(4) + +/* + * Generic trap/notice numbers + */ +#define IB_NOTICE_TRAP_LLI_THRESH cpu_to_be16(129) +#define IB_NOTICE_TRAP_EBO_THRESH cpu_to_be16(130) +#define IB_NOTICE_TRAP_FLOW_UPDATE cpu_to_be16(131) +#define IB_NOTICE_TRAP_CAP_MASK_CHG cpu_to_be16(144) +#define IB_NOTICE_TRAP_SYS_GUID_CHG cpu_to_be16(145) +#define IB_NOTICE_TRAP_BAD_MKEY cpu_to_be16(256) +#define IB_NOTICE_TRAP_BAD_PKEY cpu_to_be16(257) +#define IB_NOTICE_TRAP_BAD_QKEY cpu_to_be16(258) + +/* + * Repress trap/notice flags + */ +#define IB_NOTICE_REPRESS_LLI_THRESH (1 << 0) +#define IB_NOTICE_REPRESS_EBO_THRESH (1 << 1) +#define IB_NOTICE_REPRESS_FLOW_UPDATE (1 << 2) +#define IB_NOTICE_REPRESS_CAP_MASK_CHG (1 << 3) +#define IB_NOTICE_REPRESS_SYS_GUID_CHG (1 << 4) +#define IB_NOTICE_REPRESS_BAD_MKEY (1 << 5) +#define IB_NOTICE_REPRESS_BAD_PKEY (1 << 6) +#define IB_NOTICE_REPRESS_BAD_QKEY (1 << 7) + +/* + * Generic trap/notice other local changes flags (trap 144). + */ +#define IB_NOTICE_TRAP_LSE_CHG 0x04 /* Link Speed Enable changed */ +#define IB_NOTICE_TRAP_LWE_CHG 0x02 /* Link Width Enable changed */ +#define IB_NOTICE_TRAP_NODE_DESC_CHG 0x01 + +/* + * Generic trap/notice M_Key volation flags in dr_trunc_hop (trap 256). + */ +#define IB_NOTICE_TRAP_DR_NOTICE 0x80 +#define IB_NOTICE_TRAP_DR_TRUNC 0x40 + +struct ib_vl_weight_elem { + u8 vl; /* Only low 4 bits, upper 4 bits reserved */ + u8 weight; +}; + +#define IB_VLARB_LOWPRI_0_31 1 +#define IB_VLARB_LOWPRI_32_63 2 +#define IB_VLARB_HIGHPRI_0_31 3 +#define IB_VLARB_HIGHPRI_32_63 4 + +#define IB_PMA_PORT_COUNTERS_CONG cpu_to_be16(0xFF00) + +struct ib_pma_portcounters_cong { + u8 reserved; + u8 reserved1; + __be16 port_check_rate; + __be16 symbol_error_counter; + u8 link_error_recovery_counter; + u8 link_downed_counter; + __be16 port_rcv_errors; + __be16 port_rcv_remphys_errors; + __be16 port_rcv_switch_relay_errors; + __be16 port_xmit_discards; + u8 port_xmit_constraint_errors; + u8 port_rcv_constraint_errors; + u8 reserved2; + u8 link_overrun_errors; /* LocalLink: 7:4, BufferOverrun: 3:0 */ + __be16 reserved3; + __be16 vl15_dropped; + __be64 port_xmit_data; + __be64 port_rcv_data; + __be64 port_xmit_packets; + __be64 port_rcv_packets; + __be64 port_xmit_wait; + __be64 port_adr_events; +} __packed; + +#define IB_PMA_CONG_HW_CONTROL_TIMER 0x00 +#define IB_PMA_CONG_HW_CONTROL_SAMPLE 0x01 + +#define QIB_XMIT_RATE_UNSUPPORTED 0x0 +#define QIB_XMIT_RATE_PICO 0x7 +/* number of 4nsec cycles equaling 2secs */ +#define QIB_CONG_TIMER_PSINTERVAL 0x1DCD64EC + +#define IB_PMA_SEL_CONG_ALL 0x01 +#define IB_PMA_SEL_CONG_PORT_DATA 0x02 +#define IB_PMA_SEL_CONG_XMIT 0x04 +#define IB_PMA_SEL_CONG_ROUTING 0x08 + +/* + * Congestion control class attributes + */ +#define IB_CC_ATTR_CLASSPORTINFO cpu_to_be16(0x0001) +#define IB_CC_ATTR_NOTICE cpu_to_be16(0x0002) +#define IB_CC_ATTR_CONGESTION_INFO cpu_to_be16(0x0011) +#define IB_CC_ATTR_CONGESTION_KEY_INFO cpu_to_be16(0x0012) +#define IB_CC_ATTR_CONGESTION_LOG cpu_to_be16(0x0013) +#define IB_CC_ATTR_SWITCH_CONGESTION_SETTING cpu_to_be16(0x0014) +#define IB_CC_ATTR_SWITCH_PORT_CONGESTION_SETTING cpu_to_be16(0x0015) +#define IB_CC_ATTR_CA_CONGESTION_SETTING cpu_to_be16(0x0016) +#define IB_CC_ATTR_CONGESTION_CONTROL_TABLE cpu_to_be16(0x0017) +#define IB_CC_ATTR_TIME_STAMP cpu_to_be16(0x0018) + +/* generalizations for threshold values */ +#define IB_CC_THRESHOLD_NONE 0x0 +#define IB_CC_THRESHOLD_MIN 0x1 +#define IB_CC_THRESHOLD_MAX 0xf + +/* CCA MAD header constants */ +#define IB_CC_MAD_LOGDATA_LEN 32 +#define IB_CC_MAD_MGMTDATA_LEN 192 + +struct ib_cc_mad { + u8 base_version; + u8 mgmt_class; + u8 class_version; + u8 method; + __be16 status; + __be16 class_specific; + __be64 tid; + __be16 attr_id; + __be16 resv; + __be32 attr_mod; + __be64 cckey; + + /* For CongestionLog attribute only */ + u8 log_data[IB_CC_MAD_LOGDATA_LEN]; + + u8 mgmt_data[IB_CC_MAD_MGMTDATA_LEN]; +} __packed; + +/* + * Congestion Control class portinfo capability mask bits + */ +#define IB_CC_CPI_CM_TRAP_GEN cpu_to_be16(1 << 0) +#define IB_CC_CPI_CM_GET_SET_NOTICE cpu_to_be16(1 << 1) +#define IB_CC_CPI_CM_CAP2 cpu_to_be16(1 << 2) +#define IB_CC_CPI_CM_ENHANCEDPORT0_CC cpu_to_be16(1 << 8) + +struct ib_cc_classportinfo_attr { + u8 base_version; + u8 class_version; + __be16 cap_mask; + u8 reserved[3]; + u8 resp_time_value; /* only lower 5 bits */ + union ib_gid redirect_gid; + __be32 redirect_tc_sl_fl; /* 8, 4, 20 bits respectively */ + __be16 redirect_lid; + __be16 redirect_pkey; + __be32 redirect_qp; /* only lower 24 bits */ + __be32 redirect_qkey; + union ib_gid trap_gid; + __be32 trap_tc_sl_fl; /* 8, 4, 20 bits respectively */ + __be16 trap_lid; + __be16 trap_pkey; + __be32 trap_hl_qp; /* 8, 24 bits respectively */ + __be32 trap_qkey; +} __packed; + +/* Congestion control traps */ +#define IB_CC_TRAP_KEY_VIOLATION 0x0000 + +struct ib_cc_trap_key_violation_attr { + __be16 source_lid; + u8 method; + u8 reserved1; + __be16 attrib_id; + __be32 attrib_mod; + __be32 qp; + __be64 cckey; + u8 sgid[16]; + u8 padding[24]; +} __packed; + +/* Congestion info flags */ +#define IB_CC_CI_FLAGS_CREDIT_STARVATION 0x1 +#define IB_CC_TABLE_CAP_DEFAULT 31 + +struct ib_cc_info_attr { + __be16 congestion_info; + u8 control_table_cap; /* Multiple of 64 entry unit CCTs */ +} __packed; + +struct ib_cc_key_info_attr { + __be64 cckey; + u8 protect; + __be16 lease_period; + __be16 violations; +} __packed; + +#define IB_CC_CL_CA_LOGEVENTS_LEN 208 + +struct ib_cc_log_attr { + u8 log_type; + u8 congestion_flags; + __be16 threshold_event_counter; + __be16 threshold_congestion_event_map; + __be16 current_time_stamp; + u8 log_events[IB_CC_CL_CA_LOGEVENTS_LEN]; +} __packed; + +#define IB_CC_CLEC_SERVICETYPE_RC 0x0 +#define IB_CC_CLEC_SERVICETYPE_UC 0x1 +#define IB_CC_CLEC_SERVICETYPE_RD 0x2 +#define IB_CC_CLEC_SERVICETYPE_UD 0x3 + +struct ib_cc_log_event { + u8 local_qp_cn_entry; + u8 remote_qp_number_cn_entry[3]; + u8 sl_cn_entry:4; + u8 service_type_cn_entry:4; + __be32 remote_lid_cn_entry; + __be32 timestamp_cn_entry; +} __packed; + +/* Sixteen congestion entries */ +#define IB_CC_CCS_ENTRIES 16 + +/* Port control flags */ +#define IB_CC_CCS_PC_SL_BASED 0x01 + +struct ib_cc_congestion_entry { + u8 ccti_increase; + __be16 ccti_timer; + u8 trigger_threshold; + u8 ccti_min; /* min CCTI for cc table */ +} __packed; + +struct ib_cc_congestion_entry_shadow { + u8 ccti_increase; + u16 ccti_timer; + u8 trigger_threshold; + u8 ccti_min; /* min CCTI for cc table */ +} __packed; + +struct ib_cc_congestion_setting_attr { + __be16 port_control; + __be16 control_map; + struct ib_cc_congestion_entry entries[IB_CC_CCS_ENTRIES]; +} __packed; + +struct ib_cc_congestion_setting_attr_shadow { + u16 port_control; + u16 control_map; + struct ib_cc_congestion_entry_shadow entries[IB_CC_CCS_ENTRIES]; +} __packed; + +#define IB_CC_TABLE_ENTRY_INCREASE_DEFAULT 1 +#define IB_CC_TABLE_ENTRY_TIMER_DEFAULT 1 + +/* 64 Congestion Control table entries in a single MAD */ +#define IB_CCT_ENTRIES 64 +#define IB_CCT_MIN_ENTRIES (IB_CCT_ENTRIES * 2) + +struct ib_cc_table_entry { + __be16 entry; /* shift:2, multiplier:14 */ +}; + +struct ib_cc_table_entry_shadow { + u16 entry; /* shift:2, multiplier:14 */ +}; + +struct ib_cc_table_attr { + __be16 ccti_limit; /* max CCTI for cc table */ + struct ib_cc_table_entry ccti_entries[IB_CCT_ENTRIES]; +} __packed; + +struct ib_cc_table_attr_shadow { + u16 ccti_limit; /* max CCTI for cc table */ + struct ib_cc_table_entry_shadow ccti_entries[IB_CCT_ENTRIES]; +} __packed; + +#define CC_TABLE_SHADOW_MAX \ + (IB_CC_TABLE_CAP_DEFAULT * IB_CCT_ENTRIES) + +struct cc_table_shadow { + u16 ccti_last_entry; + struct ib_cc_table_entry_shadow entries[CC_TABLE_SHADOW_MAX]; +} __packed; + +/* + * The PortSamplesControl.CounterMasks field is an array of 3 bit fields + * which specify the N'th counter's capabilities. See ch. 16.1.3.2. + * We support 5 counters which only count the mandatory quantities. + */ +#define COUNTER_MASK(q, n) (q << ((9 - n) * 3)) +#define COUNTER_MASK0_9 \ + cpu_to_be32(COUNTER_MASK(1, 0) | \ + COUNTER_MASK(1, 1) | \ + COUNTER_MASK(1, 2) | \ + COUNTER_MASK(1, 3) | \ + COUNTER_MASK(1, 4)) + +#endif /* _QIB_MAD_H */ diff --git a/kernel/drivers/infiniband/hw/qib/qib_mmap.c b/kernel/drivers/infiniband/hw/qib/qib_mmap.c new file mode 100644 index 000000000..146cf29a2 --- /dev/null +++ b/kernel/drivers/infiniband/hw/qib/qib_mmap.c @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include + +#include "qib_verbs.h" + +/** + * qib_release_mmap_info - free mmap info structure + * @ref: a pointer to the kref within struct qib_mmap_info + */ +void qib_release_mmap_info(struct kref *ref) +{ + struct qib_mmap_info *ip = + container_of(ref, struct qib_mmap_info, ref); + struct qib_ibdev *dev = to_idev(ip->context->device); + + spin_lock_irq(&dev->pending_lock); + list_del(&ip->pending_mmaps); + spin_unlock_irq(&dev->pending_lock); + + vfree(ip->obj); + kfree(ip); +} + +/* + * open and close keep track of how many times the CQ is mapped, + * to avoid releasing it. + */ +static void qib_vma_open(struct vm_area_struct *vma) +{ + struct qib_mmap_info *ip = vma->vm_private_data; + + kref_get(&ip->ref); +} + +static void qib_vma_close(struct vm_area_struct *vma) +{ + struct qib_mmap_info *ip = vma->vm_private_data; + + kref_put(&ip->ref, qib_release_mmap_info); +} + +static struct vm_operations_struct qib_vm_ops = { + .open = qib_vma_open, + .close = qib_vma_close, +}; + +/** + * qib_mmap - create a new mmap region + * @context: the IB user context of the process making the mmap() call + * @vma: the VMA to be initialized + * Return zero if the mmap is OK. Otherwise, return an errno. + */ +int qib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) +{ + struct qib_ibdev *dev = to_idev(context->device); + unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; + unsigned long size = vma->vm_end - vma->vm_start; + struct qib_mmap_info *ip, *pp; + int ret = -EINVAL; + + /* + * Search the device's list of objects waiting for a mmap call. + * Normally, this list is very short since a call to create a + * CQ, QP, or SRQ is soon followed by a call to mmap(). + */ + spin_lock_irq(&dev->pending_lock); + list_for_each_entry_safe(ip, pp, &dev->pending_mmaps, + pending_mmaps) { + /* Only the creator is allowed to mmap the object */ + if (context != ip->context || (__u64) offset != ip->offset) + continue; + /* Don't allow a mmap larger than the object. */ + if (size > ip->size) + break; + + list_del_init(&ip->pending_mmaps); + spin_unlock_irq(&dev->pending_lock); + + ret = remap_vmalloc_range(vma, ip->obj, 0); + if (ret) + goto done; + vma->vm_ops = &qib_vm_ops; + vma->vm_private_data = ip; + qib_vma_open(vma); + goto done; + } + spin_unlock_irq(&dev->pending_lock); +done: + return ret; +} + +/* + * Allocate information for qib_mmap + */ +struct qib_mmap_info *qib_create_mmap_info(struct qib_ibdev *dev, + u32 size, + struct ib_ucontext *context, + void *obj) { + struct qib_mmap_info *ip; + + ip = kmalloc(sizeof(*ip), GFP_KERNEL); + if (!ip) + goto bail; + + size = PAGE_ALIGN(size); + + spin_lock_irq(&dev->mmap_offset_lock); + if (dev->mmap_offset == 0) + dev->mmap_offset = PAGE_SIZE; + ip->offset = dev->mmap_offset; + dev->mmap_offset += size; + spin_unlock_irq(&dev->mmap_offset_lock); + + INIT_LIST_HEAD(&ip->pending_mmaps); + ip->size = size; + ip->context = context; + ip->obj = obj; + kref_init(&ip->ref); + +bail: + return ip; +} + +void qib_update_mmap_info(struct qib_ibdev *dev, struct qib_mmap_info *ip, + u32 size, void *obj) +{ + size = PAGE_ALIGN(size); + + spin_lock_irq(&dev->mmap_offset_lock); + if (dev->mmap_offset == 0) + dev->mmap_offset = PAGE_SIZE; + ip->offset = dev->mmap_offset; + dev->mmap_offset += size; + spin_unlock_irq(&dev->mmap_offset_lock); + + ip->size = size; + ip->obj = obj; +} diff --git a/kernel/drivers/infiniband/hw/qib/qib_mr.c b/kernel/drivers/infiniband/hw/qib/qib_mr.c new file mode 100644 index 000000000..c4473db46 --- /dev/null +++ b/kernel/drivers/infiniband/hw/qib/qib_mr.c @@ -0,0 +1,532 @@ +/* + * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include "qib.h" + +/* Fast memory region */ +struct qib_fmr { + struct ib_fmr ibfmr; + struct qib_mregion mr; /* must be last */ +}; + +static inline struct qib_fmr *to_ifmr(struct ib_fmr *ibfmr) +{ + return container_of(ibfmr, struct qib_fmr, ibfmr); +} + +static int init_qib_mregion(struct qib_mregion *mr, struct ib_pd *pd, + int count) +{ + int m, i = 0; + int rval = 0; + + m = (count + QIB_SEGSZ - 1) / QIB_SEGSZ; + for (; i < m; i++) { + mr->map[i] = kzalloc(sizeof(*mr->map[0]), GFP_KERNEL); + if (!mr->map[i]) + goto bail; + } + mr->mapsz = m; + init_completion(&mr->comp); + /* count returning the ptr to user */ + atomic_set(&mr->refcount, 1); + mr->pd = pd; + mr->max_segs = count; +out: + return rval; +bail: + while (i) + kfree(mr->map[--i]); + rval = -ENOMEM; + goto out; +} + +static void deinit_qib_mregion(struct qib_mregion *mr) +{ + int i = mr->mapsz; + + mr->mapsz = 0; + while (i) + kfree(mr->map[--i]); +} + + +/** + * qib_get_dma_mr - get a DMA memory region + * @pd: protection domain for this memory region + * @acc: access flags + * + * Returns the memory region on success, otherwise returns an errno. + * Note that all DMA addresses should be created via the + * struct ib_dma_mapping_ops functions (see qib_dma.c). + */ +struct ib_mr *qib_get_dma_mr(struct ib_pd *pd, int acc) +{ + struct qib_mr *mr = NULL; + struct ib_mr *ret; + int rval; + + if (to_ipd(pd)->user) { + ret = ERR_PTR(-EPERM); + goto bail; + } + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) { + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + rval = init_qib_mregion(&mr->mr, pd, 0); + if (rval) { + ret = ERR_PTR(rval); + goto bail; + } + + + rval = qib_alloc_lkey(&mr->mr, 1); + if (rval) { + ret = ERR_PTR(rval); + goto bail_mregion; + } + + mr->mr.access_flags = acc; + ret = &mr->ibmr; +done: + return ret; + +bail_mregion: + deinit_qib_mregion(&mr->mr); +bail: + kfree(mr); + goto done; +} + +static struct qib_mr *alloc_mr(int count, struct ib_pd *pd) +{ + struct qib_mr *mr; + int rval = -ENOMEM; + int m; + + /* Allocate struct plus pointers to first level page tables. */ + m = (count + QIB_SEGSZ - 1) / QIB_SEGSZ; + mr = kzalloc(sizeof(*mr) + m * sizeof(mr->mr.map[0]), GFP_KERNEL); + if (!mr) + goto bail; + + rval = init_qib_mregion(&mr->mr, pd, count); + if (rval) + goto bail; + /* + * ib_reg_phys_mr() will initialize mr->ibmr except for + * lkey and rkey. + */ + rval = qib_alloc_lkey(&mr->mr, 0); + if (rval) + goto bail_mregion; + mr->ibmr.lkey = mr->mr.lkey; + mr->ibmr.rkey = mr->mr.lkey; +done: + return mr; + +bail_mregion: + deinit_qib_mregion(&mr->mr); +bail: + kfree(mr); + mr = ERR_PTR(rval); + goto done; +} + +/** + * qib_reg_phys_mr - register a physical memory region + * @pd: protection domain for this memory region + * @buffer_list: pointer to the list of physical buffers to register + * @num_phys_buf: the number of physical buffers to register + * @iova_start: the starting address passed over IB which maps to this MR + * + * Returns the memory region on success, otherwise returns an errno. + */ +struct ib_mr *qib_reg_phys_mr(struct ib_pd *pd, + struct ib_phys_buf *buffer_list, + int num_phys_buf, int acc, u64 *iova_start) +{ + struct qib_mr *mr; + int n, m, i; + struct ib_mr *ret; + + mr = alloc_mr(num_phys_buf, pd); + if (IS_ERR(mr)) { + ret = (struct ib_mr *)mr; + goto bail; + } + + mr->mr.user_base = *iova_start; + mr->mr.iova = *iova_start; + mr->mr.access_flags = acc; + + m = 0; + n = 0; + for (i = 0; i < num_phys_buf; i++) { + mr->mr.map[m]->segs[n].vaddr = (void *) buffer_list[i].addr; + mr->mr.map[m]->segs[n].length = buffer_list[i].size; + mr->mr.length += buffer_list[i].size; + n++; + if (n == QIB_SEGSZ) { + m++; + n = 0; + } + } + + ret = &mr->ibmr; + +bail: + return ret; +} + +/** + * qib_reg_user_mr - register a userspace memory region + * @pd: protection domain for this memory region + * @start: starting userspace address + * @length: length of region to register + * @mr_access_flags: access flags for this memory region + * @udata: unused by the QLogic_IB driver + * + * Returns the memory region on success, otherwise returns an errno. + */ +struct ib_mr *qib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int mr_access_flags, + struct ib_udata *udata) +{ + struct qib_mr *mr; + struct ib_umem *umem; + struct scatterlist *sg; + int n, m, entry; + struct ib_mr *ret; + + if (length == 0) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + + umem = ib_umem_get(pd->uobject->context, start, length, + mr_access_flags, 0); + if (IS_ERR(umem)) + return (void *) umem; + + n = umem->nmap; + + mr = alloc_mr(n, pd); + if (IS_ERR(mr)) { + ret = (struct ib_mr *)mr; + ib_umem_release(umem); + goto bail; + } + + mr->mr.user_base = start; + mr->mr.iova = virt_addr; + mr->mr.length = length; + mr->mr.offset = ib_umem_offset(umem); + mr->mr.access_flags = mr_access_flags; + mr->umem = umem; + + if (is_power_of_2(umem->page_size)) + mr->mr.page_shift = ilog2(umem->page_size); + m = 0; + n = 0; + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { + void *vaddr; + + vaddr = page_address(sg_page(sg)); + if (!vaddr) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + mr->mr.map[m]->segs[n].vaddr = vaddr; + mr->mr.map[m]->segs[n].length = umem->page_size; + n++; + if (n == QIB_SEGSZ) { + m++; + n = 0; + } + } + ret = &mr->ibmr; + +bail: + return ret; +} + +/** + * qib_dereg_mr - unregister and free a memory region + * @ibmr: the memory region to free + * + * Returns 0 on success. + * + * Note that this is called to free MRs created by qib_get_dma_mr() + * or qib_reg_user_mr(). + */ +int qib_dereg_mr(struct ib_mr *ibmr) +{ + struct qib_mr *mr = to_imr(ibmr); + int ret = 0; + unsigned long timeout; + + qib_free_lkey(&mr->mr); + + qib_put_mr(&mr->mr); /* will set completion if last */ + timeout = wait_for_completion_timeout(&mr->mr.comp, + 5 * HZ); + if (!timeout) { + qib_get_mr(&mr->mr); + ret = -EBUSY; + goto out; + } + deinit_qib_mregion(&mr->mr); + if (mr->umem) + ib_umem_release(mr->umem); + kfree(mr); +out: + return ret; +} + +/* + * Allocate a memory region usable with the + * IB_WR_FAST_REG_MR send work request. + * + * Return the memory region on success, otherwise return an errno. + */ +struct ib_mr *qib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len) +{ + struct qib_mr *mr; + + mr = alloc_mr(max_page_list_len, pd); + if (IS_ERR(mr)) + return (struct ib_mr *)mr; + + return &mr->ibmr; +} + +struct ib_fast_reg_page_list * +qib_alloc_fast_reg_page_list(struct ib_device *ibdev, int page_list_len) +{ + unsigned size = page_list_len * sizeof(u64); + struct ib_fast_reg_page_list *pl; + + if (size > PAGE_SIZE) + return ERR_PTR(-EINVAL); + + pl = kzalloc(sizeof(*pl), GFP_KERNEL); + if (!pl) + return ERR_PTR(-ENOMEM); + + pl->page_list = kzalloc(size, GFP_KERNEL); + if (!pl->page_list) + goto err_free; + + return pl; + +err_free: + kfree(pl); + return ERR_PTR(-ENOMEM); +} + +void qib_free_fast_reg_page_list(struct ib_fast_reg_page_list *pl) +{ + kfree(pl->page_list); + kfree(pl); +} + +/** + * qib_alloc_fmr - allocate a fast memory region + * @pd: the protection domain for this memory region + * @mr_access_flags: access flags for this memory region + * @fmr_attr: fast memory region attributes + * + * Returns the memory region on success, otherwise returns an errno. + */ +struct ib_fmr *qib_alloc_fmr(struct ib_pd *pd, int mr_access_flags, + struct ib_fmr_attr *fmr_attr) +{ + struct qib_fmr *fmr; + int m; + struct ib_fmr *ret; + int rval = -ENOMEM; + + /* Allocate struct plus pointers to first level page tables. */ + m = (fmr_attr->max_pages + QIB_SEGSZ - 1) / QIB_SEGSZ; + fmr = kzalloc(sizeof(*fmr) + m * sizeof(fmr->mr.map[0]), GFP_KERNEL); + if (!fmr) + goto bail; + + rval = init_qib_mregion(&fmr->mr, pd, fmr_attr->max_pages); + if (rval) + goto bail; + + /* + * ib_alloc_fmr() will initialize fmr->ibfmr except for lkey & + * rkey. + */ + rval = qib_alloc_lkey(&fmr->mr, 0); + if (rval) + goto bail_mregion; + fmr->ibfmr.rkey = fmr->mr.lkey; + fmr->ibfmr.lkey = fmr->mr.lkey; + /* + * Resources are allocated but no valid mapping (RKEY can't be + * used). + */ + fmr->mr.access_flags = mr_access_flags; + fmr->mr.max_segs = fmr_attr->max_pages; + fmr->mr.page_shift = fmr_attr->page_shift; + + ret = &fmr->ibfmr; +done: + return ret; + +bail_mregion: + deinit_qib_mregion(&fmr->mr); +bail: + kfree(fmr); + ret = ERR_PTR(rval); + goto done; +} + +/** + * qib_map_phys_fmr - set up a fast memory region + * @ibmfr: the fast memory region to set up + * @page_list: the list of pages to associate with the fast memory region + * @list_len: the number of pages to associate with the fast memory region + * @iova: the virtual address of the start of the fast memory region + * + * This may be called from interrupt context. + */ + +int qib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, + int list_len, u64 iova) +{ + struct qib_fmr *fmr = to_ifmr(ibfmr); + struct qib_lkey_table *rkt; + unsigned long flags; + int m, n, i; + u32 ps; + int ret; + + i = atomic_read(&fmr->mr.refcount); + if (i > 2) + return -EBUSY; + + if (list_len > fmr->mr.max_segs) { + ret = -EINVAL; + goto bail; + } + rkt = &to_idev(ibfmr->device)->lk_table; + spin_lock_irqsave(&rkt->lock, flags); + fmr->mr.user_base = iova; + fmr->mr.iova = iova; + ps = 1 << fmr->mr.page_shift; + fmr->mr.length = list_len * ps; + m = 0; + n = 0; + for (i = 0; i < list_len; i++) { + fmr->mr.map[m]->segs[n].vaddr = (void *) page_list[i]; + fmr->mr.map[m]->segs[n].length = ps; + if (++n == QIB_SEGSZ) { + m++; + n = 0; + } + } + spin_unlock_irqrestore(&rkt->lock, flags); + ret = 0; + +bail: + return ret; +} + +/** + * qib_unmap_fmr - unmap fast memory regions + * @fmr_list: the list of fast memory regions to unmap + * + * Returns 0 on success. + */ +int qib_unmap_fmr(struct list_head *fmr_list) +{ + struct qib_fmr *fmr; + struct qib_lkey_table *rkt; + unsigned long flags; + + list_for_each_entry(fmr, fmr_list, ibfmr.list) { + rkt = &to_idev(fmr->ibfmr.device)->lk_table; + spin_lock_irqsave(&rkt->lock, flags); + fmr->mr.user_base = 0; + fmr->mr.iova = 0; + fmr->mr.length = 0; + spin_unlock_irqrestore(&rkt->lock, flags); + } + return 0; +} + +/** + * qib_dealloc_fmr - deallocate a fast memory region + * @ibfmr: the fast memory region to deallocate + * + * Returns 0 on success. + */ +int qib_dealloc_fmr(struct ib_fmr *ibfmr) +{ + struct qib_fmr *fmr = to_ifmr(ibfmr); + int ret = 0; + unsigned long timeout; + + qib_free_lkey(&fmr->mr); + qib_put_mr(&fmr->mr); /* will set completion if last */ + timeout = wait_for_completion_timeout(&fmr->mr.comp, + 5 * HZ); + if (!timeout) { + qib_get_mr(&fmr->mr); + ret = -EBUSY; + goto out; + } + deinit_qib_mregion(&fmr->mr); + kfree(fmr); +out: + return ret; +} + +void mr_rcu_callback(struct rcu_head *list) +{ + struct qib_mregion *mr = container_of(list, struct qib_mregion, list); + + complete(&mr->comp); +} diff --git a/kernel/drivers/infiniband/hw/qib/qib_pcie.c b/kernel/drivers/infiniband/hw/qib/qib_pcie.c new file mode 100644 index 000000000..4758a3801 --- /dev/null +++ b/kernel/drivers/infiniband/hw/qib/qib_pcie.c @@ -0,0 +1,719 @@ +/* + * Copyright (c) 2008, 2009 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include + +#include "qib.h" + +/* + * This file contains PCIe utility routines that are common to the + * various QLogic InfiniPath adapters + */ + +/* + * Code to adjust PCIe capabilities. + * To minimize the change footprint, we call it + * from qib_pcie_params, which every chip-specific + * file calls, even though this violates some + * expectations of harmlessness. + */ +static void qib_tune_pcie_caps(struct qib_devdata *); +static void qib_tune_pcie_coalesce(struct qib_devdata *); + +/* + * Do all the common PCIe setup and initialization. + * devdata is not yet allocated, and is not allocated until after this + * routine returns success. Therefore qib_dev_err() can't be used for error + * printing. + */ +int qib_pcie_init(struct pci_dev *pdev, const struct pci_device_id *ent) +{ + int ret; + + ret = pci_enable_device(pdev); + if (ret) { + /* + * This can happen (in theory) iff: + * We did a chip reset, and then failed to reprogram the + * BAR, or the chip reset due to an internal error. We then + * unloaded the driver and reloaded it. + * + * Both reset cases set the BAR back to initial state. For + * the latter case, the AER sticky error bit at offset 0x718 + * should be set, but the Linux kernel doesn't yet know + * about that, it appears. If the original BAR was retained + * in the kernel data structures, this may be OK. + */ + qib_early_err(&pdev->dev, "pci enable failed: error %d\n", + -ret); + goto done; + } + + ret = pci_request_regions(pdev, QIB_DRV_NAME); + if (ret) { + qib_devinfo(pdev, "pci_request_regions fails: err %d\n", -ret); + goto bail; + } + + ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); + if (ret) { + /* + * If the 64 bit setup fails, try 32 bit. Some systems + * do not setup 64 bit maps on systems with 2GB or less + * memory installed. + */ + ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); + if (ret) { + qib_devinfo(pdev, "Unable to set DMA mask: %d\n", ret); + goto bail; + } + ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)); + } else + ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); + if (ret) { + qib_early_err(&pdev->dev, + "Unable to set DMA consistent mask: %d\n", ret); + goto bail; + } + + pci_set_master(pdev); + ret = pci_enable_pcie_error_reporting(pdev); + if (ret) { + qib_early_err(&pdev->dev, + "Unable to enable pcie error reporting: %d\n", + ret); + ret = 0; + } + goto done; + +bail: + pci_disable_device(pdev); + pci_release_regions(pdev); +done: + return ret; +} + +/* + * Do remaining PCIe setup, once dd is allocated, and save away + * fields required to re-initialize after a chip reset, or for + * various other purposes + */ +int qib_pcie_ddinit(struct qib_devdata *dd, struct pci_dev *pdev, + const struct pci_device_id *ent) +{ + unsigned long len; + resource_size_t addr; + + dd->pcidev = pdev; + pci_set_drvdata(pdev, dd); + + addr = pci_resource_start(pdev, 0); + len = pci_resource_len(pdev, 0); + +#if defined(__powerpc__) + /* There isn't a generic way to specify writethrough mappings */ + dd->kregbase = __ioremap(addr, len, _PAGE_NO_CACHE | _PAGE_WRITETHRU); +#else + dd->kregbase = ioremap_nocache(addr, len); +#endif + + if (!dd->kregbase) + return -ENOMEM; + + dd->kregend = (u64 __iomem *)((void __iomem *) dd->kregbase + len); + dd->physaddr = addr; /* used for io_remap, etc. */ + + /* + * Save BARs to rewrite after device reset. Save all 64 bits of + * BAR, just in case. + */ + dd->pcibar0 = addr; + dd->pcibar1 = addr >> 32; + dd->deviceid = ent->device; /* save for later use */ + dd->vendorid = ent->vendor; + + return 0; +} + +/* + * Do PCIe cleanup, after chip-specific cleanup, etc. Just prior + * to releasing the dd memory. + * void because none of the core pcie cleanup returns are void + */ +void qib_pcie_ddcleanup(struct qib_devdata *dd) +{ + u64 __iomem *base = (void __iomem *) dd->kregbase; + + dd->kregbase = NULL; + iounmap(base); + if (dd->piobase) + iounmap(dd->piobase); + if (dd->userbase) + iounmap(dd->userbase); + if (dd->piovl15base) + iounmap(dd->piovl15base); + + pci_disable_device(dd->pcidev); + pci_release_regions(dd->pcidev); + + pci_set_drvdata(dd->pcidev, NULL); +} + +static void qib_msix_setup(struct qib_devdata *dd, int pos, u32 *msixcnt, + struct qib_msix_entry *qib_msix_entry) +{ + int ret; + int nvec = *msixcnt; + struct msix_entry *msix_entry; + int i; + + ret = pci_msix_vec_count(dd->pcidev); + if (ret < 0) + goto do_intx; + + nvec = min(nvec, ret); + + /* We can't pass qib_msix_entry array to qib_msix_setup + * so use a dummy msix_entry array and copy the allocated + * irq back to the qib_msix_entry array. */ + msix_entry = kcalloc(nvec, sizeof(*msix_entry), GFP_KERNEL); + if (!msix_entry) + goto do_intx; + + for (i = 0; i < nvec; i++) + msix_entry[i] = qib_msix_entry[i].msix; + + ret = pci_enable_msix_range(dd->pcidev, msix_entry, 1, nvec); + if (ret < 0) + goto free_msix_entry; + else + nvec = ret; + + for (i = 0; i < nvec; i++) + qib_msix_entry[i].msix = msix_entry[i]; + + kfree(msix_entry); + *msixcnt = nvec; + return; + +free_msix_entry: + kfree(msix_entry); + +do_intx: + qib_dev_err( + dd, + "pci_enable_msix_range %d vectors failed: %d, falling back to INTx\n", + nvec, ret); + *msixcnt = 0; + qib_enable_intx(dd->pcidev); +} + +/** + * We save the msi lo and hi values, so we can restore them after + * chip reset (the kernel PCI infrastructure doesn't yet handle that + * correctly. + */ +static int qib_msi_setup(struct qib_devdata *dd, int pos) +{ + struct pci_dev *pdev = dd->pcidev; + u16 control; + int ret; + + ret = pci_enable_msi(pdev); + if (ret) + qib_dev_err(dd, + "pci_enable_msi failed: %d, interrupts may not work\n", + ret); + /* continue even if it fails, we may still be OK... */ + + pci_read_config_dword(pdev, pos + PCI_MSI_ADDRESS_LO, + &dd->msi_lo); + pci_read_config_dword(pdev, pos + PCI_MSI_ADDRESS_HI, + &dd->msi_hi); + pci_read_config_word(pdev, pos + PCI_MSI_FLAGS, &control); + /* now save the data (vector) info */ + pci_read_config_word(pdev, pos + ((control & PCI_MSI_FLAGS_64BIT) + ? 12 : 8), + &dd->msi_data); + return ret; +} + +int qib_pcie_params(struct qib_devdata *dd, u32 minw, u32 *nent, + struct qib_msix_entry *entry) +{ + u16 linkstat, speed; + int pos = 0, ret = 1; + + if (!pci_is_pcie(dd->pcidev)) { + qib_dev_err(dd, "Can't find PCI Express capability!\n"); + /* set up something... */ + dd->lbus_width = 1; + dd->lbus_speed = 2500; /* Gen1, 2.5GHz */ + goto bail; + } + + pos = dd->pcidev->msix_cap; + if (nent && *nent && pos) { + qib_msix_setup(dd, pos, nent, entry); + ret = 0; /* did it, either MSIx or INTx */ + } else { + pos = dd->pcidev->msi_cap; + if (pos) + ret = qib_msi_setup(dd, pos); + else + qib_dev_err(dd, "No PCI MSI or MSIx capability!\n"); + } + if (!pos) + qib_enable_intx(dd->pcidev); + + pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKSTA, &linkstat); + /* + * speed is bits 0-3, linkwidth is bits 4-8 + * no defines for them in headers + */ + speed = linkstat & 0xf; + linkstat >>= 4; + linkstat &= 0x1f; + dd->lbus_width = linkstat; + + switch (speed) { + case 1: + dd->lbus_speed = 2500; /* Gen1, 2.5GHz */ + break; + case 2: + dd->lbus_speed = 5000; /* Gen1, 5GHz */ + break; + default: /* not defined, assume gen1 */ + dd->lbus_speed = 2500; + break; + } + + /* + * Check against expected pcie width and complain if "wrong" + * on first initialization, not afterwards (i.e., reset). + */ + if (minw && linkstat < minw) + qib_dev_err(dd, + "PCIe width %u (x%u HCA), performance reduced\n", + linkstat, minw); + + qib_tune_pcie_caps(dd); + + qib_tune_pcie_coalesce(dd); + +bail: + /* fill in string, even on errors */ + snprintf(dd->lbus_info, sizeof(dd->lbus_info), + "PCIe,%uMHz,x%u\n", dd->lbus_speed, dd->lbus_width); + return ret; +} + +/* + * Setup pcie interrupt stuff again after a reset. I'd like to just call + * pci_enable_msi() again for msi, but when I do that, + * the MSI enable bit doesn't get set in the command word, and + * we switch to to a different interrupt vector, which is confusing, + * so I instead just do it all inline. Perhaps somehow can tie this + * into the PCIe hotplug support at some point + */ +int qib_reinit_intr(struct qib_devdata *dd) +{ + int pos; + u16 control; + int ret = 0; + + /* If we aren't using MSI, don't restore it */ + if (!dd->msi_lo) + goto bail; + + pos = dd->pcidev->msi_cap; + if (!pos) { + qib_dev_err(dd, + "Can't find MSI capability, can't restore MSI settings\n"); + ret = 0; + /* nothing special for MSIx, just MSI */ + goto bail; + } + pci_write_config_dword(dd->pcidev, pos + PCI_MSI_ADDRESS_LO, + dd->msi_lo); + pci_write_config_dword(dd->pcidev, pos + PCI_MSI_ADDRESS_HI, + dd->msi_hi); + pci_read_config_word(dd->pcidev, pos + PCI_MSI_FLAGS, &control); + if (!(control & PCI_MSI_FLAGS_ENABLE)) { + control |= PCI_MSI_FLAGS_ENABLE; + pci_write_config_word(dd->pcidev, pos + PCI_MSI_FLAGS, + control); + } + /* now rewrite the data (vector) info */ + pci_write_config_word(dd->pcidev, pos + + ((control & PCI_MSI_FLAGS_64BIT) ? 12 : 8), + dd->msi_data); + ret = 1; +bail: + if (!ret && (dd->flags & QIB_HAS_INTX)) { + qib_enable_intx(dd->pcidev); + ret = 1; + } + + /* and now set the pci master bit again */ + pci_set_master(dd->pcidev); + + return ret; +} + +/* + * Disable msi interrupt if enabled, and clear msi_lo. + * This is used primarily for the fallback to INTx, but + * is also used in reinit after reset, and during cleanup. + */ +void qib_nomsi(struct qib_devdata *dd) +{ + dd->msi_lo = 0; + pci_disable_msi(dd->pcidev); +} + +/* + * Same as qib_nosmi, but for MSIx. + */ +void qib_nomsix(struct qib_devdata *dd) +{ + pci_disable_msix(dd->pcidev); +} + +/* + * Similar to pci_intx(pdev, 1), except that we make sure + * msi(x) is off. + */ +void qib_enable_intx(struct pci_dev *pdev) +{ + u16 cw, new; + int pos; + + /* first, turn on INTx */ + pci_read_config_word(pdev, PCI_COMMAND, &cw); + new = cw & ~PCI_COMMAND_INTX_DISABLE; + if (new != cw) + pci_write_config_word(pdev, PCI_COMMAND, new); + + pos = pdev->msi_cap; + if (pos) { + /* then turn off MSI */ + pci_read_config_word(pdev, pos + PCI_MSI_FLAGS, &cw); + new = cw & ~PCI_MSI_FLAGS_ENABLE; + if (new != cw) + pci_write_config_word(pdev, pos + PCI_MSI_FLAGS, new); + } + pos = pdev->msix_cap; + if (pos) { + /* then turn off MSIx */ + pci_read_config_word(pdev, pos + PCI_MSIX_FLAGS, &cw); + new = cw & ~PCI_MSIX_FLAGS_ENABLE; + if (new != cw) + pci_write_config_word(pdev, pos + PCI_MSIX_FLAGS, new); + } +} + +/* + * These two routines are helper routines for the device reset code + * to move all the pcie code out of the chip-specific driver code. + */ +void qib_pcie_getcmd(struct qib_devdata *dd, u16 *cmd, u8 *iline, u8 *cline) +{ + pci_read_config_word(dd->pcidev, PCI_COMMAND, cmd); + pci_read_config_byte(dd->pcidev, PCI_INTERRUPT_LINE, iline); + pci_read_config_byte(dd->pcidev, PCI_CACHE_LINE_SIZE, cline); +} + +void qib_pcie_reenable(struct qib_devdata *dd, u16 cmd, u8 iline, u8 cline) +{ + int r; + + r = pci_write_config_dword(dd->pcidev, PCI_BASE_ADDRESS_0, + dd->pcibar0); + if (r) + qib_dev_err(dd, "rewrite of BAR0 failed: %d\n", r); + r = pci_write_config_dword(dd->pcidev, PCI_BASE_ADDRESS_1, + dd->pcibar1); + if (r) + qib_dev_err(dd, "rewrite of BAR1 failed: %d\n", r); + /* now re-enable memory access, and restore cosmetic settings */ + pci_write_config_word(dd->pcidev, PCI_COMMAND, cmd); + pci_write_config_byte(dd->pcidev, PCI_INTERRUPT_LINE, iline); + pci_write_config_byte(dd->pcidev, PCI_CACHE_LINE_SIZE, cline); + r = pci_enable_device(dd->pcidev); + if (r) + qib_dev_err(dd, + "pci_enable_device failed after reset: %d\n", r); +} + + +static int qib_pcie_coalesce; +module_param_named(pcie_coalesce, qib_pcie_coalesce, int, S_IRUGO); +MODULE_PARM_DESC(pcie_coalesce, "tune PCIe colescing on some Intel chipsets"); + +/* + * Enable PCIe completion and data coalescing, on Intel 5x00 and 7300 + * chipsets. This is known to be unsafe for some revisions of some + * of these chipsets, with some BIOS settings, and enabling it on those + * systems may result in the system crashing, and/or data corruption. + */ +static void qib_tune_pcie_coalesce(struct qib_devdata *dd) +{ + int r; + struct pci_dev *parent; + u16 devid; + u32 mask, bits, val; + + if (!qib_pcie_coalesce) + return; + + /* Find out supported and configured values for parent (root) */ + parent = dd->pcidev->bus->self; + if (parent->bus->parent) { + qib_devinfo(dd->pcidev, "Parent not root\n"); + return; + } + if (!pci_is_pcie(parent)) + return; + if (parent->vendor != 0x8086) + return; + + /* + * - bit 12: Max_rdcmp_Imt_EN: need to set to 1 + * - bit 11: COALESCE_FORCE: need to set to 0 + * - bit 10: COALESCE_EN: need to set to 1 + * (but limitations on some on some chipsets) + * + * On the Intel 5000, 5100, and 7300 chipsets, there is + * also: - bit 25:24: COALESCE_MODE, need to set to 0 + */ + devid = parent->device; + if (devid >= 0x25e2 && devid <= 0x25fa) { + /* 5000 P/V/X/Z */ + if (parent->revision <= 0xb2) + bits = 1U << 10; + else + bits = 7U << 10; + mask = (3U << 24) | (7U << 10); + } else if (devid >= 0x65e2 && devid <= 0x65fa) { + /* 5100 */ + bits = 1U << 10; + mask = (3U << 24) | (7U << 10); + } else if (devid >= 0x4021 && devid <= 0x402e) { + /* 5400 */ + bits = 7U << 10; + mask = 7U << 10; + } else if (devid >= 0x3604 && devid <= 0x360a) { + /* 7300 */ + bits = 7U << 10; + mask = (3U << 24) | (7U << 10); + } else { + /* not one of the chipsets that we know about */ + return; + } + pci_read_config_dword(parent, 0x48, &val); + val &= ~mask; + val |= bits; + r = pci_write_config_dword(parent, 0x48, val); +} + +/* + * BIOS may not set PCIe bus-utilization parameters for best performance. + * Check and optionally adjust them to maximize our throughput. + */ +static int qib_pcie_caps; +module_param_named(pcie_caps, qib_pcie_caps, int, S_IRUGO); +MODULE_PARM_DESC(pcie_caps, "Max PCIe tuning: Payload (0..3), ReadReq (4..7)"); + +static void qib_tune_pcie_caps(struct qib_devdata *dd) +{ + struct pci_dev *parent; + u16 rc_mpss, rc_mps, ep_mpss, ep_mps; + u16 rc_mrrs, ep_mrrs, max_mrrs; + + /* Find out supported and configured values for parent (root) */ + parent = dd->pcidev->bus->self; + if (!pci_is_root_bus(parent->bus)) { + qib_devinfo(dd->pcidev, "Parent not root\n"); + return; + } + + if (!pci_is_pcie(parent) || !pci_is_pcie(dd->pcidev)) + return; + + rc_mpss = parent->pcie_mpss; + rc_mps = ffs(pcie_get_mps(parent)) - 8; + /* Find out supported and configured values for endpoint (us) */ + ep_mpss = dd->pcidev->pcie_mpss; + ep_mps = ffs(pcie_get_mps(dd->pcidev)) - 8; + + /* Find max payload supported by root, endpoint */ + if (rc_mpss > ep_mpss) + rc_mpss = ep_mpss; + + /* If Supported greater than limit in module param, limit it */ + if (rc_mpss > (qib_pcie_caps & 7)) + rc_mpss = qib_pcie_caps & 7; + /* If less than (allowed, supported), bump root payload */ + if (rc_mpss > rc_mps) { + rc_mps = rc_mpss; + pcie_set_mps(parent, 128 << rc_mps); + } + /* If less than (allowed, supported), bump endpoint payload */ + if (rc_mpss > ep_mps) { + ep_mps = rc_mpss; + pcie_set_mps(dd->pcidev, 128 << ep_mps); + } + + /* + * Now the Read Request size. + * No field for max supported, but PCIe spec limits it to 4096, + * which is code '5' (log2(4096) - 7) + */ + max_mrrs = 5; + if (max_mrrs > ((qib_pcie_caps >> 4) & 7)) + max_mrrs = (qib_pcie_caps >> 4) & 7; + + max_mrrs = 128 << max_mrrs; + rc_mrrs = pcie_get_readrq(parent); + ep_mrrs = pcie_get_readrq(dd->pcidev); + + if (max_mrrs > rc_mrrs) { + rc_mrrs = max_mrrs; + pcie_set_readrq(parent, rc_mrrs); + } + if (max_mrrs > ep_mrrs) { + ep_mrrs = max_mrrs; + pcie_set_readrq(dd->pcidev, ep_mrrs); + } +} +/* End of PCIe capability tuning */ + +/* + * From here through qib_pci_err_handler definition is invoked via + * PCI error infrastructure, registered via pci + */ +static pci_ers_result_t +qib_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) +{ + struct qib_devdata *dd = pci_get_drvdata(pdev); + pci_ers_result_t ret = PCI_ERS_RESULT_RECOVERED; + + switch (state) { + case pci_channel_io_normal: + qib_devinfo(pdev, "State Normal, ignoring\n"); + break; + + case pci_channel_io_frozen: + qib_devinfo(pdev, "State Frozen, requesting reset\n"); + pci_disable_device(pdev); + ret = PCI_ERS_RESULT_NEED_RESET; + break; + + case pci_channel_io_perm_failure: + qib_devinfo(pdev, "State Permanent Failure, disabling\n"); + if (dd) { + /* no more register accesses! */ + dd->flags &= ~QIB_PRESENT; + qib_disable_after_error(dd); + } + /* else early, or other problem */ + ret = PCI_ERS_RESULT_DISCONNECT; + break; + + default: /* shouldn't happen */ + qib_devinfo(pdev, "QIB PCI errors detected (state %d)\n", + state); + break; + } + return ret; +} + +static pci_ers_result_t +qib_pci_mmio_enabled(struct pci_dev *pdev) +{ + u64 words = 0U; + struct qib_devdata *dd = pci_get_drvdata(pdev); + pci_ers_result_t ret = PCI_ERS_RESULT_RECOVERED; + + if (dd && dd->pport) { + words = dd->f_portcntr(dd->pport, QIBPORTCNTR_WORDRCV); + if (words == ~0ULL) + ret = PCI_ERS_RESULT_NEED_RESET; + } + qib_devinfo(pdev, + "QIB mmio_enabled function called, read wordscntr %Lx, returning %d\n", + words, ret); + return ret; +} + +static pci_ers_result_t +qib_pci_slot_reset(struct pci_dev *pdev) +{ + qib_devinfo(pdev, "QIB slot_reset function called, ignored\n"); + return PCI_ERS_RESULT_CAN_RECOVER; +} + +static pci_ers_result_t +qib_pci_link_reset(struct pci_dev *pdev) +{ + qib_devinfo(pdev, "QIB link_reset function called, ignored\n"); + return PCI_ERS_RESULT_CAN_RECOVER; +} + +static void +qib_pci_resume(struct pci_dev *pdev) +{ + struct qib_devdata *dd = pci_get_drvdata(pdev); + + qib_devinfo(pdev, "QIB resume function called\n"); + pci_cleanup_aer_uncorrect_error_status(pdev); + /* + * Running jobs will fail, since it's asynchronous + * unlike sysfs-requested reset. Better than + * doing nothing. + */ + qib_init(dd, 1); /* same as re-init after reset */ +} + +const struct pci_error_handlers qib_pci_err_handler = { + .error_detected = qib_pci_error_detected, + .mmio_enabled = qib_pci_mmio_enabled, + .link_reset = qib_pci_link_reset, + .slot_reset = qib_pci_slot_reset, + .resume = qib_pci_resume, +}; diff --git a/kernel/drivers/infiniband/hw/qib/qib_pio_copy.c b/kernel/drivers/infiniband/hw/qib/qib_pio_copy.c new file mode 100644 index 000000000..10b8c444d --- /dev/null +++ b/kernel/drivers/infiniband/hw/qib/qib_pio_copy.c @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2009 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "qib.h" + +/** + * qib_pio_copy - copy data to MMIO space, in multiples of 32-bits + * @to: destination, in MMIO space (must be 64-bit aligned) + * @from: source (must be 64-bit aligned) + * @count: number of 32-bit quantities to copy + * + * Copy data from kernel space to MMIO space, in multiples of 32 bits at a + * time. Order of access is not guaranteed, nor is a memory barrier + * performed afterwards. + */ +void qib_pio_copy(void __iomem *to, const void *from, size_t count) +{ +#ifdef CONFIG_64BIT + u64 __iomem *dst = to; + const u64 *src = from; + const u64 *end = src + (count >> 1); + + while (src < end) + __raw_writeq(*src++, dst++); + if (count & 1) + __raw_writel(*(const u32 *)src, dst); +#else + u32 __iomem *dst = to; + const u32 *src = from; + const u32 *end = src + count; + + while (src < end) + __raw_writel(*src++, dst++); +#endif +} diff --git a/kernel/drivers/infiniband/hw/qib/qib_qp.c b/kernel/drivers/infiniband/hw/qib/qib_qp.c new file mode 100644 index 000000000..4fa88ba29 --- /dev/null +++ b/kernel/drivers/infiniband/hw/qib/qib_qp.c @@ -0,0 +1,1376 @@ +/* + * Copyright (c) 2012, 2013 Intel Corporation. All rights reserved. + * Copyright (c) 2006 - 2012 QLogic Corporation. * All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#ifdef CONFIG_DEBUG_FS +#include +#endif + +#include "qib.h" + +#define BITS_PER_PAGE (PAGE_SIZE*BITS_PER_BYTE) +#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) + +static inline unsigned mk_qpn(struct qib_qpn_table *qpt, + struct qpn_map *map, unsigned off) +{ + return (map - qpt->map) * BITS_PER_PAGE + off; +} + +static inline unsigned find_next_offset(struct qib_qpn_table *qpt, + struct qpn_map *map, unsigned off, + unsigned n) +{ + if (qpt->mask) { + off++; + if (((off & qpt->mask) >> 1) >= n) + off = (off | qpt->mask) + 2; + } else + off = find_next_zero_bit(map->page, BITS_PER_PAGE, off); + return off; +} + +/* + * Convert the AETH credit code into the number of credits. + */ +static u32 credit_table[31] = { + 0, /* 0 */ + 1, /* 1 */ + 2, /* 2 */ + 3, /* 3 */ + 4, /* 4 */ + 6, /* 5 */ + 8, /* 6 */ + 12, /* 7 */ + 16, /* 8 */ + 24, /* 9 */ + 32, /* A */ + 48, /* B */ + 64, /* C */ + 96, /* D */ + 128, /* E */ + 192, /* F */ + 256, /* 10 */ + 384, /* 11 */ + 512, /* 12 */ + 768, /* 13 */ + 1024, /* 14 */ + 1536, /* 15 */ + 2048, /* 16 */ + 3072, /* 17 */ + 4096, /* 18 */ + 6144, /* 19 */ + 8192, /* 1A */ + 12288, /* 1B */ + 16384, /* 1C */ + 24576, /* 1D */ + 32768 /* 1E */ +}; + +static void get_map_page(struct qib_qpn_table *qpt, struct qpn_map *map) +{ + unsigned long page = get_zeroed_page(GFP_KERNEL); + + /* + * Free the page if someone raced with us installing it. + */ + + spin_lock(&qpt->lock); + if (map->page) + free_page(page); + else + map->page = (void *)page; + spin_unlock(&qpt->lock); +} + +/* + * Allocate the next available QPN or + * zero/one for QP type IB_QPT_SMI/IB_QPT_GSI. + */ +static int alloc_qpn(struct qib_devdata *dd, struct qib_qpn_table *qpt, + enum ib_qp_type type, u8 port) +{ + u32 i, offset, max_scan, qpn; + struct qpn_map *map; + u32 ret; + + if (type == IB_QPT_SMI || type == IB_QPT_GSI) { + unsigned n; + + ret = type == IB_QPT_GSI; + n = 1 << (ret + 2 * (port - 1)); + spin_lock(&qpt->lock); + if (qpt->flags & n) + ret = -EINVAL; + else + qpt->flags |= n; + spin_unlock(&qpt->lock); + goto bail; + } + + qpn = qpt->last + 2; + if (qpn >= QPN_MAX) + qpn = 2; + if (qpt->mask && ((qpn & qpt->mask) >> 1) >= dd->n_krcv_queues) + qpn = (qpn | qpt->mask) + 2; + offset = qpn & BITS_PER_PAGE_MASK; + map = &qpt->map[qpn / BITS_PER_PAGE]; + max_scan = qpt->nmaps - !offset; + for (i = 0;;) { + if (unlikely(!map->page)) { + get_map_page(qpt, map); + if (unlikely(!map->page)) + break; + } + do { + if (!test_and_set_bit(offset, map->page)) { + qpt->last = qpn; + ret = qpn; + goto bail; + } + offset = find_next_offset(qpt, map, offset, + dd->n_krcv_queues); + qpn = mk_qpn(qpt, map, offset); + /* + * This test differs from alloc_pidmap(). + * If find_next_offset() does find a zero + * bit, we don't need to check for QPN + * wrapping around past our starting QPN. + * We just need to be sure we don't loop + * forever. + */ + } while (offset < BITS_PER_PAGE && qpn < QPN_MAX); + /* + * In order to keep the number of pages allocated to a + * minimum, we scan the all existing pages before increasing + * the size of the bitmap table. + */ + if (++i > max_scan) { + if (qpt->nmaps == QPNMAP_ENTRIES) + break; + map = &qpt->map[qpt->nmaps++]; + offset = 0; + } else if (map < &qpt->map[qpt->nmaps]) { + ++map; + offset = 0; + } else { + map = &qpt->map[0]; + offset = 2; + } + qpn = mk_qpn(qpt, map, offset); + } + + ret = -ENOMEM; + +bail: + return ret; +} + +static void free_qpn(struct qib_qpn_table *qpt, u32 qpn) +{ + struct qpn_map *map; + + map = qpt->map + qpn / BITS_PER_PAGE; + if (map->page) + clear_bit(qpn & BITS_PER_PAGE_MASK, map->page); +} + +static inline unsigned qpn_hash(struct qib_ibdev *dev, u32 qpn) +{ + return jhash_1word(qpn, dev->qp_rnd) & + (dev->qp_table_size - 1); +} + + +/* + * Put the QP into the hash table. + * The hash table holds a reference to the QP. + */ +static void insert_qp(struct qib_ibdev *dev, struct qib_qp *qp) +{ + struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + unsigned long flags; + unsigned n = qpn_hash(dev, qp->ibqp.qp_num); + + atomic_inc(&qp->refcount); + spin_lock_irqsave(&dev->qpt_lock, flags); + + if (qp->ibqp.qp_num == 0) + rcu_assign_pointer(ibp->qp0, qp); + else if (qp->ibqp.qp_num == 1) + rcu_assign_pointer(ibp->qp1, qp); + else { + qp->next = dev->qp_table[n]; + rcu_assign_pointer(dev->qp_table[n], qp); + } + + spin_unlock_irqrestore(&dev->qpt_lock, flags); +} + +/* + * Remove the QP from the table so it can't be found asynchronously by + * the receive interrupt routine. + */ +static void remove_qp(struct qib_ibdev *dev, struct qib_qp *qp) +{ + struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + unsigned n = qpn_hash(dev, qp->ibqp.qp_num); + unsigned long flags; + int removed = 1; + + spin_lock_irqsave(&dev->qpt_lock, flags); + + if (rcu_dereference_protected(ibp->qp0, + lockdep_is_held(&dev->qpt_lock)) == qp) { + RCU_INIT_POINTER(ibp->qp0, NULL); + } else if (rcu_dereference_protected(ibp->qp1, + lockdep_is_held(&dev->qpt_lock)) == qp) { + RCU_INIT_POINTER(ibp->qp1, NULL); + } else { + struct qib_qp *q; + struct qib_qp __rcu **qpp; + + removed = 0; + qpp = &dev->qp_table[n]; + for (; (q = rcu_dereference_protected(*qpp, + lockdep_is_held(&dev->qpt_lock))) != NULL; + qpp = &q->next) + if (q == qp) { + RCU_INIT_POINTER(*qpp, + rcu_dereference_protected(qp->next, + lockdep_is_held(&dev->qpt_lock))); + removed = 1; + break; + } + } + + spin_unlock_irqrestore(&dev->qpt_lock, flags); + if (removed) { + synchronize_rcu(); + atomic_dec(&qp->refcount); + } +} + +/** + * qib_free_all_qps - check for QPs still in use + * @qpt: the QP table to empty + * + * There should not be any QPs still in use. + * Free memory for table. + */ +unsigned qib_free_all_qps(struct qib_devdata *dd) +{ + struct qib_ibdev *dev = &dd->verbs_dev; + unsigned long flags; + struct qib_qp *qp; + unsigned n, qp_inuse = 0; + + for (n = 0; n < dd->num_pports; n++) { + struct qib_ibport *ibp = &dd->pport[n].ibport_data; + + if (!qib_mcast_tree_empty(ibp)) + qp_inuse++; + rcu_read_lock(); + if (rcu_dereference(ibp->qp0)) + qp_inuse++; + if (rcu_dereference(ibp->qp1)) + qp_inuse++; + rcu_read_unlock(); + } + + spin_lock_irqsave(&dev->qpt_lock, flags); + for (n = 0; n < dev->qp_table_size; n++) { + qp = rcu_dereference_protected(dev->qp_table[n], + lockdep_is_held(&dev->qpt_lock)); + RCU_INIT_POINTER(dev->qp_table[n], NULL); + + for (; qp; qp = rcu_dereference_protected(qp->next, + lockdep_is_held(&dev->qpt_lock))) + qp_inuse++; + } + spin_unlock_irqrestore(&dev->qpt_lock, flags); + synchronize_rcu(); + + return qp_inuse; +} + +/** + * qib_lookup_qpn - return the QP with the given QPN + * @qpt: the QP table + * @qpn: the QP number to look up + * + * The caller is responsible for decrementing the QP reference count + * when done. + */ +struct qib_qp *qib_lookup_qpn(struct qib_ibport *ibp, u32 qpn) +{ + struct qib_qp *qp = NULL; + + rcu_read_lock(); + if (unlikely(qpn <= 1)) { + if (qpn == 0) + qp = rcu_dereference(ibp->qp0); + else + qp = rcu_dereference(ibp->qp1); + if (qp) + atomic_inc(&qp->refcount); + } else { + struct qib_ibdev *dev = &ppd_from_ibp(ibp)->dd->verbs_dev; + unsigned n = qpn_hash(dev, qpn); + + for (qp = rcu_dereference(dev->qp_table[n]); qp; + qp = rcu_dereference(qp->next)) + if (qp->ibqp.qp_num == qpn) { + atomic_inc(&qp->refcount); + break; + } + } + rcu_read_unlock(); + return qp; +} + +/** + * qib_reset_qp - initialize the QP state to the reset state + * @qp: the QP to reset + * @type: the QP type + */ +static void qib_reset_qp(struct qib_qp *qp, enum ib_qp_type type) +{ + qp->remote_qpn = 0; + qp->qkey = 0; + qp->qp_access_flags = 0; + atomic_set(&qp->s_dma_busy, 0); + qp->s_flags &= QIB_S_SIGNAL_REQ_WR; + qp->s_hdrwords = 0; + qp->s_wqe = NULL; + qp->s_draining = 0; + qp->s_next_psn = 0; + qp->s_last_psn = 0; + qp->s_sending_psn = 0; + qp->s_sending_hpsn = 0; + qp->s_psn = 0; + qp->r_psn = 0; + qp->r_msn = 0; + if (type == IB_QPT_RC) { + qp->s_state = IB_OPCODE_RC_SEND_LAST; + qp->r_state = IB_OPCODE_RC_SEND_LAST; + } else { + qp->s_state = IB_OPCODE_UC_SEND_LAST; + qp->r_state = IB_OPCODE_UC_SEND_LAST; + } + qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE; + qp->r_nak_state = 0; + qp->r_aflags = 0; + qp->r_flags = 0; + qp->s_head = 0; + qp->s_tail = 0; + qp->s_cur = 0; + qp->s_acked = 0; + qp->s_last = 0; + qp->s_ssn = 1; + qp->s_lsn = 0; + qp->s_mig_state = IB_MIG_MIGRATED; + memset(qp->s_ack_queue, 0, sizeof(qp->s_ack_queue)); + qp->r_head_ack_queue = 0; + qp->s_tail_ack_queue = 0; + qp->s_num_rd_atomic = 0; + if (qp->r_rq.wq) { + qp->r_rq.wq->head = 0; + qp->r_rq.wq->tail = 0; + } + qp->r_sge.num_sge = 0; +} + +static void clear_mr_refs(struct qib_qp *qp, int clr_sends) +{ + unsigned n; + + if (test_and_clear_bit(QIB_R_REWIND_SGE, &qp->r_aflags)) + qib_put_ss(&qp->s_rdma_read_sge); + + qib_put_ss(&qp->r_sge); + + if (clr_sends) { + while (qp->s_last != qp->s_head) { + struct qib_swqe *wqe = get_swqe_ptr(qp, qp->s_last); + unsigned i; + + for (i = 0; i < wqe->wr.num_sge; i++) { + struct qib_sge *sge = &wqe->sg_list[i]; + + qib_put_mr(sge->mr); + } + if (qp->ibqp.qp_type == IB_QPT_UD || + qp->ibqp.qp_type == IB_QPT_SMI || + qp->ibqp.qp_type == IB_QPT_GSI) + atomic_dec(&to_iah(wqe->wr.wr.ud.ah)->refcount); + if (++qp->s_last >= qp->s_size) + qp->s_last = 0; + } + if (qp->s_rdma_mr) { + qib_put_mr(qp->s_rdma_mr); + qp->s_rdma_mr = NULL; + } + } + + if (qp->ibqp.qp_type != IB_QPT_RC) + return; + + for (n = 0; n < ARRAY_SIZE(qp->s_ack_queue); n++) { + struct qib_ack_entry *e = &qp->s_ack_queue[n]; + + if (e->opcode == IB_OPCODE_RC_RDMA_READ_REQUEST && + e->rdma_sge.mr) { + qib_put_mr(e->rdma_sge.mr); + e->rdma_sge.mr = NULL; + } + } +} + +/** + * qib_error_qp - put a QP into the error state + * @qp: the QP to put into the error state + * @err: the receive completion error to signal if a RWQE is active + * + * Flushes both send and receive work queues. + * Returns true if last WQE event should be generated. + * The QP r_lock and s_lock should be held and interrupts disabled. + * If we are already in error state, just return. + */ +int qib_error_qp(struct qib_qp *qp, enum ib_wc_status err) +{ + struct qib_ibdev *dev = to_idev(qp->ibqp.device); + struct ib_wc wc; + int ret = 0; + + if (qp->state == IB_QPS_ERR || qp->state == IB_QPS_RESET) + goto bail; + + qp->state = IB_QPS_ERR; + + if (qp->s_flags & (QIB_S_TIMER | QIB_S_WAIT_RNR)) { + qp->s_flags &= ~(QIB_S_TIMER | QIB_S_WAIT_RNR); + del_timer(&qp->s_timer); + } + + if (qp->s_flags & QIB_S_ANY_WAIT_SEND) + qp->s_flags &= ~QIB_S_ANY_WAIT_SEND; + + spin_lock(&dev->pending_lock); + if (!list_empty(&qp->iowait) && !(qp->s_flags & QIB_S_BUSY)) { + qp->s_flags &= ~QIB_S_ANY_WAIT_IO; + list_del_init(&qp->iowait); + } + spin_unlock(&dev->pending_lock); + + if (!(qp->s_flags & QIB_S_BUSY)) { + qp->s_hdrwords = 0; + if (qp->s_rdma_mr) { + qib_put_mr(qp->s_rdma_mr); + qp->s_rdma_mr = NULL; + } + if (qp->s_tx) { + qib_put_txreq(qp->s_tx); + qp->s_tx = NULL; + } + } + + /* Schedule the sending tasklet to drain the send work queue. */ + if (qp->s_last != qp->s_head) + qib_schedule_send(qp); + + clear_mr_refs(qp, 0); + + memset(&wc, 0, sizeof(wc)); + wc.qp = &qp->ibqp; + wc.opcode = IB_WC_RECV; + + if (test_and_clear_bit(QIB_R_WRID_VALID, &qp->r_aflags)) { + wc.wr_id = qp->r_wr_id; + wc.status = err; + qib_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1); + } + wc.status = IB_WC_WR_FLUSH_ERR; + + if (qp->r_rq.wq) { + struct qib_rwq *wq; + u32 head; + u32 tail; + + spin_lock(&qp->r_rq.lock); + + /* sanity check pointers before trusting them */ + wq = qp->r_rq.wq; + head = wq->head; + if (head >= qp->r_rq.size) + head = 0; + tail = wq->tail; + if (tail >= qp->r_rq.size) + tail = 0; + while (tail != head) { + wc.wr_id = get_rwqe_ptr(&qp->r_rq, tail)->wr_id; + if (++tail >= qp->r_rq.size) + tail = 0; + qib_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1); + } + wq->tail = tail; + + spin_unlock(&qp->r_rq.lock); + } else if (qp->ibqp.event_handler) + ret = 1; + +bail: + return ret; +} + +/** + * qib_modify_qp - modify the attributes of a queue pair + * @ibqp: the queue pair who's attributes we're modifying + * @attr: the new attributes + * @attr_mask: the mask of attributes to modify + * @udata: user data for libibverbs.so + * + * Returns 0 on success, otherwise returns an errno. + */ +int qib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct qib_ibdev *dev = to_idev(ibqp->device); + struct qib_qp *qp = to_iqp(ibqp); + enum ib_qp_state cur_state, new_state; + struct ib_event ev; + int lastwqe = 0; + int mig = 0; + int ret; + u32 pmtu = 0; /* for gcc warning only */ + + spin_lock_irq(&qp->r_lock); + spin_lock(&qp->s_lock); + + cur_state = attr_mask & IB_QP_CUR_STATE ? + attr->cur_qp_state : qp->state; + new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; + + if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, + attr_mask, IB_LINK_LAYER_UNSPECIFIED)) + goto inval; + + if (attr_mask & IB_QP_AV) { + if (attr->ah_attr.dlid >= QIB_MULTICAST_LID_BASE) + goto inval; + if (qib_check_ah(qp->ibqp.device, &attr->ah_attr)) + goto inval; + } + + if (attr_mask & IB_QP_ALT_PATH) { + if (attr->alt_ah_attr.dlid >= QIB_MULTICAST_LID_BASE) + goto inval; + if (qib_check_ah(qp->ibqp.device, &attr->alt_ah_attr)) + goto inval; + if (attr->alt_pkey_index >= qib_get_npkeys(dd_from_dev(dev))) + goto inval; + } + + if (attr_mask & IB_QP_PKEY_INDEX) + if (attr->pkey_index >= qib_get_npkeys(dd_from_dev(dev))) + goto inval; + + if (attr_mask & IB_QP_MIN_RNR_TIMER) + if (attr->min_rnr_timer > 31) + goto inval; + + if (attr_mask & IB_QP_PORT) + if (qp->ibqp.qp_type == IB_QPT_SMI || + qp->ibqp.qp_type == IB_QPT_GSI || + attr->port_num == 0 || + attr->port_num > ibqp->device->phys_port_cnt) + goto inval; + + if (attr_mask & IB_QP_DEST_QPN) + if (attr->dest_qp_num > QIB_QPN_MASK) + goto inval; + + if (attr_mask & IB_QP_RETRY_CNT) + if (attr->retry_cnt > 7) + goto inval; + + if (attr_mask & IB_QP_RNR_RETRY) + if (attr->rnr_retry > 7) + goto inval; + + /* + * Don't allow invalid path_mtu values. OK to set greater + * than the active mtu (or even the max_cap, if we have tuned + * that to a small mtu. We'll set qp->path_mtu + * to the lesser of requested attribute mtu and active, + * for packetizing messages. + * Note that the QP port has to be set in INIT and MTU in RTR. + */ + if (attr_mask & IB_QP_PATH_MTU) { + struct qib_devdata *dd = dd_from_dev(dev); + int mtu, pidx = qp->port_num - 1; + + mtu = ib_mtu_enum_to_int(attr->path_mtu); + if (mtu == -1) + goto inval; + if (mtu > dd->pport[pidx].ibmtu) { + switch (dd->pport[pidx].ibmtu) { + case 4096: + pmtu = IB_MTU_4096; + break; + case 2048: + pmtu = IB_MTU_2048; + break; + case 1024: + pmtu = IB_MTU_1024; + break; + case 512: + pmtu = IB_MTU_512; + break; + case 256: + pmtu = IB_MTU_256; + break; + default: + pmtu = IB_MTU_2048; + } + } else + pmtu = attr->path_mtu; + } + + if (attr_mask & IB_QP_PATH_MIG_STATE) { + if (attr->path_mig_state == IB_MIG_REARM) { + if (qp->s_mig_state == IB_MIG_ARMED) + goto inval; + if (new_state != IB_QPS_RTS) + goto inval; + } else if (attr->path_mig_state == IB_MIG_MIGRATED) { + if (qp->s_mig_state == IB_MIG_REARM) + goto inval; + if (new_state != IB_QPS_RTS && new_state != IB_QPS_SQD) + goto inval; + if (qp->s_mig_state == IB_MIG_ARMED) + mig = 1; + } else + goto inval; + } + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) + if (attr->max_dest_rd_atomic > QIB_MAX_RDMA_ATOMIC) + goto inval; + + switch (new_state) { + case IB_QPS_RESET: + if (qp->state != IB_QPS_RESET) { + qp->state = IB_QPS_RESET; + spin_lock(&dev->pending_lock); + if (!list_empty(&qp->iowait)) + list_del_init(&qp->iowait); + spin_unlock(&dev->pending_lock); + qp->s_flags &= ~(QIB_S_TIMER | QIB_S_ANY_WAIT); + spin_unlock(&qp->s_lock); + spin_unlock_irq(&qp->r_lock); + /* Stop the sending work queue and retry timer */ + cancel_work_sync(&qp->s_work); + del_timer_sync(&qp->s_timer); + wait_event(qp->wait_dma, !atomic_read(&qp->s_dma_busy)); + if (qp->s_tx) { + qib_put_txreq(qp->s_tx); + qp->s_tx = NULL; + } + remove_qp(dev, qp); + wait_event(qp->wait, !atomic_read(&qp->refcount)); + spin_lock_irq(&qp->r_lock); + spin_lock(&qp->s_lock); + clear_mr_refs(qp, 1); + qib_reset_qp(qp, ibqp->qp_type); + } + break; + + case IB_QPS_RTR: + /* Allow event to retrigger if QP set to RTR more than once */ + qp->r_flags &= ~QIB_R_COMM_EST; + qp->state = new_state; + break; + + case IB_QPS_SQD: + qp->s_draining = qp->s_last != qp->s_cur; + qp->state = new_state; + break; + + case IB_QPS_SQE: + if (qp->ibqp.qp_type == IB_QPT_RC) + goto inval; + qp->state = new_state; + break; + + case IB_QPS_ERR: + lastwqe = qib_error_qp(qp, IB_WC_WR_FLUSH_ERR); + break; + + default: + qp->state = new_state; + break; + } + + if (attr_mask & IB_QP_PKEY_INDEX) + qp->s_pkey_index = attr->pkey_index; + + if (attr_mask & IB_QP_PORT) + qp->port_num = attr->port_num; + + if (attr_mask & IB_QP_DEST_QPN) + qp->remote_qpn = attr->dest_qp_num; + + if (attr_mask & IB_QP_SQ_PSN) { + qp->s_next_psn = attr->sq_psn & QIB_PSN_MASK; + qp->s_psn = qp->s_next_psn; + qp->s_sending_psn = qp->s_next_psn; + qp->s_last_psn = qp->s_next_psn - 1; + qp->s_sending_hpsn = qp->s_last_psn; + } + + if (attr_mask & IB_QP_RQ_PSN) + qp->r_psn = attr->rq_psn & QIB_PSN_MASK; + + if (attr_mask & IB_QP_ACCESS_FLAGS) + qp->qp_access_flags = attr->qp_access_flags; + + if (attr_mask & IB_QP_AV) { + qp->remote_ah_attr = attr->ah_attr; + qp->s_srate = attr->ah_attr.static_rate; + } + + if (attr_mask & IB_QP_ALT_PATH) { + qp->alt_ah_attr = attr->alt_ah_attr; + qp->s_alt_pkey_index = attr->alt_pkey_index; + } + + if (attr_mask & IB_QP_PATH_MIG_STATE) { + qp->s_mig_state = attr->path_mig_state; + if (mig) { + qp->remote_ah_attr = qp->alt_ah_attr; + qp->port_num = qp->alt_ah_attr.port_num; + qp->s_pkey_index = qp->s_alt_pkey_index; + } + } + + if (attr_mask & IB_QP_PATH_MTU) { + qp->path_mtu = pmtu; + qp->pmtu = ib_mtu_enum_to_int(pmtu); + } + + if (attr_mask & IB_QP_RETRY_CNT) { + qp->s_retry_cnt = attr->retry_cnt; + qp->s_retry = attr->retry_cnt; + } + + if (attr_mask & IB_QP_RNR_RETRY) { + qp->s_rnr_retry_cnt = attr->rnr_retry; + qp->s_rnr_retry = attr->rnr_retry; + } + + if (attr_mask & IB_QP_MIN_RNR_TIMER) + qp->r_min_rnr_timer = attr->min_rnr_timer; + + if (attr_mask & IB_QP_TIMEOUT) { + qp->timeout = attr->timeout; + qp->timeout_jiffies = + usecs_to_jiffies((4096UL * (1UL << qp->timeout)) / + 1000UL); + } + + if (attr_mask & IB_QP_QKEY) + qp->qkey = attr->qkey; + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) + qp->r_max_rd_atomic = attr->max_dest_rd_atomic; + + if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) + qp->s_max_rd_atomic = attr->max_rd_atomic; + + spin_unlock(&qp->s_lock); + spin_unlock_irq(&qp->r_lock); + + if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) + insert_qp(dev, qp); + + if (lastwqe) { + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_QP_LAST_WQE_REACHED; + qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); + } + if (mig) { + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_PATH_MIG; + qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); + } + ret = 0; + goto bail; + +inval: + spin_unlock(&qp->s_lock); + spin_unlock_irq(&qp->r_lock); + ret = -EINVAL; + +bail: + return ret; +} + +int qib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_qp_init_attr *init_attr) +{ + struct qib_qp *qp = to_iqp(ibqp); + + attr->qp_state = qp->state; + attr->cur_qp_state = attr->qp_state; + attr->path_mtu = qp->path_mtu; + attr->path_mig_state = qp->s_mig_state; + attr->qkey = qp->qkey; + attr->rq_psn = qp->r_psn & QIB_PSN_MASK; + attr->sq_psn = qp->s_next_psn & QIB_PSN_MASK; + attr->dest_qp_num = qp->remote_qpn; + attr->qp_access_flags = qp->qp_access_flags; + attr->cap.max_send_wr = qp->s_size - 1; + attr->cap.max_recv_wr = qp->ibqp.srq ? 0 : qp->r_rq.size - 1; + attr->cap.max_send_sge = qp->s_max_sge; + attr->cap.max_recv_sge = qp->r_rq.max_sge; + attr->cap.max_inline_data = 0; + attr->ah_attr = qp->remote_ah_attr; + attr->alt_ah_attr = qp->alt_ah_attr; + attr->pkey_index = qp->s_pkey_index; + attr->alt_pkey_index = qp->s_alt_pkey_index; + attr->en_sqd_async_notify = 0; + attr->sq_draining = qp->s_draining; + attr->max_rd_atomic = qp->s_max_rd_atomic; + attr->max_dest_rd_atomic = qp->r_max_rd_atomic; + attr->min_rnr_timer = qp->r_min_rnr_timer; + attr->port_num = qp->port_num; + attr->timeout = qp->timeout; + attr->retry_cnt = qp->s_retry_cnt; + attr->rnr_retry = qp->s_rnr_retry_cnt; + attr->alt_port_num = qp->alt_ah_attr.port_num; + attr->alt_timeout = qp->alt_timeout; + + init_attr->event_handler = qp->ibqp.event_handler; + init_attr->qp_context = qp->ibqp.qp_context; + init_attr->send_cq = qp->ibqp.send_cq; + init_attr->recv_cq = qp->ibqp.recv_cq; + init_attr->srq = qp->ibqp.srq; + init_attr->cap = attr->cap; + if (qp->s_flags & QIB_S_SIGNAL_REQ_WR) + init_attr->sq_sig_type = IB_SIGNAL_REQ_WR; + else + init_attr->sq_sig_type = IB_SIGNAL_ALL_WR; + init_attr->qp_type = qp->ibqp.qp_type; + init_attr->port_num = qp->port_num; + return 0; +} + +/** + * qib_compute_aeth - compute the AETH (syndrome + MSN) + * @qp: the queue pair to compute the AETH for + * + * Returns the AETH. + */ +__be32 qib_compute_aeth(struct qib_qp *qp) +{ + u32 aeth = qp->r_msn & QIB_MSN_MASK; + + if (qp->ibqp.srq) { + /* + * Shared receive queues don't generate credits. + * Set the credit field to the invalid value. + */ + aeth |= QIB_AETH_CREDIT_INVAL << QIB_AETH_CREDIT_SHIFT; + } else { + u32 min, max, x; + u32 credits; + struct qib_rwq *wq = qp->r_rq.wq; + u32 head; + u32 tail; + + /* sanity check pointers before trusting them */ + head = wq->head; + if (head >= qp->r_rq.size) + head = 0; + tail = wq->tail; + if (tail >= qp->r_rq.size) + tail = 0; + /* + * Compute the number of credits available (RWQEs). + * XXX Not holding the r_rq.lock here so there is a small + * chance that the pair of reads are not atomic. + */ + credits = head - tail; + if ((int)credits < 0) + credits += qp->r_rq.size; + /* + * Binary search the credit table to find the code to + * use. + */ + min = 0; + max = 31; + for (;;) { + x = (min + max) / 2; + if (credit_table[x] == credits) + break; + if (credit_table[x] > credits) + max = x; + else if (min == x) + break; + else + min = x; + } + aeth |= x << QIB_AETH_CREDIT_SHIFT; + } + return cpu_to_be32(aeth); +} + +/** + * qib_create_qp - create a queue pair for a device + * @ibpd: the protection domain who's device we create the queue pair for + * @init_attr: the attributes of the queue pair + * @udata: user data for libibverbs.so + * + * Returns the queue pair on success, otherwise returns an errno. + * + * Called by the ib_create_qp() core verbs function. + */ +struct ib_qp *qib_create_qp(struct ib_pd *ibpd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata) +{ + struct qib_qp *qp; + int err; + struct qib_swqe *swq = NULL; + struct qib_ibdev *dev; + struct qib_devdata *dd; + size_t sz; + size_t sg_list_sz; + struct ib_qp *ret; + + if (init_attr->cap.max_send_sge > ib_qib_max_sges || + init_attr->cap.max_send_wr > ib_qib_max_qp_wrs || + init_attr->create_flags) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + + /* Check receive queue parameters if no SRQ is specified. */ + if (!init_attr->srq) { + if (init_attr->cap.max_recv_sge > ib_qib_max_sges || + init_attr->cap.max_recv_wr > ib_qib_max_qp_wrs) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + if (init_attr->cap.max_send_sge + + init_attr->cap.max_send_wr + + init_attr->cap.max_recv_sge + + init_attr->cap.max_recv_wr == 0) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + } + + switch (init_attr->qp_type) { + case IB_QPT_SMI: + case IB_QPT_GSI: + if (init_attr->port_num == 0 || + init_attr->port_num > ibpd->device->phys_port_cnt) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + case IB_QPT_UC: + case IB_QPT_RC: + case IB_QPT_UD: + sz = sizeof(struct qib_sge) * + init_attr->cap.max_send_sge + + sizeof(struct qib_swqe); + swq = vmalloc((init_attr->cap.max_send_wr + 1) * sz); + if (swq == NULL) { + ret = ERR_PTR(-ENOMEM); + goto bail; + } + sz = sizeof(*qp); + sg_list_sz = 0; + if (init_attr->srq) { + struct qib_srq *srq = to_isrq(init_attr->srq); + + if (srq->rq.max_sge > 1) + sg_list_sz = sizeof(*qp->r_sg_list) * + (srq->rq.max_sge - 1); + } else if (init_attr->cap.max_recv_sge > 1) + sg_list_sz = sizeof(*qp->r_sg_list) * + (init_attr->cap.max_recv_sge - 1); + qp = kzalloc(sz + sg_list_sz, GFP_KERNEL); + if (!qp) { + ret = ERR_PTR(-ENOMEM); + goto bail_swq; + } + RCU_INIT_POINTER(qp->next, NULL); + qp->s_hdr = kzalloc(sizeof(*qp->s_hdr), GFP_KERNEL); + if (!qp->s_hdr) { + ret = ERR_PTR(-ENOMEM); + goto bail_qp; + } + qp->timeout_jiffies = + usecs_to_jiffies((4096UL * (1UL << qp->timeout)) / + 1000UL); + if (init_attr->srq) + sz = 0; + else { + qp->r_rq.size = init_attr->cap.max_recv_wr + 1; + qp->r_rq.max_sge = init_attr->cap.max_recv_sge; + sz = (sizeof(struct ib_sge) * qp->r_rq.max_sge) + + sizeof(struct qib_rwqe); + qp->r_rq.wq = vmalloc_user(sizeof(struct qib_rwq) + + qp->r_rq.size * sz); + if (!qp->r_rq.wq) { + ret = ERR_PTR(-ENOMEM); + goto bail_qp; + } + } + + /* + * ib_create_qp() will initialize qp->ibqp + * except for qp->ibqp.qp_num. + */ + spin_lock_init(&qp->r_lock); + spin_lock_init(&qp->s_lock); + spin_lock_init(&qp->r_rq.lock); + atomic_set(&qp->refcount, 0); + init_waitqueue_head(&qp->wait); + init_waitqueue_head(&qp->wait_dma); + init_timer(&qp->s_timer); + qp->s_timer.data = (unsigned long)qp; + INIT_WORK(&qp->s_work, qib_do_send); + INIT_LIST_HEAD(&qp->iowait); + INIT_LIST_HEAD(&qp->rspwait); + qp->state = IB_QPS_RESET; + qp->s_wq = swq; + qp->s_size = init_attr->cap.max_send_wr + 1; + qp->s_max_sge = init_attr->cap.max_send_sge; + if (init_attr->sq_sig_type == IB_SIGNAL_REQ_WR) + qp->s_flags = QIB_S_SIGNAL_REQ_WR; + dev = to_idev(ibpd->device); + dd = dd_from_dev(dev); + err = alloc_qpn(dd, &dev->qpn_table, init_attr->qp_type, + init_attr->port_num); + if (err < 0) { + ret = ERR_PTR(err); + vfree(qp->r_rq.wq); + goto bail_qp; + } + qp->ibqp.qp_num = err; + qp->port_num = init_attr->port_num; + qib_reset_qp(qp, init_attr->qp_type); + break; + + default: + /* Don't support raw QPs */ + ret = ERR_PTR(-ENOSYS); + goto bail; + } + + init_attr->cap.max_inline_data = 0; + + /* + * Return the address of the RWQ as the offset to mmap. + * See qib_mmap() for details. + */ + if (udata && udata->outlen >= sizeof(__u64)) { + if (!qp->r_rq.wq) { + __u64 offset = 0; + + err = ib_copy_to_udata(udata, &offset, + sizeof(offset)); + if (err) { + ret = ERR_PTR(err); + goto bail_ip; + } + } else { + u32 s = sizeof(struct qib_rwq) + qp->r_rq.size * sz; + + qp->ip = qib_create_mmap_info(dev, s, + ibpd->uobject->context, + qp->r_rq.wq); + if (!qp->ip) { + ret = ERR_PTR(-ENOMEM); + goto bail_ip; + } + + err = ib_copy_to_udata(udata, &(qp->ip->offset), + sizeof(qp->ip->offset)); + if (err) { + ret = ERR_PTR(err); + goto bail_ip; + } + } + } + + spin_lock(&dev->n_qps_lock); + if (dev->n_qps_allocated == ib_qib_max_qps) { + spin_unlock(&dev->n_qps_lock); + ret = ERR_PTR(-ENOMEM); + goto bail_ip; + } + + dev->n_qps_allocated++; + spin_unlock(&dev->n_qps_lock); + + if (qp->ip) { + spin_lock_irq(&dev->pending_lock); + list_add(&qp->ip->pending_mmaps, &dev->pending_mmaps); + spin_unlock_irq(&dev->pending_lock); + } + + ret = &qp->ibqp; + goto bail; + +bail_ip: + if (qp->ip) + kref_put(&qp->ip->ref, qib_release_mmap_info); + else + vfree(qp->r_rq.wq); + free_qpn(&dev->qpn_table, qp->ibqp.qp_num); +bail_qp: + kfree(qp->s_hdr); + kfree(qp); +bail_swq: + vfree(swq); +bail: + return ret; +} + +/** + * qib_destroy_qp - destroy a queue pair + * @ibqp: the queue pair to destroy + * + * Returns 0 on success. + * + * Note that this can be called while the QP is actively sending or + * receiving! + */ +int qib_destroy_qp(struct ib_qp *ibqp) +{ + struct qib_qp *qp = to_iqp(ibqp); + struct qib_ibdev *dev = to_idev(ibqp->device); + + /* Make sure HW and driver activity is stopped. */ + spin_lock_irq(&qp->s_lock); + if (qp->state != IB_QPS_RESET) { + qp->state = IB_QPS_RESET; + spin_lock(&dev->pending_lock); + if (!list_empty(&qp->iowait)) + list_del_init(&qp->iowait); + spin_unlock(&dev->pending_lock); + qp->s_flags &= ~(QIB_S_TIMER | QIB_S_ANY_WAIT); + spin_unlock_irq(&qp->s_lock); + cancel_work_sync(&qp->s_work); + del_timer_sync(&qp->s_timer); + wait_event(qp->wait_dma, !atomic_read(&qp->s_dma_busy)); + if (qp->s_tx) { + qib_put_txreq(qp->s_tx); + qp->s_tx = NULL; + } + remove_qp(dev, qp); + wait_event(qp->wait, !atomic_read(&qp->refcount)); + clear_mr_refs(qp, 1); + } else + spin_unlock_irq(&qp->s_lock); + + /* all user's cleaned up, mark it available */ + free_qpn(&dev->qpn_table, qp->ibqp.qp_num); + spin_lock(&dev->n_qps_lock); + dev->n_qps_allocated--; + spin_unlock(&dev->n_qps_lock); + + if (qp->ip) + kref_put(&qp->ip->ref, qib_release_mmap_info); + else + vfree(qp->r_rq.wq); + vfree(qp->s_wq); + kfree(qp->s_hdr); + kfree(qp); + return 0; +} + +/** + * qib_init_qpn_table - initialize the QP number table for a device + * @qpt: the QPN table + */ +void qib_init_qpn_table(struct qib_devdata *dd, struct qib_qpn_table *qpt) +{ + spin_lock_init(&qpt->lock); + qpt->last = 1; /* start with QPN 2 */ + qpt->nmaps = 1; + qpt->mask = dd->qpn_mask; +} + +/** + * qib_free_qpn_table - free the QP number table for a device + * @qpt: the QPN table + */ +void qib_free_qpn_table(struct qib_qpn_table *qpt) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(qpt->map); i++) + if (qpt->map[i].page) + free_page((unsigned long) qpt->map[i].page); +} + +/** + * qib_get_credit - flush the send work queue of a QP + * @qp: the qp who's send work queue to flush + * @aeth: the Acknowledge Extended Transport Header + * + * The QP s_lock should be held. + */ +void qib_get_credit(struct qib_qp *qp, u32 aeth) +{ + u32 credit = (aeth >> QIB_AETH_CREDIT_SHIFT) & QIB_AETH_CREDIT_MASK; + + /* + * If the credit is invalid, we can send + * as many packets as we like. Otherwise, we have to + * honor the credit field. + */ + if (credit == QIB_AETH_CREDIT_INVAL) { + if (!(qp->s_flags & QIB_S_UNLIMITED_CREDIT)) { + qp->s_flags |= QIB_S_UNLIMITED_CREDIT; + if (qp->s_flags & QIB_S_WAIT_SSN_CREDIT) { + qp->s_flags &= ~QIB_S_WAIT_SSN_CREDIT; + qib_schedule_send(qp); + } + } + } else if (!(qp->s_flags & QIB_S_UNLIMITED_CREDIT)) { + /* Compute new LSN (i.e., MSN + credit) */ + credit = (aeth + credit_table[credit]) & QIB_MSN_MASK; + if (qib_cmp24(credit, qp->s_lsn) > 0) { + qp->s_lsn = credit; + if (qp->s_flags & QIB_S_WAIT_SSN_CREDIT) { + qp->s_flags &= ~QIB_S_WAIT_SSN_CREDIT; + qib_schedule_send(qp); + } + } + } +} + +#ifdef CONFIG_DEBUG_FS + +struct qib_qp_iter { + struct qib_ibdev *dev; + struct qib_qp *qp; + int n; +}; + +struct qib_qp_iter *qib_qp_iter_init(struct qib_ibdev *dev) +{ + struct qib_qp_iter *iter; + + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return NULL; + + iter->dev = dev; + if (qib_qp_iter_next(iter)) { + kfree(iter); + return NULL; + } + + return iter; +} + +int qib_qp_iter_next(struct qib_qp_iter *iter) +{ + struct qib_ibdev *dev = iter->dev; + int n = iter->n; + int ret = 1; + struct qib_qp *pqp = iter->qp; + struct qib_qp *qp; + + for (; n < dev->qp_table_size; n++) { + if (pqp) + qp = rcu_dereference(pqp->next); + else + qp = rcu_dereference(dev->qp_table[n]); + pqp = qp; + if (qp) { + iter->qp = qp; + iter->n = n; + return 0; + } + } + return ret; +} + +static const char * const qp_type_str[] = { + "SMI", "GSI", "RC", "UC", "UD", +}; + +void qib_qp_iter_print(struct seq_file *s, struct qib_qp_iter *iter) +{ + struct qib_swqe *wqe; + struct qib_qp *qp = iter->qp; + + wqe = get_swqe_ptr(qp, qp->s_last); + seq_printf(s, + "N %d QP%u %s %u %u %u f=%x %u %u %u %u %u PSN %x %x %x %x %x (%u %u %u %u %u %u) QP%u LID %x\n", + iter->n, + qp->ibqp.qp_num, + qp_type_str[qp->ibqp.qp_type], + qp->state, + wqe->wr.opcode, + qp->s_hdrwords, + qp->s_flags, + atomic_read(&qp->s_dma_busy), + !list_empty(&qp->iowait), + qp->timeout, + wqe->ssn, + qp->s_lsn, + qp->s_last_psn, + qp->s_psn, qp->s_next_psn, + qp->s_sending_psn, qp->s_sending_hpsn, + qp->s_last, qp->s_acked, qp->s_cur, + qp->s_tail, qp->s_head, qp->s_size, + qp->remote_qpn, + qp->remote_ah_attr.dlid); +} + +#endif diff --git a/kernel/drivers/infiniband/hw/qib/qib_qsfp.c b/kernel/drivers/infiniband/hw/qib/qib_qsfp.c new file mode 100644 index 000000000..5e27f7680 --- /dev/null +++ b/kernel/drivers/infiniband/hw/qib/qib_qsfp.c @@ -0,0 +1,559 @@ +/* + * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include "qib.h" +#include "qib_qsfp.h" + +/* + * QSFP support for ib_qib driver, using "Two Wire Serial Interface" driver + * in qib_twsi.c + */ +#define QSFP_MAX_RETRY 4 + +static int qsfp_read(struct qib_pportdata *ppd, int addr, void *bp, int len) +{ + struct qib_devdata *dd = ppd->dd; + u32 out, mask; + int ret, cnt, pass = 0; + int stuck = 0; + u8 *buff = bp; + + ret = mutex_lock_interruptible(&dd->eep_lock); + if (ret) + goto no_unlock; + + if (dd->twsi_eeprom_dev == QIB_TWSI_NO_DEV) { + ret = -ENXIO; + goto bail; + } + + /* + * We presume, if we are called at all, that this board has + * QSFP. This is on the same i2c chain as the legacy parts, + * but only responds if the module is selected via GPIO pins. + * Further, there are very long setup and hold requirements + * on MODSEL. + */ + mask = QSFP_GPIO_MOD_SEL_N | QSFP_GPIO_MOD_RST_N | QSFP_GPIO_LP_MODE; + out = QSFP_GPIO_MOD_RST_N | QSFP_GPIO_LP_MODE; + if (ppd->hw_pidx) { + mask <<= QSFP_GPIO_PORT2_SHIFT; + out <<= QSFP_GPIO_PORT2_SHIFT; + } + + dd->f_gpio_mod(dd, out, mask, mask); + + /* + * Module could take up to 2 Msec to respond to MOD_SEL, and there + * is no way to tell if it is ready, so we must wait. + */ + msleep(20); + + /* Make sure TWSI bus is in sane state. */ + ret = qib_twsi_reset(dd); + if (ret) { + qib_dev_porterr(dd, ppd->port, + "QSFP interface Reset for read failed\n"); + ret = -EIO; + stuck = 1; + goto deselect; + } + + /* All QSFP modules are at A0 */ + + cnt = 0; + while (cnt < len) { + unsigned in_page; + int wlen = len - cnt; + + in_page = addr % QSFP_PAGESIZE; + if ((in_page + wlen) > QSFP_PAGESIZE) + wlen = QSFP_PAGESIZE - in_page; + ret = qib_twsi_blk_rd(dd, QSFP_DEV, addr, buff + cnt, wlen); + /* Some QSFP's fail first try. Retry as experiment */ + if (ret && cnt == 0 && ++pass < QSFP_MAX_RETRY) + continue; + if (ret) { + /* qib_twsi_blk_rd() 1 for error, else 0 */ + ret = -EIO; + goto deselect; + } + addr += wlen; + cnt += wlen; + } + ret = cnt; + +deselect: + /* + * Module could take up to 10 uSec after transfer before + * ready to respond to MOD_SEL negation, and there is no way + * to tell if it is ready, so we must wait. + */ + udelay(10); + /* set QSFP MODSEL, RST. LP all high */ + dd->f_gpio_mod(dd, mask, mask, mask); + + /* + * Module could take up to 2 Msec to respond to MOD_SEL + * going away, and there is no way to tell if it is ready. + * so we must wait. + */ + if (stuck) + qib_dev_err(dd, "QSFP interface bus stuck non-idle\n"); + + if (pass >= QSFP_MAX_RETRY && ret) + qib_dev_porterr(dd, ppd->port, "QSFP failed even retrying\n"); + else if (pass) + qib_dev_porterr(dd, ppd->port, "QSFP retries: %d\n", pass); + + msleep(20); + +bail: + mutex_unlock(&dd->eep_lock); + +no_unlock: + return ret; +} + +/* + * qsfp_write + * We do not ordinarily write the QSFP, but this is needed to select + * the page on non-flat QSFPs, and possibly later unusual cases + */ +static int qib_qsfp_write(struct qib_pportdata *ppd, int addr, void *bp, + int len) +{ + struct qib_devdata *dd = ppd->dd; + u32 out, mask; + int ret, cnt; + u8 *buff = bp; + + ret = mutex_lock_interruptible(&dd->eep_lock); + if (ret) + goto no_unlock; + + if (dd->twsi_eeprom_dev == QIB_TWSI_NO_DEV) { + ret = -ENXIO; + goto bail; + } + + /* + * We presume, if we are called at all, that this board has + * QSFP. This is on the same i2c chain as the legacy parts, + * but only responds if the module is selected via GPIO pins. + * Further, there are very long setup and hold requirements + * on MODSEL. + */ + mask = QSFP_GPIO_MOD_SEL_N | QSFP_GPIO_MOD_RST_N | QSFP_GPIO_LP_MODE; + out = QSFP_GPIO_MOD_RST_N | QSFP_GPIO_LP_MODE; + if (ppd->hw_pidx) { + mask <<= QSFP_GPIO_PORT2_SHIFT; + out <<= QSFP_GPIO_PORT2_SHIFT; + } + dd->f_gpio_mod(dd, out, mask, mask); + + /* + * Module could take up to 2 Msec to respond to MOD_SEL, + * and there is no way to tell if it is ready, so we must wait. + */ + msleep(20); + + /* Make sure TWSI bus is in sane state. */ + ret = qib_twsi_reset(dd); + if (ret) { + qib_dev_porterr(dd, ppd->port, + "QSFP interface Reset for write failed\n"); + ret = -EIO; + goto deselect; + } + + /* All QSFP modules are at A0 */ + + cnt = 0; + while (cnt < len) { + unsigned in_page; + int wlen = len - cnt; + + in_page = addr % QSFP_PAGESIZE; + if ((in_page + wlen) > QSFP_PAGESIZE) + wlen = QSFP_PAGESIZE - in_page; + ret = qib_twsi_blk_wr(dd, QSFP_DEV, addr, buff + cnt, wlen); + if (ret) { + /* qib_twsi_blk_wr() 1 for error, else 0 */ + ret = -EIO; + goto deselect; + } + addr += wlen; + cnt += wlen; + } + ret = cnt; + +deselect: + /* + * Module could take up to 10 uSec after transfer before + * ready to respond to MOD_SEL negation, and there is no way + * to tell if it is ready, so we must wait. + */ + udelay(10); + /* set QSFP MODSEL, RST, LP high */ + dd->f_gpio_mod(dd, mask, mask, mask); + /* + * Module could take up to 2 Msec to respond to MOD_SEL + * going away, and there is no way to tell if it is ready. + * so we must wait. + */ + msleep(20); + +bail: + mutex_unlock(&dd->eep_lock); + +no_unlock: + return ret; +} + +/* + * For validation, we want to check the checksums, even of the + * fields we do not otherwise use. This function reads the bytes from + * to and returns the 8lsbs of the sum, or <0 for errors + */ +static int qsfp_cks(struct qib_pportdata *ppd, int first, int next) +{ + int ret; + u16 cks; + u8 bval; + + cks = 0; + while (first < next) { + ret = qsfp_read(ppd, first, &bval, 1); + if (ret < 0) + goto bail; + cks += bval; + ++first; + } + ret = cks & 0xFF; +bail: + return ret; + +} + +int qib_refresh_qsfp_cache(struct qib_pportdata *ppd, struct qib_qsfp_cache *cp) +{ + int ret; + int idx; + u16 cks; + u8 peek[4]; + + /* ensure sane contents on invalid reads, for cable swaps */ + memset(cp, 0, sizeof(*cp)); + + if (!qib_qsfp_mod_present(ppd)) { + ret = -ENODEV; + goto bail; + } + + ret = qsfp_read(ppd, 0, peek, 3); + if (ret < 0) + goto bail; + if ((peek[0] & 0xFE) != 0x0C) + qib_dev_porterr(ppd->dd, ppd->port, + "QSFP byte0 is 0x%02X, S/B 0x0C/D\n", peek[0]); + + if ((peek[2] & 2) == 0) { + /* + * If cable is paged, rather than "flat memory", we need to + * set the page to zero, Even if it already appears to be zero. + */ + u8 poke = 0; + + ret = qib_qsfp_write(ppd, 127, &poke, 1); + udelay(50); + if (ret != 1) { + qib_dev_porterr(ppd->dd, ppd->port, + "Failed QSFP Page set\n"); + goto bail; + } + } + + ret = qsfp_read(ppd, QSFP_MOD_ID_OFFS, &cp->id, 1); + if (ret < 0) + goto bail; + if ((cp->id & 0xFE) != 0x0C) + qib_dev_porterr(ppd->dd, ppd->port, + "QSFP ID byte is 0x%02X, S/B 0x0C/D\n", cp->id); + cks = cp->id; + + ret = qsfp_read(ppd, QSFP_MOD_PWR_OFFS, &cp->pwr, 1); + if (ret < 0) + goto bail; + cks += cp->pwr; + + ret = qsfp_cks(ppd, QSFP_MOD_PWR_OFFS + 1, QSFP_MOD_LEN_OFFS); + if (ret < 0) + goto bail; + cks += ret; + + ret = qsfp_read(ppd, QSFP_MOD_LEN_OFFS, &cp->len, 1); + if (ret < 0) + goto bail; + cks += cp->len; + + ret = qsfp_read(ppd, QSFP_MOD_TECH_OFFS, &cp->tech, 1); + if (ret < 0) + goto bail; + cks += cp->tech; + + ret = qsfp_read(ppd, QSFP_VEND_OFFS, &cp->vendor, QSFP_VEND_LEN); + if (ret < 0) + goto bail; + for (idx = 0; idx < QSFP_VEND_LEN; ++idx) + cks += cp->vendor[idx]; + + ret = qsfp_read(ppd, QSFP_IBXCV_OFFS, &cp->xt_xcv, 1); + if (ret < 0) + goto bail; + cks += cp->xt_xcv; + + ret = qsfp_read(ppd, QSFP_VOUI_OFFS, &cp->oui, QSFP_VOUI_LEN); + if (ret < 0) + goto bail; + for (idx = 0; idx < QSFP_VOUI_LEN; ++idx) + cks += cp->oui[idx]; + + ret = qsfp_read(ppd, QSFP_PN_OFFS, &cp->partnum, QSFP_PN_LEN); + if (ret < 0) + goto bail; + for (idx = 0; idx < QSFP_PN_LEN; ++idx) + cks += cp->partnum[idx]; + + ret = qsfp_read(ppd, QSFP_REV_OFFS, &cp->rev, QSFP_REV_LEN); + if (ret < 0) + goto bail; + for (idx = 0; idx < QSFP_REV_LEN; ++idx) + cks += cp->rev[idx]; + + ret = qsfp_read(ppd, QSFP_ATTEN_OFFS, &cp->atten, QSFP_ATTEN_LEN); + if (ret < 0) + goto bail; + for (idx = 0; idx < QSFP_ATTEN_LEN; ++idx) + cks += cp->atten[idx]; + + ret = qsfp_cks(ppd, QSFP_ATTEN_OFFS + QSFP_ATTEN_LEN, QSFP_CC_OFFS); + if (ret < 0) + goto bail; + cks += ret; + + cks &= 0xFF; + ret = qsfp_read(ppd, QSFP_CC_OFFS, &cp->cks1, 1); + if (ret < 0) + goto bail; + if (cks != cp->cks1) + qib_dev_porterr(ppd->dd, ppd->port, + "QSFP cks1 is %02X, computed %02X\n", cp->cks1, + cks); + + /* Second checksum covers 192 to (serial, date, lot) */ + ret = qsfp_cks(ppd, QSFP_CC_OFFS + 1, QSFP_SN_OFFS); + if (ret < 0) + goto bail; + cks = ret; + + ret = qsfp_read(ppd, QSFP_SN_OFFS, &cp->serial, QSFP_SN_LEN); + if (ret < 0) + goto bail; + for (idx = 0; idx < QSFP_SN_LEN; ++idx) + cks += cp->serial[idx]; + + ret = qsfp_read(ppd, QSFP_DATE_OFFS, &cp->date, QSFP_DATE_LEN); + if (ret < 0) + goto bail; + for (idx = 0; idx < QSFP_DATE_LEN; ++idx) + cks += cp->date[idx]; + + ret = qsfp_read(ppd, QSFP_LOT_OFFS, &cp->lot, QSFP_LOT_LEN); + if (ret < 0) + goto bail; + for (idx = 0; idx < QSFP_LOT_LEN; ++idx) + cks += cp->lot[idx]; + + ret = qsfp_cks(ppd, QSFP_LOT_OFFS + QSFP_LOT_LEN, QSFP_CC_EXT_OFFS); + if (ret < 0) + goto bail; + cks += ret; + + ret = qsfp_read(ppd, QSFP_CC_EXT_OFFS, &cp->cks2, 1); + if (ret < 0) + goto bail; + cks &= 0xFF; + if (cks != cp->cks2) + qib_dev_porterr(ppd->dd, ppd->port, + "QSFP cks2 is %02X, computed %02X\n", cp->cks2, + cks); + return 0; + +bail: + cp->id = 0; + return ret; +} + +const char * const qib_qsfp_devtech[16] = { + "850nm VCSEL", "1310nm VCSEL", "1550nm VCSEL", "1310nm FP", + "1310nm DFB", "1550nm DFB", "1310nm EML", "1550nm EML", + "Cu Misc", "1490nm DFB", "Cu NoEq", "Cu Eq", + "Undef", "Cu Active BothEq", "Cu FarEq", "Cu NearEq" +}; + +#define QSFP_DUMP_CHUNK 16 /* Holds longest string */ +#define QSFP_DEFAULT_HDR_CNT 224 + +static const char *pwr_codes = "1.5W2.0W2.5W3.5W"; + +int qib_qsfp_mod_present(struct qib_pportdata *ppd) +{ + u32 mask; + int ret; + + mask = QSFP_GPIO_MOD_PRS_N << + (ppd->hw_pidx * QSFP_GPIO_PORT2_SHIFT); + ret = ppd->dd->f_gpio_mod(ppd->dd, 0, 0, 0); + + return !((ret & mask) >> + ((ppd->hw_pidx * QSFP_GPIO_PORT2_SHIFT) + 3)); +} + +/* + * Initialize structures that control access to QSFP. Called once per port + * on cards that support QSFP. + */ +void qib_qsfp_init(struct qib_qsfp_data *qd, + void (*fevent)(struct work_struct *)) +{ + u32 mask, highs; + + struct qib_devdata *dd = qd->ppd->dd; + + /* Initialize work struct for later QSFP events */ + INIT_WORK(&qd->work, fevent); + + /* + * Later, we may want more validation. For now, just set up pins and + * blip reset. If module is present, call qib_refresh_qsfp_cache(), + * to do further init. + */ + mask = QSFP_GPIO_MOD_SEL_N | QSFP_GPIO_MOD_RST_N | QSFP_GPIO_LP_MODE; + highs = mask - QSFP_GPIO_MOD_RST_N; + if (qd->ppd->hw_pidx) { + mask <<= QSFP_GPIO_PORT2_SHIFT; + highs <<= QSFP_GPIO_PORT2_SHIFT; + } + dd->f_gpio_mod(dd, highs, mask, mask); + udelay(20); /* Generous RST dwell */ + + dd->f_gpio_mod(dd, mask, mask, mask); +} + +void qib_qsfp_deinit(struct qib_qsfp_data *qd) +{ + /* + * There is nothing to do here for now. our work is scheduled + * with queue_work(), and flush_workqueue() from remove_one + * will block until all work setup with queue_work() + * completes. + */ +} + +int qib_qsfp_dump(struct qib_pportdata *ppd, char *buf, int len) +{ + struct qib_qsfp_cache cd; + u8 bin_buff[QSFP_DUMP_CHUNK]; + char lenstr[6]; + int sofar, ret; + int bidx = 0; + + sofar = 0; + ret = qib_refresh_qsfp_cache(ppd, &cd); + if (ret < 0) + goto bail; + + lenstr[0] = ' '; + lenstr[1] = '\0'; + if (QSFP_IS_CU(cd.tech)) + sprintf(lenstr, "%dM ", cd.len); + + sofar += scnprintf(buf + sofar, len - sofar, "PWR:%.3sW\n", pwr_codes + + (QSFP_PWR(cd.pwr) * 4)); + + sofar += scnprintf(buf + sofar, len - sofar, "TECH:%s%s\n", lenstr, + qib_qsfp_devtech[cd.tech >> 4]); + + sofar += scnprintf(buf + sofar, len - sofar, "Vendor:%.*s\n", + QSFP_VEND_LEN, cd.vendor); + + sofar += scnprintf(buf + sofar, len - sofar, "OUI:%06X\n", + QSFP_OUI(cd.oui)); + + sofar += scnprintf(buf + sofar, len - sofar, "Part#:%.*s\n", + QSFP_PN_LEN, cd.partnum); + sofar += scnprintf(buf + sofar, len - sofar, "Rev:%.*s\n", + QSFP_REV_LEN, cd.rev); + if (QSFP_IS_CU(cd.tech)) + sofar += scnprintf(buf + sofar, len - sofar, "Atten:%d, %d\n", + QSFP_ATTEN_SDR(cd.atten), + QSFP_ATTEN_DDR(cd.atten)); + sofar += scnprintf(buf + sofar, len - sofar, "Serial:%.*s\n", + QSFP_SN_LEN, cd.serial); + sofar += scnprintf(buf + sofar, len - sofar, "Date:%.*s\n", + QSFP_DATE_LEN, cd.date); + sofar += scnprintf(buf + sofar, len - sofar, "Lot:%.*s\n", + QSFP_LOT_LEN, cd.date); + + while (bidx < QSFP_DEFAULT_HDR_CNT) { + int iidx; + + ret = qsfp_read(ppd, bidx, bin_buff, QSFP_DUMP_CHUNK); + if (ret < 0) + goto bail; + for (iidx = 0; iidx < ret; ++iidx) { + sofar += scnprintf(buf + sofar, len-sofar, " %02X", + bin_buff[iidx]); + } + sofar += scnprintf(buf + sofar, len - sofar, "\n"); + bidx += QSFP_DUMP_CHUNK; + } + ret = sofar; +bail: + return ret; +} diff --git a/kernel/drivers/infiniband/hw/qib/qib_qsfp.h b/kernel/drivers/infiniband/hw/qib/qib_qsfp.h new file mode 100644 index 000000000..91908f533 --- /dev/null +++ b/kernel/drivers/infiniband/hw/qib/qib_qsfp.h @@ -0,0 +1,189 @@ +/* + * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +/* QSFP support common definitions, for ib_qib driver */ + +#define QSFP_DEV 0xA0 +#define QSFP_PWR_LAG_MSEC 2000 +#define QSFP_MODPRS_LAG_MSEC 20 + +/* + * Below are masks for various QSFP signals, for Port 1. + * Port2 equivalents are shifted by QSFP_GPIO_PORT2_SHIFT. + * _N means asserted low + */ +#define QSFP_GPIO_MOD_SEL_N (4) +#define QSFP_GPIO_MOD_PRS_N (8) +#define QSFP_GPIO_INT_N (0x10) +#define QSFP_GPIO_MOD_RST_N (0x20) +#define QSFP_GPIO_LP_MODE (0x40) +#define QSFP_GPIO_PORT2_SHIFT 5 + +#define QSFP_PAGESIZE 128 +/* Defined fields that QLogic requires of qualified cables */ +/* Byte 0 is Identifier, not checked */ +/* Byte 1 is reserved "status MSB" */ +/* Byte 2 is "status LSB" We only care that D2 "Flat Mem" is set. */ +/* + * Rest of first 128 not used, although 127 is reserved for page select + * if module is not "Flat memory". + */ +/* Byte 128 is Identifier: must be 0x0c for QSFP, or 0x0d for QSFP+ */ +#define QSFP_MOD_ID_OFFS 128 +/* + * Byte 129 is "Extended Identifier". We only care about D7,D6: Power class + * 0:1.5W, 1:2.0W, 2:2.5W, 3:3.5W + */ +#define QSFP_MOD_PWR_OFFS 129 +/* Byte 130 is Connector type. Not QLogic req'd */ +/* Bytes 131..138 are Transceiver types, bit maps for various tech, none IB */ +/* Byte 139 is encoding. code 0x01 is 8b10b. Not QLogic req'd */ +/* byte 140 is nominal bit-rate, in units of 100Mbits/sec Not QLogic req'd */ +/* Byte 141 is Extended Rate Select. Not QLogic req'd */ +/* Bytes 142..145 are lengths for various fiber types. Not QLogic req'd */ +/* Byte 146 is length for Copper. Units of 1 meter */ +#define QSFP_MOD_LEN_OFFS 146 +/* + * Byte 147 is Device technology. D0..3 not Qlogc req'd + * D4..7 select from 15 choices, translated by table: + */ +#define QSFP_MOD_TECH_OFFS 147 +extern const char *const qib_qsfp_devtech[16]; +/* Active Equalization includes fiber, copper full EQ, and copper near Eq */ +#define QSFP_IS_ACTIVE(tech) ((0xA2FF >> ((tech) >> 4)) & 1) +/* Active Equalization includes fiber, copper full EQ, and copper far Eq */ +#define QSFP_IS_ACTIVE_FAR(tech) ((0x32FF >> ((tech) >> 4)) & 1) +/* Attenuation should be valid for copper other than full/near Eq */ +#define QSFP_HAS_ATTEN(tech) ((0x4D00 >> ((tech) >> 4)) & 1) +/* Length is only valid if technology is "copper" */ +#define QSFP_IS_CU(tech) ((0xED00 >> ((tech) >> 4)) & 1) +#define QSFP_TECH_1490 9 + +#define QSFP_OUI(oui) (((unsigned)oui[0] << 16) | ((unsigned)oui[1] << 8) | \ + oui[2]) +#define QSFP_OUI_AMPHENOL 0x415048 +#define QSFP_OUI_FINISAR 0x009065 +#define QSFP_OUI_GORE 0x002177 + +/* Bytes 148..163 are Vendor Name, Left-justified Blank-filled */ +#define QSFP_VEND_OFFS 148 +#define QSFP_VEND_LEN 16 +/* Byte 164 is IB Extended tranceiver codes Bits D0..3 are SDR,DDR,QDR,EDR */ +#define QSFP_IBXCV_OFFS 164 +/* Bytes 165..167 are Vendor OUI number */ +#define QSFP_VOUI_OFFS 165 +#define QSFP_VOUI_LEN 3 +/* Bytes 168..183 are Vendor Part Number, string */ +#define QSFP_PN_OFFS 168 +#define QSFP_PN_LEN 16 +/* Bytes 184,185 are Vendor Rev. Left Justified, Blank-filled */ +#define QSFP_REV_OFFS 184 +#define QSFP_REV_LEN 2 +/* + * Bytes 186,187 are Wavelength, if Optical. Not Qlogic req'd + * If copper, they are attenuation in dB: + * Byte 186 is at 2.5Gb/sec (SDR), Byte 187 at 5.0Gb/sec (DDR) + */ +#define QSFP_ATTEN_OFFS 186 +#define QSFP_ATTEN_LEN 2 +/* Bytes 188,189 are Wavelength tolerance, not QLogic req'd */ +/* Byte 190 is Max Case Temp. Not QLogic req'd */ +/* Byte 191 is LSB of sum of bytes 128..190. Not QLogic req'd */ +#define QSFP_CC_OFFS 191 +/* Bytes 192..195 are Options implemented in qsfp. Not Qlogic req'd */ +/* Bytes 196..211 are Serial Number, String */ +#define QSFP_SN_OFFS 196 +#define QSFP_SN_LEN 16 +/* Bytes 212..219 are date-code YYMMDD (MM==1 for Jan) */ +#define QSFP_DATE_OFFS 212 +#define QSFP_DATE_LEN 6 +/* Bytes 218,219 are optional lot-code, string */ +#define QSFP_LOT_OFFS 218 +#define QSFP_LOT_LEN 2 +/* Bytes 220, 221 indicate monitoring options, Not QLogic req'd */ +/* Byte 223 is LSB of sum of bytes 192..222 */ +#define QSFP_CC_EXT_OFFS 223 + +/* + * struct qib_qsfp_data encapsulates state of QSFP device for one port. + * it will be part of port-chip-specific data if a board supports QSFP. + * + * Since multiple board-types use QSFP, and their pport_data structs + * differ (in the chip-specific section), we need a pointer to its head. + * + * Avoiding premature optimization, we will have one work_struct per port, + * and let the (increasingly inaccurately named) eep_lock arbitrate + * access to common resources. + * + */ + +/* + * Hold the parts of the onboard EEPROM that we care about, so we aren't + * coonstantly bit-boffing + */ +struct qib_qsfp_cache { + u8 id; /* must be 0x0C or 0x0D; 0 indicates invalid EEPROM read */ + u8 pwr; /* in D6,7 */ + u8 len; /* in meters, Cu only */ + u8 tech; + char vendor[QSFP_VEND_LEN]; + u8 xt_xcv; /* Ext. tranceiver codes, 4 lsbs are IB speed supported */ + u8 oui[QSFP_VOUI_LEN]; + u8 partnum[QSFP_PN_LEN]; + u8 rev[QSFP_REV_LEN]; + u8 atten[QSFP_ATTEN_LEN]; + u8 cks1; /* Checksum of bytes 128..190 */ + u8 serial[QSFP_SN_LEN]; + u8 date[QSFP_DATE_LEN]; + u8 lot[QSFP_LOT_LEN]; + u8 cks2; /* Checsum of bytes 192..222 */ +}; + +#define QSFP_PWR(pbyte) (((pbyte) >> 6) & 3) +#define QSFP_ATTEN_SDR(attenarray) (attenarray[0]) +#define QSFP_ATTEN_DDR(attenarray) (attenarray[1]) + +struct qib_qsfp_data { + /* Helps to find our way */ + struct qib_pportdata *ppd; + struct work_struct work; + struct qib_qsfp_cache cache; + unsigned long t_insert; + u8 modpresent; +}; + +extern int qib_refresh_qsfp_cache(struct qib_pportdata *ppd, + struct qib_qsfp_cache *cp); +extern int qib_qsfp_mod_present(struct qib_pportdata *ppd); +extern void qib_qsfp_init(struct qib_qsfp_data *qd, + void (*fevent)(struct work_struct *)); +extern void qib_qsfp_deinit(struct qib_qsfp_data *qd); diff --git a/kernel/drivers/infiniband/hw/qib/qib_rc.c b/kernel/drivers/infiniband/hw/qib/qib_rc.c new file mode 100644 index 000000000..4544d6f88 --- /dev/null +++ b/kernel/drivers/infiniband/hw/qib/qib_rc.c @@ -0,0 +1,2290 @@ +/* + * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include "qib.h" + +/* cut down ridiculously long IB macro names */ +#define OP(x) IB_OPCODE_RC_##x + +static void rc_timeout(unsigned long arg); + +static u32 restart_sge(struct qib_sge_state *ss, struct qib_swqe *wqe, + u32 psn, u32 pmtu) +{ + u32 len; + + len = ((psn - wqe->psn) & QIB_PSN_MASK) * pmtu; + ss->sge = wqe->sg_list[0]; + ss->sg_list = wqe->sg_list + 1; + ss->num_sge = wqe->wr.num_sge; + ss->total_len = wqe->length; + qib_skip_sge(ss, len, 0); + return wqe->length - len; +} + +static void start_timer(struct qib_qp *qp) +{ + qp->s_flags |= QIB_S_TIMER; + qp->s_timer.function = rc_timeout; + /* 4.096 usec. * (1 << qp->timeout) */ + qp->s_timer.expires = jiffies + qp->timeout_jiffies; + add_timer(&qp->s_timer); +} + +/** + * qib_make_rc_ack - construct a response packet (ACK, NAK, or RDMA read) + * @dev: the device for this QP + * @qp: a pointer to the QP + * @ohdr: a pointer to the IB header being constructed + * @pmtu: the path MTU + * + * Return 1 if constructed; otherwise, return 0. + * Note that we are in the responder's side of the QP context. + * Note the QP s_lock must be held. + */ +static int qib_make_rc_ack(struct qib_ibdev *dev, struct qib_qp *qp, + struct qib_other_headers *ohdr, u32 pmtu) +{ + struct qib_ack_entry *e; + u32 hwords; + u32 len; + u32 bth0; + u32 bth2; + + /* Don't send an ACK if we aren't supposed to. */ + if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK)) + goto bail; + + /* header size in 32-bit words LRH+BTH = (8+12)/4. */ + hwords = 5; + + switch (qp->s_ack_state) { + case OP(RDMA_READ_RESPONSE_LAST): + case OP(RDMA_READ_RESPONSE_ONLY): + e = &qp->s_ack_queue[qp->s_tail_ack_queue]; + if (e->rdma_sge.mr) { + qib_put_mr(e->rdma_sge.mr); + e->rdma_sge.mr = NULL; + } + /* FALLTHROUGH */ + case OP(ATOMIC_ACKNOWLEDGE): + /* + * We can increment the tail pointer now that the last + * response has been sent instead of only being + * constructed. + */ + if (++qp->s_tail_ack_queue > QIB_MAX_RDMA_ATOMIC) + qp->s_tail_ack_queue = 0; + /* FALLTHROUGH */ + case OP(SEND_ONLY): + case OP(ACKNOWLEDGE): + /* Check for no next entry in the queue. */ + if (qp->r_head_ack_queue == qp->s_tail_ack_queue) { + if (qp->s_flags & QIB_S_ACK_PENDING) + goto normal; + goto bail; + } + + e = &qp->s_ack_queue[qp->s_tail_ack_queue]; + if (e->opcode == OP(RDMA_READ_REQUEST)) { + /* + * If a RDMA read response is being resent and + * we haven't seen the duplicate request yet, + * then stop sending the remaining responses the + * responder has seen until the requester resends it. + */ + len = e->rdma_sge.sge_length; + if (len && !e->rdma_sge.mr) { + qp->s_tail_ack_queue = qp->r_head_ack_queue; + goto bail; + } + /* Copy SGE state in case we need to resend */ + qp->s_rdma_mr = e->rdma_sge.mr; + if (qp->s_rdma_mr) + qib_get_mr(qp->s_rdma_mr); + qp->s_ack_rdma_sge.sge = e->rdma_sge; + qp->s_ack_rdma_sge.num_sge = 1; + qp->s_cur_sge = &qp->s_ack_rdma_sge; + if (len > pmtu) { + len = pmtu; + qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST); + } else { + qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY); + e->sent = 1; + } + ohdr->u.aeth = qib_compute_aeth(qp); + hwords++; + qp->s_ack_rdma_psn = e->psn; + bth2 = qp->s_ack_rdma_psn++ & QIB_PSN_MASK; + } else { + /* COMPARE_SWAP or FETCH_ADD */ + qp->s_cur_sge = NULL; + len = 0; + qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE); + ohdr->u.at.aeth = qib_compute_aeth(qp); + ohdr->u.at.atomic_ack_eth[0] = + cpu_to_be32(e->atomic_data >> 32); + ohdr->u.at.atomic_ack_eth[1] = + cpu_to_be32(e->atomic_data); + hwords += sizeof(ohdr->u.at) / sizeof(u32); + bth2 = e->psn & QIB_PSN_MASK; + e->sent = 1; + } + bth0 = qp->s_ack_state << 24; + break; + + case OP(RDMA_READ_RESPONSE_FIRST): + qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE); + /* FALLTHROUGH */ + case OP(RDMA_READ_RESPONSE_MIDDLE): + qp->s_cur_sge = &qp->s_ack_rdma_sge; + qp->s_rdma_mr = qp->s_ack_rdma_sge.sge.mr; + if (qp->s_rdma_mr) + qib_get_mr(qp->s_rdma_mr); + len = qp->s_ack_rdma_sge.sge.sge_length; + if (len > pmtu) + len = pmtu; + else { + ohdr->u.aeth = qib_compute_aeth(qp); + hwords++; + qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST); + e = &qp->s_ack_queue[qp->s_tail_ack_queue]; + e->sent = 1; + } + bth0 = qp->s_ack_state << 24; + bth2 = qp->s_ack_rdma_psn++ & QIB_PSN_MASK; + break; + + default: +normal: + /* + * Send a regular ACK. + * Set the s_ack_state so we wait until after sending + * the ACK before setting s_ack_state to ACKNOWLEDGE + * (see above). + */ + qp->s_ack_state = OP(SEND_ONLY); + qp->s_flags &= ~QIB_S_ACK_PENDING; + qp->s_cur_sge = NULL; + if (qp->s_nak_state) + ohdr->u.aeth = + cpu_to_be32((qp->r_msn & QIB_MSN_MASK) | + (qp->s_nak_state << + QIB_AETH_CREDIT_SHIFT)); + else + ohdr->u.aeth = qib_compute_aeth(qp); + hwords++; + len = 0; + bth0 = OP(ACKNOWLEDGE) << 24; + bth2 = qp->s_ack_psn & QIB_PSN_MASK; + } + qp->s_rdma_ack_cnt++; + qp->s_hdrwords = hwords; + qp->s_cur_size = len; + qib_make_ruc_header(qp, ohdr, bth0, bth2); + return 1; + +bail: + qp->s_ack_state = OP(ACKNOWLEDGE); + qp->s_flags &= ~(QIB_S_RESP_PENDING | QIB_S_ACK_PENDING); + return 0; +} + +/** + * qib_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC) + * @qp: a pointer to the QP + * + * Return 1 if constructed; otherwise, return 0. + */ +int qib_make_rc_req(struct qib_qp *qp) +{ + struct qib_ibdev *dev = to_idev(qp->ibqp.device); + struct qib_other_headers *ohdr; + struct qib_sge_state *ss; + struct qib_swqe *wqe; + u32 hwords; + u32 len; + u32 bth0; + u32 bth2; + u32 pmtu = qp->pmtu; + char newreq; + unsigned long flags; + int ret = 0; + int delta; + + ohdr = &qp->s_hdr->u.oth; + if (qp->remote_ah_attr.ah_flags & IB_AH_GRH) + ohdr = &qp->s_hdr->u.l.oth; + + /* + * The lock is needed to synchronize between the sending tasklet, + * the receive interrupt handler, and timeout resends. + */ + spin_lock_irqsave(&qp->s_lock, flags); + + /* Sending responses has higher priority over sending requests. */ + if ((qp->s_flags & QIB_S_RESP_PENDING) && + qib_make_rc_ack(dev, qp, ohdr, pmtu)) + goto done; + + if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_SEND_OK)) { + if (!(ib_qib_state_ops[qp->state] & QIB_FLUSH_SEND)) + goto bail; + /* We are in the error state, flush the work request. */ + if (qp->s_last == qp->s_head) + goto bail; + /* If DMAs are in progress, we can't flush immediately. */ + if (atomic_read(&qp->s_dma_busy)) { + qp->s_flags |= QIB_S_WAIT_DMA; + goto bail; + } + wqe = get_swqe_ptr(qp, qp->s_last); + qib_send_complete(qp, wqe, qp->s_last != qp->s_acked ? + IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR); + /* will get called again */ + goto done; + } + + if (qp->s_flags & (QIB_S_WAIT_RNR | QIB_S_WAIT_ACK)) + goto bail; + + if (qib_cmp24(qp->s_psn, qp->s_sending_hpsn) <= 0) { + if (qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) { + qp->s_flags |= QIB_S_WAIT_PSN; + goto bail; + } + qp->s_sending_psn = qp->s_psn; + qp->s_sending_hpsn = qp->s_psn - 1; + } + + /* header size in 32-bit words LRH+BTH = (8+12)/4. */ + hwords = 5; + bth0 = 0; + + /* Send a request. */ + wqe = get_swqe_ptr(qp, qp->s_cur); + switch (qp->s_state) { + default: + if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_NEXT_SEND_OK)) + goto bail; + /* + * Resend an old request or start a new one. + * + * We keep track of the current SWQE so that + * we don't reset the "furthest progress" state + * if we need to back up. + */ + newreq = 0; + if (qp->s_cur == qp->s_tail) { + /* Check if send work queue is empty. */ + if (qp->s_tail == qp->s_head) + goto bail; + /* + * If a fence is requested, wait for previous + * RDMA read and atomic operations to finish. + */ + if ((wqe->wr.send_flags & IB_SEND_FENCE) && + qp->s_num_rd_atomic) { + qp->s_flags |= QIB_S_WAIT_FENCE; + goto bail; + } + wqe->psn = qp->s_next_psn; + newreq = 1; + } + /* + * Note that we have to be careful not to modify the + * original work request since we may need to resend + * it. + */ + len = wqe->length; + ss = &qp->s_sge; + bth2 = qp->s_psn & QIB_PSN_MASK; + switch (wqe->wr.opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + /* If no credit, return. */ + if (!(qp->s_flags & QIB_S_UNLIMITED_CREDIT) && + qib_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) { + qp->s_flags |= QIB_S_WAIT_SSN_CREDIT; + goto bail; + } + wqe->lpsn = wqe->psn; + if (len > pmtu) { + wqe->lpsn += (len - 1) / pmtu; + qp->s_state = OP(SEND_FIRST); + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_SEND) + qp->s_state = OP(SEND_ONLY); + else { + qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + } + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= IB_BTH_SOLICITED; + bth2 |= IB_BTH_REQ_ACK; + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + + case IB_WR_RDMA_WRITE: + if (newreq && !(qp->s_flags & QIB_S_UNLIMITED_CREDIT)) + qp->s_lsn++; + /* FALLTHROUGH */ + case IB_WR_RDMA_WRITE_WITH_IMM: + /* If no credit, return. */ + if (!(qp->s_flags & QIB_S_UNLIMITED_CREDIT) && + qib_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) { + qp->s_flags |= QIB_S_WAIT_SSN_CREDIT; + goto bail; + } + ohdr->u.rc.reth.vaddr = + cpu_to_be64(wqe->wr.wr.rdma.remote_addr); + ohdr->u.rc.reth.rkey = + cpu_to_be32(wqe->wr.wr.rdma.rkey); + ohdr->u.rc.reth.length = cpu_to_be32(len); + hwords += sizeof(struct ib_reth) / sizeof(u32); + wqe->lpsn = wqe->psn; + if (len > pmtu) { + wqe->lpsn += (len - 1) / pmtu; + qp->s_state = OP(RDMA_WRITE_FIRST); + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_RDMA_WRITE) + qp->s_state = OP(RDMA_WRITE_ONLY); + else { + qp->s_state = + OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE); + /* Immediate data comes after RETH */ + ohdr->u.rc.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= IB_BTH_SOLICITED; + } + bth2 |= IB_BTH_REQ_ACK; + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + + case IB_WR_RDMA_READ: + /* + * Don't allow more operations to be started + * than the QP limits allow. + */ + if (newreq) { + if (qp->s_num_rd_atomic >= + qp->s_max_rd_atomic) { + qp->s_flags |= QIB_S_WAIT_RDMAR; + goto bail; + } + qp->s_num_rd_atomic++; + if (!(qp->s_flags & QIB_S_UNLIMITED_CREDIT)) + qp->s_lsn++; + /* + * Adjust s_next_psn to count the + * expected number of responses. + */ + if (len > pmtu) + qp->s_next_psn += (len - 1) / pmtu; + wqe->lpsn = qp->s_next_psn++; + } + ohdr->u.rc.reth.vaddr = + cpu_to_be64(wqe->wr.wr.rdma.remote_addr); + ohdr->u.rc.reth.rkey = + cpu_to_be32(wqe->wr.wr.rdma.rkey); + ohdr->u.rc.reth.length = cpu_to_be32(len); + qp->s_state = OP(RDMA_READ_REQUEST); + hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32); + ss = NULL; + len = 0; + bth2 |= IB_BTH_REQ_ACK; + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + /* + * Don't allow more operations to be started + * than the QP limits allow. + */ + if (newreq) { + if (qp->s_num_rd_atomic >= + qp->s_max_rd_atomic) { + qp->s_flags |= QIB_S_WAIT_RDMAR; + goto bail; + } + qp->s_num_rd_atomic++; + if (!(qp->s_flags & QIB_S_UNLIMITED_CREDIT)) + qp->s_lsn++; + wqe->lpsn = wqe->psn; + } + if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) { + qp->s_state = OP(COMPARE_SWAP); + ohdr->u.atomic_eth.swap_data = cpu_to_be64( + wqe->wr.wr.atomic.swap); + ohdr->u.atomic_eth.compare_data = cpu_to_be64( + wqe->wr.wr.atomic.compare_add); + } else { + qp->s_state = OP(FETCH_ADD); + ohdr->u.atomic_eth.swap_data = cpu_to_be64( + wqe->wr.wr.atomic.compare_add); + ohdr->u.atomic_eth.compare_data = 0; + } + ohdr->u.atomic_eth.vaddr[0] = cpu_to_be32( + wqe->wr.wr.atomic.remote_addr >> 32); + ohdr->u.atomic_eth.vaddr[1] = cpu_to_be32( + wqe->wr.wr.atomic.remote_addr); + ohdr->u.atomic_eth.rkey = cpu_to_be32( + wqe->wr.wr.atomic.rkey); + hwords += sizeof(struct ib_atomic_eth) / sizeof(u32); + ss = NULL; + len = 0; + bth2 |= IB_BTH_REQ_ACK; + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + + default: + goto bail; + } + qp->s_sge.sge = wqe->sg_list[0]; + qp->s_sge.sg_list = wqe->sg_list + 1; + qp->s_sge.num_sge = wqe->wr.num_sge; + qp->s_sge.total_len = wqe->length; + qp->s_len = wqe->length; + if (newreq) { + qp->s_tail++; + if (qp->s_tail >= qp->s_size) + qp->s_tail = 0; + } + if (wqe->wr.opcode == IB_WR_RDMA_READ) + qp->s_psn = wqe->lpsn + 1; + else { + qp->s_psn++; + if (qib_cmp24(qp->s_psn, qp->s_next_psn) > 0) + qp->s_next_psn = qp->s_psn; + } + break; + + case OP(RDMA_READ_RESPONSE_FIRST): + /* + * qp->s_state is normally set to the opcode of the + * last packet constructed for new requests and therefore + * is never set to RDMA read response. + * RDMA_READ_RESPONSE_FIRST is used by the ACK processing + * thread to indicate a SEND needs to be restarted from an + * earlier PSN without interferring with the sending thread. + * See qib_restart_rc(). + */ + qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu); + /* FALLTHROUGH */ + case OP(SEND_FIRST): + qp->s_state = OP(SEND_MIDDLE); + /* FALLTHROUGH */ + case OP(SEND_MIDDLE): + bth2 = qp->s_psn++ & QIB_PSN_MASK; + if (qib_cmp24(qp->s_psn, qp->s_next_psn) > 0) + qp->s_next_psn = qp->s_psn; + ss = &qp->s_sge; + len = qp->s_len; + if (len > pmtu) { + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_SEND) + qp->s_state = OP(SEND_LAST); + else { + qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + } + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= IB_BTH_SOLICITED; + bth2 |= IB_BTH_REQ_ACK; + qp->s_cur++; + if (qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + + case OP(RDMA_READ_RESPONSE_LAST): + /* + * qp->s_state is normally set to the opcode of the + * last packet constructed for new requests and therefore + * is never set to RDMA read response. + * RDMA_READ_RESPONSE_LAST is used by the ACK processing + * thread to indicate a RDMA write needs to be restarted from + * an earlier PSN without interferring with the sending thread. + * See qib_restart_rc(). + */ + qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu); + /* FALLTHROUGH */ + case OP(RDMA_WRITE_FIRST): + qp->s_state = OP(RDMA_WRITE_MIDDLE); + /* FALLTHROUGH */ + case OP(RDMA_WRITE_MIDDLE): + bth2 = qp->s_psn++ & QIB_PSN_MASK; + if (qib_cmp24(qp->s_psn, qp->s_next_psn) > 0) + qp->s_next_psn = qp->s_psn; + ss = &qp->s_sge; + len = qp->s_len; + if (len > pmtu) { + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_RDMA_WRITE) + qp->s_state = OP(RDMA_WRITE_LAST); + else { + qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= IB_BTH_SOLICITED; + } + bth2 |= IB_BTH_REQ_ACK; + qp->s_cur++; + if (qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + + case OP(RDMA_READ_RESPONSE_MIDDLE): + /* + * qp->s_state is normally set to the opcode of the + * last packet constructed for new requests and therefore + * is never set to RDMA read response. + * RDMA_READ_RESPONSE_MIDDLE is used by the ACK processing + * thread to indicate a RDMA read needs to be restarted from + * an earlier PSN without interferring with the sending thread. + * See qib_restart_rc(). + */ + len = ((qp->s_psn - wqe->psn) & QIB_PSN_MASK) * pmtu; + ohdr->u.rc.reth.vaddr = + cpu_to_be64(wqe->wr.wr.rdma.remote_addr + len); + ohdr->u.rc.reth.rkey = + cpu_to_be32(wqe->wr.wr.rdma.rkey); + ohdr->u.rc.reth.length = cpu_to_be32(wqe->length - len); + qp->s_state = OP(RDMA_READ_REQUEST); + hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32); + bth2 = (qp->s_psn & QIB_PSN_MASK) | IB_BTH_REQ_ACK; + qp->s_psn = wqe->lpsn + 1; + ss = NULL; + len = 0; + qp->s_cur++; + if (qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + } + qp->s_sending_hpsn = bth2; + delta = (((int) bth2 - (int) wqe->psn) << 8) >> 8; + if (delta && delta % QIB_PSN_CREDIT == 0) + bth2 |= IB_BTH_REQ_ACK; + if (qp->s_flags & QIB_S_SEND_ONE) { + qp->s_flags &= ~QIB_S_SEND_ONE; + qp->s_flags |= QIB_S_WAIT_ACK; + bth2 |= IB_BTH_REQ_ACK; + } + qp->s_len -= len; + qp->s_hdrwords = hwords; + qp->s_cur_sge = ss; + qp->s_cur_size = len; + qib_make_ruc_header(qp, ohdr, bth0 | (qp->s_state << 24), bth2); +done: + ret = 1; + goto unlock; + +bail: + qp->s_flags &= ~QIB_S_BUSY; +unlock: + spin_unlock_irqrestore(&qp->s_lock, flags); + return ret; +} + +/** + * qib_send_rc_ack - Construct an ACK packet and send it + * @qp: a pointer to the QP + * + * This is called from qib_rc_rcv() and qib_kreceive(). + * Note that RDMA reads and atomics are handled in the + * send side QP state and tasklet. + */ +void qib_send_rc_ack(struct qib_qp *qp) +{ + struct qib_devdata *dd = dd_from_ibdev(qp->ibqp.device); + struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + u64 pbc; + u16 lrh0; + u32 bth0; + u32 hwords; + u32 pbufn; + u32 __iomem *piobuf; + struct qib_ib_header hdr; + struct qib_other_headers *ohdr; + u32 control; + unsigned long flags; + + spin_lock_irqsave(&qp->s_lock, flags); + + if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK)) + goto unlock; + + /* Don't send ACK or NAK if a RDMA read or atomic is pending. */ + if ((qp->s_flags & QIB_S_RESP_PENDING) || qp->s_rdma_ack_cnt) + goto queue_ack; + + /* Construct the header with s_lock held so APM doesn't change it. */ + ohdr = &hdr.u.oth; + lrh0 = QIB_LRH_BTH; + /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4. */ + hwords = 6; + if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) { + hwords += qib_make_grh(ibp, &hdr.u.l.grh, + &qp->remote_ah_attr.grh, hwords, 0); + ohdr = &hdr.u.l.oth; + lrh0 = QIB_LRH_GRH; + } + /* read pkey_index w/o lock (its atomic) */ + bth0 = qib_get_pkey(ibp, qp->s_pkey_index) | (OP(ACKNOWLEDGE) << 24); + if (qp->s_mig_state == IB_MIG_MIGRATED) + bth0 |= IB_BTH_MIG_REQ; + if (qp->r_nak_state) + ohdr->u.aeth = cpu_to_be32((qp->r_msn & QIB_MSN_MASK) | + (qp->r_nak_state << + QIB_AETH_CREDIT_SHIFT)); + else + ohdr->u.aeth = qib_compute_aeth(qp); + lrh0 |= ibp->sl_to_vl[qp->remote_ah_attr.sl] << 12 | + qp->remote_ah_attr.sl << 4; + hdr.lrh[0] = cpu_to_be16(lrh0); + hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid); + hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC); + hdr.lrh[3] = cpu_to_be16(ppd->lid | qp->remote_ah_attr.src_path_bits); + ohdr->bth[0] = cpu_to_be32(bth0); + ohdr->bth[1] = cpu_to_be32(qp->remote_qpn); + ohdr->bth[2] = cpu_to_be32(qp->r_ack_psn & QIB_PSN_MASK); + + spin_unlock_irqrestore(&qp->s_lock, flags); + + /* Don't try to send ACKs if the link isn't ACTIVE */ + if (!(ppd->lflags & QIBL_LINKACTIVE)) + goto done; + + control = dd->f_setpbc_control(ppd, hwords + SIZE_OF_CRC, + qp->s_srate, lrh0 >> 12); + /* length is + 1 for the control dword */ + pbc = ((u64) control << 32) | (hwords + 1); + + piobuf = dd->f_getsendbuf(ppd, pbc, &pbufn); + if (!piobuf) { + /* + * We are out of PIO buffers at the moment. + * Pass responsibility for sending the ACK to the + * send tasklet so that when a PIO buffer becomes + * available, the ACK is sent ahead of other outgoing + * packets. + */ + spin_lock_irqsave(&qp->s_lock, flags); + goto queue_ack; + } + + /* + * Write the pbc. + * We have to flush after the PBC for correctness + * on some cpus or WC buffer can be written out of order. + */ + writeq(pbc, piobuf); + + if (dd->flags & QIB_PIO_FLUSH_WC) { + u32 *hdrp = (u32 *) &hdr; + + qib_flush_wc(); + qib_pio_copy(piobuf + 2, hdrp, hwords - 1); + qib_flush_wc(); + __raw_writel(hdrp[hwords - 1], piobuf + hwords + 1); + } else + qib_pio_copy(piobuf + 2, (u32 *) &hdr, hwords); + + if (dd->flags & QIB_USE_SPCL_TRIG) { + u32 spcl_off = (pbufn >= dd->piobcnt2k) ? 2047 : 1023; + + qib_flush_wc(); + __raw_writel(0xaebecede, piobuf + spcl_off); + } + + qib_flush_wc(); + qib_sendbuf_done(dd, pbufn); + + this_cpu_inc(ibp->pmastats->n_unicast_xmit); + goto done; + +queue_ack: + if (ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK) { + ibp->n_rc_qacks++; + qp->s_flags |= QIB_S_ACK_PENDING | QIB_S_RESP_PENDING; + qp->s_nak_state = qp->r_nak_state; + qp->s_ack_psn = qp->r_ack_psn; + + /* Schedule the send tasklet. */ + qib_schedule_send(qp); + } +unlock: + spin_unlock_irqrestore(&qp->s_lock, flags); +done: + return; +} + +/** + * reset_psn - reset the QP state to send starting from PSN + * @qp: the QP + * @psn: the packet sequence number to restart at + * + * This is called from qib_rc_rcv() to process an incoming RC ACK + * for the given QP. + * Called at interrupt level with the QP s_lock held. + */ +static void reset_psn(struct qib_qp *qp, u32 psn) +{ + u32 n = qp->s_acked; + struct qib_swqe *wqe = get_swqe_ptr(qp, n); + u32 opcode; + + qp->s_cur = n; + + /* + * If we are starting the request from the beginning, + * let the normal send code handle initialization. + */ + if (qib_cmp24(psn, wqe->psn) <= 0) { + qp->s_state = OP(SEND_LAST); + goto done; + } + + /* Find the work request opcode corresponding to the given PSN. */ + opcode = wqe->wr.opcode; + for (;;) { + int diff; + + if (++n == qp->s_size) + n = 0; + if (n == qp->s_tail) + break; + wqe = get_swqe_ptr(qp, n); + diff = qib_cmp24(psn, wqe->psn); + if (diff < 0) + break; + qp->s_cur = n; + /* + * If we are starting the request from the beginning, + * let the normal send code handle initialization. + */ + if (diff == 0) { + qp->s_state = OP(SEND_LAST); + goto done; + } + opcode = wqe->wr.opcode; + } + + /* + * Set the state to restart in the middle of a request. + * Don't change the s_sge, s_cur_sge, or s_cur_size. + * See qib_make_rc_req(). + */ + switch (opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + qp->s_state = OP(RDMA_READ_RESPONSE_FIRST); + break; + + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + qp->s_state = OP(RDMA_READ_RESPONSE_LAST); + break; + + case IB_WR_RDMA_READ: + qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE); + break; + + default: + /* + * This case shouldn't happen since its only + * one PSN per req. + */ + qp->s_state = OP(SEND_LAST); + } +done: + qp->s_psn = psn; + /* + * Set QIB_S_WAIT_PSN as qib_rc_complete() may start the timer + * asynchronously before the send tasklet can get scheduled. + * Doing it in qib_make_rc_req() is too late. + */ + if ((qib_cmp24(qp->s_psn, qp->s_sending_hpsn) <= 0) && + (qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)) + qp->s_flags |= QIB_S_WAIT_PSN; +} + +/* + * Back up requester to resend the last un-ACKed request. + * The QP r_lock and s_lock should be held and interrupts disabled. + */ +static void qib_restart_rc(struct qib_qp *qp, u32 psn, int wait) +{ + struct qib_swqe *wqe = get_swqe_ptr(qp, qp->s_acked); + struct qib_ibport *ibp; + + if (qp->s_retry == 0) { + if (qp->s_mig_state == IB_MIG_ARMED) { + qib_migrate_qp(qp); + qp->s_retry = qp->s_retry_cnt; + } else if (qp->s_last == qp->s_acked) { + qib_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR); + qib_error_qp(qp, IB_WC_WR_FLUSH_ERR); + return; + } else /* XXX need to handle delayed completion */ + return; + } else + qp->s_retry--; + + ibp = to_iport(qp->ibqp.device, qp->port_num); + if (wqe->wr.opcode == IB_WR_RDMA_READ) + ibp->n_rc_resends++; + else + ibp->n_rc_resends += (qp->s_psn - psn) & QIB_PSN_MASK; + + qp->s_flags &= ~(QIB_S_WAIT_FENCE | QIB_S_WAIT_RDMAR | + QIB_S_WAIT_SSN_CREDIT | QIB_S_WAIT_PSN | + QIB_S_WAIT_ACK); + if (wait) + qp->s_flags |= QIB_S_SEND_ONE; + reset_psn(qp, psn); +} + +/* + * This is called from s_timer for missing responses. + */ +static void rc_timeout(unsigned long arg) +{ + struct qib_qp *qp = (struct qib_qp *)arg; + struct qib_ibport *ibp; + unsigned long flags; + + spin_lock_irqsave(&qp->r_lock, flags); + spin_lock(&qp->s_lock); + if (qp->s_flags & QIB_S_TIMER) { + ibp = to_iport(qp->ibqp.device, qp->port_num); + ibp->n_rc_timeouts++; + qp->s_flags &= ~QIB_S_TIMER; + del_timer(&qp->s_timer); + qib_restart_rc(qp, qp->s_last_psn + 1, 1); + qib_schedule_send(qp); + } + spin_unlock(&qp->s_lock); + spin_unlock_irqrestore(&qp->r_lock, flags); +} + +/* + * This is called from s_timer for RNR timeouts. + */ +void qib_rc_rnr_retry(unsigned long arg) +{ + struct qib_qp *qp = (struct qib_qp *)arg; + unsigned long flags; + + spin_lock_irqsave(&qp->s_lock, flags); + if (qp->s_flags & QIB_S_WAIT_RNR) { + qp->s_flags &= ~QIB_S_WAIT_RNR; + del_timer(&qp->s_timer); + qib_schedule_send(qp); + } + spin_unlock_irqrestore(&qp->s_lock, flags); +} + +/* + * Set qp->s_sending_psn to the next PSN after the given one. + * This would be psn+1 except when RDMA reads are present. + */ +static void reset_sending_psn(struct qib_qp *qp, u32 psn) +{ + struct qib_swqe *wqe; + u32 n = qp->s_last; + + /* Find the work request corresponding to the given PSN. */ + for (;;) { + wqe = get_swqe_ptr(qp, n); + if (qib_cmp24(psn, wqe->lpsn) <= 0) { + if (wqe->wr.opcode == IB_WR_RDMA_READ) + qp->s_sending_psn = wqe->lpsn + 1; + else + qp->s_sending_psn = psn + 1; + break; + } + if (++n == qp->s_size) + n = 0; + if (n == qp->s_tail) + break; + } +} + +/* + * This should be called with the QP s_lock held and interrupts disabled. + */ +void qib_rc_send_complete(struct qib_qp *qp, struct qib_ib_header *hdr) +{ + struct qib_other_headers *ohdr; + struct qib_swqe *wqe; + struct ib_wc wc; + unsigned i; + u32 opcode; + u32 psn; + + if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_OR_FLUSH_SEND)) + return; + + /* Find out where the BTH is */ + if ((be16_to_cpu(hdr->lrh[0]) & 3) == QIB_LRH_BTH) + ohdr = &hdr->u.oth; + else + ohdr = &hdr->u.l.oth; + + opcode = be32_to_cpu(ohdr->bth[0]) >> 24; + if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) && + opcode <= OP(ATOMIC_ACKNOWLEDGE)) { + WARN_ON(!qp->s_rdma_ack_cnt); + qp->s_rdma_ack_cnt--; + return; + } + + psn = be32_to_cpu(ohdr->bth[2]); + reset_sending_psn(qp, psn); + + /* + * Start timer after a packet requesting an ACK has been sent and + * there are still requests that haven't been acked. + */ + if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail && + !(qp->s_flags & (QIB_S_TIMER | QIB_S_WAIT_RNR | QIB_S_WAIT_PSN)) && + (ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK)) + start_timer(qp); + + while (qp->s_last != qp->s_acked) { + wqe = get_swqe_ptr(qp, qp->s_last); + if (qib_cmp24(wqe->lpsn, qp->s_sending_psn) >= 0 && + qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) + break; + for (i = 0; i < wqe->wr.num_sge; i++) { + struct qib_sge *sge = &wqe->sg_list[i]; + + qib_put_mr(sge->mr); + } + /* Post a send completion queue entry if requested. */ + if (!(qp->s_flags & QIB_S_SIGNAL_REQ_WR) || + (wqe->wr.send_flags & IB_SEND_SIGNALED)) { + memset(&wc, 0, sizeof(wc)); + wc.wr_id = wqe->wr.wr_id; + wc.status = IB_WC_SUCCESS; + wc.opcode = ib_qib_wc_opcode[wqe->wr.opcode]; + wc.byte_len = wqe->length; + wc.qp = &qp->ibqp; + qib_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0); + } + if (++qp->s_last >= qp->s_size) + qp->s_last = 0; + } + /* + * If we were waiting for sends to complete before resending, + * and they are now complete, restart sending. + */ + if (qp->s_flags & QIB_S_WAIT_PSN && + qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) > 0) { + qp->s_flags &= ~QIB_S_WAIT_PSN; + qp->s_sending_psn = qp->s_psn; + qp->s_sending_hpsn = qp->s_psn - 1; + qib_schedule_send(qp); + } +} + +static inline void update_last_psn(struct qib_qp *qp, u32 psn) +{ + qp->s_last_psn = psn; +} + +/* + * Generate a SWQE completion. + * This is similar to qib_send_complete but has to check to be sure + * that the SGEs are not being referenced if the SWQE is being resent. + */ +static struct qib_swqe *do_rc_completion(struct qib_qp *qp, + struct qib_swqe *wqe, + struct qib_ibport *ibp) +{ + struct ib_wc wc; + unsigned i; + + /* + * Don't decrement refcount and don't generate a + * completion if the SWQE is being resent until the send + * is finished. + */ + if (qib_cmp24(wqe->lpsn, qp->s_sending_psn) < 0 || + qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) > 0) { + for (i = 0; i < wqe->wr.num_sge; i++) { + struct qib_sge *sge = &wqe->sg_list[i]; + + qib_put_mr(sge->mr); + } + /* Post a send completion queue entry if requested. */ + if (!(qp->s_flags & QIB_S_SIGNAL_REQ_WR) || + (wqe->wr.send_flags & IB_SEND_SIGNALED)) { + memset(&wc, 0, sizeof(wc)); + wc.wr_id = wqe->wr.wr_id; + wc.status = IB_WC_SUCCESS; + wc.opcode = ib_qib_wc_opcode[wqe->wr.opcode]; + wc.byte_len = wqe->length; + wc.qp = &qp->ibqp; + qib_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0); + } + if (++qp->s_last >= qp->s_size) + qp->s_last = 0; + } else + ibp->n_rc_delayed_comp++; + + qp->s_retry = qp->s_retry_cnt; + update_last_psn(qp, wqe->lpsn); + + /* + * If we are completing a request which is in the process of + * being resent, we can stop resending it since we know the + * responder has already seen it. + */ + if (qp->s_acked == qp->s_cur) { + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; + qp->s_acked = qp->s_cur; + wqe = get_swqe_ptr(qp, qp->s_cur); + if (qp->s_acked != qp->s_tail) { + qp->s_state = OP(SEND_LAST); + qp->s_psn = wqe->psn; + } + } else { + if (++qp->s_acked >= qp->s_size) + qp->s_acked = 0; + if (qp->state == IB_QPS_SQD && qp->s_acked == qp->s_cur) + qp->s_draining = 0; + wqe = get_swqe_ptr(qp, qp->s_acked); + } + return wqe; +} + +/** + * do_rc_ack - process an incoming RC ACK + * @qp: the QP the ACK came in on + * @psn: the packet sequence number of the ACK + * @opcode: the opcode of the request that resulted in the ACK + * + * This is called from qib_rc_rcv_resp() to process an incoming RC ACK + * for the given QP. + * Called at interrupt level with the QP s_lock held. + * Returns 1 if OK, 0 if current operation should be aborted (NAK). + */ +static int do_rc_ack(struct qib_qp *qp, u32 aeth, u32 psn, int opcode, + u64 val, struct qib_ctxtdata *rcd) +{ + struct qib_ibport *ibp; + enum ib_wc_status status; + struct qib_swqe *wqe; + int ret = 0; + u32 ack_psn; + int diff; + + /* Remove QP from retry timer */ + if (qp->s_flags & (QIB_S_TIMER | QIB_S_WAIT_RNR)) { + qp->s_flags &= ~(QIB_S_TIMER | QIB_S_WAIT_RNR); + del_timer(&qp->s_timer); + } + + /* + * Note that NAKs implicitly ACK outstanding SEND and RDMA write + * requests and implicitly NAK RDMA read and atomic requests issued + * before the NAK'ed request. The MSN won't include the NAK'ed + * request but will include an ACK'ed request(s). + */ + ack_psn = psn; + if (aeth >> 29) + ack_psn--; + wqe = get_swqe_ptr(qp, qp->s_acked); + ibp = to_iport(qp->ibqp.device, qp->port_num); + + /* + * The MSN might be for a later WQE than the PSN indicates so + * only complete WQEs that the PSN finishes. + */ + while ((diff = qib_cmp24(ack_psn, wqe->lpsn)) >= 0) { + /* + * RDMA_READ_RESPONSE_ONLY is a special case since + * we want to generate completion events for everything + * before the RDMA read, copy the data, then generate + * the completion for the read. + */ + if (wqe->wr.opcode == IB_WR_RDMA_READ && + opcode == OP(RDMA_READ_RESPONSE_ONLY) && + diff == 0) { + ret = 1; + goto bail; + } + /* + * If this request is a RDMA read or atomic, and the ACK is + * for a later operation, this ACK NAKs the RDMA read or + * atomic. In other words, only a RDMA_READ_LAST or ONLY + * can ACK a RDMA read and likewise for atomic ops. Note + * that the NAK case can only happen if relaxed ordering is + * used and requests are sent after an RDMA read or atomic + * is sent but before the response is received. + */ + if ((wqe->wr.opcode == IB_WR_RDMA_READ && + (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) || + ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || + wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) && + (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) { + /* Retry this request. */ + if (!(qp->r_flags & QIB_R_RDMAR_SEQ)) { + qp->r_flags |= QIB_R_RDMAR_SEQ; + qib_restart_rc(qp, qp->s_last_psn + 1, 0); + if (list_empty(&qp->rspwait)) { + qp->r_flags |= QIB_R_RSP_SEND; + atomic_inc(&qp->refcount); + list_add_tail(&qp->rspwait, + &rcd->qp_wait_list); + } + } + /* + * No need to process the ACK/NAK since we are + * restarting an earlier request. + */ + goto bail; + } + if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || + wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) { + u64 *vaddr = wqe->sg_list[0].vaddr; + *vaddr = val; + } + if (qp->s_num_rd_atomic && + (wqe->wr.opcode == IB_WR_RDMA_READ || + wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || + wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) { + qp->s_num_rd_atomic--; + /* Restart sending task if fence is complete */ + if ((qp->s_flags & QIB_S_WAIT_FENCE) && + !qp->s_num_rd_atomic) { + qp->s_flags &= ~(QIB_S_WAIT_FENCE | + QIB_S_WAIT_ACK); + qib_schedule_send(qp); + } else if (qp->s_flags & QIB_S_WAIT_RDMAR) { + qp->s_flags &= ~(QIB_S_WAIT_RDMAR | + QIB_S_WAIT_ACK); + qib_schedule_send(qp); + } + } + wqe = do_rc_completion(qp, wqe, ibp); + if (qp->s_acked == qp->s_tail) + break; + } + + switch (aeth >> 29) { + case 0: /* ACK */ + ibp->n_rc_acks++; + if (qp->s_acked != qp->s_tail) { + /* + * We are expecting more ACKs so + * reset the retransmit timer. + */ + start_timer(qp); + /* + * We can stop resending the earlier packets and + * continue with the next packet the receiver wants. + */ + if (qib_cmp24(qp->s_psn, psn) <= 0) + reset_psn(qp, psn + 1); + } else if (qib_cmp24(qp->s_psn, psn) <= 0) { + qp->s_state = OP(SEND_LAST); + qp->s_psn = psn + 1; + } + if (qp->s_flags & QIB_S_WAIT_ACK) { + qp->s_flags &= ~QIB_S_WAIT_ACK; + qib_schedule_send(qp); + } + qib_get_credit(qp, aeth); + qp->s_rnr_retry = qp->s_rnr_retry_cnt; + qp->s_retry = qp->s_retry_cnt; + update_last_psn(qp, psn); + ret = 1; + goto bail; + + case 1: /* RNR NAK */ + ibp->n_rnr_naks++; + if (qp->s_acked == qp->s_tail) + goto bail; + if (qp->s_flags & QIB_S_WAIT_RNR) + goto bail; + if (qp->s_rnr_retry == 0) { + status = IB_WC_RNR_RETRY_EXC_ERR; + goto class_b; + } + if (qp->s_rnr_retry_cnt < 7) + qp->s_rnr_retry--; + + /* The last valid PSN is the previous PSN. */ + update_last_psn(qp, psn - 1); + + ibp->n_rc_resends += (qp->s_psn - psn) & QIB_PSN_MASK; + + reset_psn(qp, psn); + + qp->s_flags &= ~(QIB_S_WAIT_SSN_CREDIT | QIB_S_WAIT_ACK); + qp->s_flags |= QIB_S_WAIT_RNR; + qp->s_timer.function = qib_rc_rnr_retry; + qp->s_timer.expires = jiffies + usecs_to_jiffies( + ib_qib_rnr_table[(aeth >> QIB_AETH_CREDIT_SHIFT) & + QIB_AETH_CREDIT_MASK]); + add_timer(&qp->s_timer); + goto bail; + + case 3: /* NAK */ + if (qp->s_acked == qp->s_tail) + goto bail; + /* The last valid PSN is the previous PSN. */ + update_last_psn(qp, psn - 1); + switch ((aeth >> QIB_AETH_CREDIT_SHIFT) & + QIB_AETH_CREDIT_MASK) { + case 0: /* PSN sequence error */ + ibp->n_seq_naks++; + /* + * Back up to the responder's expected PSN. + * Note that we might get a NAK in the middle of an + * RDMA READ response which terminates the RDMA + * READ. + */ + qib_restart_rc(qp, psn, 0); + qib_schedule_send(qp); + break; + + case 1: /* Invalid Request */ + status = IB_WC_REM_INV_REQ_ERR; + ibp->n_other_naks++; + goto class_b; + + case 2: /* Remote Access Error */ + status = IB_WC_REM_ACCESS_ERR; + ibp->n_other_naks++; + goto class_b; + + case 3: /* Remote Operation Error */ + status = IB_WC_REM_OP_ERR; + ibp->n_other_naks++; +class_b: + if (qp->s_last == qp->s_acked) { + qib_send_complete(qp, wqe, status); + qib_error_qp(qp, IB_WC_WR_FLUSH_ERR); + } + break; + + default: + /* Ignore other reserved NAK error codes */ + goto reserved; + } + qp->s_retry = qp->s_retry_cnt; + qp->s_rnr_retry = qp->s_rnr_retry_cnt; + goto bail; + + default: /* 2: reserved */ +reserved: + /* Ignore reserved NAK codes. */ + goto bail; + } + +bail: + return ret; +} + +/* + * We have seen an out of sequence RDMA read middle or last packet. + * This ACKs SENDs and RDMA writes up to the first RDMA read or atomic SWQE. + */ +static void rdma_seq_err(struct qib_qp *qp, struct qib_ibport *ibp, u32 psn, + struct qib_ctxtdata *rcd) +{ + struct qib_swqe *wqe; + + /* Remove QP from retry timer */ + if (qp->s_flags & (QIB_S_TIMER | QIB_S_WAIT_RNR)) { + qp->s_flags &= ~(QIB_S_TIMER | QIB_S_WAIT_RNR); + del_timer(&qp->s_timer); + } + + wqe = get_swqe_ptr(qp, qp->s_acked); + + while (qib_cmp24(psn, wqe->lpsn) > 0) { + if (wqe->wr.opcode == IB_WR_RDMA_READ || + wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || + wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) + break; + wqe = do_rc_completion(qp, wqe, ibp); + } + + ibp->n_rdma_seq++; + qp->r_flags |= QIB_R_RDMAR_SEQ; + qib_restart_rc(qp, qp->s_last_psn + 1, 0); + if (list_empty(&qp->rspwait)) { + qp->r_flags |= QIB_R_RSP_SEND; + atomic_inc(&qp->refcount); + list_add_tail(&qp->rspwait, &rcd->qp_wait_list); + } +} + +/** + * qib_rc_rcv_resp - process an incoming RC response packet + * @ibp: the port this packet came in on + * @ohdr: the other headers for this packet + * @data: the packet data + * @tlen: the packet length + * @qp: the QP for this packet + * @opcode: the opcode for this packet + * @psn: the packet sequence number for this packet + * @hdrsize: the header length + * @pmtu: the path MTU + * + * This is called from qib_rc_rcv() to process an incoming RC response + * packet for the given QP. + * Called at interrupt level. + */ +static void qib_rc_rcv_resp(struct qib_ibport *ibp, + struct qib_other_headers *ohdr, + void *data, u32 tlen, + struct qib_qp *qp, + u32 opcode, + u32 psn, u32 hdrsize, u32 pmtu, + struct qib_ctxtdata *rcd) +{ + struct qib_swqe *wqe; + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + enum ib_wc_status status; + unsigned long flags; + int diff; + u32 pad; + u32 aeth; + u64 val; + + if (opcode != OP(RDMA_READ_RESPONSE_MIDDLE)) { + /* + * If ACK'd PSN on SDMA busy list try to make progress to + * reclaim SDMA credits. + */ + if ((qib_cmp24(psn, qp->s_sending_psn) >= 0) && + (qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)) { + + /* + * If send tasklet not running attempt to progress + * SDMA queue. + */ + if (!(qp->s_flags & QIB_S_BUSY)) { + /* Acquire SDMA Lock */ + spin_lock_irqsave(&ppd->sdma_lock, flags); + /* Invoke sdma make progress */ + qib_sdma_make_progress(ppd); + /* Release SDMA Lock */ + spin_unlock_irqrestore(&ppd->sdma_lock, flags); + } + } + } + + spin_lock_irqsave(&qp->s_lock, flags); + if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK)) + goto ack_done; + + /* Ignore invalid responses. */ + if (qib_cmp24(psn, qp->s_next_psn) >= 0) + goto ack_done; + + /* Ignore duplicate responses. */ + diff = qib_cmp24(psn, qp->s_last_psn); + if (unlikely(diff <= 0)) { + /* Update credits for "ghost" ACKs */ + if (diff == 0 && opcode == OP(ACKNOWLEDGE)) { + aeth = be32_to_cpu(ohdr->u.aeth); + if ((aeth >> 29) == 0) + qib_get_credit(qp, aeth); + } + goto ack_done; + } + + /* + * Skip everything other than the PSN we expect, if we are waiting + * for a reply to a restarted RDMA read or atomic op. + */ + if (qp->r_flags & QIB_R_RDMAR_SEQ) { + if (qib_cmp24(psn, qp->s_last_psn + 1) != 0) + goto ack_done; + qp->r_flags &= ~QIB_R_RDMAR_SEQ; + } + + if (unlikely(qp->s_acked == qp->s_tail)) + goto ack_done; + wqe = get_swqe_ptr(qp, qp->s_acked); + status = IB_WC_SUCCESS; + + switch (opcode) { + case OP(ACKNOWLEDGE): + case OP(ATOMIC_ACKNOWLEDGE): + case OP(RDMA_READ_RESPONSE_FIRST): + aeth = be32_to_cpu(ohdr->u.aeth); + if (opcode == OP(ATOMIC_ACKNOWLEDGE)) { + __be32 *p = ohdr->u.at.atomic_ack_eth; + + val = ((u64) be32_to_cpu(p[0]) << 32) | + be32_to_cpu(p[1]); + } else + val = 0; + if (!do_rc_ack(qp, aeth, psn, opcode, val, rcd) || + opcode != OP(RDMA_READ_RESPONSE_FIRST)) + goto ack_done; + hdrsize += 4; + wqe = get_swqe_ptr(qp, qp->s_acked); + if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) + goto ack_op_err; + /* + * If this is a response to a resent RDMA read, we + * have to be careful to copy the data to the right + * location. + */ + qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge, + wqe, psn, pmtu); + goto read_middle; + + case OP(RDMA_READ_RESPONSE_MIDDLE): + /* no AETH, no ACK */ + if (unlikely(qib_cmp24(psn, qp->s_last_psn + 1))) + goto ack_seq_err; + if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) + goto ack_op_err; +read_middle: + if (unlikely(tlen != (hdrsize + pmtu + 4))) + goto ack_len_err; + if (unlikely(pmtu >= qp->s_rdma_read_len)) + goto ack_len_err; + + /* + * We got a response so update the timeout. + * 4.096 usec. * (1 << qp->timeout) + */ + qp->s_flags |= QIB_S_TIMER; + mod_timer(&qp->s_timer, jiffies + qp->timeout_jiffies); + if (qp->s_flags & QIB_S_WAIT_ACK) { + qp->s_flags &= ~QIB_S_WAIT_ACK; + qib_schedule_send(qp); + } + + if (opcode == OP(RDMA_READ_RESPONSE_MIDDLE)) + qp->s_retry = qp->s_retry_cnt; + + /* + * Update the RDMA receive state but do the copy w/o + * holding the locks and blocking interrupts. + */ + qp->s_rdma_read_len -= pmtu; + update_last_psn(qp, psn); + spin_unlock_irqrestore(&qp->s_lock, flags); + qib_copy_sge(&qp->s_rdma_read_sge, data, pmtu, 0); + goto bail; + + case OP(RDMA_READ_RESPONSE_ONLY): + aeth = be32_to_cpu(ohdr->u.aeth); + if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd)) + goto ack_done; + /* Get the number of bytes the message was padded by. */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + /* + * Check that the data size is >= 0 && <= pmtu. + * Remember to account for the AETH header (4) and + * ICRC (4). + */ + if (unlikely(tlen < (hdrsize + pad + 8))) + goto ack_len_err; + /* + * If this is a response to a resent RDMA read, we + * have to be careful to copy the data to the right + * location. + */ + wqe = get_swqe_ptr(qp, qp->s_acked); + qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge, + wqe, psn, pmtu); + goto read_last; + + case OP(RDMA_READ_RESPONSE_LAST): + /* ACKs READ req. */ + if (unlikely(qib_cmp24(psn, qp->s_last_psn + 1))) + goto ack_seq_err; + if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) + goto ack_op_err; + /* Get the number of bytes the message was padded by. */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + /* + * Check that the data size is >= 1 && <= pmtu. + * Remember to account for the AETH header (4) and + * ICRC (4). + */ + if (unlikely(tlen <= (hdrsize + pad + 8))) + goto ack_len_err; +read_last: + tlen -= hdrsize + pad + 8; + if (unlikely(tlen != qp->s_rdma_read_len)) + goto ack_len_err; + aeth = be32_to_cpu(ohdr->u.aeth); + qib_copy_sge(&qp->s_rdma_read_sge, data, tlen, 0); + WARN_ON(qp->s_rdma_read_sge.num_sge); + (void) do_rc_ack(qp, aeth, psn, + OP(RDMA_READ_RESPONSE_LAST), 0, rcd); + goto ack_done; + } + +ack_op_err: + status = IB_WC_LOC_QP_OP_ERR; + goto ack_err; + +ack_seq_err: + rdma_seq_err(qp, ibp, psn, rcd); + goto ack_done; + +ack_len_err: + status = IB_WC_LOC_LEN_ERR; +ack_err: + if (qp->s_last == qp->s_acked) { + qib_send_complete(qp, wqe, status); + qib_error_qp(qp, IB_WC_WR_FLUSH_ERR); + } +ack_done: + spin_unlock_irqrestore(&qp->s_lock, flags); +bail: + return; +} + +/** + * qib_rc_rcv_error - process an incoming duplicate or error RC packet + * @ohdr: the other headers for this packet + * @data: the packet data + * @qp: the QP for this packet + * @opcode: the opcode for this packet + * @psn: the packet sequence number for this packet + * @diff: the difference between the PSN and the expected PSN + * + * This is called from qib_rc_rcv() to process an unexpected + * incoming RC packet for the given QP. + * Called at interrupt level. + * Return 1 if no more processing is needed; otherwise return 0 to + * schedule a response to be sent. + */ +static int qib_rc_rcv_error(struct qib_other_headers *ohdr, + void *data, + struct qib_qp *qp, + u32 opcode, + u32 psn, + int diff, + struct qib_ctxtdata *rcd) +{ + struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + struct qib_ack_entry *e; + unsigned long flags; + u8 i, prev; + int old_req; + + if (diff > 0) { + /* + * Packet sequence error. + * A NAK will ACK earlier sends and RDMA writes. + * Don't queue the NAK if we already sent one. + */ + if (!qp->r_nak_state) { + ibp->n_rc_seqnak++; + qp->r_nak_state = IB_NAK_PSN_ERROR; + /* Use the expected PSN. */ + qp->r_ack_psn = qp->r_psn; + /* + * Wait to send the sequence NAK until all packets + * in the receive queue have been processed. + * Otherwise, we end up propagating congestion. + */ + if (list_empty(&qp->rspwait)) { + qp->r_flags |= QIB_R_RSP_NAK; + atomic_inc(&qp->refcount); + list_add_tail(&qp->rspwait, &rcd->qp_wait_list); + } + } + goto done; + } + + /* + * Handle a duplicate request. Don't re-execute SEND, RDMA + * write or atomic op. Don't NAK errors, just silently drop + * the duplicate request. Note that r_sge, r_len, and + * r_rcv_len may be in use so don't modify them. + * + * We are supposed to ACK the earliest duplicate PSN but we + * can coalesce an outstanding duplicate ACK. We have to + * send the earliest so that RDMA reads can be restarted at + * the requester's expected PSN. + * + * First, find where this duplicate PSN falls within the + * ACKs previously sent. + * old_req is true if there is an older response that is scheduled + * to be sent before sending this one. + */ + e = NULL; + old_req = 1; + ibp->n_rc_dupreq++; + + spin_lock_irqsave(&qp->s_lock, flags); + + for (i = qp->r_head_ack_queue; ; i = prev) { + if (i == qp->s_tail_ack_queue) + old_req = 0; + if (i) + prev = i - 1; + else + prev = QIB_MAX_RDMA_ATOMIC; + if (prev == qp->r_head_ack_queue) { + e = NULL; + break; + } + e = &qp->s_ack_queue[prev]; + if (!e->opcode) { + e = NULL; + break; + } + if (qib_cmp24(psn, e->psn) >= 0) { + if (prev == qp->s_tail_ack_queue && + qib_cmp24(psn, e->lpsn) <= 0) + old_req = 0; + break; + } + } + switch (opcode) { + case OP(RDMA_READ_REQUEST): { + struct ib_reth *reth; + u32 offset; + u32 len; + + /* + * If we didn't find the RDMA read request in the ack queue, + * we can ignore this request. + */ + if (!e || e->opcode != OP(RDMA_READ_REQUEST)) + goto unlock_done; + /* RETH comes after BTH */ + reth = &ohdr->u.rc.reth; + /* + * Address range must be a subset of the original + * request and start on pmtu boundaries. + * We reuse the old ack_queue slot since the requester + * should not back up and request an earlier PSN for the + * same request. + */ + offset = ((psn - e->psn) & QIB_PSN_MASK) * + qp->pmtu; + len = be32_to_cpu(reth->length); + if (unlikely(offset + len != e->rdma_sge.sge_length)) + goto unlock_done; + if (e->rdma_sge.mr) { + qib_put_mr(e->rdma_sge.mr); + e->rdma_sge.mr = NULL; + } + if (len != 0) { + u32 rkey = be32_to_cpu(reth->rkey); + u64 vaddr = be64_to_cpu(reth->vaddr); + int ok; + + ok = qib_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey, + IB_ACCESS_REMOTE_READ); + if (unlikely(!ok)) + goto unlock_done; + } else { + e->rdma_sge.vaddr = NULL; + e->rdma_sge.length = 0; + e->rdma_sge.sge_length = 0; + } + e->psn = psn; + if (old_req) + goto unlock_done; + qp->s_tail_ack_queue = prev; + break; + } + + case OP(COMPARE_SWAP): + case OP(FETCH_ADD): { + /* + * If we didn't find the atomic request in the ack queue + * or the send tasklet is already backed up to send an + * earlier entry, we can ignore this request. + */ + if (!e || e->opcode != (u8) opcode || old_req) + goto unlock_done; + qp->s_tail_ack_queue = prev; + break; + } + + default: + /* + * Ignore this operation if it doesn't request an ACK + * or an earlier RDMA read or atomic is going to be resent. + */ + if (!(psn & IB_BTH_REQ_ACK) || old_req) + goto unlock_done; + /* + * Resend the most recent ACK if this request is + * after all the previous RDMA reads and atomics. + */ + if (i == qp->r_head_ack_queue) { + spin_unlock_irqrestore(&qp->s_lock, flags); + qp->r_nak_state = 0; + qp->r_ack_psn = qp->r_psn - 1; + goto send_ack; + } + /* + * Try to send a simple ACK to work around a Mellanox bug + * which doesn't accept a RDMA read response or atomic + * response as an ACK for earlier SENDs or RDMA writes. + */ + if (!(qp->s_flags & QIB_S_RESP_PENDING)) { + spin_unlock_irqrestore(&qp->s_lock, flags); + qp->r_nak_state = 0; + qp->r_ack_psn = qp->s_ack_queue[i].psn - 1; + goto send_ack; + } + /* + * Resend the RDMA read or atomic op which + * ACKs this duplicate request. + */ + qp->s_tail_ack_queue = i; + break; + } + qp->s_ack_state = OP(ACKNOWLEDGE); + qp->s_flags |= QIB_S_RESP_PENDING; + qp->r_nak_state = 0; + qib_schedule_send(qp); + +unlock_done: + spin_unlock_irqrestore(&qp->s_lock, flags); +done: + return 1; + +send_ack: + return 0; +} + +void qib_rc_error(struct qib_qp *qp, enum ib_wc_status err) +{ + unsigned long flags; + int lastwqe; + + spin_lock_irqsave(&qp->s_lock, flags); + lastwqe = qib_error_qp(qp, err); + spin_unlock_irqrestore(&qp->s_lock, flags); + + if (lastwqe) { + struct ib_event ev; + + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_QP_LAST_WQE_REACHED; + qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); + } +} + +static inline void qib_update_ack_queue(struct qib_qp *qp, unsigned n) +{ + unsigned next; + + next = n + 1; + if (next > QIB_MAX_RDMA_ATOMIC) + next = 0; + qp->s_tail_ack_queue = next; + qp->s_ack_state = OP(ACKNOWLEDGE); +} + +/** + * qib_rc_rcv - process an incoming RC packet + * @rcd: the context pointer + * @hdr: the header of this packet + * @has_grh: true if the header has a GRH + * @data: the packet data + * @tlen: the packet length + * @qp: the QP for this packet + * + * This is called from qib_qp_rcv() to process an incoming RC packet + * for the given QP. + * Called at interrupt level. + */ +void qib_rc_rcv(struct qib_ctxtdata *rcd, struct qib_ib_header *hdr, + int has_grh, void *data, u32 tlen, struct qib_qp *qp) +{ + struct qib_ibport *ibp = &rcd->ppd->ibport_data; + struct qib_other_headers *ohdr; + u32 opcode; + u32 hdrsize; + u32 psn; + u32 pad; + struct ib_wc wc; + u32 pmtu = qp->pmtu; + int diff; + struct ib_reth *reth; + unsigned long flags; + int ret; + + /* Check for GRH */ + if (!has_grh) { + ohdr = &hdr->u.oth; + hdrsize = 8 + 12; /* LRH + BTH */ + } else { + ohdr = &hdr->u.l.oth; + hdrsize = 8 + 40 + 12; /* LRH + GRH + BTH */ + } + + opcode = be32_to_cpu(ohdr->bth[0]); + if (qib_ruc_check_hdr(ibp, hdr, has_grh, qp, opcode)) + return; + + psn = be32_to_cpu(ohdr->bth[2]); + opcode >>= 24; + + /* + * Process responses (ACKs) before anything else. Note that the + * packet sequence number will be for something in the send work + * queue rather than the expected receive packet sequence number. + * In other words, this QP is the requester. + */ + if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) && + opcode <= OP(ATOMIC_ACKNOWLEDGE)) { + qib_rc_rcv_resp(ibp, ohdr, data, tlen, qp, opcode, psn, + hdrsize, pmtu, rcd); + return; + } + + /* Compute 24 bits worth of difference. */ + diff = qib_cmp24(psn, qp->r_psn); + if (unlikely(diff)) { + if (qib_rc_rcv_error(ohdr, data, qp, opcode, psn, diff, rcd)) + return; + goto send_ack; + } + + /* Check for opcode sequence errors. */ + switch (qp->r_state) { + case OP(SEND_FIRST): + case OP(SEND_MIDDLE): + if (opcode == OP(SEND_MIDDLE) || + opcode == OP(SEND_LAST) || + opcode == OP(SEND_LAST_WITH_IMMEDIATE)) + break; + goto nack_inv; + + case OP(RDMA_WRITE_FIRST): + case OP(RDMA_WRITE_MIDDLE): + if (opcode == OP(RDMA_WRITE_MIDDLE) || + opcode == OP(RDMA_WRITE_LAST) || + opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE)) + break; + goto nack_inv; + + default: + if (opcode == OP(SEND_MIDDLE) || + opcode == OP(SEND_LAST) || + opcode == OP(SEND_LAST_WITH_IMMEDIATE) || + opcode == OP(RDMA_WRITE_MIDDLE) || + opcode == OP(RDMA_WRITE_LAST) || + opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE)) + goto nack_inv; + /* + * Note that it is up to the requester to not send a new + * RDMA read or atomic operation before receiving an ACK + * for the previous operation. + */ + break; + } + + if (qp->state == IB_QPS_RTR && !(qp->r_flags & QIB_R_COMM_EST)) { + qp->r_flags |= QIB_R_COMM_EST; + if (qp->ibqp.event_handler) { + struct ib_event ev; + + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_COMM_EST; + qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); + } + } + + /* OK, process the packet. */ + switch (opcode) { + case OP(SEND_FIRST): + ret = qib_get_rwqe(qp, 0); + if (ret < 0) + goto nack_op_err; + if (!ret) + goto rnr_nak; + qp->r_rcv_len = 0; + /* FALLTHROUGH */ + case OP(SEND_MIDDLE): + case OP(RDMA_WRITE_MIDDLE): +send_middle: + /* Check for invalid length PMTU or posted rwqe len. */ + if (unlikely(tlen != (hdrsize + pmtu + 4))) + goto nack_inv; + qp->r_rcv_len += pmtu; + if (unlikely(qp->r_rcv_len > qp->r_len)) + goto nack_inv; + qib_copy_sge(&qp->r_sge, data, pmtu, 1); + break; + + case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): + /* consume RWQE */ + ret = qib_get_rwqe(qp, 1); + if (ret < 0) + goto nack_op_err; + if (!ret) + goto rnr_nak; + goto send_last_imm; + + case OP(SEND_ONLY): + case OP(SEND_ONLY_WITH_IMMEDIATE): + ret = qib_get_rwqe(qp, 0); + if (ret < 0) + goto nack_op_err; + if (!ret) + goto rnr_nak; + qp->r_rcv_len = 0; + if (opcode == OP(SEND_ONLY)) + goto no_immediate_data; + /* FALLTHROUGH for SEND_ONLY_WITH_IMMEDIATE */ + case OP(SEND_LAST_WITH_IMMEDIATE): +send_last_imm: + wc.ex.imm_data = ohdr->u.imm_data; + hdrsize += 4; + wc.wc_flags = IB_WC_WITH_IMM; + goto send_last; + case OP(SEND_LAST): + case OP(RDMA_WRITE_LAST): +no_immediate_data: + wc.wc_flags = 0; + wc.ex.imm_data = 0; +send_last: + /* Get the number of bytes the message was padded by. */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + /* Check for invalid length. */ + /* XXX LAST len should be >= 1 */ + if (unlikely(tlen < (hdrsize + pad + 4))) + goto nack_inv; + /* Don't count the CRC. */ + tlen -= (hdrsize + pad + 4); + wc.byte_len = tlen + qp->r_rcv_len; + if (unlikely(wc.byte_len > qp->r_len)) + goto nack_inv; + qib_copy_sge(&qp->r_sge, data, tlen, 1); + qib_put_ss(&qp->r_sge); + qp->r_msn++; + if (!test_and_clear_bit(QIB_R_WRID_VALID, &qp->r_aflags)) + break; + wc.wr_id = qp->r_wr_id; + wc.status = IB_WC_SUCCESS; + if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) || + opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) + wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; + else + wc.opcode = IB_WC_RECV; + wc.qp = &qp->ibqp; + wc.src_qp = qp->remote_qpn; + wc.slid = qp->remote_ah_attr.dlid; + wc.sl = qp->remote_ah_attr.sl; + /* zero fields that are N/A */ + wc.vendor_err = 0; + wc.pkey_index = 0; + wc.dlid_path_bits = 0; + wc.port_num = 0; + /* Signal completion event if the solicited bit is set. */ + qib_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, + (ohdr->bth[0] & + cpu_to_be32(IB_BTH_SOLICITED)) != 0); + break; + + case OP(RDMA_WRITE_FIRST): + case OP(RDMA_WRITE_ONLY): + case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) + goto nack_inv; + /* consume RWQE */ + reth = &ohdr->u.rc.reth; + hdrsize += sizeof(*reth); + qp->r_len = be32_to_cpu(reth->length); + qp->r_rcv_len = 0; + qp->r_sge.sg_list = NULL; + if (qp->r_len != 0) { + u32 rkey = be32_to_cpu(reth->rkey); + u64 vaddr = be64_to_cpu(reth->vaddr); + int ok; + + /* Check rkey & NAK */ + ok = qib_rkey_ok(qp, &qp->r_sge.sge, qp->r_len, vaddr, + rkey, IB_ACCESS_REMOTE_WRITE); + if (unlikely(!ok)) + goto nack_acc; + qp->r_sge.num_sge = 1; + } else { + qp->r_sge.num_sge = 0; + qp->r_sge.sge.mr = NULL; + qp->r_sge.sge.vaddr = NULL; + qp->r_sge.sge.length = 0; + qp->r_sge.sge.sge_length = 0; + } + if (opcode == OP(RDMA_WRITE_FIRST)) + goto send_middle; + else if (opcode == OP(RDMA_WRITE_ONLY)) + goto no_immediate_data; + ret = qib_get_rwqe(qp, 1); + if (ret < 0) + goto nack_op_err; + if (!ret) + goto rnr_nak; + wc.ex.imm_data = ohdr->u.rc.imm_data; + hdrsize += 4; + wc.wc_flags = IB_WC_WITH_IMM; + goto send_last; + + case OP(RDMA_READ_REQUEST): { + struct qib_ack_entry *e; + u32 len; + u8 next; + + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ))) + goto nack_inv; + next = qp->r_head_ack_queue + 1; + /* s_ack_queue is size QIB_MAX_RDMA_ATOMIC+1 so use > not >= */ + if (next > QIB_MAX_RDMA_ATOMIC) + next = 0; + spin_lock_irqsave(&qp->s_lock, flags); + if (unlikely(next == qp->s_tail_ack_queue)) { + if (!qp->s_ack_queue[next].sent) + goto nack_inv_unlck; + qib_update_ack_queue(qp, next); + } + e = &qp->s_ack_queue[qp->r_head_ack_queue]; + if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) { + qib_put_mr(e->rdma_sge.mr); + e->rdma_sge.mr = NULL; + } + reth = &ohdr->u.rc.reth; + len = be32_to_cpu(reth->length); + if (len) { + u32 rkey = be32_to_cpu(reth->rkey); + u64 vaddr = be64_to_cpu(reth->vaddr); + int ok; + + /* Check rkey & NAK */ + ok = qib_rkey_ok(qp, &e->rdma_sge, len, vaddr, + rkey, IB_ACCESS_REMOTE_READ); + if (unlikely(!ok)) + goto nack_acc_unlck; + /* + * Update the next expected PSN. We add 1 later + * below, so only add the remainder here. + */ + if (len > pmtu) + qp->r_psn += (len - 1) / pmtu; + } else { + e->rdma_sge.mr = NULL; + e->rdma_sge.vaddr = NULL; + e->rdma_sge.length = 0; + e->rdma_sge.sge_length = 0; + } + e->opcode = opcode; + e->sent = 0; + e->psn = psn; + e->lpsn = qp->r_psn; + /* + * We need to increment the MSN here instead of when we + * finish sending the result since a duplicate request would + * increment it more than once. + */ + qp->r_msn++; + qp->r_psn++; + qp->r_state = opcode; + qp->r_nak_state = 0; + qp->r_head_ack_queue = next; + + /* Schedule the send tasklet. */ + qp->s_flags |= QIB_S_RESP_PENDING; + qib_schedule_send(qp); + + goto sunlock; + } + + case OP(COMPARE_SWAP): + case OP(FETCH_ADD): { + struct ib_atomic_eth *ateth; + struct qib_ack_entry *e; + u64 vaddr; + atomic64_t *maddr; + u64 sdata; + u32 rkey; + u8 next; + + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) + goto nack_inv; + next = qp->r_head_ack_queue + 1; + if (next > QIB_MAX_RDMA_ATOMIC) + next = 0; + spin_lock_irqsave(&qp->s_lock, flags); + if (unlikely(next == qp->s_tail_ack_queue)) { + if (!qp->s_ack_queue[next].sent) + goto nack_inv_unlck; + qib_update_ack_queue(qp, next); + } + e = &qp->s_ack_queue[qp->r_head_ack_queue]; + if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) { + qib_put_mr(e->rdma_sge.mr); + e->rdma_sge.mr = NULL; + } + ateth = &ohdr->u.atomic_eth; + vaddr = ((u64) be32_to_cpu(ateth->vaddr[0]) << 32) | + be32_to_cpu(ateth->vaddr[1]); + if (unlikely(vaddr & (sizeof(u64) - 1))) + goto nack_inv_unlck; + rkey = be32_to_cpu(ateth->rkey); + /* Check rkey & NAK */ + if (unlikely(!qib_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64), + vaddr, rkey, + IB_ACCESS_REMOTE_ATOMIC))) + goto nack_acc_unlck; + /* Perform atomic OP and save result. */ + maddr = (atomic64_t *) qp->r_sge.sge.vaddr; + sdata = be64_to_cpu(ateth->swap_data); + e->atomic_data = (opcode == OP(FETCH_ADD)) ? + (u64) atomic64_add_return(sdata, maddr) - sdata : + (u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr, + be64_to_cpu(ateth->compare_data), + sdata); + qib_put_mr(qp->r_sge.sge.mr); + qp->r_sge.num_sge = 0; + e->opcode = opcode; + e->sent = 0; + e->psn = psn; + e->lpsn = psn; + qp->r_msn++; + qp->r_psn++; + qp->r_state = opcode; + qp->r_nak_state = 0; + qp->r_head_ack_queue = next; + + /* Schedule the send tasklet. */ + qp->s_flags |= QIB_S_RESP_PENDING; + qib_schedule_send(qp); + + goto sunlock; + } + + default: + /* NAK unknown opcodes. */ + goto nack_inv; + } + qp->r_psn++; + qp->r_state = opcode; + qp->r_ack_psn = psn; + qp->r_nak_state = 0; + /* Send an ACK if requested or required. */ + if (psn & (1 << 31)) + goto send_ack; + return; + +rnr_nak: + qp->r_nak_state = IB_RNR_NAK | qp->r_min_rnr_timer; + qp->r_ack_psn = qp->r_psn; + /* Queue RNR NAK for later */ + if (list_empty(&qp->rspwait)) { + qp->r_flags |= QIB_R_RSP_NAK; + atomic_inc(&qp->refcount); + list_add_tail(&qp->rspwait, &rcd->qp_wait_list); + } + return; + +nack_op_err: + qib_rc_error(qp, IB_WC_LOC_QP_OP_ERR); + qp->r_nak_state = IB_NAK_REMOTE_OPERATIONAL_ERROR; + qp->r_ack_psn = qp->r_psn; + /* Queue NAK for later */ + if (list_empty(&qp->rspwait)) { + qp->r_flags |= QIB_R_RSP_NAK; + atomic_inc(&qp->refcount); + list_add_tail(&qp->rspwait, &rcd->qp_wait_list); + } + return; + +nack_inv_unlck: + spin_unlock_irqrestore(&qp->s_lock, flags); +nack_inv: + qib_rc_error(qp, IB_WC_LOC_QP_OP_ERR); + qp->r_nak_state = IB_NAK_INVALID_REQUEST; + qp->r_ack_psn = qp->r_psn; + /* Queue NAK for later */ + if (list_empty(&qp->rspwait)) { + qp->r_flags |= QIB_R_RSP_NAK; + atomic_inc(&qp->refcount); + list_add_tail(&qp->rspwait, &rcd->qp_wait_list); + } + return; + +nack_acc_unlck: + spin_unlock_irqrestore(&qp->s_lock, flags); +nack_acc: + qib_rc_error(qp, IB_WC_LOC_PROT_ERR); + qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR; + qp->r_ack_psn = qp->r_psn; +send_ack: + qib_send_rc_ack(qp); + return; + +sunlock: + spin_unlock_irqrestore(&qp->s_lock, flags); +} diff --git a/kernel/drivers/infiniband/hw/qib/qib_ruc.c b/kernel/drivers/infiniband/hw/qib/qib_ruc.c new file mode 100644 index 000000000..f42bd0f47 --- /dev/null +++ b/kernel/drivers/infiniband/hw/qib/qib_ruc.c @@ -0,0 +1,819 @@ +/* + * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include "qib.h" +#include "qib_mad.h" + +/* + * Convert the AETH RNR timeout code into the number of microseconds. + */ +const u32 ib_qib_rnr_table[32] = { + 655360, /* 00: 655.36 */ + 10, /* 01: .01 */ + 20, /* 02 .02 */ + 30, /* 03: .03 */ + 40, /* 04: .04 */ + 60, /* 05: .06 */ + 80, /* 06: .08 */ + 120, /* 07: .12 */ + 160, /* 08: .16 */ + 240, /* 09: .24 */ + 320, /* 0A: .32 */ + 480, /* 0B: .48 */ + 640, /* 0C: .64 */ + 960, /* 0D: .96 */ + 1280, /* 0E: 1.28 */ + 1920, /* 0F: 1.92 */ + 2560, /* 10: 2.56 */ + 3840, /* 11: 3.84 */ + 5120, /* 12: 5.12 */ + 7680, /* 13: 7.68 */ + 10240, /* 14: 10.24 */ + 15360, /* 15: 15.36 */ + 20480, /* 16: 20.48 */ + 30720, /* 17: 30.72 */ + 40960, /* 18: 40.96 */ + 61440, /* 19: 61.44 */ + 81920, /* 1A: 81.92 */ + 122880, /* 1B: 122.88 */ + 163840, /* 1C: 163.84 */ + 245760, /* 1D: 245.76 */ + 327680, /* 1E: 327.68 */ + 491520 /* 1F: 491.52 */ +}; + +/* + * Validate a RWQE and fill in the SGE state. + * Return 1 if OK. + */ +static int qib_init_sge(struct qib_qp *qp, struct qib_rwqe *wqe) +{ + int i, j, ret; + struct ib_wc wc; + struct qib_lkey_table *rkt; + struct qib_pd *pd; + struct qib_sge_state *ss; + + rkt = &to_idev(qp->ibqp.device)->lk_table; + pd = to_ipd(qp->ibqp.srq ? qp->ibqp.srq->pd : qp->ibqp.pd); + ss = &qp->r_sge; + ss->sg_list = qp->r_sg_list; + qp->r_len = 0; + for (i = j = 0; i < wqe->num_sge; i++) { + if (wqe->sg_list[i].length == 0) + continue; + /* Check LKEY */ + if (!qib_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge, + &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE)) + goto bad_lkey; + qp->r_len += wqe->sg_list[i].length; + j++; + } + ss->num_sge = j; + ss->total_len = qp->r_len; + ret = 1; + goto bail; + +bad_lkey: + while (j) { + struct qib_sge *sge = --j ? &ss->sg_list[j - 1] : &ss->sge; + + qib_put_mr(sge->mr); + } + ss->num_sge = 0; + memset(&wc, 0, sizeof(wc)); + wc.wr_id = wqe->wr_id; + wc.status = IB_WC_LOC_PROT_ERR; + wc.opcode = IB_WC_RECV; + wc.qp = &qp->ibqp; + /* Signal solicited completion event. */ + qib_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1); + ret = 0; +bail: + return ret; +} + +/** + * qib_get_rwqe - copy the next RWQE into the QP's RWQE + * @qp: the QP + * @wr_id_only: update qp->r_wr_id only, not qp->r_sge + * + * Return -1 if there is a local error, 0 if no RWQE is available, + * otherwise return 1. + * + * Can be called from interrupt level. + */ +int qib_get_rwqe(struct qib_qp *qp, int wr_id_only) +{ + unsigned long flags; + struct qib_rq *rq; + struct qib_rwq *wq; + struct qib_srq *srq; + struct qib_rwqe *wqe; + void (*handler)(struct ib_event *, void *); + u32 tail; + int ret; + + if (qp->ibqp.srq) { + srq = to_isrq(qp->ibqp.srq); + handler = srq->ibsrq.event_handler; + rq = &srq->rq; + } else { + srq = NULL; + handler = NULL; + rq = &qp->r_rq; + } + + spin_lock_irqsave(&rq->lock, flags); + if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK)) { + ret = 0; + goto unlock; + } + + wq = rq->wq; + tail = wq->tail; + /* Validate tail before using it since it is user writable. */ + if (tail >= rq->size) + tail = 0; + if (unlikely(tail == wq->head)) { + ret = 0; + goto unlock; + } + /* Make sure entry is read after head index is read. */ + smp_rmb(); + wqe = get_rwqe_ptr(rq, tail); + /* + * Even though we update the tail index in memory, the verbs + * consumer is not supposed to post more entries until a + * completion is generated. + */ + if (++tail >= rq->size) + tail = 0; + wq->tail = tail; + if (!wr_id_only && !qib_init_sge(qp, wqe)) { + ret = -1; + goto unlock; + } + qp->r_wr_id = wqe->wr_id; + + ret = 1; + set_bit(QIB_R_WRID_VALID, &qp->r_aflags); + if (handler) { + u32 n; + + /* + * Validate head pointer value and compute + * the number of remaining WQEs. + */ + n = wq->head; + if (n >= rq->size) + n = 0; + if (n < tail) + n += rq->size - tail; + else + n -= tail; + if (n < srq->limit) { + struct ib_event ev; + + srq->limit = 0; + spin_unlock_irqrestore(&rq->lock, flags); + ev.device = qp->ibqp.device; + ev.element.srq = qp->ibqp.srq; + ev.event = IB_EVENT_SRQ_LIMIT_REACHED; + handler(&ev, srq->ibsrq.srq_context); + goto bail; + } + } +unlock: + spin_unlock_irqrestore(&rq->lock, flags); +bail: + return ret; +} + +/* + * Switch to alternate path. + * The QP s_lock should be held and interrupts disabled. + */ +void qib_migrate_qp(struct qib_qp *qp) +{ + struct ib_event ev; + + qp->s_mig_state = IB_MIG_MIGRATED; + qp->remote_ah_attr = qp->alt_ah_attr; + qp->port_num = qp->alt_ah_attr.port_num; + qp->s_pkey_index = qp->s_alt_pkey_index; + + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_PATH_MIG; + qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); +} + +static __be64 get_sguid(struct qib_ibport *ibp, unsigned index) +{ + if (!index) { + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + + return ppd->guid; + } + return ibp->guids[index - 1]; +} + +static int gid_ok(union ib_gid *gid, __be64 gid_prefix, __be64 id) +{ + return (gid->global.interface_id == id && + (gid->global.subnet_prefix == gid_prefix || + gid->global.subnet_prefix == IB_DEFAULT_GID_PREFIX)); +} + +/* + * + * This should be called with the QP r_lock held. + * + * The s_lock will be acquired around the qib_migrate_qp() call. + */ +int qib_ruc_check_hdr(struct qib_ibport *ibp, struct qib_ib_header *hdr, + int has_grh, struct qib_qp *qp, u32 bth0) +{ + __be64 guid; + unsigned long flags; + + if (qp->s_mig_state == IB_MIG_ARMED && (bth0 & IB_BTH_MIG_REQ)) { + if (!has_grh) { + if (qp->alt_ah_attr.ah_flags & IB_AH_GRH) + goto err; + } else { + if (!(qp->alt_ah_attr.ah_flags & IB_AH_GRH)) + goto err; + guid = get_sguid(ibp, qp->alt_ah_attr.grh.sgid_index); + if (!gid_ok(&hdr->u.l.grh.dgid, ibp->gid_prefix, guid)) + goto err; + if (!gid_ok(&hdr->u.l.grh.sgid, + qp->alt_ah_attr.grh.dgid.global.subnet_prefix, + qp->alt_ah_attr.grh.dgid.global.interface_id)) + goto err; + } + if (!qib_pkey_ok((u16)bth0, + qib_get_pkey(ibp, qp->s_alt_pkey_index))) { + qib_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_PKEY, + (u16)bth0, + (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF, + 0, qp->ibqp.qp_num, + hdr->lrh[3], hdr->lrh[1]); + goto err; + } + /* Validate the SLID. See Ch. 9.6.1.5 and 17.2.8 */ + if (be16_to_cpu(hdr->lrh[3]) != qp->alt_ah_attr.dlid || + ppd_from_ibp(ibp)->port != qp->alt_ah_attr.port_num) + goto err; + spin_lock_irqsave(&qp->s_lock, flags); + qib_migrate_qp(qp); + spin_unlock_irqrestore(&qp->s_lock, flags); + } else { + if (!has_grh) { + if (qp->remote_ah_attr.ah_flags & IB_AH_GRH) + goto err; + } else { + if (!(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) + goto err; + guid = get_sguid(ibp, + qp->remote_ah_attr.grh.sgid_index); + if (!gid_ok(&hdr->u.l.grh.dgid, ibp->gid_prefix, guid)) + goto err; + if (!gid_ok(&hdr->u.l.grh.sgid, + qp->remote_ah_attr.grh.dgid.global.subnet_prefix, + qp->remote_ah_attr.grh.dgid.global.interface_id)) + goto err; + } + if (!qib_pkey_ok((u16)bth0, + qib_get_pkey(ibp, qp->s_pkey_index))) { + qib_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_PKEY, + (u16)bth0, + (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF, + 0, qp->ibqp.qp_num, + hdr->lrh[3], hdr->lrh[1]); + goto err; + } + /* Validate the SLID. See Ch. 9.6.1.5 */ + if (be16_to_cpu(hdr->lrh[3]) != qp->remote_ah_attr.dlid || + ppd_from_ibp(ibp)->port != qp->port_num) + goto err; + if (qp->s_mig_state == IB_MIG_REARM && + !(bth0 & IB_BTH_MIG_REQ)) + qp->s_mig_state = IB_MIG_ARMED; + } + + return 0; + +err: + return 1; +} + +/** + * qib_ruc_loopback - handle UC and RC lookback requests + * @sqp: the sending QP + * + * This is called from qib_do_send() to + * forward a WQE addressed to the same HCA. + * Note that although we are single threaded due to the tasklet, we still + * have to protect against post_send(). We don't have to worry about + * receive interrupts since this is a connected protocol and all packets + * will pass through here. + */ +static void qib_ruc_loopback(struct qib_qp *sqp) +{ + struct qib_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num); + struct qib_qp *qp; + struct qib_swqe *wqe; + struct qib_sge *sge; + unsigned long flags; + struct ib_wc wc; + u64 sdata; + atomic64_t *maddr; + enum ib_wc_status send_status; + int release; + int ret; + + /* + * Note that we check the responder QP state after + * checking the requester's state. + */ + qp = qib_lookup_qpn(ibp, sqp->remote_qpn); + + spin_lock_irqsave(&sqp->s_lock, flags); + + /* Return if we are already busy processing a work request. */ + if ((sqp->s_flags & (QIB_S_BUSY | QIB_S_ANY_WAIT)) || + !(ib_qib_state_ops[sqp->state] & QIB_PROCESS_OR_FLUSH_SEND)) + goto unlock; + + sqp->s_flags |= QIB_S_BUSY; + +again: + if (sqp->s_last == sqp->s_head) + goto clr_busy; + wqe = get_swqe_ptr(sqp, sqp->s_last); + + /* Return if it is not OK to start a new work reqeust. */ + if (!(ib_qib_state_ops[sqp->state] & QIB_PROCESS_NEXT_SEND_OK)) { + if (!(ib_qib_state_ops[sqp->state] & QIB_FLUSH_SEND)) + goto clr_busy; + /* We are in the error state, flush the work request. */ + send_status = IB_WC_WR_FLUSH_ERR; + goto flush_send; + } + + /* + * We can rely on the entry not changing without the s_lock + * being held until we update s_last. + * We increment s_cur to indicate s_last is in progress. + */ + if (sqp->s_last == sqp->s_cur) { + if (++sqp->s_cur >= sqp->s_size) + sqp->s_cur = 0; + } + spin_unlock_irqrestore(&sqp->s_lock, flags); + + if (!qp || !(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK) || + qp->ibqp.qp_type != sqp->ibqp.qp_type) { + ibp->n_pkt_drops++; + /* + * For RC, the requester would timeout and retry so + * shortcut the timeouts and just signal too many retries. + */ + if (sqp->ibqp.qp_type == IB_QPT_RC) + send_status = IB_WC_RETRY_EXC_ERR; + else + send_status = IB_WC_SUCCESS; + goto serr; + } + + memset(&wc, 0, sizeof(wc)); + send_status = IB_WC_SUCCESS; + + release = 1; + sqp->s_sge.sge = wqe->sg_list[0]; + sqp->s_sge.sg_list = wqe->sg_list + 1; + sqp->s_sge.num_sge = wqe->wr.num_sge; + sqp->s_len = wqe->length; + switch (wqe->wr.opcode) { + case IB_WR_SEND_WITH_IMM: + wc.wc_flags = IB_WC_WITH_IMM; + wc.ex.imm_data = wqe->wr.ex.imm_data; + /* FALLTHROUGH */ + case IB_WR_SEND: + ret = qib_get_rwqe(qp, 0); + if (ret < 0) + goto op_err; + if (!ret) + goto rnr_nak; + break; + + case IB_WR_RDMA_WRITE_WITH_IMM: + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) + goto inv_err; + wc.wc_flags = IB_WC_WITH_IMM; + wc.ex.imm_data = wqe->wr.ex.imm_data; + ret = qib_get_rwqe(qp, 1); + if (ret < 0) + goto op_err; + if (!ret) + goto rnr_nak; + /* FALLTHROUGH */ + case IB_WR_RDMA_WRITE: + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) + goto inv_err; + if (wqe->length == 0) + break; + if (unlikely(!qib_rkey_ok(qp, &qp->r_sge.sge, wqe->length, + wqe->wr.wr.rdma.remote_addr, + wqe->wr.wr.rdma.rkey, + IB_ACCESS_REMOTE_WRITE))) + goto acc_err; + qp->r_sge.sg_list = NULL; + qp->r_sge.num_sge = 1; + qp->r_sge.total_len = wqe->length; + break; + + case IB_WR_RDMA_READ: + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ))) + goto inv_err; + if (unlikely(!qib_rkey_ok(qp, &sqp->s_sge.sge, wqe->length, + wqe->wr.wr.rdma.remote_addr, + wqe->wr.wr.rdma.rkey, + IB_ACCESS_REMOTE_READ))) + goto acc_err; + release = 0; + sqp->s_sge.sg_list = NULL; + sqp->s_sge.num_sge = 1; + qp->r_sge.sge = wqe->sg_list[0]; + qp->r_sge.sg_list = wqe->sg_list + 1; + qp->r_sge.num_sge = wqe->wr.num_sge; + qp->r_sge.total_len = wqe->length; + break; + + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) + goto inv_err; + if (unlikely(!qib_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64), + wqe->wr.wr.atomic.remote_addr, + wqe->wr.wr.atomic.rkey, + IB_ACCESS_REMOTE_ATOMIC))) + goto acc_err; + /* Perform atomic OP and save result. */ + maddr = (atomic64_t *) qp->r_sge.sge.vaddr; + sdata = wqe->wr.wr.atomic.compare_add; + *(u64 *) sqp->s_sge.sge.vaddr = + (wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ? + (u64) atomic64_add_return(sdata, maddr) - sdata : + (u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr, + sdata, wqe->wr.wr.atomic.swap); + qib_put_mr(qp->r_sge.sge.mr); + qp->r_sge.num_sge = 0; + goto send_comp; + + default: + send_status = IB_WC_LOC_QP_OP_ERR; + goto serr; + } + + sge = &sqp->s_sge.sge; + while (sqp->s_len) { + u32 len = sqp->s_len; + + if (len > sge->length) + len = sge->length; + if (len > sge->sge_length) + len = sge->sge_length; + BUG_ON(len == 0); + qib_copy_sge(&qp->r_sge, sge->vaddr, len, release); + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (sge->sge_length == 0) { + if (!release) + qib_put_mr(sge->mr); + if (--sqp->s_sge.num_sge) + *sge = *sqp->s_sge.sg_list++; + } else if (sge->length == 0 && sge->mr->lkey) { + if (++sge->n >= QIB_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + break; + sge->n = 0; + } + sge->vaddr = + sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = + sge->mr->map[sge->m]->segs[sge->n].length; + } + sqp->s_len -= len; + } + if (release) + qib_put_ss(&qp->r_sge); + + if (!test_and_clear_bit(QIB_R_WRID_VALID, &qp->r_aflags)) + goto send_comp; + + if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM) + wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; + else + wc.opcode = IB_WC_RECV; + wc.wr_id = qp->r_wr_id; + wc.status = IB_WC_SUCCESS; + wc.byte_len = wqe->length; + wc.qp = &qp->ibqp; + wc.src_qp = qp->remote_qpn; + wc.slid = qp->remote_ah_attr.dlid; + wc.sl = qp->remote_ah_attr.sl; + wc.port_num = 1; + /* Signal completion event if the solicited bit is set. */ + qib_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, + wqe->wr.send_flags & IB_SEND_SOLICITED); + +send_comp: + spin_lock_irqsave(&sqp->s_lock, flags); + ibp->n_loop_pkts++; +flush_send: + sqp->s_rnr_retry = sqp->s_rnr_retry_cnt; + qib_send_complete(sqp, wqe, send_status); + goto again; + +rnr_nak: + /* Handle RNR NAK */ + if (qp->ibqp.qp_type == IB_QPT_UC) + goto send_comp; + ibp->n_rnr_naks++; + /* + * Note: we don't need the s_lock held since the BUSY flag + * makes this single threaded. + */ + if (sqp->s_rnr_retry == 0) { + send_status = IB_WC_RNR_RETRY_EXC_ERR; + goto serr; + } + if (sqp->s_rnr_retry_cnt < 7) + sqp->s_rnr_retry--; + spin_lock_irqsave(&sqp->s_lock, flags); + if (!(ib_qib_state_ops[sqp->state] & QIB_PROCESS_RECV_OK)) + goto clr_busy; + sqp->s_flags |= QIB_S_WAIT_RNR; + sqp->s_timer.function = qib_rc_rnr_retry; + sqp->s_timer.expires = jiffies + + usecs_to_jiffies(ib_qib_rnr_table[qp->r_min_rnr_timer]); + add_timer(&sqp->s_timer); + goto clr_busy; + +op_err: + send_status = IB_WC_REM_OP_ERR; + wc.status = IB_WC_LOC_QP_OP_ERR; + goto err; + +inv_err: + send_status = IB_WC_REM_INV_REQ_ERR; + wc.status = IB_WC_LOC_QP_OP_ERR; + goto err; + +acc_err: + send_status = IB_WC_REM_ACCESS_ERR; + wc.status = IB_WC_LOC_PROT_ERR; +err: + /* responder goes to error state */ + qib_rc_error(qp, wc.status); + +serr: + spin_lock_irqsave(&sqp->s_lock, flags); + qib_send_complete(sqp, wqe, send_status); + if (sqp->ibqp.qp_type == IB_QPT_RC) { + int lastwqe = qib_error_qp(sqp, IB_WC_WR_FLUSH_ERR); + + sqp->s_flags &= ~QIB_S_BUSY; + spin_unlock_irqrestore(&sqp->s_lock, flags); + if (lastwqe) { + struct ib_event ev; + + ev.device = sqp->ibqp.device; + ev.element.qp = &sqp->ibqp; + ev.event = IB_EVENT_QP_LAST_WQE_REACHED; + sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context); + } + goto done; + } +clr_busy: + sqp->s_flags &= ~QIB_S_BUSY; +unlock: + spin_unlock_irqrestore(&sqp->s_lock, flags); +done: + if (qp && atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); +} + +/** + * qib_make_grh - construct a GRH header + * @ibp: a pointer to the IB port + * @hdr: a pointer to the GRH header being constructed + * @grh: the global route address to send to + * @hwords: the number of 32 bit words of header being sent + * @nwords: the number of 32 bit words of data being sent + * + * Return the size of the header in 32 bit words. + */ +u32 qib_make_grh(struct qib_ibport *ibp, struct ib_grh *hdr, + struct ib_global_route *grh, u32 hwords, u32 nwords) +{ + hdr->version_tclass_flow = + cpu_to_be32((IB_GRH_VERSION << IB_GRH_VERSION_SHIFT) | + (grh->traffic_class << IB_GRH_TCLASS_SHIFT) | + (grh->flow_label << IB_GRH_FLOW_SHIFT)); + hdr->paylen = cpu_to_be16((hwords - 2 + nwords + SIZE_OF_CRC) << 2); + /* next_hdr is defined by C8-7 in ch. 8.4.1 */ + hdr->next_hdr = IB_GRH_NEXT_HDR; + hdr->hop_limit = grh->hop_limit; + /* The SGID is 32-bit aligned. */ + hdr->sgid.global.subnet_prefix = ibp->gid_prefix; + hdr->sgid.global.interface_id = grh->sgid_index ? + ibp->guids[grh->sgid_index - 1] : ppd_from_ibp(ibp)->guid; + hdr->dgid = grh->dgid; + + /* GRH header size in 32-bit words. */ + return sizeof(struct ib_grh) / sizeof(u32); +} + +void qib_make_ruc_header(struct qib_qp *qp, struct qib_other_headers *ohdr, + u32 bth0, u32 bth2) +{ + struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + u16 lrh0; + u32 nwords; + u32 extra_bytes; + + /* Construct the header. */ + extra_bytes = -qp->s_cur_size & 3; + nwords = (qp->s_cur_size + extra_bytes) >> 2; + lrh0 = QIB_LRH_BTH; + if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) { + qp->s_hdrwords += qib_make_grh(ibp, &qp->s_hdr->u.l.grh, + &qp->remote_ah_attr.grh, + qp->s_hdrwords, nwords); + lrh0 = QIB_LRH_GRH; + } + lrh0 |= ibp->sl_to_vl[qp->remote_ah_attr.sl] << 12 | + qp->remote_ah_attr.sl << 4; + qp->s_hdr->lrh[0] = cpu_to_be16(lrh0); + qp->s_hdr->lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid); + qp->s_hdr->lrh[2] = cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC); + qp->s_hdr->lrh[3] = cpu_to_be16(ppd_from_ibp(ibp)->lid | + qp->remote_ah_attr.src_path_bits); + bth0 |= qib_get_pkey(ibp, qp->s_pkey_index); + bth0 |= extra_bytes << 20; + if (qp->s_mig_state == IB_MIG_MIGRATED) + bth0 |= IB_BTH_MIG_REQ; + ohdr->bth[0] = cpu_to_be32(bth0); + ohdr->bth[1] = cpu_to_be32(qp->remote_qpn); + ohdr->bth[2] = cpu_to_be32(bth2); + this_cpu_inc(ibp->pmastats->n_unicast_xmit); +} + +/** + * qib_do_send - perform a send on a QP + * @work: contains a pointer to the QP + * + * Process entries in the send work queue until credit or queue is + * exhausted. Only allow one CPU to send a packet per QP (tasklet). + * Otherwise, two threads could send packets out of order. + */ +void qib_do_send(struct work_struct *work) +{ + struct qib_qp *qp = container_of(work, struct qib_qp, s_work); + struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + int (*make_req)(struct qib_qp *qp); + unsigned long flags; + + if ((qp->ibqp.qp_type == IB_QPT_RC || + qp->ibqp.qp_type == IB_QPT_UC) && + (qp->remote_ah_attr.dlid & ~((1 << ppd->lmc) - 1)) == ppd->lid) { + qib_ruc_loopback(qp); + return; + } + + if (qp->ibqp.qp_type == IB_QPT_RC) + make_req = qib_make_rc_req; + else if (qp->ibqp.qp_type == IB_QPT_UC) + make_req = qib_make_uc_req; + else + make_req = qib_make_ud_req; + + spin_lock_irqsave(&qp->s_lock, flags); + + /* Return if we are already busy processing a work request. */ + if (!qib_send_ok(qp)) { + spin_unlock_irqrestore(&qp->s_lock, flags); + return; + } + + qp->s_flags |= QIB_S_BUSY; + + spin_unlock_irqrestore(&qp->s_lock, flags); + + do { + /* Check for a constructed packet to be sent. */ + if (qp->s_hdrwords != 0) { + /* + * If the packet cannot be sent now, return and + * the send tasklet will be woken up later. + */ + if (qib_verbs_send(qp, qp->s_hdr, qp->s_hdrwords, + qp->s_cur_sge, qp->s_cur_size)) + break; + /* Record that s_hdr is empty. */ + qp->s_hdrwords = 0; + } + } while (make_req(qp)); +} + +/* + * This should be called with s_lock held. + */ +void qib_send_complete(struct qib_qp *qp, struct qib_swqe *wqe, + enum ib_wc_status status) +{ + u32 old_last, last; + unsigned i; + + if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_OR_FLUSH_SEND)) + return; + + for (i = 0; i < wqe->wr.num_sge; i++) { + struct qib_sge *sge = &wqe->sg_list[i]; + + qib_put_mr(sge->mr); + } + if (qp->ibqp.qp_type == IB_QPT_UD || + qp->ibqp.qp_type == IB_QPT_SMI || + qp->ibqp.qp_type == IB_QPT_GSI) + atomic_dec(&to_iah(wqe->wr.wr.ud.ah)->refcount); + + /* See ch. 11.2.4.1 and 10.7.3.1 */ + if (!(qp->s_flags & QIB_S_SIGNAL_REQ_WR) || + (wqe->wr.send_flags & IB_SEND_SIGNALED) || + status != IB_WC_SUCCESS) { + struct ib_wc wc; + + memset(&wc, 0, sizeof(wc)); + wc.wr_id = wqe->wr.wr_id; + wc.status = status; + wc.opcode = ib_qib_wc_opcode[wqe->wr.opcode]; + wc.qp = &qp->ibqp; + if (status == IB_WC_SUCCESS) + wc.byte_len = wqe->length; + qib_cq_enter(to_icq(qp->ibqp.send_cq), &wc, + status != IB_WC_SUCCESS); + } + + last = qp->s_last; + old_last = last; + if (++last >= qp->s_size) + last = 0; + qp->s_last = last; + if (qp->s_acked == old_last) + qp->s_acked = last; + if (qp->s_cur == old_last) + qp->s_cur = last; + if (qp->s_tail == old_last) + qp->s_tail = last; + if (qp->state == IB_QPS_SQD && last == qp->s_cur) + qp->s_draining = 0; +} diff --git a/kernel/drivers/infiniband/hw/qib/qib_sd7220.c b/kernel/drivers/infiniband/hw/qib/qib_sd7220.c new file mode 100644 index 000000000..c72775f27 --- /dev/null +++ b/kernel/drivers/infiniband/hw/qib/qib_sd7220.c @@ -0,0 +1,1454 @@ +/* + * Copyright (c) 2013 Intel Corporation. All rights reserved. + * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +/* + * This file contains all of the code that is specific to the SerDes + * on the QLogic_IB 7220 chip. + */ + +#include +#include +#include +#include + +#include "qib.h" +#include "qib_7220.h" + +#define SD7220_FW_NAME "qlogic/sd7220.fw" +MODULE_FIRMWARE(SD7220_FW_NAME); + +/* + * Same as in qib_iba7220.c, but just the registers needed here. + * Could move whole set to qib_7220.h, but decided better to keep + * local. + */ +#define KREG_IDX(regname) (QIB_7220_##regname##_OFFS / sizeof(u64)) +#define kr_hwerrclear KREG_IDX(HwErrClear) +#define kr_hwerrmask KREG_IDX(HwErrMask) +#define kr_hwerrstatus KREG_IDX(HwErrStatus) +#define kr_ibcstatus KREG_IDX(IBCStatus) +#define kr_ibserdesctrl KREG_IDX(IBSerDesCtrl) +#define kr_scratch KREG_IDX(Scratch) +#define kr_xgxs_cfg KREG_IDX(XGXSCfg) +/* these are used only here, not in qib_iba7220.c */ +#define kr_ibsd_epb_access_ctrl KREG_IDX(ibsd_epb_access_ctrl) +#define kr_ibsd_epb_transaction_reg KREG_IDX(ibsd_epb_transaction_reg) +#define kr_pciesd_epb_transaction_reg KREG_IDX(pciesd_epb_transaction_reg) +#define kr_pciesd_epb_access_ctrl KREG_IDX(pciesd_epb_access_ctrl) +#define kr_serdes_ddsrxeq0 KREG_IDX(SerDes_DDSRXEQ0) + +/* + * The IBSerDesMappTable is a memory that holds values to be stored in + * various SerDes registers by IBC. + */ +#define kr_serdes_maptable KREG_IDX(IBSerDesMappTable) + +/* + * Below used for sdnum parameter, selecting one of the two sections + * used for PCIe, or the single SerDes used for IB. + */ +#define PCIE_SERDES0 0 +#define PCIE_SERDES1 1 + +/* + * The EPB requires addressing in a particular form. EPB_LOC() is intended + * to make #definitions a little more readable. + */ +#define EPB_ADDR_SHF 8 +#define EPB_LOC(chn, elt, reg) \ + (((elt & 0xf) | ((chn & 7) << 4) | ((reg & 0x3f) << 9)) << \ + EPB_ADDR_SHF) +#define EPB_IB_QUAD0_CS_SHF (25) +#define EPB_IB_QUAD0_CS (1U << EPB_IB_QUAD0_CS_SHF) +#define EPB_IB_UC_CS_SHF (26) +#define EPB_PCIE_UC_CS_SHF (27) +#define EPB_GLOBAL_WR (1U << (EPB_ADDR_SHF + 8)) + +/* Forward declarations. */ +static int qib_sd7220_reg_mod(struct qib_devdata *dd, int sdnum, u32 loc, + u32 data, u32 mask); +static int ibsd_mod_allchnls(struct qib_devdata *dd, int loc, int val, + int mask); +static int qib_sd_trimdone_poll(struct qib_devdata *dd); +static void qib_sd_trimdone_monitor(struct qib_devdata *dd, const char *where); +static int qib_sd_setvals(struct qib_devdata *dd); +static int qib_sd_early(struct qib_devdata *dd); +static int qib_sd_dactrim(struct qib_devdata *dd); +static int qib_internal_presets(struct qib_devdata *dd); +/* Tweak the register (CMUCTRL5) that contains the TRIMSELF controls */ +static int qib_sd_trimself(struct qib_devdata *dd, int val); +static int epb_access(struct qib_devdata *dd, int sdnum, int claim); +static int qib_sd7220_ib_load(struct qib_devdata *dd, + const struct firmware *fw); +static int qib_sd7220_ib_vfy(struct qib_devdata *dd, + const struct firmware *fw); + +/* + * Below keeps track of whether the "once per power-on" initialization has + * been done, because uC code Version 1.32.17 or higher allows the uC to + * be reset at will, and Automatic Equalization may require it. So the + * state of the reset "pin", is no longer valid. Instead, we check for the + * actual uC code having been loaded. + */ +static int qib_ibsd_ucode_loaded(struct qib_pportdata *ppd, + const struct firmware *fw) +{ + struct qib_devdata *dd = ppd->dd; + + if (!dd->cspec->serdes_first_init_done && + qib_sd7220_ib_vfy(dd, fw) > 0) + dd->cspec->serdes_first_init_done = 1; + return dd->cspec->serdes_first_init_done; +} + +/* repeat #define for local use. "Real" #define is in qib_iba7220.c */ +#define QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR 0x0000004000000000ULL +#define IB_MPREG5 (EPB_LOC(6, 0, 0xE) | (1L << EPB_IB_UC_CS_SHF)) +#define IB_MPREG6 (EPB_LOC(6, 0, 0xF) | (1U << EPB_IB_UC_CS_SHF)) +#define UC_PAR_CLR_D 8 +#define UC_PAR_CLR_M 0xC +#define IB_CTRL2(chn) (EPB_LOC(chn, 7, 3) | EPB_IB_QUAD0_CS) +#define START_EQ1(chan) EPB_LOC(chan, 7, 0x27) + +void qib_sd7220_clr_ibpar(struct qib_devdata *dd) +{ + int ret; + + /* clear, then re-enable parity errs */ + ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, IB_MPREG6, + UC_PAR_CLR_D, UC_PAR_CLR_M); + if (ret < 0) { + qib_dev_err(dd, "Failed clearing IBSerDes Parity err\n"); + goto bail; + } + ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, IB_MPREG6, 0, + UC_PAR_CLR_M); + + qib_read_kreg32(dd, kr_scratch); + udelay(4); + qib_write_kreg(dd, kr_hwerrclear, + QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR); + qib_read_kreg32(dd, kr_scratch); +bail: + return; +} + +/* + * After a reset or other unusual event, the epb interface may need + * to be re-synchronized, between the host and the uC. + * returns <0 for failure to resync within IBSD_RESYNC_TRIES (not expected) + */ +#define IBSD_RESYNC_TRIES 3 +#define IB_PGUDP(chn) (EPB_LOC((chn), 2, 1) | EPB_IB_QUAD0_CS) +#define IB_CMUDONE(chn) (EPB_LOC((chn), 7, 0xF) | EPB_IB_QUAD0_CS) + +static int qib_resync_ibepb(struct qib_devdata *dd) +{ + int ret, pat, tries, chn; + u32 loc; + + ret = -1; + chn = 0; + for (tries = 0; tries < (4 * IBSD_RESYNC_TRIES); ++tries) { + loc = IB_PGUDP(chn); + ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, loc, 0, 0); + if (ret < 0) { + qib_dev_err(dd, "Failed read in resync\n"); + continue; + } + if (ret != 0xF0 && ret != 0x55 && tries == 0) + qib_dev_err(dd, "unexpected pattern in resync\n"); + pat = ret ^ 0xA5; /* alternate F0 and 55 */ + ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, loc, pat, 0xFF); + if (ret < 0) { + qib_dev_err(dd, "Failed write in resync\n"); + continue; + } + ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, loc, 0, 0); + if (ret < 0) { + qib_dev_err(dd, "Failed re-read in resync\n"); + continue; + } + if (ret != pat) { + qib_dev_err(dd, "Failed compare1 in resync\n"); + continue; + } + loc = IB_CMUDONE(chn); + ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, loc, 0, 0); + if (ret < 0) { + qib_dev_err(dd, "Failed CMUDONE rd in resync\n"); + continue; + } + if ((ret & 0x70) != ((chn << 4) | 0x40)) { + qib_dev_err(dd, "Bad CMUDONE value %02X, chn %d\n", + ret, chn); + continue; + } + if (++chn == 4) + break; /* Success */ + } + return (ret > 0) ? 0 : ret; +} + +/* + * Localize the stuff that should be done to change IB uC reset + * returns <0 for errors. + */ +static int qib_ibsd_reset(struct qib_devdata *dd, int assert_rst) +{ + u64 rst_val; + int ret = 0; + unsigned long flags; + + rst_val = qib_read_kreg64(dd, kr_ibserdesctrl); + if (assert_rst) { + /* + * Vendor recommends "interrupting" uC before reset, to + * minimize possible glitches. + */ + spin_lock_irqsave(&dd->cspec->sdepb_lock, flags); + epb_access(dd, IB_7220_SERDES, 1); + rst_val |= 1ULL; + /* Squelch possible parity error from _asserting_ reset */ + qib_write_kreg(dd, kr_hwerrmask, + dd->cspec->hwerrmask & + ~QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR); + qib_write_kreg(dd, kr_ibserdesctrl, rst_val); + /* flush write, delay to ensure it took effect */ + qib_read_kreg32(dd, kr_scratch); + udelay(2); + /* once it's reset, can remove interrupt */ + epb_access(dd, IB_7220_SERDES, -1); + spin_unlock_irqrestore(&dd->cspec->sdepb_lock, flags); + } else { + /* + * Before we de-assert reset, we need to deal with + * possible glitch on the Parity-error line. + * Suppress it around the reset, both in chip-level + * hwerrmask and in IB uC control reg. uC will allow + * it again during startup. + */ + u64 val; + + rst_val &= ~(1ULL); + qib_write_kreg(dd, kr_hwerrmask, + dd->cspec->hwerrmask & + ~QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR); + + ret = qib_resync_ibepb(dd); + if (ret < 0) + qib_dev_err(dd, "unable to re-sync IB EPB\n"); + + /* set uC control regs to suppress parity errs */ + ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, IB_MPREG5, 1, 1); + if (ret < 0) + goto bail; + /* IB uC code past Version 1.32.17 allow suppression of wdog */ + ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, IB_MPREG6, 0x80, + 0x80); + if (ret < 0) { + qib_dev_err(dd, "Failed to set WDOG disable\n"); + goto bail; + } + qib_write_kreg(dd, kr_ibserdesctrl, rst_val); + /* flush write, delay for startup */ + qib_read_kreg32(dd, kr_scratch); + udelay(1); + /* clear, then re-enable parity errs */ + qib_sd7220_clr_ibpar(dd); + val = qib_read_kreg64(dd, kr_hwerrstatus); + if (val & QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR) { + qib_dev_err(dd, "IBUC Parity still set after RST\n"); + dd->cspec->hwerrmask &= + ~QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR; + } + qib_write_kreg(dd, kr_hwerrmask, + dd->cspec->hwerrmask); + } + +bail: + return ret; +} + +static void qib_sd_trimdone_monitor(struct qib_devdata *dd, + const char *where) +{ + int ret, chn, baduns; + u64 val; + + if (!where) + where = "?"; + + /* give time for reset to settle out in EPB */ + udelay(2); + + ret = qib_resync_ibepb(dd); + if (ret < 0) + qib_dev_err(dd, "not able to re-sync IB EPB (%s)\n", where); + + /* Do "sacrificial read" to get EPB in sane state after reset */ + ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, IB_CTRL2(0), 0, 0); + if (ret < 0) + qib_dev_err(dd, "Failed TRIMDONE 1st read, (%s)\n", where); + + /* Check/show "summary" Trim-done bit in IBCStatus */ + val = qib_read_kreg64(dd, kr_ibcstatus); + if (!(val & (1ULL << 11))) + qib_dev_err(dd, "IBCS TRIMDONE clear (%s)\n", where); + /* + * Do "dummy read/mod/wr" to get EPB in sane state after reset + * The default value for MPREG6 is 0. + */ + udelay(2); + + ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, IB_MPREG6, 0x80, 0x80); + if (ret < 0) + qib_dev_err(dd, "Failed Dummy RMW, (%s)\n", where); + udelay(10); + + baduns = 0; + + for (chn = 3; chn >= 0; --chn) { + /* Read CTRL reg for each channel to check TRIMDONE */ + ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, + IB_CTRL2(chn), 0, 0); + if (ret < 0) + qib_dev_err(dd, + "Failed checking TRIMDONE, chn %d (%s)\n", + chn, where); + + if (!(ret & 0x10)) { + int probe; + + baduns |= (1 << chn); + qib_dev_err(dd, + "TRIMDONE cleared on chn %d (%02X). (%s)\n", + chn, ret, where); + probe = qib_sd7220_reg_mod(dd, IB_7220_SERDES, + IB_PGUDP(0), 0, 0); + qib_dev_err(dd, "probe is %d (%02X)\n", + probe, probe); + probe = qib_sd7220_reg_mod(dd, IB_7220_SERDES, + IB_CTRL2(chn), 0, 0); + qib_dev_err(dd, "re-read: %d (%02X)\n", + probe, probe); + ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, + IB_CTRL2(chn), 0x10, 0x10); + if (ret < 0) + qib_dev_err(dd, + "Err on TRIMDONE rewrite1\n"); + } + } + for (chn = 3; chn >= 0; --chn) { + /* Read CTRL reg for each channel to check TRIMDONE */ + if (baduns & (1 << chn)) { + qib_dev_err(dd, + "Resetting TRIMDONE on chn %d (%s)\n", + chn, where); + ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, + IB_CTRL2(chn), 0x10, 0x10); + if (ret < 0) + qib_dev_err(dd, + "Failed re-setting TRIMDONE, chn %d (%s)\n", + chn, where); + } + } +} + +/* + * Below is portion of IBA7220-specific bringup_serdes() that actually + * deals with registers and memory within the SerDes itself. + * Post IB uC code version 1.32.17, was_reset being 1 is not really + * informative, so we double-check. + */ +int qib_sd7220_init(struct qib_devdata *dd) +{ + const struct firmware *fw; + int ret = 1; /* default to failure */ + int first_reset, was_reset; + + /* SERDES MPU reset recorded in D0 */ + was_reset = (qib_read_kreg64(dd, kr_ibserdesctrl) & 1); + if (!was_reset) { + /* entered with reset not asserted, we need to do it */ + qib_ibsd_reset(dd, 1); + qib_sd_trimdone_monitor(dd, "Driver-reload"); + } + + ret = request_firmware(&fw, SD7220_FW_NAME, &dd->pcidev->dev); + if (ret) { + qib_dev_err(dd, "Failed to load IB SERDES image\n"); + goto done; + } + + /* Substitute our deduced value for was_reset */ + ret = qib_ibsd_ucode_loaded(dd->pport, fw); + if (ret < 0) + goto bail; + + first_reset = !ret; /* First reset if IBSD uCode not yet loaded */ + /* + * Alter some regs per vendor latest doc, reset-defaults + * are not right for IB. + */ + ret = qib_sd_early(dd); + if (ret < 0) { + qib_dev_err(dd, "Failed to set IB SERDES early defaults\n"); + goto bail; + } + /* + * Set DAC manual trim IB. + * We only do this once after chip has been reset (usually + * same as once per system boot). + */ + if (first_reset) { + ret = qib_sd_dactrim(dd); + if (ret < 0) { + qib_dev_err(dd, "Failed IB SERDES DAC trim\n"); + goto bail; + } + } + /* + * Set various registers (DDS and RXEQ) that will be + * controlled by IBC (in 1.2 mode) to reasonable preset values + * Calling the "internal" version avoids the "check for needed" + * and "trimdone monitor" that might be counter-productive. + */ + ret = qib_internal_presets(dd); + if (ret < 0) { + qib_dev_err(dd, "Failed to set IB SERDES presets\n"); + goto bail; + } + ret = qib_sd_trimself(dd, 0x80); + if (ret < 0) { + qib_dev_err(dd, "Failed to set IB SERDES TRIMSELF\n"); + goto bail; + } + + /* Load image, then try to verify */ + ret = 0; /* Assume success */ + if (first_reset) { + int vfy; + int trim_done; + + ret = qib_sd7220_ib_load(dd, fw); + if (ret < 0) { + qib_dev_err(dd, "Failed to load IB SERDES image\n"); + goto bail; + } else { + /* Loaded image, try to verify */ + vfy = qib_sd7220_ib_vfy(dd, fw); + if (vfy != ret) { + qib_dev_err(dd, "SERDES PRAM VFY failed\n"); + goto bail; + } /* end if verified */ + } /* end if loaded */ + + /* + * Loaded and verified. Almost good... + * hold "success" in ret + */ + ret = 0; + /* + * Prev steps all worked, continue bringup + * De-assert RESET to uC, only in first reset, to allow + * trimming. + * + * Since our default setup sets START_EQ1 to + * PRESET, we need to clear that for this very first run. + */ + ret = ibsd_mod_allchnls(dd, START_EQ1(0), 0, 0x38); + if (ret < 0) { + qib_dev_err(dd, "Failed clearing START_EQ1\n"); + goto bail; + } + + qib_ibsd_reset(dd, 0); + /* + * If this is not the first reset, trimdone should be set + * already. We may need to check about this. + */ + trim_done = qib_sd_trimdone_poll(dd); + /* + * Whether or not trimdone succeeded, we need to put the + * uC back into reset to avoid a possible fight with the + * IBC state-machine. + */ + qib_ibsd_reset(dd, 1); + + if (!trim_done) { + qib_dev_err(dd, "No TRIMDONE seen\n"); + goto bail; + } + /* + * DEBUG: check each time we reset if trimdone bits have + * gotten cleared, and re-set them. + */ + qib_sd_trimdone_monitor(dd, "First-reset"); + /* Remember so we do not re-do the load, dactrim, etc. */ + dd->cspec->serdes_first_init_done = 1; + } + /* + * setup for channel training and load values for + * RxEq and DDS in tables used by IBC in IB1.2 mode + */ + ret = 0; + if (qib_sd_setvals(dd) >= 0) + goto done; +bail: + ret = 1; +done: + /* start relock timer regardless, but start at 1 second */ + set_7220_relock_poll(dd, -1); + + release_firmware(fw); + return ret; +} + +#define EPB_ACC_REQ 1 +#define EPB_ACC_GNT 0x100 +#define EPB_DATA_MASK 0xFF +#define EPB_RD (1ULL << 24) +#define EPB_TRANS_RDY (1ULL << 31) +#define EPB_TRANS_ERR (1ULL << 30) +#define EPB_TRANS_TRIES 5 + +/* + * query, claim, release ownership of the EPB (External Parallel Bus) + * for a specified SERDES. + * the "claim" parameter is >0 to claim, <0 to release, 0 to query. + * Returns <0 for errors, >0 if we had ownership, else 0. + */ +static int epb_access(struct qib_devdata *dd, int sdnum, int claim) +{ + u16 acc; + u64 accval; + int owned = 0; + u64 oct_sel = 0; + + switch (sdnum) { + case IB_7220_SERDES: + /* + * The IB SERDES "ownership" is fairly simple. A single each + * request/grant. + */ + acc = kr_ibsd_epb_access_ctrl; + break; + + case PCIE_SERDES0: + case PCIE_SERDES1: + /* PCIe SERDES has two "octants", need to select which */ + acc = kr_pciesd_epb_access_ctrl; + oct_sel = (2 << (sdnum - PCIE_SERDES0)); + break; + + default: + return 0; + } + + /* Make sure any outstanding transaction was seen */ + qib_read_kreg32(dd, kr_scratch); + udelay(15); + + accval = qib_read_kreg32(dd, acc); + + owned = !!(accval & EPB_ACC_GNT); + if (claim < 0) { + /* Need to release */ + u64 pollval; + /* + * The only writeable bits are the request and CS. + * Both should be clear + */ + u64 newval = 0; + + qib_write_kreg(dd, acc, newval); + /* First read after write is not trustworthy */ + pollval = qib_read_kreg32(dd, acc); + udelay(5); + pollval = qib_read_kreg32(dd, acc); + if (pollval & EPB_ACC_GNT) + owned = -1; + } else if (claim > 0) { + /* Need to claim */ + u64 pollval; + u64 newval = EPB_ACC_REQ | oct_sel; + + qib_write_kreg(dd, acc, newval); + /* First read after write is not trustworthy */ + pollval = qib_read_kreg32(dd, acc); + udelay(5); + pollval = qib_read_kreg32(dd, acc); + if (!(pollval & EPB_ACC_GNT)) + owned = -1; + } + return owned; +} + +/* + * Lemma to deal with race condition of write..read to epb regs + */ +static int epb_trans(struct qib_devdata *dd, u16 reg, u64 i_val, u64 *o_vp) +{ + int tries; + u64 transval; + + qib_write_kreg(dd, reg, i_val); + /* Throw away first read, as RDY bit may be stale */ + transval = qib_read_kreg64(dd, reg); + + for (tries = EPB_TRANS_TRIES; tries; --tries) { + transval = qib_read_kreg32(dd, reg); + if (transval & EPB_TRANS_RDY) + break; + udelay(5); + } + if (transval & EPB_TRANS_ERR) + return -1; + if (tries > 0 && o_vp) + *o_vp = transval; + return tries; +} + +/** + * qib_sd7220_reg_mod - modify SERDES register + * @dd: the qlogic_ib device + * @sdnum: which SERDES to access + * @loc: location - channel, element, register, as packed by EPB_LOC() macro. + * @wd: Write Data - value to set in register + * @mask: ones where data should be spliced into reg. + * + * Basic register read/modify/write, with un-needed acesses elided. That is, + * a mask of zero will prevent write, while a mask of 0xFF will prevent read. + * returns current (presumed, if a write was done) contents of selected + * register, or <0 if errors. + */ +static int qib_sd7220_reg_mod(struct qib_devdata *dd, int sdnum, u32 loc, + u32 wd, u32 mask) +{ + u16 trans; + u64 transval; + int owned; + int tries, ret; + unsigned long flags; + + switch (sdnum) { + case IB_7220_SERDES: + trans = kr_ibsd_epb_transaction_reg; + break; + + case PCIE_SERDES0: + case PCIE_SERDES1: + trans = kr_pciesd_epb_transaction_reg; + break; + + default: + return -1; + } + + /* + * All access is locked in software (vs other host threads) and + * hardware (vs uC access). + */ + spin_lock_irqsave(&dd->cspec->sdepb_lock, flags); + + owned = epb_access(dd, sdnum, 1); + if (owned < 0) { + spin_unlock_irqrestore(&dd->cspec->sdepb_lock, flags); + return -1; + } + ret = 0; + for (tries = EPB_TRANS_TRIES; tries; --tries) { + transval = qib_read_kreg32(dd, trans); + if (transval & EPB_TRANS_RDY) + break; + udelay(5); + } + + if (tries > 0) { + tries = 1; /* to make read-skip work */ + if (mask != 0xFF) { + /* + * Not a pure write, so need to read. + * loc encodes chip-select as well as address + */ + transval = loc | EPB_RD; + tries = epb_trans(dd, trans, transval, &transval); + } + if (tries > 0 && mask != 0) { + /* + * Not a pure read, so need to write. + */ + wd = (wd & mask) | (transval & ~mask); + transval = loc | (wd & EPB_DATA_MASK); + tries = epb_trans(dd, trans, transval, &transval); + } + } + /* else, failed to see ready, what error-handling? */ + + /* + * Release bus. Failure is an error. + */ + if (epb_access(dd, sdnum, -1) < 0) + ret = -1; + else + ret = transval & EPB_DATA_MASK; + + spin_unlock_irqrestore(&dd->cspec->sdepb_lock, flags); + if (tries <= 0) + ret = -1; + return ret; +} + +#define EPB_ROM_R (2) +#define EPB_ROM_W (1) +/* + * Below, all uC-related, use appropriate UC_CS, depending + * on which SerDes is used. + */ +#define EPB_UC_CTL EPB_LOC(6, 0, 0) +#define EPB_MADDRL EPB_LOC(6, 0, 2) +#define EPB_MADDRH EPB_LOC(6, 0, 3) +#define EPB_ROMDATA EPB_LOC(6, 0, 4) +#define EPB_RAMDATA EPB_LOC(6, 0, 5) + +/* Transfer date to/from uC Program RAM of IB or PCIe SerDes */ +static int qib_sd7220_ram_xfer(struct qib_devdata *dd, int sdnum, u32 loc, + u8 *buf, int cnt, int rd_notwr) +{ + u16 trans; + u64 transval; + u64 csbit; + int owned; + int tries; + int sofar; + int addr; + int ret; + unsigned long flags; + const char *op; + + /* Pick appropriate transaction reg and "Chip select" for this serdes */ + switch (sdnum) { + case IB_7220_SERDES: + csbit = 1ULL << EPB_IB_UC_CS_SHF; + trans = kr_ibsd_epb_transaction_reg; + break; + + case PCIE_SERDES0: + case PCIE_SERDES1: + /* PCIe SERDES has uC "chip select" in different bit, too */ + csbit = 1ULL << EPB_PCIE_UC_CS_SHF; + trans = kr_pciesd_epb_transaction_reg; + break; + + default: + return -1; + } + + op = rd_notwr ? "Rd" : "Wr"; + spin_lock_irqsave(&dd->cspec->sdepb_lock, flags); + + owned = epb_access(dd, sdnum, 1); + if (owned < 0) { + spin_unlock_irqrestore(&dd->cspec->sdepb_lock, flags); + return -1; + } + + /* + * In future code, we may need to distinguish several address ranges, + * and select various memories based on this. For now, just trim + * "loc" (location including address and memory select) to + * "addr" (address within memory). we will only support PRAM + * The memory is 8KB. + */ + addr = loc & 0x1FFF; + for (tries = EPB_TRANS_TRIES; tries; --tries) { + transval = qib_read_kreg32(dd, trans); + if (transval & EPB_TRANS_RDY) + break; + udelay(5); + } + + sofar = 0; + if (tries > 0) { + /* + * Every "memory" access is doubly-indirect. + * We set two bytes of address, then read/write + * one or mores bytes of data. + */ + + /* First, we set control to "Read" or "Write" */ + transval = csbit | EPB_UC_CTL | + (rd_notwr ? EPB_ROM_R : EPB_ROM_W); + tries = epb_trans(dd, trans, transval, &transval); + while (tries > 0 && sofar < cnt) { + if (!sofar) { + /* Only set address at start of chunk */ + int addrbyte = (addr + sofar) >> 8; + + transval = csbit | EPB_MADDRH | addrbyte; + tries = epb_trans(dd, trans, transval, + &transval); + if (tries <= 0) + break; + addrbyte = (addr + sofar) & 0xFF; + transval = csbit | EPB_MADDRL | addrbyte; + tries = epb_trans(dd, trans, transval, + &transval); + if (tries <= 0) + break; + } + + if (rd_notwr) + transval = csbit | EPB_ROMDATA | EPB_RD; + else + transval = csbit | EPB_ROMDATA | buf[sofar]; + tries = epb_trans(dd, trans, transval, &transval); + if (tries <= 0) + break; + if (rd_notwr) + buf[sofar] = transval & EPB_DATA_MASK; + ++sofar; + } + /* Finally, clear control-bit for Read or Write */ + transval = csbit | EPB_UC_CTL; + tries = epb_trans(dd, trans, transval, &transval); + } + + ret = sofar; + /* Release bus. Failure is an error */ + if (epb_access(dd, sdnum, -1) < 0) + ret = -1; + + spin_unlock_irqrestore(&dd->cspec->sdepb_lock, flags); + if (tries <= 0) + ret = -1; + return ret; +} + +#define PROG_CHUNK 64 + +static int qib_sd7220_prog_ld(struct qib_devdata *dd, int sdnum, + const u8 *img, int len, int offset) +{ + int cnt, sofar, req; + + sofar = 0; + while (sofar < len) { + req = len - sofar; + if (req > PROG_CHUNK) + req = PROG_CHUNK; + cnt = qib_sd7220_ram_xfer(dd, sdnum, offset + sofar, + (u8 *)img + sofar, req, 0); + if (cnt < req) { + sofar = -1; + break; + } + sofar += req; + } + return sofar; +} + +#define VFY_CHUNK 64 +#define SD_PRAM_ERROR_LIMIT 42 + +static int qib_sd7220_prog_vfy(struct qib_devdata *dd, int sdnum, + const u8 *img, int len, int offset) +{ + int cnt, sofar, req, idx, errors; + unsigned char readback[VFY_CHUNK]; + + errors = 0; + sofar = 0; + while (sofar < len) { + req = len - sofar; + if (req > VFY_CHUNK) + req = VFY_CHUNK; + cnt = qib_sd7220_ram_xfer(dd, sdnum, sofar + offset, + readback, req, 1); + if (cnt < req) { + /* failed in read itself */ + sofar = -1; + break; + } + for (idx = 0; idx < cnt; ++idx) { + if (readback[idx] != img[idx+sofar]) + ++errors; + } + sofar += cnt; + } + return errors ? -errors : sofar; +} + +static int +qib_sd7220_ib_load(struct qib_devdata *dd, const struct firmware *fw) +{ + return qib_sd7220_prog_ld(dd, IB_7220_SERDES, fw->data, fw->size, 0); +} + +static int +qib_sd7220_ib_vfy(struct qib_devdata *dd, const struct firmware *fw) +{ + return qib_sd7220_prog_vfy(dd, IB_7220_SERDES, fw->data, fw->size, 0); +} + +/* + * IRQ not set up at this point in init, so we poll. + */ +#define IB_SERDES_TRIM_DONE (1ULL << 11) +#define TRIM_TMO (15) + +static int qib_sd_trimdone_poll(struct qib_devdata *dd) +{ + int trim_tmo, ret; + uint64_t val; + + /* + * Default to failure, so IBC will not start + * without IB_SERDES_TRIM_DONE. + */ + ret = 0; + for (trim_tmo = 0; trim_tmo < TRIM_TMO; ++trim_tmo) { + val = qib_read_kreg64(dd, kr_ibcstatus); + if (val & IB_SERDES_TRIM_DONE) { + ret = 1; + break; + } + msleep(20); + } + if (trim_tmo >= TRIM_TMO) { + qib_dev_err(dd, "No TRIMDONE in %d tries\n", trim_tmo); + ret = 0; + } + return ret; +} + +#define TX_FAST_ELT (9) + +/* + * Set the "negotiation" values for SERDES. These are used by the IB1.2 + * link negotiation. Macros below are attempt to keep the values a + * little more human-editable. + * First, values related to Drive De-emphasis Settings. + */ + +#define NUM_DDS_REGS 6 +#define DDS_REG_MAP 0x76A910 /* LSB-first list of regs (in elt 9) to mod */ + +#define DDS_VAL(amp_d, main_d, ipst_d, ipre_d, amp_s, main_s, ipst_s, ipre_s) \ + { { ((amp_d & 0x1F) << 1) | 1, ((amp_s & 0x1F) << 1) | 1, \ + (main_d << 3) | 4 | (ipre_d >> 2), \ + (main_s << 3) | 4 | (ipre_s >> 2), \ + ((ipst_d & 0xF) << 1) | ((ipre_d & 3) << 6) | 0x21, \ + ((ipst_s & 0xF) << 1) | ((ipre_s & 3) << 6) | 0x21 } } + +static struct dds_init { + uint8_t reg_vals[NUM_DDS_REGS]; +} dds_init_vals[] = { + /* DDR(FDR) SDR(HDR) */ + /* Vendor recommends below for 3m cable */ +#define DDS_3M 0 + DDS_VAL(31, 19, 12, 0, 29, 22, 9, 0), + DDS_VAL(31, 12, 15, 4, 31, 15, 15, 1), + DDS_VAL(31, 13, 15, 3, 31, 16, 15, 0), + DDS_VAL(31, 14, 15, 2, 31, 17, 14, 0), + DDS_VAL(31, 15, 15, 1, 31, 18, 13, 0), + DDS_VAL(31, 16, 15, 0, 31, 19, 12, 0), + DDS_VAL(31, 17, 14, 0, 31, 20, 11, 0), + DDS_VAL(31, 18, 13, 0, 30, 21, 10, 0), + DDS_VAL(31, 20, 11, 0, 28, 23, 8, 0), + DDS_VAL(31, 21, 10, 0, 27, 24, 7, 0), + DDS_VAL(31, 22, 9, 0, 26, 25, 6, 0), + DDS_VAL(30, 23, 8, 0, 25, 26, 5, 0), + DDS_VAL(29, 24, 7, 0, 23, 27, 4, 0), + /* Vendor recommends below for 1m cable */ +#define DDS_1M 13 + DDS_VAL(28, 25, 6, 0, 21, 28, 3, 0), + DDS_VAL(27, 26, 5, 0, 19, 29, 2, 0), + DDS_VAL(25, 27, 4, 0, 17, 30, 1, 0) +}; + +/* + * Now the RXEQ section of the table. + */ +/* Hardware packs an element number and register address thus: */ +#define RXEQ_INIT_RDESC(elt, addr) (((elt) & 0xF) | ((addr) << 4)) +#define RXEQ_VAL(elt, adr, val0, val1, val2, val3) \ + {RXEQ_INIT_RDESC((elt), (adr)), {(val0), (val1), (val2), (val3)} } + +#define RXEQ_VAL_ALL(elt, adr, val) \ + {RXEQ_INIT_RDESC((elt), (adr)), {(val), (val), (val), (val)} } + +#define RXEQ_SDR_DFELTH 0 +#define RXEQ_SDR_TLTH 0 +#define RXEQ_SDR_G1CNT_Z1CNT 0x11 +#define RXEQ_SDR_ZCNT 23 + +static struct rxeq_init { + u16 rdesc; /* in form used in SerDesDDSRXEQ */ + u8 rdata[4]; +} rxeq_init_vals[] = { + /* Set Rcv Eq. to Preset node */ + RXEQ_VAL_ALL(7, 0x27, 0x10), + /* Set DFELTHFDR/HDR thresholds */ + RXEQ_VAL(7, 8, 0, 0, 0, 0), /* FDR, was 0, 1, 2, 3 */ + RXEQ_VAL(7, 0x21, 0, 0, 0, 0), /* HDR */ + /* Set TLTHFDR/HDR theshold */ + RXEQ_VAL(7, 9, 2, 2, 2, 2), /* FDR, was 0, 2, 4, 6 */ + RXEQ_VAL(7, 0x23, 2, 2, 2, 2), /* HDR, was 0, 1, 2, 3 */ + /* Set Preamp setting 2 (ZFR/ZCNT) */ + RXEQ_VAL(7, 0x1B, 12, 12, 12, 12), /* FDR, was 12, 16, 20, 24 */ + RXEQ_VAL(7, 0x1C, 12, 12, 12, 12), /* HDR, was 12, 16, 20, 24 */ + /* Set Preamp DC gain and Setting 1 (GFR/GHR) */ + RXEQ_VAL(7, 0x1E, 16, 16, 16, 16), /* FDR, was 16, 17, 18, 20 */ + RXEQ_VAL(7, 0x1F, 16, 16, 16, 16), /* HDR, was 16, 17, 18, 20 */ + /* Toggle RELOCK (in VCDL_CTRL0) to lock to data */ + RXEQ_VAL_ALL(6, 6, 0x20), /* Set D5 High */ + RXEQ_VAL_ALL(6, 6, 0), /* Set D5 Low */ +}; + +/* There are 17 values from vendor, but IBC only accesses the first 16 */ +#define DDS_ROWS (16) +#define RXEQ_ROWS ARRAY_SIZE(rxeq_init_vals) + +static int qib_sd_setvals(struct qib_devdata *dd) +{ + int idx, midx; + int min_idx; /* Minimum index for this portion of table */ + uint32_t dds_reg_map; + u64 __iomem *taddr, *iaddr; + uint64_t data; + uint64_t sdctl; + + taddr = dd->kregbase + kr_serdes_maptable; + iaddr = dd->kregbase + kr_serdes_ddsrxeq0; + + /* + * Init the DDS section of the table. + * Each "row" of the table provokes NUM_DDS_REG writes, to the + * registers indicated in DDS_REG_MAP. + */ + sdctl = qib_read_kreg64(dd, kr_ibserdesctrl); + sdctl = (sdctl & ~(0x1f << 8)) | (NUM_DDS_REGS << 8); + sdctl = (sdctl & ~(0x1f << 13)) | (RXEQ_ROWS << 13); + qib_write_kreg(dd, kr_ibserdesctrl, sdctl); + + /* + * Iterate down table within loop for each register to store. + */ + dds_reg_map = DDS_REG_MAP; + for (idx = 0; idx < NUM_DDS_REGS; ++idx) { + data = ((dds_reg_map & 0xF) << 4) | TX_FAST_ELT; + writeq(data, iaddr + idx); + mmiowb(); + qib_read_kreg32(dd, kr_scratch); + dds_reg_map >>= 4; + for (midx = 0; midx < DDS_ROWS; ++midx) { + u64 __iomem *daddr = taddr + ((midx << 4) + idx); + + data = dds_init_vals[midx].reg_vals[idx]; + writeq(data, daddr); + mmiowb(); + qib_read_kreg32(dd, kr_scratch); + } /* End inner for (vals for this reg, each row) */ + } /* end outer for (regs to be stored) */ + + /* + * Init the RXEQ section of the table. + * This runs in a different order, as the pattern of + * register references is more complex, but there are only + * four "data" values per register. + */ + min_idx = idx; /* RXEQ indices pick up where DDS left off */ + taddr += 0x100; /* RXEQ data is in second half of table */ + /* Iterate through RXEQ register addresses */ + for (idx = 0; idx < RXEQ_ROWS; ++idx) { + int didx; /* "destination" */ + int vidx; + + /* didx is offset by min_idx to address RXEQ range of regs */ + didx = idx + min_idx; + /* Store the next RXEQ register address */ + writeq(rxeq_init_vals[idx].rdesc, iaddr + didx); + mmiowb(); + qib_read_kreg32(dd, kr_scratch); + /* Iterate through RXEQ values */ + for (vidx = 0; vidx < 4; vidx++) { + data = rxeq_init_vals[idx].rdata[vidx]; + writeq(data, taddr + (vidx << 6) + idx); + mmiowb(); + qib_read_kreg32(dd, kr_scratch); + } + } /* end outer for (Reg-writes for RXEQ) */ + return 0; +} + +#define CMUCTRL5 EPB_LOC(7, 0, 0x15) +#define RXHSCTRL0(chan) EPB_LOC(chan, 6, 0) +#define VCDL_DAC2(chan) EPB_LOC(chan, 6, 5) +#define VCDL_CTRL0(chan) EPB_LOC(chan, 6, 6) +#define VCDL_CTRL2(chan) EPB_LOC(chan, 6, 8) +#define START_EQ2(chan) EPB_LOC(chan, 7, 0x28) + +/* + * Repeat a "store" across all channels of the IB SerDes. + * Although nominally it inherits the "read value" of the last + * channel it modified, the only really useful return is <0 for + * failure, >= 0 for success. The parameter 'loc' is assumed to + * be the location in some channel of the register to be modified + * The caller can specify use of the "gang write" option of EPB, + * in which case we use the specified channel data for any fields + * not explicitely written. + */ +static int ibsd_mod_allchnls(struct qib_devdata *dd, int loc, int val, + int mask) +{ + int ret = -1; + int chnl; + + if (loc & EPB_GLOBAL_WR) { + /* + * Our caller has assured us that we can set all four + * channels at once. Trust that. If mask is not 0xFF, + * we will read the _specified_ channel for our starting + * value. + */ + loc |= (1U << EPB_IB_QUAD0_CS_SHF); + chnl = (loc >> (4 + EPB_ADDR_SHF)) & 7; + if (mask != 0xFF) { + ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, + loc & ~EPB_GLOBAL_WR, 0, 0); + if (ret < 0) { + int sloc = loc >> EPB_ADDR_SHF; + + qib_dev_err(dd, + "pre-read failed: elt %d, addr 0x%X, chnl %d\n", + (sloc & 0xF), + (sloc >> 9) & 0x3f, chnl); + return ret; + } + val = (ret & ~mask) | (val & mask); + } + loc &= ~(7 << (4+EPB_ADDR_SHF)); + ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, loc, val, 0xFF); + if (ret < 0) { + int sloc = loc >> EPB_ADDR_SHF; + + qib_dev_err(dd, + "Global WR failed: elt %d, addr 0x%X, val %02X\n", + (sloc & 0xF), (sloc >> 9) & 0x3f, val); + } + return ret; + } + /* Clear "channel" and set CS so we can simply iterate */ + loc &= ~(7 << (4+EPB_ADDR_SHF)); + loc |= (1U << EPB_IB_QUAD0_CS_SHF); + for (chnl = 0; chnl < 4; ++chnl) { + int cloc = loc | (chnl << (4+EPB_ADDR_SHF)); + + ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, cloc, val, mask); + if (ret < 0) { + int sloc = loc >> EPB_ADDR_SHF; + + qib_dev_err(dd, + "Write failed: elt %d, addr 0x%X, chnl %d, val 0x%02X, mask 0x%02X\n", + (sloc & 0xF), (sloc >> 9) & 0x3f, chnl, + val & 0xFF, mask & 0xFF); + break; + } + } + return ret; +} + +/* + * Set the Tx values normally modified by IBC in IB1.2 mode to default + * values, as gotten from first row of init table. + */ +static int set_dds_vals(struct qib_devdata *dd, struct dds_init *ddi) +{ + int ret; + int idx, reg, data; + uint32_t regmap; + + regmap = DDS_REG_MAP; + for (idx = 0; idx < NUM_DDS_REGS; ++idx) { + reg = (regmap & 0xF); + regmap >>= 4; + data = ddi->reg_vals[idx]; + /* Vendor says RMW not needed for these regs, use 0xFF mask */ + ret = ibsd_mod_allchnls(dd, EPB_LOC(0, 9, reg), data, 0xFF); + if (ret < 0) + break; + } + return ret; +} + +/* + * Set the Rx values normally modified by IBC in IB1.2 mode to default + * values, as gotten from selected column of init table. + */ +static int set_rxeq_vals(struct qib_devdata *dd, int vsel) +{ + int ret; + int ridx; + int cnt = ARRAY_SIZE(rxeq_init_vals); + + for (ridx = 0; ridx < cnt; ++ridx) { + int elt, reg, val, loc; + + elt = rxeq_init_vals[ridx].rdesc & 0xF; + reg = rxeq_init_vals[ridx].rdesc >> 4; + loc = EPB_LOC(0, elt, reg); + val = rxeq_init_vals[ridx].rdata[vsel]; + /* mask of 0xFF, because hardware does full-byte store. */ + ret = ibsd_mod_allchnls(dd, loc, val, 0xFF); + if (ret < 0) + break; + } + return ret; +} + +/* + * Set the default values (row 0) for DDR Driver Demphasis. + * we do this initially and whenever we turn off IB-1.2 + * + * The "default" values for Rx equalization are also stored to + * SerDes registers. Formerly (and still default), we used set 2. + * For experimenting with cables and link-partners, we allow changing + * that via a module parameter. + */ +static unsigned qib_rxeq_set = 2; +module_param_named(rxeq_default_set, qib_rxeq_set, uint, + S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(rxeq_default_set, + "Which set [0..3] of Rx Equalization values is default"); + +static int qib_internal_presets(struct qib_devdata *dd) +{ + int ret = 0; + + ret = set_dds_vals(dd, dds_init_vals + DDS_3M); + + if (ret < 0) + qib_dev_err(dd, "Failed to set default DDS values\n"); + ret = set_rxeq_vals(dd, qib_rxeq_set & 3); + if (ret < 0) + qib_dev_err(dd, "Failed to set default RXEQ values\n"); + return ret; +} + +int qib_sd7220_presets(struct qib_devdata *dd) +{ + int ret = 0; + + if (!dd->cspec->presets_needed) + return ret; + dd->cspec->presets_needed = 0; + /* Assert uC reset, so we don't clash with it. */ + qib_ibsd_reset(dd, 1); + udelay(2); + qib_sd_trimdone_monitor(dd, "link-down"); + + ret = qib_internal_presets(dd); + return ret; +} + +static int qib_sd_trimself(struct qib_devdata *dd, int val) +{ + int loc = CMUCTRL5 | (1U << EPB_IB_QUAD0_CS_SHF); + + return qib_sd7220_reg_mod(dd, IB_7220_SERDES, loc, val, 0xFF); +} + +static int qib_sd_early(struct qib_devdata *dd) +{ + int ret; + + ret = ibsd_mod_allchnls(dd, RXHSCTRL0(0) | EPB_GLOBAL_WR, 0xD4, 0xFF); + if (ret < 0) + goto bail; + ret = ibsd_mod_allchnls(dd, START_EQ1(0) | EPB_GLOBAL_WR, 0x10, 0xFF); + if (ret < 0) + goto bail; + ret = ibsd_mod_allchnls(dd, START_EQ2(0) | EPB_GLOBAL_WR, 0x30, 0xFF); +bail: + return ret; +} + +#define BACTRL(chnl) EPB_LOC(chnl, 6, 0x0E) +#define LDOUTCTRL1(chnl) EPB_LOC(chnl, 7, 6) +#define RXHSSTATUS(chnl) EPB_LOC(chnl, 6, 0xF) + +static int qib_sd_dactrim(struct qib_devdata *dd) +{ + int ret; + + ret = ibsd_mod_allchnls(dd, VCDL_DAC2(0) | EPB_GLOBAL_WR, 0x2D, 0xFF); + if (ret < 0) + goto bail; + + /* more fine-tuning of what will be default */ + ret = ibsd_mod_allchnls(dd, VCDL_CTRL2(0), 3, 0xF); + if (ret < 0) + goto bail; + + ret = ibsd_mod_allchnls(dd, BACTRL(0) | EPB_GLOBAL_WR, 0x40, 0xFF); + if (ret < 0) + goto bail; + + ret = ibsd_mod_allchnls(dd, LDOUTCTRL1(0) | EPB_GLOBAL_WR, 0x04, 0xFF); + if (ret < 0) + goto bail; + + ret = ibsd_mod_allchnls(dd, RXHSSTATUS(0) | EPB_GLOBAL_WR, 0x04, 0xFF); + if (ret < 0) + goto bail; + + /* + * Delay for max possible number of steps, with slop. + * Each step is about 4usec. + */ + udelay(415); + + ret = ibsd_mod_allchnls(dd, LDOUTCTRL1(0) | EPB_GLOBAL_WR, 0x00, 0xFF); + +bail: + return ret; +} + +#define RELOCK_FIRST_MS 3 +#define RXLSPPM(chan) EPB_LOC(chan, 0, 2) +void toggle_7220_rclkrls(struct qib_devdata *dd) +{ + int loc = RXLSPPM(0) | EPB_GLOBAL_WR; + int ret; + + ret = ibsd_mod_allchnls(dd, loc, 0, 0x80); + if (ret < 0) + qib_dev_err(dd, "RCLKRLS failed to clear D7\n"); + else { + udelay(1); + ibsd_mod_allchnls(dd, loc, 0x80, 0x80); + } + /* And again for good measure */ + udelay(1); + ret = ibsd_mod_allchnls(dd, loc, 0, 0x80); + if (ret < 0) + qib_dev_err(dd, "RCLKRLS failed to clear D7\n"); + else { + udelay(1); + ibsd_mod_allchnls(dd, loc, 0x80, 0x80); + } + /* Now reset xgxs and IBC to complete the recovery */ + dd->f_xgxs_reset(dd->pport); +} + +/* + * Shut down the timer that polls for relock occasions, if needed + * this is "hooked" from qib_7220_quiet_serdes(), which is called + * just before qib_shutdown_device() in qib_driver.c shuts down all + * the other timers + */ +void shutdown_7220_relock_poll(struct qib_devdata *dd) +{ + if (dd->cspec->relock_timer_active) + del_timer_sync(&dd->cspec->relock_timer); +} + +static unsigned qib_relock_by_timer = 1; +module_param_named(relock_by_timer, qib_relock_by_timer, uint, + S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(relock_by_timer, "Allow relock attempt if link not up"); + +static void qib_run_relock(unsigned long opaque) +{ + struct qib_devdata *dd = (struct qib_devdata *)opaque; + struct qib_pportdata *ppd = dd->pport; + struct qib_chip_specific *cs = dd->cspec; + int timeoff; + + /* + * Check link-training state for "stuck" state, when down. + * if found, try relock and schedule another try at + * exponentially growing delay, maxed at one second. + * if not stuck, our work is done. + */ + if ((dd->flags & QIB_INITTED) && !(ppd->lflags & + (QIBL_IB_AUTONEG_INPROG | QIBL_LINKINIT | QIBL_LINKARMED | + QIBL_LINKACTIVE))) { + if (qib_relock_by_timer) { + if (!(ppd->lflags & QIBL_IB_LINK_DISABLED)) + toggle_7220_rclkrls(dd); + } + /* re-set timer for next check */ + timeoff = cs->relock_interval << 1; + if (timeoff > HZ) + timeoff = HZ; + cs->relock_interval = timeoff; + } else + timeoff = HZ; + mod_timer(&cs->relock_timer, jiffies + timeoff); +} + +void set_7220_relock_poll(struct qib_devdata *dd, int ibup) +{ + struct qib_chip_specific *cs = dd->cspec; + + if (ibup) { + /* We are now up, relax timer to 1 second interval */ + if (cs->relock_timer_active) { + cs->relock_interval = HZ; + mod_timer(&cs->relock_timer, jiffies + HZ); + } + } else { + /* Transition to down, (re-)set timer to short interval. */ + unsigned int timeout; + + timeout = msecs_to_jiffies(RELOCK_FIRST_MS); + if (timeout == 0) + timeout = 1; + /* If timer has not yet been started, do so. */ + if (!cs->relock_timer_active) { + cs->relock_timer_active = 1; + init_timer(&cs->relock_timer); + cs->relock_timer.function = qib_run_relock; + cs->relock_timer.data = (unsigned long) dd; + cs->relock_interval = timeout; + cs->relock_timer.expires = jiffies + timeout; + add_timer(&cs->relock_timer); + } else { + cs->relock_interval = timeout; + mod_timer(&cs->relock_timer, jiffies + timeout); + } + } +} diff --git a/kernel/drivers/infiniband/hw/qib/qib_sdma.c b/kernel/drivers/infiniband/hw/qib/qib_sdma.c new file mode 100644 index 000000000..c6d6a54d2 --- /dev/null +++ b/kernel/drivers/infiniband/hw/qib/qib_sdma.c @@ -0,0 +1,1039 @@ +/* + * Copyright (c) 2012 Intel Corporation. All rights reserved. + * Copyright (c) 2007 - 2012 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include "qib.h" +#include "qib_common.h" + +/* default pio off, sdma on */ +static ushort sdma_descq_cnt = 256; +module_param_named(sdma_descq_cnt, sdma_descq_cnt, ushort, S_IRUGO); +MODULE_PARM_DESC(sdma_descq_cnt, "Number of SDMA descq entries"); + +/* + * Bits defined in the send DMA descriptor. + */ +#define SDMA_DESC_LAST (1ULL << 11) +#define SDMA_DESC_FIRST (1ULL << 12) +#define SDMA_DESC_DMA_HEAD (1ULL << 13) +#define SDMA_DESC_USE_LARGE_BUF (1ULL << 14) +#define SDMA_DESC_INTR (1ULL << 15) +#define SDMA_DESC_COUNT_LSB 16 +#define SDMA_DESC_GEN_LSB 30 + +char *qib_sdma_state_names[] = { + [qib_sdma_state_s00_hw_down] = "s00_HwDown", + [qib_sdma_state_s10_hw_start_up_wait] = "s10_HwStartUpWait", + [qib_sdma_state_s20_idle] = "s20_Idle", + [qib_sdma_state_s30_sw_clean_up_wait] = "s30_SwCleanUpWait", + [qib_sdma_state_s40_hw_clean_up_wait] = "s40_HwCleanUpWait", + [qib_sdma_state_s50_hw_halt_wait] = "s50_HwHaltWait", + [qib_sdma_state_s99_running] = "s99_Running", +}; + +char *qib_sdma_event_names[] = { + [qib_sdma_event_e00_go_hw_down] = "e00_GoHwDown", + [qib_sdma_event_e10_go_hw_start] = "e10_GoHwStart", + [qib_sdma_event_e20_hw_started] = "e20_HwStarted", + [qib_sdma_event_e30_go_running] = "e30_GoRunning", + [qib_sdma_event_e40_sw_cleaned] = "e40_SwCleaned", + [qib_sdma_event_e50_hw_cleaned] = "e50_HwCleaned", + [qib_sdma_event_e60_hw_halted] = "e60_HwHalted", + [qib_sdma_event_e70_go_idle] = "e70_GoIdle", + [qib_sdma_event_e7220_err_halted] = "e7220_ErrHalted", + [qib_sdma_event_e7322_err_halted] = "e7322_ErrHalted", + [qib_sdma_event_e90_timer_tick] = "e90_TimerTick", +}; + +/* declare all statics here rather than keep sorting */ +static int alloc_sdma(struct qib_pportdata *); +static void sdma_complete(struct kref *); +static void sdma_finalput(struct qib_sdma_state *); +static void sdma_get(struct qib_sdma_state *); +static void sdma_put(struct qib_sdma_state *); +static void sdma_set_state(struct qib_pportdata *, enum qib_sdma_states); +static void sdma_start_sw_clean_up(struct qib_pportdata *); +static void sdma_sw_clean_up_task(unsigned long); +static void unmap_desc(struct qib_pportdata *, unsigned); + +static void sdma_get(struct qib_sdma_state *ss) +{ + kref_get(&ss->kref); +} + +static void sdma_complete(struct kref *kref) +{ + struct qib_sdma_state *ss = + container_of(kref, struct qib_sdma_state, kref); + + complete(&ss->comp); +} + +static void sdma_put(struct qib_sdma_state *ss) +{ + kref_put(&ss->kref, sdma_complete); +} + +static void sdma_finalput(struct qib_sdma_state *ss) +{ + sdma_put(ss); + wait_for_completion(&ss->comp); +} + +/* + * Complete all the sdma requests on the active list, in the correct + * order, and with appropriate processing. Called when cleaning up + * after sdma shutdown, and when new sdma requests are submitted for + * a link that is down. This matches what is done for requests + * that complete normally, it's just the full list. + * + * Must be called with sdma_lock held + */ +static void clear_sdma_activelist(struct qib_pportdata *ppd) +{ + struct qib_sdma_txreq *txp, *txp_next; + + list_for_each_entry_safe(txp, txp_next, &ppd->sdma_activelist, list) { + list_del_init(&txp->list); + if (txp->flags & QIB_SDMA_TXREQ_F_FREEDESC) { + unsigned idx; + + idx = txp->start_idx; + while (idx != txp->next_descq_idx) { + unmap_desc(ppd, idx); + if (++idx == ppd->sdma_descq_cnt) + idx = 0; + } + } + if (txp->callback) + (*txp->callback)(txp, QIB_SDMA_TXREQ_S_ABORTED); + } +} + +static void sdma_sw_clean_up_task(unsigned long opaque) +{ + struct qib_pportdata *ppd = (struct qib_pportdata *) opaque; + unsigned long flags; + + spin_lock_irqsave(&ppd->sdma_lock, flags); + + /* + * At this point, the following should always be true: + * - We are halted, so no more descriptors are getting retired. + * - We are not running, so no one is submitting new work. + * - Only we can send the e40_sw_cleaned, so we can't start + * running again until we say so. So, the active list and + * descq are ours to play with. + */ + + /* Process all retired requests. */ + qib_sdma_make_progress(ppd); + + clear_sdma_activelist(ppd); + + /* + * Resync count of added and removed. It is VERY important that + * sdma_descq_removed NEVER decrement - user_sdma depends on it. + */ + ppd->sdma_descq_removed = ppd->sdma_descq_added; + + /* + * Reset our notion of head and tail. + * Note that the HW registers will be reset when switching states + * due to calling __qib_sdma_process_event() below. + */ + ppd->sdma_descq_tail = 0; + ppd->sdma_descq_head = 0; + ppd->sdma_head_dma[0] = 0; + ppd->sdma_generation = 0; + + __qib_sdma_process_event(ppd, qib_sdma_event_e40_sw_cleaned); + + spin_unlock_irqrestore(&ppd->sdma_lock, flags); +} + +/* + * This is called when changing to state qib_sdma_state_s10_hw_start_up_wait + * as a result of send buffer errors or send DMA descriptor errors. + * We want to disarm the buffers in these cases. + */ +static void sdma_hw_start_up(struct qib_pportdata *ppd) +{ + struct qib_sdma_state *ss = &ppd->sdma_state; + unsigned bufno; + + for (bufno = ss->first_sendbuf; bufno < ss->last_sendbuf; ++bufno) + ppd->dd->f_sendctrl(ppd, QIB_SENDCTRL_DISARM_BUF(bufno)); + + ppd->dd->f_sdma_hw_start_up(ppd); +} + +static void sdma_sw_tear_down(struct qib_pportdata *ppd) +{ + struct qib_sdma_state *ss = &ppd->sdma_state; + + /* Releasing this reference means the state machine has stopped. */ + sdma_put(ss); +} + +static void sdma_start_sw_clean_up(struct qib_pportdata *ppd) +{ + tasklet_hi_schedule(&ppd->sdma_sw_clean_up_task); +} + +static void sdma_set_state(struct qib_pportdata *ppd, + enum qib_sdma_states next_state) +{ + struct qib_sdma_state *ss = &ppd->sdma_state; + struct sdma_set_state_action *action = ss->set_state_action; + unsigned op = 0; + + /* debugging bookkeeping */ + ss->previous_state = ss->current_state; + ss->previous_op = ss->current_op; + + ss->current_state = next_state; + + if (action[next_state].op_enable) + op |= QIB_SDMA_SENDCTRL_OP_ENABLE; + + if (action[next_state].op_intenable) + op |= QIB_SDMA_SENDCTRL_OP_INTENABLE; + + if (action[next_state].op_halt) + op |= QIB_SDMA_SENDCTRL_OP_HALT; + + if (action[next_state].op_drain) + op |= QIB_SDMA_SENDCTRL_OP_DRAIN; + + if (action[next_state].go_s99_running_tofalse) + ss->go_s99_running = 0; + + if (action[next_state].go_s99_running_totrue) + ss->go_s99_running = 1; + + ss->current_op = op; + + ppd->dd->f_sdma_sendctrl(ppd, ss->current_op); +} + +static void unmap_desc(struct qib_pportdata *ppd, unsigned head) +{ + __le64 *descqp = &ppd->sdma_descq[head].qw[0]; + u64 desc[2]; + dma_addr_t addr; + size_t len; + + desc[0] = le64_to_cpu(descqp[0]); + desc[1] = le64_to_cpu(descqp[1]); + + addr = (desc[1] << 32) | (desc[0] >> 32); + len = (desc[0] >> 14) & (0x7ffULL << 2); + dma_unmap_single(&ppd->dd->pcidev->dev, addr, len, DMA_TO_DEVICE); +} + +static int alloc_sdma(struct qib_pportdata *ppd) +{ + ppd->sdma_descq_cnt = sdma_descq_cnt; + if (!ppd->sdma_descq_cnt) + ppd->sdma_descq_cnt = 256; + + /* Allocate memory for SendDMA descriptor FIFO */ + ppd->sdma_descq = dma_alloc_coherent(&ppd->dd->pcidev->dev, + ppd->sdma_descq_cnt * sizeof(u64[2]), &ppd->sdma_descq_phys, + GFP_KERNEL); + + if (!ppd->sdma_descq) { + qib_dev_err(ppd->dd, + "failed to allocate SendDMA descriptor FIFO memory\n"); + goto bail; + } + + /* Allocate memory for DMA of head register to memory */ + ppd->sdma_head_dma = dma_alloc_coherent(&ppd->dd->pcidev->dev, + PAGE_SIZE, &ppd->sdma_head_phys, GFP_KERNEL); + if (!ppd->sdma_head_dma) { + qib_dev_err(ppd->dd, + "failed to allocate SendDMA head memory\n"); + goto cleanup_descq; + } + ppd->sdma_head_dma[0] = 0; + return 0; + +cleanup_descq: + dma_free_coherent(&ppd->dd->pcidev->dev, + ppd->sdma_descq_cnt * sizeof(u64[2]), (void *)ppd->sdma_descq, + ppd->sdma_descq_phys); + ppd->sdma_descq = NULL; + ppd->sdma_descq_phys = 0; +bail: + ppd->sdma_descq_cnt = 0; + return -ENOMEM; +} + +static void free_sdma(struct qib_pportdata *ppd) +{ + struct qib_devdata *dd = ppd->dd; + + if (ppd->sdma_head_dma) { + dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE, + (void *)ppd->sdma_head_dma, + ppd->sdma_head_phys); + ppd->sdma_head_dma = NULL; + ppd->sdma_head_phys = 0; + } + + if (ppd->sdma_descq) { + dma_free_coherent(&dd->pcidev->dev, + ppd->sdma_descq_cnt * sizeof(u64[2]), + ppd->sdma_descq, ppd->sdma_descq_phys); + ppd->sdma_descq = NULL; + ppd->sdma_descq_phys = 0; + } +} + +static inline void make_sdma_desc(struct qib_pportdata *ppd, + u64 *sdmadesc, u64 addr, u64 dwlen, + u64 dwoffset) +{ + + WARN_ON(addr & 3); + /* SDmaPhyAddr[47:32] */ + sdmadesc[1] = addr >> 32; + /* SDmaPhyAddr[31:0] */ + sdmadesc[0] = (addr & 0xfffffffcULL) << 32; + /* SDmaGeneration[1:0] */ + sdmadesc[0] |= (ppd->sdma_generation & 3ULL) << + SDMA_DESC_GEN_LSB; + /* SDmaDwordCount[10:0] */ + sdmadesc[0] |= (dwlen & 0x7ffULL) << SDMA_DESC_COUNT_LSB; + /* SDmaBufOffset[12:2] */ + sdmadesc[0] |= dwoffset & 0x7ffULL; +} + +/* sdma_lock must be held */ +int qib_sdma_make_progress(struct qib_pportdata *ppd) +{ + struct list_head *lp = NULL; + struct qib_sdma_txreq *txp = NULL; + struct qib_devdata *dd = ppd->dd; + int progress = 0; + u16 hwhead; + u16 idx = 0; + + hwhead = dd->f_sdma_gethead(ppd); + + /* The reason for some of the complexity of this code is that + * not all descriptors have corresponding txps. So, we have to + * be able to skip over descs until we wander into the range of + * the next txp on the list. + */ + + if (!list_empty(&ppd->sdma_activelist)) { + lp = ppd->sdma_activelist.next; + txp = list_entry(lp, struct qib_sdma_txreq, list); + idx = txp->start_idx; + } + + while (ppd->sdma_descq_head != hwhead) { + /* if desc is part of this txp, unmap if needed */ + if (txp && (txp->flags & QIB_SDMA_TXREQ_F_FREEDESC) && + (idx == ppd->sdma_descq_head)) { + unmap_desc(ppd, ppd->sdma_descq_head); + if (++idx == ppd->sdma_descq_cnt) + idx = 0; + } + + /* increment dequed desc count */ + ppd->sdma_descq_removed++; + + /* advance head, wrap if needed */ + if (++ppd->sdma_descq_head == ppd->sdma_descq_cnt) + ppd->sdma_descq_head = 0; + + /* if now past this txp's descs, do the callback */ + if (txp && txp->next_descq_idx == ppd->sdma_descq_head) { + /* remove from active list */ + list_del_init(&txp->list); + if (txp->callback) + (*txp->callback)(txp, QIB_SDMA_TXREQ_S_OK); + /* see if there is another txp */ + if (list_empty(&ppd->sdma_activelist)) + txp = NULL; + else { + lp = ppd->sdma_activelist.next; + txp = list_entry(lp, struct qib_sdma_txreq, + list); + idx = txp->start_idx; + } + } + progress = 1; + } + if (progress) + qib_verbs_sdma_desc_avail(ppd, qib_sdma_descq_freecnt(ppd)); + return progress; +} + +/* + * This is called from interrupt context. + */ +void qib_sdma_intr(struct qib_pportdata *ppd) +{ + unsigned long flags; + + spin_lock_irqsave(&ppd->sdma_lock, flags); + + __qib_sdma_intr(ppd); + + spin_unlock_irqrestore(&ppd->sdma_lock, flags); +} + +void __qib_sdma_intr(struct qib_pportdata *ppd) +{ + if (__qib_sdma_running(ppd)) { + qib_sdma_make_progress(ppd); + if (!list_empty(&ppd->sdma_userpending)) + qib_user_sdma_send_desc(ppd, &ppd->sdma_userpending); + } +} + +int qib_setup_sdma(struct qib_pportdata *ppd) +{ + struct qib_devdata *dd = ppd->dd; + unsigned long flags; + int ret = 0; + + ret = alloc_sdma(ppd); + if (ret) + goto bail; + + /* set consistent sdma state */ + ppd->dd->f_sdma_init_early(ppd); + spin_lock_irqsave(&ppd->sdma_lock, flags); + sdma_set_state(ppd, qib_sdma_state_s00_hw_down); + spin_unlock_irqrestore(&ppd->sdma_lock, flags); + + /* set up reference counting */ + kref_init(&ppd->sdma_state.kref); + init_completion(&ppd->sdma_state.comp); + + ppd->sdma_generation = 0; + ppd->sdma_descq_head = 0; + ppd->sdma_descq_removed = 0; + ppd->sdma_descq_added = 0; + + ppd->sdma_intrequest = 0; + INIT_LIST_HEAD(&ppd->sdma_userpending); + + INIT_LIST_HEAD(&ppd->sdma_activelist); + + tasklet_init(&ppd->sdma_sw_clean_up_task, sdma_sw_clean_up_task, + (unsigned long)ppd); + + ret = dd->f_init_sdma_regs(ppd); + if (ret) + goto bail_alloc; + + qib_sdma_process_event(ppd, qib_sdma_event_e10_go_hw_start); + + return 0; + +bail_alloc: + qib_teardown_sdma(ppd); +bail: + return ret; +} + +void qib_teardown_sdma(struct qib_pportdata *ppd) +{ + qib_sdma_process_event(ppd, qib_sdma_event_e00_go_hw_down); + + /* + * This waits for the state machine to exit so it is not + * necessary to kill the sdma_sw_clean_up_task to make sure + * it is not running. + */ + sdma_finalput(&ppd->sdma_state); + + free_sdma(ppd); +} + +int qib_sdma_running(struct qib_pportdata *ppd) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&ppd->sdma_lock, flags); + ret = __qib_sdma_running(ppd); + spin_unlock_irqrestore(&ppd->sdma_lock, flags); + + return ret; +} + +/* + * Complete a request when sdma not running; likely only request + * but to simplify the code, always queue it, then process the full + * activelist. We process the entire list to ensure that this particular + * request does get it's callback, but in the correct order. + * Must be called with sdma_lock held + */ +static void complete_sdma_err_req(struct qib_pportdata *ppd, + struct qib_verbs_txreq *tx) +{ + atomic_inc(&tx->qp->s_dma_busy); + /* no sdma descriptors, so no unmap_desc */ + tx->txreq.start_idx = 0; + tx->txreq.next_descq_idx = 0; + list_add_tail(&tx->txreq.list, &ppd->sdma_activelist); + clear_sdma_activelist(ppd); +} + +/* + * This function queues one IB packet onto the send DMA queue per call. + * The caller is responsible for checking: + * 1) The number of send DMA descriptor entries is less than the size of + * the descriptor queue. + * 2) The IB SGE addresses and lengths are 32-bit aligned + * (except possibly the last SGE's length) + * 3) The SGE addresses are suitable for passing to dma_map_single(). + */ +int qib_sdma_verbs_send(struct qib_pportdata *ppd, + struct qib_sge_state *ss, u32 dwords, + struct qib_verbs_txreq *tx) +{ + unsigned long flags; + struct qib_sge *sge; + struct qib_qp *qp; + int ret = 0; + u16 tail; + __le64 *descqp; + u64 sdmadesc[2]; + u32 dwoffset; + dma_addr_t addr; + + spin_lock_irqsave(&ppd->sdma_lock, flags); + +retry: + if (unlikely(!__qib_sdma_running(ppd))) { + complete_sdma_err_req(ppd, tx); + goto unlock; + } + + if (tx->txreq.sg_count > qib_sdma_descq_freecnt(ppd)) { + if (qib_sdma_make_progress(ppd)) + goto retry; + if (ppd->dd->flags & QIB_HAS_SDMA_TIMEOUT) + ppd->dd->f_sdma_set_desc_cnt(ppd, + ppd->sdma_descq_cnt / 2); + goto busy; + } + + dwoffset = tx->hdr_dwords; + make_sdma_desc(ppd, sdmadesc, (u64) tx->txreq.addr, dwoffset, 0); + + sdmadesc[0] |= SDMA_DESC_FIRST; + if (tx->txreq.flags & QIB_SDMA_TXREQ_F_USELARGEBUF) + sdmadesc[0] |= SDMA_DESC_USE_LARGE_BUF; + + /* write to the descq */ + tail = ppd->sdma_descq_tail; + descqp = &ppd->sdma_descq[tail].qw[0]; + *descqp++ = cpu_to_le64(sdmadesc[0]); + *descqp++ = cpu_to_le64(sdmadesc[1]); + + /* increment the tail */ + if (++tail == ppd->sdma_descq_cnt) { + tail = 0; + descqp = &ppd->sdma_descq[0].qw[0]; + ++ppd->sdma_generation; + } + + tx->txreq.start_idx = tail; + + sge = &ss->sge; + while (dwords) { + u32 dw; + u32 len; + + len = dwords << 2; + if (len > sge->length) + len = sge->length; + if (len > sge->sge_length) + len = sge->sge_length; + BUG_ON(len == 0); + dw = (len + 3) >> 2; + addr = dma_map_single(&ppd->dd->pcidev->dev, sge->vaddr, + dw << 2, DMA_TO_DEVICE); + if (dma_mapping_error(&ppd->dd->pcidev->dev, addr)) + goto unmap; + sdmadesc[0] = 0; + make_sdma_desc(ppd, sdmadesc, (u64) addr, dw, dwoffset); + /* SDmaUseLargeBuf has to be set in every descriptor */ + if (tx->txreq.flags & QIB_SDMA_TXREQ_F_USELARGEBUF) + sdmadesc[0] |= SDMA_DESC_USE_LARGE_BUF; + /* write to the descq */ + *descqp++ = cpu_to_le64(sdmadesc[0]); + *descqp++ = cpu_to_le64(sdmadesc[1]); + + /* increment the tail */ + if (++tail == ppd->sdma_descq_cnt) { + tail = 0; + descqp = &ppd->sdma_descq[0].qw[0]; + ++ppd->sdma_generation; + } + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (sge->sge_length == 0) { + if (--ss->num_sge) + *sge = *ss->sg_list++; + } else if (sge->length == 0 && sge->mr->lkey) { + if (++sge->n >= QIB_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + break; + sge->n = 0; + } + sge->vaddr = + sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = + sge->mr->map[sge->m]->segs[sge->n].length; + } + + dwoffset += dw; + dwords -= dw; + } + + if (!tail) + descqp = &ppd->sdma_descq[ppd->sdma_descq_cnt].qw[0]; + descqp -= 2; + descqp[0] |= cpu_to_le64(SDMA_DESC_LAST); + if (tx->txreq.flags & QIB_SDMA_TXREQ_F_HEADTOHOST) + descqp[0] |= cpu_to_le64(SDMA_DESC_DMA_HEAD); + if (tx->txreq.flags & QIB_SDMA_TXREQ_F_INTREQ) + descqp[0] |= cpu_to_le64(SDMA_DESC_INTR); + + atomic_inc(&tx->qp->s_dma_busy); + tx->txreq.next_descq_idx = tail; + ppd->dd->f_sdma_update_tail(ppd, tail); + ppd->sdma_descq_added += tx->txreq.sg_count; + list_add_tail(&tx->txreq.list, &ppd->sdma_activelist); + goto unlock; + +unmap: + for (;;) { + if (!tail) + tail = ppd->sdma_descq_cnt - 1; + else + tail--; + if (tail == ppd->sdma_descq_tail) + break; + unmap_desc(ppd, tail); + } + qp = tx->qp; + qib_put_txreq(tx); + spin_lock(&qp->r_lock); + spin_lock(&qp->s_lock); + if (qp->ibqp.qp_type == IB_QPT_RC) { + /* XXX what about error sending RDMA read responses? */ + if (ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK) + qib_error_qp(qp, IB_WC_GENERAL_ERR); + } else if (qp->s_wqe) + qib_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR); + spin_unlock(&qp->s_lock); + spin_unlock(&qp->r_lock); + /* return zero to process the next send work request */ + goto unlock; + +busy: + qp = tx->qp; + spin_lock(&qp->s_lock); + if (ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK) { + struct qib_ibdev *dev; + + /* + * If we couldn't queue the DMA request, save the info + * and try again later rather than destroying the + * buffer and undoing the side effects of the copy. + */ + tx->ss = ss; + tx->dwords = dwords; + qp->s_tx = tx; + dev = &ppd->dd->verbs_dev; + spin_lock(&dev->pending_lock); + if (list_empty(&qp->iowait)) { + struct qib_ibport *ibp; + + ibp = &ppd->ibport_data; + ibp->n_dmawait++; + qp->s_flags |= QIB_S_WAIT_DMA_DESC; + list_add_tail(&qp->iowait, &dev->dmawait); + } + spin_unlock(&dev->pending_lock); + qp->s_flags &= ~QIB_S_BUSY; + spin_unlock(&qp->s_lock); + ret = -EBUSY; + } else { + spin_unlock(&qp->s_lock); + qib_put_txreq(tx); + } +unlock: + spin_unlock_irqrestore(&ppd->sdma_lock, flags); + return ret; +} + +/* + * sdma_lock should be acquired before calling this routine + */ +void dump_sdma_state(struct qib_pportdata *ppd) +{ + struct qib_sdma_desc *descq; + struct qib_sdma_txreq *txp, *txpnext; + __le64 *descqp; + u64 desc[2]; + u64 addr; + u16 gen, dwlen, dwoffset; + u16 head, tail, cnt; + + head = ppd->sdma_descq_head; + tail = ppd->sdma_descq_tail; + cnt = qib_sdma_descq_freecnt(ppd); + descq = ppd->sdma_descq; + + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA ppd->sdma_descq_head: %u\n", head); + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA ppd->sdma_descq_tail: %u\n", tail); + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA sdma_descq_freecnt: %u\n", cnt); + + /* print info for each entry in the descriptor queue */ + while (head != tail) { + char flags[6] = { 'x', 'x', 'x', 'x', 'x', 0 }; + + descqp = &descq[head].qw[0]; + desc[0] = le64_to_cpu(descqp[0]); + desc[1] = le64_to_cpu(descqp[1]); + flags[0] = (desc[0] & 1<<15) ? 'I' : '-'; + flags[1] = (desc[0] & 1<<14) ? 'L' : 'S'; + flags[2] = (desc[0] & 1<<13) ? 'H' : '-'; + flags[3] = (desc[0] & 1<<12) ? 'F' : '-'; + flags[4] = (desc[0] & 1<<11) ? 'L' : '-'; + addr = (desc[1] << 32) | ((desc[0] >> 32) & 0xfffffffcULL); + gen = (desc[0] >> 30) & 3ULL; + dwlen = (desc[0] >> 14) & (0x7ffULL << 2); + dwoffset = (desc[0] & 0x7ffULL) << 2; + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA sdmadesc[%u]: flags:%s addr:0x%016llx gen:%u len:%u bytes offset:%u bytes\n", + head, flags, addr, gen, dwlen, dwoffset); + if (++head == ppd->sdma_descq_cnt) + head = 0; + } + + /* print dma descriptor indices from the TX requests */ + list_for_each_entry_safe(txp, txpnext, &ppd->sdma_activelist, + list) + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA txp->start_idx: %u txp->next_descq_idx: %u\n", + txp->start_idx, txp->next_descq_idx); +} + +void qib_sdma_process_event(struct qib_pportdata *ppd, + enum qib_sdma_events event) +{ + unsigned long flags; + + spin_lock_irqsave(&ppd->sdma_lock, flags); + + __qib_sdma_process_event(ppd, event); + + if (ppd->sdma_state.current_state == qib_sdma_state_s99_running) + qib_verbs_sdma_desc_avail(ppd, qib_sdma_descq_freecnt(ppd)); + + spin_unlock_irqrestore(&ppd->sdma_lock, flags); +} + +void __qib_sdma_process_event(struct qib_pportdata *ppd, + enum qib_sdma_events event) +{ + struct qib_sdma_state *ss = &ppd->sdma_state; + + switch (ss->current_state) { + case qib_sdma_state_s00_hw_down: + switch (event) { + case qib_sdma_event_e00_go_hw_down: + break; + case qib_sdma_event_e30_go_running: + /* + * If down, but running requested (usually result + * of link up, then we need to start up. + * This can happen when hw down is requested while + * bringing the link up with traffic active on + * 7220, e.g. */ + ss->go_s99_running = 1; + /* fall through and start dma engine */ + case qib_sdma_event_e10_go_hw_start: + /* This reference means the state machine is started */ + sdma_get(&ppd->sdma_state); + sdma_set_state(ppd, + qib_sdma_state_s10_hw_start_up_wait); + break; + case qib_sdma_event_e20_hw_started: + break; + case qib_sdma_event_e40_sw_cleaned: + sdma_sw_tear_down(ppd); + break; + case qib_sdma_event_e50_hw_cleaned: + break; + case qib_sdma_event_e60_hw_halted: + break; + case qib_sdma_event_e70_go_idle: + break; + case qib_sdma_event_e7220_err_halted: + break; + case qib_sdma_event_e7322_err_halted: + break; + case qib_sdma_event_e90_timer_tick: + break; + } + break; + + case qib_sdma_state_s10_hw_start_up_wait: + switch (event) { + case qib_sdma_event_e00_go_hw_down: + sdma_set_state(ppd, qib_sdma_state_s00_hw_down); + sdma_sw_tear_down(ppd); + break; + case qib_sdma_event_e10_go_hw_start: + break; + case qib_sdma_event_e20_hw_started: + sdma_set_state(ppd, ss->go_s99_running ? + qib_sdma_state_s99_running : + qib_sdma_state_s20_idle); + break; + case qib_sdma_event_e30_go_running: + ss->go_s99_running = 1; + break; + case qib_sdma_event_e40_sw_cleaned: + break; + case qib_sdma_event_e50_hw_cleaned: + break; + case qib_sdma_event_e60_hw_halted: + break; + case qib_sdma_event_e70_go_idle: + ss->go_s99_running = 0; + break; + case qib_sdma_event_e7220_err_halted: + break; + case qib_sdma_event_e7322_err_halted: + break; + case qib_sdma_event_e90_timer_tick: + break; + } + break; + + case qib_sdma_state_s20_idle: + switch (event) { + case qib_sdma_event_e00_go_hw_down: + sdma_set_state(ppd, qib_sdma_state_s00_hw_down); + sdma_sw_tear_down(ppd); + break; + case qib_sdma_event_e10_go_hw_start: + break; + case qib_sdma_event_e20_hw_started: + break; + case qib_sdma_event_e30_go_running: + sdma_set_state(ppd, qib_sdma_state_s99_running); + ss->go_s99_running = 1; + break; + case qib_sdma_event_e40_sw_cleaned: + break; + case qib_sdma_event_e50_hw_cleaned: + break; + case qib_sdma_event_e60_hw_halted: + break; + case qib_sdma_event_e70_go_idle: + break; + case qib_sdma_event_e7220_err_halted: + break; + case qib_sdma_event_e7322_err_halted: + break; + case qib_sdma_event_e90_timer_tick: + break; + } + break; + + case qib_sdma_state_s30_sw_clean_up_wait: + switch (event) { + case qib_sdma_event_e00_go_hw_down: + sdma_set_state(ppd, qib_sdma_state_s00_hw_down); + break; + case qib_sdma_event_e10_go_hw_start: + break; + case qib_sdma_event_e20_hw_started: + break; + case qib_sdma_event_e30_go_running: + ss->go_s99_running = 1; + break; + case qib_sdma_event_e40_sw_cleaned: + sdma_set_state(ppd, + qib_sdma_state_s10_hw_start_up_wait); + sdma_hw_start_up(ppd); + break; + case qib_sdma_event_e50_hw_cleaned: + break; + case qib_sdma_event_e60_hw_halted: + break; + case qib_sdma_event_e70_go_idle: + ss->go_s99_running = 0; + break; + case qib_sdma_event_e7220_err_halted: + break; + case qib_sdma_event_e7322_err_halted: + break; + case qib_sdma_event_e90_timer_tick: + break; + } + break; + + case qib_sdma_state_s40_hw_clean_up_wait: + switch (event) { + case qib_sdma_event_e00_go_hw_down: + sdma_set_state(ppd, qib_sdma_state_s00_hw_down); + sdma_start_sw_clean_up(ppd); + break; + case qib_sdma_event_e10_go_hw_start: + break; + case qib_sdma_event_e20_hw_started: + break; + case qib_sdma_event_e30_go_running: + ss->go_s99_running = 1; + break; + case qib_sdma_event_e40_sw_cleaned: + break; + case qib_sdma_event_e50_hw_cleaned: + sdma_set_state(ppd, + qib_sdma_state_s30_sw_clean_up_wait); + sdma_start_sw_clean_up(ppd); + break; + case qib_sdma_event_e60_hw_halted: + break; + case qib_sdma_event_e70_go_idle: + ss->go_s99_running = 0; + break; + case qib_sdma_event_e7220_err_halted: + break; + case qib_sdma_event_e7322_err_halted: + break; + case qib_sdma_event_e90_timer_tick: + break; + } + break; + + case qib_sdma_state_s50_hw_halt_wait: + switch (event) { + case qib_sdma_event_e00_go_hw_down: + sdma_set_state(ppd, qib_sdma_state_s00_hw_down); + sdma_start_sw_clean_up(ppd); + break; + case qib_sdma_event_e10_go_hw_start: + break; + case qib_sdma_event_e20_hw_started: + break; + case qib_sdma_event_e30_go_running: + ss->go_s99_running = 1; + break; + case qib_sdma_event_e40_sw_cleaned: + break; + case qib_sdma_event_e50_hw_cleaned: + break; + case qib_sdma_event_e60_hw_halted: + sdma_set_state(ppd, + qib_sdma_state_s40_hw_clean_up_wait); + ppd->dd->f_sdma_hw_clean_up(ppd); + break; + case qib_sdma_event_e70_go_idle: + ss->go_s99_running = 0; + break; + case qib_sdma_event_e7220_err_halted: + break; + case qib_sdma_event_e7322_err_halted: + break; + case qib_sdma_event_e90_timer_tick: + break; + } + break; + + case qib_sdma_state_s99_running: + switch (event) { + case qib_sdma_event_e00_go_hw_down: + sdma_set_state(ppd, qib_sdma_state_s00_hw_down); + sdma_start_sw_clean_up(ppd); + break; + case qib_sdma_event_e10_go_hw_start: + break; + case qib_sdma_event_e20_hw_started: + break; + case qib_sdma_event_e30_go_running: + break; + case qib_sdma_event_e40_sw_cleaned: + break; + case qib_sdma_event_e50_hw_cleaned: + break; + case qib_sdma_event_e60_hw_halted: + sdma_set_state(ppd, + qib_sdma_state_s30_sw_clean_up_wait); + sdma_start_sw_clean_up(ppd); + break; + case qib_sdma_event_e70_go_idle: + sdma_set_state(ppd, qib_sdma_state_s50_hw_halt_wait); + ss->go_s99_running = 0; + break; + case qib_sdma_event_e7220_err_halted: + sdma_set_state(ppd, + qib_sdma_state_s30_sw_clean_up_wait); + sdma_start_sw_clean_up(ppd); + break; + case qib_sdma_event_e7322_err_halted: + sdma_set_state(ppd, qib_sdma_state_s50_hw_halt_wait); + break; + case qib_sdma_event_e90_timer_tick: + break; + } + break; + } + + ss->last_event = event; +} diff --git a/kernel/drivers/infiniband/hw/qib/qib_srq.c b/kernel/drivers/infiniband/hw/qib/qib_srq.c new file mode 100644 index 000000000..d6235931a --- /dev/null +++ b/kernel/drivers/infiniband/hw/qib/qib_srq.c @@ -0,0 +1,380 @@ +/* + * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include "qib_verbs.h" + +/** + * qib_post_srq_receive - post a receive on a shared receive queue + * @ibsrq: the SRQ to post the receive on + * @wr: the list of work requests to post + * @bad_wr: A pointer to the first WR to cause a problem is put here + * + * This may be called from interrupt context. + */ +int qib_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct qib_srq *srq = to_isrq(ibsrq); + struct qib_rwq *wq; + unsigned long flags; + int ret; + + for (; wr; wr = wr->next) { + struct qib_rwqe *wqe; + u32 next; + int i; + + if ((unsigned) wr->num_sge > srq->rq.max_sge) { + *bad_wr = wr; + ret = -EINVAL; + goto bail; + } + + spin_lock_irqsave(&srq->rq.lock, flags); + wq = srq->rq.wq; + next = wq->head + 1; + if (next >= srq->rq.size) + next = 0; + if (next == wq->tail) { + spin_unlock_irqrestore(&srq->rq.lock, flags); + *bad_wr = wr; + ret = -ENOMEM; + goto bail; + } + + wqe = get_rwqe_ptr(&srq->rq, wq->head); + wqe->wr_id = wr->wr_id; + wqe->num_sge = wr->num_sge; + for (i = 0; i < wr->num_sge; i++) + wqe->sg_list[i] = wr->sg_list[i]; + /* Make sure queue entry is written before the head index. */ + smp_wmb(); + wq->head = next; + spin_unlock_irqrestore(&srq->rq.lock, flags); + } + ret = 0; + +bail: + return ret; +} + +/** + * qib_create_srq - create a shared receive queue + * @ibpd: the protection domain of the SRQ to create + * @srq_init_attr: the attributes of the SRQ + * @udata: data from libibverbs when creating a user SRQ + */ +struct ib_srq *qib_create_srq(struct ib_pd *ibpd, + struct ib_srq_init_attr *srq_init_attr, + struct ib_udata *udata) +{ + struct qib_ibdev *dev = to_idev(ibpd->device); + struct qib_srq *srq; + u32 sz; + struct ib_srq *ret; + + if (srq_init_attr->srq_type != IB_SRQT_BASIC) { + ret = ERR_PTR(-ENOSYS); + goto done; + } + + if (srq_init_attr->attr.max_sge == 0 || + srq_init_attr->attr.max_sge > ib_qib_max_srq_sges || + srq_init_attr->attr.max_wr == 0 || + srq_init_attr->attr.max_wr > ib_qib_max_srq_wrs) { + ret = ERR_PTR(-EINVAL); + goto done; + } + + srq = kmalloc(sizeof(*srq), GFP_KERNEL); + if (!srq) { + ret = ERR_PTR(-ENOMEM); + goto done; + } + + /* + * Need to use vmalloc() if we want to support large #s of entries. + */ + srq->rq.size = srq_init_attr->attr.max_wr + 1; + srq->rq.max_sge = srq_init_attr->attr.max_sge; + sz = sizeof(struct ib_sge) * srq->rq.max_sge + + sizeof(struct qib_rwqe); + srq->rq.wq = vmalloc_user(sizeof(struct qib_rwq) + srq->rq.size * sz); + if (!srq->rq.wq) { + ret = ERR_PTR(-ENOMEM); + goto bail_srq; + } + + /* + * Return the address of the RWQ as the offset to mmap. + * See qib_mmap() for details. + */ + if (udata && udata->outlen >= sizeof(__u64)) { + int err; + u32 s = sizeof(struct qib_rwq) + srq->rq.size * sz; + + srq->ip = + qib_create_mmap_info(dev, s, ibpd->uobject->context, + srq->rq.wq); + if (!srq->ip) { + ret = ERR_PTR(-ENOMEM); + goto bail_wq; + } + + err = ib_copy_to_udata(udata, &srq->ip->offset, + sizeof(srq->ip->offset)); + if (err) { + ret = ERR_PTR(err); + goto bail_ip; + } + } else + srq->ip = NULL; + + /* + * ib_create_srq() will initialize srq->ibsrq. + */ + spin_lock_init(&srq->rq.lock); + srq->rq.wq->head = 0; + srq->rq.wq->tail = 0; + srq->limit = srq_init_attr->attr.srq_limit; + + spin_lock(&dev->n_srqs_lock); + if (dev->n_srqs_allocated == ib_qib_max_srqs) { + spin_unlock(&dev->n_srqs_lock); + ret = ERR_PTR(-ENOMEM); + goto bail_ip; + } + + dev->n_srqs_allocated++; + spin_unlock(&dev->n_srqs_lock); + + if (srq->ip) { + spin_lock_irq(&dev->pending_lock); + list_add(&srq->ip->pending_mmaps, &dev->pending_mmaps); + spin_unlock_irq(&dev->pending_lock); + } + + ret = &srq->ibsrq; + goto done; + +bail_ip: + kfree(srq->ip); +bail_wq: + vfree(srq->rq.wq); +bail_srq: + kfree(srq); +done: + return ret; +} + +/** + * qib_modify_srq - modify a shared receive queue + * @ibsrq: the SRQ to modify + * @attr: the new attributes of the SRQ + * @attr_mask: indicates which attributes to modify + * @udata: user data for libibverbs.so + */ +int qib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, + struct ib_udata *udata) +{ + struct qib_srq *srq = to_isrq(ibsrq); + struct qib_rwq *wq; + int ret = 0; + + if (attr_mask & IB_SRQ_MAX_WR) { + struct qib_rwq *owq; + struct qib_rwqe *p; + u32 sz, size, n, head, tail; + + /* Check that the requested sizes are below the limits. */ + if ((attr->max_wr > ib_qib_max_srq_wrs) || + ((attr_mask & IB_SRQ_LIMIT) ? + attr->srq_limit : srq->limit) > attr->max_wr) { + ret = -EINVAL; + goto bail; + } + + sz = sizeof(struct qib_rwqe) + + srq->rq.max_sge * sizeof(struct ib_sge); + size = attr->max_wr + 1; + wq = vmalloc_user(sizeof(struct qib_rwq) + size * sz); + if (!wq) { + ret = -ENOMEM; + goto bail; + } + + /* Check that we can write the offset to mmap. */ + if (udata && udata->inlen >= sizeof(__u64)) { + __u64 offset_addr; + __u64 offset = 0; + + ret = ib_copy_from_udata(&offset_addr, udata, + sizeof(offset_addr)); + if (ret) + goto bail_free; + udata->outbuf = + (void __user *) (unsigned long) offset_addr; + ret = ib_copy_to_udata(udata, &offset, + sizeof(offset)); + if (ret) + goto bail_free; + } + + spin_lock_irq(&srq->rq.lock); + /* + * validate head and tail pointer values and compute + * the number of remaining WQEs. + */ + owq = srq->rq.wq; + head = owq->head; + tail = owq->tail; + if (head >= srq->rq.size || tail >= srq->rq.size) { + ret = -EINVAL; + goto bail_unlock; + } + n = head; + if (n < tail) + n += srq->rq.size - tail; + else + n -= tail; + if (size <= n) { + ret = -EINVAL; + goto bail_unlock; + } + n = 0; + p = wq->wq; + while (tail != head) { + struct qib_rwqe *wqe; + int i; + + wqe = get_rwqe_ptr(&srq->rq, tail); + p->wr_id = wqe->wr_id; + p->num_sge = wqe->num_sge; + for (i = 0; i < wqe->num_sge; i++) + p->sg_list[i] = wqe->sg_list[i]; + n++; + p = (struct qib_rwqe *)((char *) p + sz); + if (++tail >= srq->rq.size) + tail = 0; + } + srq->rq.wq = wq; + srq->rq.size = size; + wq->head = n; + wq->tail = 0; + if (attr_mask & IB_SRQ_LIMIT) + srq->limit = attr->srq_limit; + spin_unlock_irq(&srq->rq.lock); + + vfree(owq); + + if (srq->ip) { + struct qib_mmap_info *ip = srq->ip; + struct qib_ibdev *dev = to_idev(srq->ibsrq.device); + u32 s = sizeof(struct qib_rwq) + size * sz; + + qib_update_mmap_info(dev, ip, s, wq); + + /* + * Return the offset to mmap. + * See qib_mmap() for details. + */ + if (udata && udata->inlen >= sizeof(__u64)) { + ret = ib_copy_to_udata(udata, &ip->offset, + sizeof(ip->offset)); + if (ret) + goto bail; + } + + /* + * Put user mapping info onto the pending list + * unless it already is on the list. + */ + spin_lock_irq(&dev->pending_lock); + if (list_empty(&ip->pending_mmaps)) + list_add(&ip->pending_mmaps, + &dev->pending_mmaps); + spin_unlock_irq(&dev->pending_lock); + } + } else if (attr_mask & IB_SRQ_LIMIT) { + spin_lock_irq(&srq->rq.lock); + if (attr->srq_limit >= srq->rq.size) + ret = -EINVAL; + else + srq->limit = attr->srq_limit; + spin_unlock_irq(&srq->rq.lock); + } + goto bail; + +bail_unlock: + spin_unlock_irq(&srq->rq.lock); +bail_free: + vfree(wq); +bail: + return ret; +} + +int qib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr) +{ + struct qib_srq *srq = to_isrq(ibsrq); + + attr->max_wr = srq->rq.size - 1; + attr->max_sge = srq->rq.max_sge; + attr->srq_limit = srq->limit; + return 0; +} + +/** + * qib_destroy_srq - destroy a shared receive queue + * @ibsrq: the SRQ to destroy + */ +int qib_destroy_srq(struct ib_srq *ibsrq) +{ + struct qib_srq *srq = to_isrq(ibsrq); + struct qib_ibdev *dev = to_idev(ibsrq->device); + + spin_lock(&dev->n_srqs_lock); + dev->n_srqs_allocated--; + spin_unlock(&dev->n_srqs_lock); + if (srq->ip) + kref_put(&srq->ip->ref, qib_release_mmap_info); + else + vfree(srq->rq.wq); + kfree(srq); + + return 0; +} diff --git a/kernel/drivers/infiniband/hw/qib/qib_sysfs.c b/kernel/drivers/infiniband/hw/qib/qib_sysfs.c new file mode 100644 index 000000000..81f56cdff --- /dev/null +++ b/kernel/drivers/infiniband/hw/qib/qib_sysfs.c @@ -0,0 +1,818 @@ +/* + * Copyright (c) 2012 Intel Corporation. All rights reserved. + * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved. + * Copyright (c) 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include + +#include "qib.h" +#include "qib_mad.h" + +/* start of per-port functions */ +/* + * Get/Set heartbeat enable. OR of 1=enabled, 2=auto + */ +static ssize_t show_hrtbt_enb(struct qib_pportdata *ppd, char *buf) +{ + struct qib_devdata *dd = ppd->dd; + int ret; + + ret = dd->f_get_ib_cfg(ppd, QIB_IB_CFG_HRTBT); + ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret); + return ret; +} + +static ssize_t store_hrtbt_enb(struct qib_pportdata *ppd, const char *buf, + size_t count) +{ + struct qib_devdata *dd = ppd->dd; + int ret; + u16 val; + + ret = kstrtou16(buf, 0, &val); + if (ret) { + qib_dev_err(dd, "attempt to set invalid Heartbeat enable\n"); + return ret; + } + + /* + * Set the "intentional" heartbeat enable per either of + * "Enable" and "Auto", as these are normally set together. + * This bit is consulted when leaving loopback mode, + * because entering loopback mode overrides it and automatically + * disables heartbeat. + */ + ret = dd->f_set_ib_cfg(ppd, QIB_IB_CFG_HRTBT, val); + return ret < 0 ? ret : count; +} + +static ssize_t store_loopback(struct qib_pportdata *ppd, const char *buf, + size_t count) +{ + struct qib_devdata *dd = ppd->dd; + int ret = count, r; + + r = dd->f_set_ib_loopback(ppd, buf); + if (r < 0) + ret = r; + + return ret; +} + +static ssize_t store_led_override(struct qib_pportdata *ppd, const char *buf, + size_t count) +{ + struct qib_devdata *dd = ppd->dd; + int ret; + u16 val; + + ret = kstrtou16(buf, 0, &val); + if (ret) { + qib_dev_err(dd, "attempt to set invalid LED override\n"); + return ret; + } + + qib_set_led_override(ppd, val); + return count; +} + +static ssize_t show_status(struct qib_pportdata *ppd, char *buf) +{ + ssize_t ret; + + if (!ppd->statusp) + ret = -EINVAL; + else + ret = scnprintf(buf, PAGE_SIZE, "0x%llx\n", + (unsigned long long) *(ppd->statusp)); + return ret; +} + +/* + * For userland compatibility, these offsets must remain fixed. + * They are strings for QIB_STATUS_* + */ +static const char * const qib_status_str[] = { + "Initted", + "", + "", + "", + "", + "Present", + "IB_link_up", + "IB_configured", + "", + "Fatal_Hardware_Error", + NULL, +}; + +static ssize_t show_status_str(struct qib_pportdata *ppd, char *buf) +{ + int i, any; + u64 s; + ssize_t ret; + + if (!ppd->statusp) { + ret = -EINVAL; + goto bail; + } + + s = *(ppd->statusp); + *buf = '\0'; + for (any = i = 0; s && qib_status_str[i]; i++) { + if (s & 1) { + /* if overflow */ + if (any && strlcat(buf, " ", PAGE_SIZE) >= PAGE_SIZE) + break; + if (strlcat(buf, qib_status_str[i], PAGE_SIZE) >= + PAGE_SIZE) + break; + any = 1; + } + s >>= 1; + } + if (any) + strlcat(buf, "\n", PAGE_SIZE); + + ret = strlen(buf); + +bail: + return ret; +} + +/* end of per-port functions */ + +/* + * Start of per-port file structures and support code + * Because we are fitting into other infrastructure, we have to supply the + * full set of kobject/sysfs_ops structures and routines. + */ +#define QIB_PORT_ATTR(name, mode, show, store) \ + static struct qib_port_attr qib_port_attr_##name = \ + __ATTR(name, mode, show, store) + +struct qib_port_attr { + struct attribute attr; + ssize_t (*show)(struct qib_pportdata *, char *); + ssize_t (*store)(struct qib_pportdata *, const char *, size_t); +}; + +QIB_PORT_ATTR(loopback, S_IWUSR, NULL, store_loopback); +QIB_PORT_ATTR(led_override, S_IWUSR, NULL, store_led_override); +QIB_PORT_ATTR(hrtbt_enable, S_IWUSR | S_IRUGO, show_hrtbt_enb, + store_hrtbt_enb); +QIB_PORT_ATTR(status, S_IRUGO, show_status, NULL); +QIB_PORT_ATTR(status_str, S_IRUGO, show_status_str, NULL); + +static struct attribute *port_default_attributes[] = { + &qib_port_attr_loopback.attr, + &qib_port_attr_led_override.attr, + &qib_port_attr_hrtbt_enable.attr, + &qib_port_attr_status.attr, + &qib_port_attr_status_str.attr, + NULL +}; + +/* + * Start of per-port congestion control structures and support code + */ + +/* + * Congestion control table size followed by table entries + */ +static ssize_t read_cc_table_bin(struct file *filp, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t pos, size_t count) +{ + int ret; + struct qib_pportdata *ppd = + container_of(kobj, struct qib_pportdata, pport_cc_kobj); + + if (!qib_cc_table_size || !ppd->ccti_entries_shadow) + return -EINVAL; + + ret = ppd->total_cct_entry * sizeof(struct ib_cc_table_entry_shadow) + + sizeof(__be16); + + if (pos > ret) + return -EINVAL; + + if (count > ret - pos) + count = ret - pos; + + if (!count) + return count; + + spin_lock(&ppd->cc_shadow_lock); + memcpy(buf, ppd->ccti_entries_shadow, count); + spin_unlock(&ppd->cc_shadow_lock); + + return count; +} + +static void qib_port_release(struct kobject *kobj) +{ + /* nothing to do since memory is freed by qib_free_devdata() */ +} + +static struct kobj_type qib_port_cc_ktype = { + .release = qib_port_release, +}; + +static struct bin_attribute cc_table_bin_attr = { + .attr = {.name = "cc_table_bin", .mode = 0444}, + .read = read_cc_table_bin, + .size = PAGE_SIZE, +}; + +/* + * Congestion settings: port control, control map and an array of 16 + * entries for the congestion entries - increase, timer, event log + * trigger threshold and the minimum injection rate delay. + */ +static ssize_t read_cc_setting_bin(struct file *filp, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t pos, size_t count) +{ + int ret; + struct qib_pportdata *ppd = + container_of(kobj, struct qib_pportdata, pport_cc_kobj); + + if (!qib_cc_table_size || !ppd->congestion_entries_shadow) + return -EINVAL; + + ret = sizeof(struct ib_cc_congestion_setting_attr_shadow); + + if (pos > ret) + return -EINVAL; + if (count > ret - pos) + count = ret - pos; + + if (!count) + return count; + + spin_lock(&ppd->cc_shadow_lock); + memcpy(buf, ppd->congestion_entries_shadow, count); + spin_unlock(&ppd->cc_shadow_lock); + + return count; +} + +static struct bin_attribute cc_setting_bin_attr = { + .attr = {.name = "cc_settings_bin", .mode = 0444}, + .read = read_cc_setting_bin, + .size = PAGE_SIZE, +}; + + +static ssize_t qib_portattr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct qib_port_attr *pattr = + container_of(attr, struct qib_port_attr, attr); + struct qib_pportdata *ppd = + container_of(kobj, struct qib_pportdata, pport_kobj); + + return pattr->show(ppd, buf); +} + +static ssize_t qib_portattr_store(struct kobject *kobj, + struct attribute *attr, const char *buf, size_t len) +{ + struct qib_port_attr *pattr = + container_of(attr, struct qib_port_attr, attr); + struct qib_pportdata *ppd = + container_of(kobj, struct qib_pportdata, pport_kobj); + + return pattr->store(ppd, buf, len); +} + + +static const struct sysfs_ops qib_port_ops = { + .show = qib_portattr_show, + .store = qib_portattr_store, +}; + +static struct kobj_type qib_port_ktype = { + .release = qib_port_release, + .sysfs_ops = &qib_port_ops, + .default_attrs = port_default_attributes +}; + +/* Start sl2vl */ + +#define QIB_SL2VL_ATTR(N) \ + static struct qib_sl2vl_attr qib_sl2vl_attr_##N = { \ + .attr = { .name = __stringify(N), .mode = 0444 }, \ + .sl = N \ + } + +struct qib_sl2vl_attr { + struct attribute attr; + int sl; +}; + +QIB_SL2VL_ATTR(0); +QIB_SL2VL_ATTR(1); +QIB_SL2VL_ATTR(2); +QIB_SL2VL_ATTR(3); +QIB_SL2VL_ATTR(4); +QIB_SL2VL_ATTR(5); +QIB_SL2VL_ATTR(6); +QIB_SL2VL_ATTR(7); +QIB_SL2VL_ATTR(8); +QIB_SL2VL_ATTR(9); +QIB_SL2VL_ATTR(10); +QIB_SL2VL_ATTR(11); +QIB_SL2VL_ATTR(12); +QIB_SL2VL_ATTR(13); +QIB_SL2VL_ATTR(14); +QIB_SL2VL_ATTR(15); + +static struct attribute *sl2vl_default_attributes[] = { + &qib_sl2vl_attr_0.attr, + &qib_sl2vl_attr_1.attr, + &qib_sl2vl_attr_2.attr, + &qib_sl2vl_attr_3.attr, + &qib_sl2vl_attr_4.attr, + &qib_sl2vl_attr_5.attr, + &qib_sl2vl_attr_6.attr, + &qib_sl2vl_attr_7.attr, + &qib_sl2vl_attr_8.attr, + &qib_sl2vl_attr_9.attr, + &qib_sl2vl_attr_10.attr, + &qib_sl2vl_attr_11.attr, + &qib_sl2vl_attr_12.attr, + &qib_sl2vl_attr_13.attr, + &qib_sl2vl_attr_14.attr, + &qib_sl2vl_attr_15.attr, + NULL +}; + +static ssize_t sl2vl_attr_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct qib_sl2vl_attr *sattr = + container_of(attr, struct qib_sl2vl_attr, attr); + struct qib_pportdata *ppd = + container_of(kobj, struct qib_pportdata, sl2vl_kobj); + struct qib_ibport *qibp = &ppd->ibport_data; + + return sprintf(buf, "%u\n", qibp->sl_to_vl[sattr->sl]); +} + +static const struct sysfs_ops qib_sl2vl_ops = { + .show = sl2vl_attr_show, +}; + +static struct kobj_type qib_sl2vl_ktype = { + .release = qib_port_release, + .sysfs_ops = &qib_sl2vl_ops, + .default_attrs = sl2vl_default_attributes +}; + +/* End sl2vl */ + +/* Start diag_counters */ + +#define QIB_DIAGC_ATTR(N) \ + static struct qib_diagc_attr qib_diagc_attr_##N = { \ + .attr = { .name = __stringify(N), .mode = 0664 }, \ + .counter = offsetof(struct qib_ibport, n_##N) \ + } + +struct qib_diagc_attr { + struct attribute attr; + size_t counter; +}; + +QIB_DIAGC_ATTR(rc_resends); +QIB_DIAGC_ATTR(rc_acks); +QIB_DIAGC_ATTR(rc_qacks); +QIB_DIAGC_ATTR(rc_delayed_comp); +QIB_DIAGC_ATTR(seq_naks); +QIB_DIAGC_ATTR(rdma_seq); +QIB_DIAGC_ATTR(rnr_naks); +QIB_DIAGC_ATTR(other_naks); +QIB_DIAGC_ATTR(rc_timeouts); +QIB_DIAGC_ATTR(loop_pkts); +QIB_DIAGC_ATTR(pkt_drops); +QIB_DIAGC_ATTR(dmawait); +QIB_DIAGC_ATTR(unaligned); +QIB_DIAGC_ATTR(rc_dupreq); +QIB_DIAGC_ATTR(rc_seqnak); + +static struct attribute *diagc_default_attributes[] = { + &qib_diagc_attr_rc_resends.attr, + &qib_diagc_attr_rc_acks.attr, + &qib_diagc_attr_rc_qacks.attr, + &qib_diagc_attr_rc_delayed_comp.attr, + &qib_diagc_attr_seq_naks.attr, + &qib_diagc_attr_rdma_seq.attr, + &qib_diagc_attr_rnr_naks.attr, + &qib_diagc_attr_other_naks.attr, + &qib_diagc_attr_rc_timeouts.attr, + &qib_diagc_attr_loop_pkts.attr, + &qib_diagc_attr_pkt_drops.attr, + &qib_diagc_attr_dmawait.attr, + &qib_diagc_attr_unaligned.attr, + &qib_diagc_attr_rc_dupreq.attr, + &qib_diagc_attr_rc_seqnak.attr, + NULL +}; + +static ssize_t diagc_attr_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct qib_diagc_attr *dattr = + container_of(attr, struct qib_diagc_attr, attr); + struct qib_pportdata *ppd = + container_of(kobj, struct qib_pportdata, diagc_kobj); + struct qib_ibport *qibp = &ppd->ibport_data; + + return sprintf(buf, "%u\n", *(u32 *)((char *)qibp + dattr->counter)); +} + +static ssize_t diagc_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t size) +{ + struct qib_diagc_attr *dattr = + container_of(attr, struct qib_diagc_attr, attr); + struct qib_pportdata *ppd = + container_of(kobj, struct qib_pportdata, diagc_kobj); + struct qib_ibport *qibp = &ppd->ibport_data; + u32 val; + int ret; + + ret = kstrtou32(buf, 0, &val); + if (ret) + return ret; + *(u32 *)((char *) qibp + dattr->counter) = val; + return size; +} + +static const struct sysfs_ops qib_diagc_ops = { + .show = diagc_attr_show, + .store = diagc_attr_store, +}; + +static struct kobj_type qib_diagc_ktype = { + .release = qib_port_release, + .sysfs_ops = &qib_diagc_ops, + .default_attrs = diagc_default_attributes +}; + +/* End diag_counters */ + +/* end of per-port file structures and support code */ + +/* + * Start of per-unit (or driver, in some cases, but replicated + * per unit) functions (these get a device *) + */ +static ssize_t show_rev(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct qib_ibdev *dev = + container_of(device, struct qib_ibdev, ibdev.dev); + + return sprintf(buf, "%x\n", dd_from_dev(dev)->minrev); +} + +static ssize_t show_hca(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct qib_ibdev *dev = + container_of(device, struct qib_ibdev, ibdev.dev); + struct qib_devdata *dd = dd_from_dev(dev); + int ret; + + if (!dd->boardname) + ret = -EINVAL; + else + ret = scnprintf(buf, PAGE_SIZE, "%s\n", dd->boardname); + return ret; +} + +static ssize_t show_version(struct device *device, + struct device_attribute *attr, char *buf) +{ + /* The string printed here is already newline-terminated. */ + return scnprintf(buf, PAGE_SIZE, "%s", (char *)ib_qib_version); +} + +static ssize_t show_boardversion(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct qib_ibdev *dev = + container_of(device, struct qib_ibdev, ibdev.dev); + struct qib_devdata *dd = dd_from_dev(dev); + + /* The string printed here is already newline-terminated. */ + return scnprintf(buf, PAGE_SIZE, "%s", dd->boardversion); +} + + +static ssize_t show_localbus_info(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct qib_ibdev *dev = + container_of(device, struct qib_ibdev, ibdev.dev); + struct qib_devdata *dd = dd_from_dev(dev); + + /* The string printed here is already newline-terminated. */ + return scnprintf(buf, PAGE_SIZE, "%s", dd->lbus_info); +} + + +static ssize_t show_nctxts(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct qib_ibdev *dev = + container_of(device, struct qib_ibdev, ibdev.dev); + struct qib_devdata *dd = dd_from_dev(dev); + + /* Return the number of user ports (contexts) available. */ + /* The calculation below deals with a special case where + * cfgctxts is set to 1 on a single-port board. */ + return scnprintf(buf, PAGE_SIZE, "%u\n", + (dd->first_user_ctxt > dd->cfgctxts) ? 0 : + (dd->cfgctxts - dd->first_user_ctxt)); +} + +static ssize_t show_nfreectxts(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct qib_ibdev *dev = + container_of(device, struct qib_ibdev, ibdev.dev); + struct qib_devdata *dd = dd_from_dev(dev); + + /* Return the number of free user ports (contexts) available. */ + return scnprintf(buf, PAGE_SIZE, "%u\n", dd->freectxts); +} + +static ssize_t show_serial(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct qib_ibdev *dev = + container_of(device, struct qib_ibdev, ibdev.dev); + struct qib_devdata *dd = dd_from_dev(dev); + + buf[sizeof(dd->serial)] = '\0'; + memcpy(buf, dd->serial, sizeof(dd->serial)); + strcat(buf, "\n"); + return strlen(buf); +} + +static ssize_t store_chip_reset(struct device *device, + struct device_attribute *attr, const char *buf, + size_t count) +{ + struct qib_ibdev *dev = + container_of(device, struct qib_ibdev, ibdev.dev); + struct qib_devdata *dd = dd_from_dev(dev); + int ret; + + if (count < 5 || memcmp(buf, "reset", 5) || !dd->diag_client) { + ret = -EINVAL; + goto bail; + } + + ret = qib_reset_device(dd->unit); +bail: + return ret < 0 ? ret : count; +} + +/* + * Dump tempsense regs. in decimal, to ease shell-scripts. + */ +static ssize_t show_tempsense(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct qib_ibdev *dev = + container_of(device, struct qib_ibdev, ibdev.dev); + struct qib_devdata *dd = dd_from_dev(dev); + int ret; + int idx; + u8 regvals[8]; + + ret = -ENXIO; + for (idx = 0; idx < 8; ++idx) { + if (idx == 6) + continue; + ret = dd->f_tempsense_rd(dd, idx); + if (ret < 0) + break; + regvals[idx] = ret; + } + if (idx == 8) + ret = scnprintf(buf, PAGE_SIZE, "%d %d %02X %02X %d %d\n", + *(signed char *)(regvals), + *(signed char *)(regvals + 1), + regvals[2], regvals[3], + *(signed char *)(regvals + 5), + *(signed char *)(regvals + 7)); + return ret; +} + +/* + * end of per-unit (or driver, in some cases, but replicated + * per unit) functions + */ + +/* start of per-unit file structures and support code */ +static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); +static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); +static DEVICE_ATTR(board_id, S_IRUGO, show_hca, NULL); +static DEVICE_ATTR(version, S_IRUGO, show_version, NULL); +static DEVICE_ATTR(nctxts, S_IRUGO, show_nctxts, NULL); +static DEVICE_ATTR(nfreectxts, S_IRUGO, show_nfreectxts, NULL); +static DEVICE_ATTR(serial, S_IRUGO, show_serial, NULL); +static DEVICE_ATTR(boardversion, S_IRUGO, show_boardversion, NULL); +static DEVICE_ATTR(tempsense, S_IRUGO, show_tempsense, NULL); +static DEVICE_ATTR(localbus_info, S_IRUGO, show_localbus_info, NULL); +static DEVICE_ATTR(chip_reset, S_IWUSR, NULL, store_chip_reset); + +static struct device_attribute *qib_attributes[] = { + &dev_attr_hw_rev, + &dev_attr_hca_type, + &dev_attr_board_id, + &dev_attr_version, + &dev_attr_nctxts, + &dev_attr_nfreectxts, + &dev_attr_serial, + &dev_attr_boardversion, + &dev_attr_tempsense, + &dev_attr_localbus_info, + &dev_attr_chip_reset, +}; + +int qib_create_port_files(struct ib_device *ibdev, u8 port_num, + struct kobject *kobj) +{ + struct qib_pportdata *ppd; + struct qib_devdata *dd = dd_from_ibdev(ibdev); + int ret; + + if (!port_num || port_num > dd->num_pports) { + qib_dev_err(dd, + "Skipping infiniband class with invalid port %u\n", + port_num); + ret = -ENODEV; + goto bail; + } + ppd = &dd->pport[port_num - 1]; + + ret = kobject_init_and_add(&ppd->pport_kobj, &qib_port_ktype, kobj, + "linkcontrol"); + if (ret) { + qib_dev_err(dd, + "Skipping linkcontrol sysfs info, (err %d) port %u\n", + ret, port_num); + goto bail; + } + kobject_uevent(&ppd->pport_kobj, KOBJ_ADD); + + ret = kobject_init_and_add(&ppd->sl2vl_kobj, &qib_sl2vl_ktype, kobj, + "sl2vl"); + if (ret) { + qib_dev_err(dd, + "Skipping sl2vl sysfs info, (err %d) port %u\n", + ret, port_num); + goto bail_link; + } + kobject_uevent(&ppd->sl2vl_kobj, KOBJ_ADD); + + ret = kobject_init_and_add(&ppd->diagc_kobj, &qib_diagc_ktype, kobj, + "diag_counters"); + if (ret) { + qib_dev_err(dd, + "Skipping diag_counters sysfs info, (err %d) port %u\n", + ret, port_num); + goto bail_sl; + } + kobject_uevent(&ppd->diagc_kobj, KOBJ_ADD); + + if (!qib_cc_table_size || !ppd->congestion_entries_shadow) + return 0; + + ret = kobject_init_and_add(&ppd->pport_cc_kobj, &qib_port_cc_ktype, + kobj, "CCMgtA"); + if (ret) { + qib_dev_err(dd, + "Skipping Congestion Control sysfs info, (err %d) port %u\n", + ret, port_num); + goto bail_diagc; + } + + kobject_uevent(&ppd->pport_cc_kobj, KOBJ_ADD); + + ret = sysfs_create_bin_file(&ppd->pport_cc_kobj, + &cc_setting_bin_attr); + if (ret) { + qib_dev_err(dd, + "Skipping Congestion Control setting sysfs info, (err %d) port %u\n", + ret, port_num); + goto bail_cc; + } + + ret = sysfs_create_bin_file(&ppd->pport_cc_kobj, + &cc_table_bin_attr); + if (ret) { + qib_dev_err(dd, + "Skipping Congestion Control table sysfs info, (err %d) port %u\n", + ret, port_num); + goto bail_cc_entry_bin; + } + + qib_devinfo(dd->pcidev, + "IB%u: Congestion Control Agent enabled for port %d\n", + dd->unit, port_num); + + return 0; + +bail_cc_entry_bin: + sysfs_remove_bin_file(&ppd->pport_cc_kobj, &cc_setting_bin_attr); +bail_cc: + kobject_put(&ppd->pport_cc_kobj); +bail_diagc: + kobject_put(&ppd->diagc_kobj); +bail_sl: + kobject_put(&ppd->sl2vl_kobj); +bail_link: + kobject_put(&ppd->pport_kobj); +bail: + return ret; +} + +/* + * Register and create our files in /sys/class/infiniband. + */ +int qib_verbs_register_sysfs(struct qib_devdata *dd) +{ + struct ib_device *dev = &dd->verbs_dev.ibdev; + int i, ret; + + for (i = 0; i < ARRAY_SIZE(qib_attributes); ++i) { + ret = device_create_file(&dev->dev, qib_attributes[i]); + if (ret) + goto bail; + } + + return 0; +bail: + for (i = 0; i < ARRAY_SIZE(qib_attributes); ++i) + device_remove_file(&dev->dev, qib_attributes[i]); + return ret; +} + +/* + * Unregister and remove our files in /sys/class/infiniband. + */ +void qib_verbs_unregister_sysfs(struct qib_devdata *dd) +{ + struct qib_pportdata *ppd; + int i; + + for (i = 0; i < dd->num_pports; i++) { + ppd = &dd->pport[i]; + if (qib_cc_table_size && + ppd->congestion_entries_shadow) { + sysfs_remove_bin_file(&ppd->pport_cc_kobj, + &cc_setting_bin_attr); + sysfs_remove_bin_file(&ppd->pport_cc_kobj, + &cc_table_bin_attr); + kobject_put(&ppd->pport_cc_kobj); + } + kobject_put(&ppd->sl2vl_kobj); + kobject_put(&ppd->pport_kobj); + } +} diff --git a/kernel/drivers/infiniband/hw/qib/qib_twsi.c b/kernel/drivers/infiniband/hw/qib/qib_twsi.c new file mode 100644 index 000000000..f56986644 --- /dev/null +++ b/kernel/drivers/infiniband/hw/qib/qib_twsi.c @@ -0,0 +1,501 @@ +/* + * Copyright (c) 2012 Intel Corporation. All rights reserved. + * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include "qib.h" + +/* + * QLogic_IB "Two Wire Serial Interface" driver. + * Originally written for a not-quite-i2c serial eeprom, which is + * still used on some supported boards. Later boards have added a + * variety of other uses, most board-specific, so the bit-boffing + * part has been split off to this file, while the other parts + * have been moved to chip-specific files. + * + * We have also dropped all pretense of fully generic (e.g. pretend + * we don't know whether '1' is the higher voltage) interface, as + * the restrictions of the generic i2c interface (e.g. no access from + * driver itself) make it unsuitable for this use. + */ + +#define READ_CMD 1 +#define WRITE_CMD 0 + +/** + * i2c_wait_for_writes - wait for a write + * @dd: the qlogic_ib device + * + * We use this instead of udelay directly, so we can make sure + * that previous register writes have been flushed all the way + * to the chip. Since we are delaying anyway, the cost doesn't + * hurt, and makes the bit twiddling more regular + */ +static void i2c_wait_for_writes(struct qib_devdata *dd) +{ + /* + * implicit read of EXTStatus is as good as explicit + * read of scratch, if all we want to do is flush + * writes. + */ + dd->f_gpio_mod(dd, 0, 0, 0); + rmb(); /* inlined, so prevent compiler reordering */ +} + +/* + * QSFP modules are allowed to hold SCL low for 500uSec. Allow twice that + * for "almost compliant" modules + */ +#define SCL_WAIT_USEC 1000 + +/* BUF_WAIT is time bus must be free between STOP or ACK and to next START. + * Should be 20, but some chips need more. + */ +#define TWSI_BUF_WAIT_USEC 60 + +static void scl_out(struct qib_devdata *dd, u8 bit) +{ + u32 mask; + + udelay(1); + + mask = 1UL << dd->gpio_scl_num; + + /* SCL is meant to be bare-drain, so never set "OUT", just DIR */ + dd->f_gpio_mod(dd, 0, bit ? 0 : mask, mask); + + /* + * Allow for slow slaves by simple + * delay for falling edge, sampling on rise. + */ + if (!bit) + udelay(2); + else { + int rise_usec; + + for (rise_usec = SCL_WAIT_USEC; rise_usec > 0; rise_usec -= 2) { + if (mask & dd->f_gpio_mod(dd, 0, 0, 0)) + break; + udelay(2); + } + if (rise_usec <= 0) + qib_dev_err(dd, "SCL interface stuck low > %d uSec\n", + SCL_WAIT_USEC); + } + i2c_wait_for_writes(dd); +} + +static void sda_out(struct qib_devdata *dd, u8 bit) +{ + u32 mask; + + mask = 1UL << dd->gpio_sda_num; + + /* SDA is meant to be bare-drain, so never set "OUT", just DIR */ + dd->f_gpio_mod(dd, 0, bit ? 0 : mask, mask); + + i2c_wait_for_writes(dd); + udelay(2); +} + +static u8 sda_in(struct qib_devdata *dd, int wait) +{ + int bnum; + u32 read_val, mask; + + bnum = dd->gpio_sda_num; + mask = (1UL << bnum); + /* SDA is meant to be bare-drain, so never set "OUT", just DIR */ + dd->f_gpio_mod(dd, 0, 0, mask); + read_val = dd->f_gpio_mod(dd, 0, 0, 0); + if (wait) + i2c_wait_for_writes(dd); + return (read_val & mask) >> bnum; +} + +/** + * i2c_ackrcv - see if ack following write is true + * @dd: the qlogic_ib device + */ +static int i2c_ackrcv(struct qib_devdata *dd) +{ + u8 ack_received; + + /* AT ENTRY SCL = LOW */ + /* change direction, ignore data */ + ack_received = sda_in(dd, 1); + scl_out(dd, 1); + ack_received = sda_in(dd, 1) == 0; + scl_out(dd, 0); + return ack_received; +} + +static void stop_cmd(struct qib_devdata *dd); + +/** + * rd_byte - read a byte, sending STOP on last, else ACK + * @dd: the qlogic_ib device + * + * Returns byte shifted out of device + */ +static int rd_byte(struct qib_devdata *dd, int last) +{ + int bit_cntr, data; + + data = 0; + + for (bit_cntr = 7; bit_cntr >= 0; --bit_cntr) { + data <<= 1; + scl_out(dd, 1); + data |= sda_in(dd, 0); + scl_out(dd, 0); + } + if (last) { + scl_out(dd, 1); + stop_cmd(dd); + } else { + sda_out(dd, 0); + scl_out(dd, 1); + scl_out(dd, 0); + sda_out(dd, 1); + } + return data; +} + +/** + * wr_byte - write a byte, one bit at a time + * @dd: the qlogic_ib device + * @data: the byte to write + * + * Returns 0 if we got the following ack, otherwise 1 + */ +static int wr_byte(struct qib_devdata *dd, u8 data) +{ + int bit_cntr; + u8 bit; + + for (bit_cntr = 7; bit_cntr >= 0; bit_cntr--) { + bit = (data >> bit_cntr) & 1; + sda_out(dd, bit); + scl_out(dd, 1); + scl_out(dd, 0); + } + return (!i2c_ackrcv(dd)) ? 1 : 0; +} + +/* + * issue TWSI start sequence: + * (both clock/data high, clock high, data low while clock is high) + */ +static void start_seq(struct qib_devdata *dd) +{ + sda_out(dd, 1); + scl_out(dd, 1); + sda_out(dd, 0); + udelay(1); + scl_out(dd, 0); +} + +/** + * stop_seq - transmit the stop sequence + * @dd: the qlogic_ib device + * + * (both clock/data low, clock high, data high while clock is high) + */ +static void stop_seq(struct qib_devdata *dd) +{ + scl_out(dd, 0); + sda_out(dd, 0); + scl_out(dd, 1); + sda_out(dd, 1); +} + +/** + * stop_cmd - transmit the stop condition + * @dd: the qlogic_ib device + * + * (both clock/data low, clock high, data high while clock is high) + */ +static void stop_cmd(struct qib_devdata *dd) +{ + stop_seq(dd); + udelay(TWSI_BUF_WAIT_USEC); +} + +/** + * qib_twsi_reset - reset I2C communication + * @dd: the qlogic_ib device + */ + +int qib_twsi_reset(struct qib_devdata *dd) +{ + int clock_cycles_left = 9; + int was_high = 0; + u32 pins, mask; + + /* Both SCL and SDA should be high. If not, there + * is something wrong. + */ + mask = (1UL << dd->gpio_scl_num) | (1UL << dd->gpio_sda_num); + + /* + * Force pins to desired innocuous state. + * This is the default power-on state with out=0 and dir=0, + * So tri-stated and should be floating high (barring HW problems) + */ + dd->f_gpio_mod(dd, 0, 0, mask); + + /* + * Clock nine times to get all listeners into a sane state. + * If SDA does not go high at any point, we are wedged. + * One vendor recommends then issuing START followed by STOP. + * we cannot use our "normal" functions to do that, because + * if SCL drops between them, another vendor's part will + * wedge, dropping SDA and keeping it low forever, at the end of + * the next transaction (even if it was not the device addressed). + * So our START and STOP take place with SCL held high. + */ + while (clock_cycles_left--) { + scl_out(dd, 0); + scl_out(dd, 1); + /* Note if SDA is high, but keep clocking to sync slave */ + was_high |= sda_in(dd, 0); + } + + if (was_high) { + /* + * We saw a high, which we hope means the slave is sync'd. + * Issue START, STOP, pause for T_BUF. + */ + + pins = dd->f_gpio_mod(dd, 0, 0, 0); + if ((pins & mask) != mask) + qib_dev_err(dd, "GPIO pins not at rest: %d\n", + pins & mask); + /* Drop SDA to issue START */ + udelay(1); /* Guarantee .6 uSec setup */ + sda_out(dd, 0); + udelay(1); /* Guarantee .6 uSec hold */ + /* At this point, SCL is high, SDA low. Raise SDA for STOP */ + sda_out(dd, 1); + udelay(TWSI_BUF_WAIT_USEC); + } + + return !was_high; +} + +#define QIB_TWSI_START 0x100 +#define QIB_TWSI_STOP 0x200 + +/* Write byte to TWSI, optionally prefixed with START or suffixed with + * STOP. + * returns 0 if OK (ACK received), else != 0 + */ +static int qib_twsi_wr(struct qib_devdata *dd, int data, int flags) +{ + int ret = 1; + + if (flags & QIB_TWSI_START) + start_seq(dd); + + ret = wr_byte(dd, data); /* Leaves SCL low (from i2c_ackrcv()) */ + + if (flags & QIB_TWSI_STOP) + stop_cmd(dd); + return ret; +} + +/* Added functionality for IBA7220-based cards */ +#define QIB_TEMP_DEV 0x98 + +/* + * qib_twsi_blk_rd + * Formerly called qib_eeprom_internal_read, and only used for eeprom, + * but now the general interface for data transfer from twsi devices. + * One vestige of its former role is that it recognizes a device + * QIB_TWSI_NO_DEV and does the correct operation for the legacy part, + * which responded to all TWSI device codes, interpreting them as + * address within device. On all other devices found on board handled by + * this driver, the device is followed by a one-byte "address" which selects + * the "register" or "offset" within the device from which data should + * be read. + */ +int qib_twsi_blk_rd(struct qib_devdata *dd, int dev, int addr, + void *buffer, int len) +{ + int ret; + u8 *bp = buffer; + + ret = 1; + + if (dev == QIB_TWSI_NO_DEV) { + /* legacy not-really-I2C */ + addr = (addr << 1) | READ_CMD; + ret = qib_twsi_wr(dd, addr, QIB_TWSI_START); + } else { + /* Actual I2C */ + ret = qib_twsi_wr(dd, dev | WRITE_CMD, QIB_TWSI_START); + if (ret) { + stop_cmd(dd); + ret = 1; + goto bail; + } + /* + * SFF spec claims we do _not_ stop after the addr + * but simply issue a start with the "read" dev-addr. + * Since we are implicitely waiting for ACK here, + * we need t_buf (nominally 20uSec) before that start, + * and cannot rely on the delay built in to the STOP + */ + ret = qib_twsi_wr(dd, addr, 0); + udelay(TWSI_BUF_WAIT_USEC); + + if (ret) { + qib_dev_err(dd, + "Failed to write interface read addr %02X\n", + addr); + ret = 1; + goto bail; + } + ret = qib_twsi_wr(dd, dev | READ_CMD, QIB_TWSI_START); + } + if (ret) { + stop_cmd(dd); + ret = 1; + goto bail; + } + + /* + * block devices keeps clocking data out as long as we ack, + * automatically incrementing the address. Some have "pages" + * whose boundaries will not be crossed, but the handling + * of these is left to the caller, who is in a better + * position to know. + */ + while (len-- > 0) { + /* + * Get and store data, sending ACK if length remaining, + * else STOP + */ + *bp++ = rd_byte(dd, !len); + } + + ret = 0; + +bail: + return ret; +} + +/* + * qib_twsi_blk_wr + * Formerly called qib_eeprom_internal_write, and only used for eeprom, + * but now the general interface for data transfer to twsi devices. + * One vestige of its former role is that it recognizes a device + * QIB_TWSI_NO_DEV and does the correct operation for the legacy part, + * which responded to all TWSI device codes, interpreting them as + * address within device. On all other devices found on board handled by + * this driver, the device is followed by a one-byte "address" which selects + * the "register" or "offset" within the device to which data should + * be written. + */ +int qib_twsi_blk_wr(struct qib_devdata *dd, int dev, int addr, + const void *buffer, int len) +{ + int sub_len; + const u8 *bp = buffer; + int max_wait_time, i; + int ret = 1; + + while (len > 0) { + if (dev == QIB_TWSI_NO_DEV) { + if (qib_twsi_wr(dd, (addr << 1) | WRITE_CMD, + QIB_TWSI_START)) { + goto failed_write; + } + } else { + /* Real I2C */ + if (qib_twsi_wr(dd, dev | WRITE_CMD, QIB_TWSI_START)) + goto failed_write; + ret = qib_twsi_wr(dd, addr, 0); + if (ret) { + qib_dev_err(dd, + "Failed to write interface write addr %02X\n", + addr); + goto failed_write; + } + } + + sub_len = min(len, 4); + addr += sub_len; + len -= sub_len; + + for (i = 0; i < sub_len; i++) + if (qib_twsi_wr(dd, *bp++, 0)) + goto failed_write; + + stop_cmd(dd); + + /* + * Wait for write complete by waiting for a successful + * read (the chip replies with a zero after the write + * cmd completes, and before it writes to the eeprom. + * The startcmd for the read will fail the ack until + * the writes have completed. We do this inline to avoid + * the debug prints that are in the real read routine + * if the startcmd fails. + * We also use the proper device address, so it doesn't matter + * whether we have real eeprom_dev. Legacy likes any address. + */ + max_wait_time = 100; + while (qib_twsi_wr(dd, dev | READ_CMD, QIB_TWSI_START)) { + stop_cmd(dd); + if (!--max_wait_time) + goto failed_write; + } + /* now read (and ignore) the resulting byte */ + rd_byte(dd, 1); + } + + ret = 0; + goto bail; + +failed_write: + stop_cmd(dd); + ret = 1; + +bail: + return ret; +} diff --git a/kernel/drivers/infiniband/hw/qib/qib_tx.c b/kernel/drivers/infiniband/hw/qib/qib_tx.c new file mode 100644 index 000000000..eface3b3d --- /dev/null +++ b/kernel/drivers/infiniband/hw/qib/qib_tx.c @@ -0,0 +1,572 @@ +/* + * Copyright (c) 2008, 2009, 2010 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "qib.h" + +static unsigned qib_hol_timeout_ms = 3000; +module_param_named(hol_timeout_ms, qib_hol_timeout_ms, uint, S_IRUGO); +MODULE_PARM_DESC(hol_timeout_ms, + "duration of user app suspension after link failure"); + +unsigned qib_sdma_fetch_arb = 1; +module_param_named(fetch_arb, qib_sdma_fetch_arb, uint, S_IRUGO); +MODULE_PARM_DESC(fetch_arb, "IBA7220: change SDMA descriptor arbitration"); + +/** + * qib_disarm_piobufs - cancel a range of PIO buffers + * @dd: the qlogic_ib device + * @first: the first PIO buffer to cancel + * @cnt: the number of PIO buffers to cancel + * + * Cancel a range of PIO buffers. Used at user process close, + * in case it died while writing to a PIO buffer. + */ +void qib_disarm_piobufs(struct qib_devdata *dd, unsigned first, unsigned cnt) +{ + unsigned long flags; + unsigned i; + unsigned last; + + last = first + cnt; + spin_lock_irqsave(&dd->pioavail_lock, flags); + for (i = first; i < last; i++) { + __clear_bit(i, dd->pio_need_disarm); + dd->f_sendctrl(dd->pport, QIB_SENDCTRL_DISARM_BUF(i)); + } + spin_unlock_irqrestore(&dd->pioavail_lock, flags); +} + +/* + * This is called by a user process when it sees the DISARM_BUFS event + * bit is set. + */ +int qib_disarm_piobufs_ifneeded(struct qib_ctxtdata *rcd) +{ + struct qib_devdata *dd = rcd->dd; + unsigned i; + unsigned last; + unsigned n = 0; + + last = rcd->pio_base + rcd->piocnt; + /* + * Don't need uctxt_lock here, since user has called in to us. + * Clear at start in case more interrupts set bits while we + * are disarming + */ + if (rcd->user_event_mask) { + /* + * subctxt_cnt is 0 if not shared, so do base + * separately, first, then remaining subctxt, if any + */ + clear_bit(_QIB_EVENT_DISARM_BUFS_BIT, &rcd->user_event_mask[0]); + for (i = 1; i < rcd->subctxt_cnt; i++) + clear_bit(_QIB_EVENT_DISARM_BUFS_BIT, + &rcd->user_event_mask[i]); + } + spin_lock_irq(&dd->pioavail_lock); + for (i = rcd->pio_base; i < last; i++) { + if (__test_and_clear_bit(i, dd->pio_need_disarm)) { + n++; + dd->f_sendctrl(rcd->ppd, QIB_SENDCTRL_DISARM_BUF(i)); + } + } + spin_unlock_irq(&dd->pioavail_lock); + return 0; +} + +static struct qib_pportdata *is_sdma_buf(struct qib_devdata *dd, unsigned i) +{ + struct qib_pportdata *ppd; + unsigned pidx; + + for (pidx = 0; pidx < dd->num_pports; pidx++) { + ppd = dd->pport + pidx; + if (i >= ppd->sdma_state.first_sendbuf && + i < ppd->sdma_state.last_sendbuf) + return ppd; + } + return NULL; +} + +/* + * Return true if send buffer is being used by a user context. + * Sets _QIB_EVENT_DISARM_BUFS_BIT in user_event_mask as a side effect + */ +static int find_ctxt(struct qib_devdata *dd, unsigned bufn) +{ + struct qib_ctxtdata *rcd; + unsigned ctxt; + int ret = 0; + + spin_lock(&dd->uctxt_lock); + for (ctxt = dd->first_user_ctxt; ctxt < dd->cfgctxts; ctxt++) { + rcd = dd->rcd[ctxt]; + if (!rcd || bufn < rcd->pio_base || + bufn >= rcd->pio_base + rcd->piocnt) + continue; + if (rcd->user_event_mask) { + int i; + /* + * subctxt_cnt is 0 if not shared, so do base + * separately, first, then remaining subctxt, if any + */ + set_bit(_QIB_EVENT_DISARM_BUFS_BIT, + &rcd->user_event_mask[0]); + for (i = 1; i < rcd->subctxt_cnt; i++) + set_bit(_QIB_EVENT_DISARM_BUFS_BIT, + &rcd->user_event_mask[i]); + } + ret = 1; + break; + } + spin_unlock(&dd->uctxt_lock); + + return ret; +} + +/* + * Disarm a set of send buffers. If the buffer might be actively being + * written to, mark the buffer to be disarmed later when it is not being + * written to. + * + * This should only be called from the IRQ error handler. + */ +void qib_disarm_piobufs_set(struct qib_devdata *dd, unsigned long *mask, + unsigned cnt) +{ + struct qib_pportdata *ppd, *pppd[QIB_MAX_IB_PORTS]; + unsigned i; + unsigned long flags; + + for (i = 0; i < dd->num_pports; i++) + pppd[i] = NULL; + + for (i = 0; i < cnt; i++) { + int which; + + if (!test_bit(i, mask)) + continue; + /* + * If the buffer is owned by the DMA hardware, + * reset the DMA engine. + */ + ppd = is_sdma_buf(dd, i); + if (ppd) { + pppd[ppd->port] = ppd; + continue; + } + /* + * If the kernel is writing the buffer or the buffer is + * owned by a user process, we can't clear it yet. + */ + spin_lock_irqsave(&dd->pioavail_lock, flags); + if (test_bit(i, dd->pio_writing) || + (!test_bit(i << 1, dd->pioavailkernel) && + find_ctxt(dd, i))) { + __set_bit(i, dd->pio_need_disarm); + which = 0; + } else { + which = 1; + dd->f_sendctrl(dd->pport, QIB_SENDCTRL_DISARM_BUF(i)); + } + spin_unlock_irqrestore(&dd->pioavail_lock, flags); + } + + /* do cancel_sends once per port that had sdma piobufs in error */ + for (i = 0; i < dd->num_pports; i++) + if (pppd[i]) + qib_cancel_sends(pppd[i]); +} + +/** + * update_send_bufs - update shadow copy of the PIO availability map + * @dd: the qlogic_ib device + * + * called whenever our local copy indicates we have run out of send buffers + */ +static void update_send_bufs(struct qib_devdata *dd) +{ + unsigned long flags; + unsigned i; + const unsigned piobregs = dd->pioavregs; + + /* + * If the generation (check) bits have changed, then we update the + * busy bit for the corresponding PIO buffer. This algorithm will + * modify positions to the value they already have in some cases + * (i.e., no change), but it's faster than changing only the bits + * that have changed. + * + * We would like to do this atomicly, to avoid spinlocks in the + * critical send path, but that's not really possible, given the + * type of changes, and that this routine could be called on + * multiple cpu's simultaneously, so we lock in this routine only, + * to avoid conflicting updates; all we change is the shadow, and + * it's a single 64 bit memory location, so by definition the update + * is atomic in terms of what other cpu's can see in testing the + * bits. The spin_lock overhead isn't too bad, since it only + * happens when all buffers are in use, so only cpu overhead, not + * latency or bandwidth is affected. + */ + if (!dd->pioavailregs_dma) + return; + spin_lock_irqsave(&dd->pioavail_lock, flags); + for (i = 0; i < piobregs; i++) { + u64 pchbusy, pchg, piov, pnew; + + piov = le64_to_cpu(dd->pioavailregs_dma[i]); + pchg = dd->pioavailkernel[i] & + ~(dd->pioavailshadow[i] ^ piov); + pchbusy = pchg << QLOGIC_IB_SENDPIOAVAIL_BUSY_SHIFT; + if (pchg && (pchbusy & dd->pioavailshadow[i])) { + pnew = dd->pioavailshadow[i] & ~pchbusy; + pnew |= piov & pchbusy; + dd->pioavailshadow[i] = pnew; + } + } + spin_unlock_irqrestore(&dd->pioavail_lock, flags); +} + +/* + * Debugging code and stats updates if no pio buffers available. + */ +static noinline void no_send_bufs(struct qib_devdata *dd) +{ + dd->upd_pio_shadow = 1; + + /* not atomic, but if we lose a stat count in a while, that's OK */ + qib_stats.sps_nopiobufs++; +} + +/* + * Common code for normal driver send buffer allocation, and reserved + * allocation. + * + * Do appropriate marking as busy, etc. + * Returns buffer pointer if one is found, otherwise NULL. + */ +u32 __iomem *qib_getsendbuf_range(struct qib_devdata *dd, u32 *pbufnum, + u32 first, u32 last) +{ + unsigned i, j, updated = 0; + unsigned nbufs; + unsigned long flags; + unsigned long *shadow = dd->pioavailshadow; + u32 __iomem *buf; + + if (!(dd->flags & QIB_PRESENT)) + return NULL; + + nbufs = last - first + 1; /* number in range to check */ + if (dd->upd_pio_shadow) { +update_shadow: + /* + * Minor optimization. If we had no buffers on last call, + * start out by doing the update; continue and do scan even + * if no buffers were updated, to be paranoid. + */ + update_send_bufs(dd); + updated++; + } + i = first; + /* + * While test_and_set_bit() is atomic, we do that and then the + * change_bit(), and the pair is not. See if this is the cause + * of the remaining armlaunch errors. + */ + spin_lock_irqsave(&dd->pioavail_lock, flags); + if (dd->last_pio >= first && dd->last_pio <= last) + i = dd->last_pio + 1; + if (!first) + /* adjust to min possible */ + nbufs = last - dd->min_kernel_pio + 1; + for (j = 0; j < nbufs; j++, i++) { + if (i > last) + i = !first ? dd->min_kernel_pio : first; + if (__test_and_set_bit((2 * i) + 1, shadow)) + continue; + /* flip generation bit */ + __change_bit(2 * i, shadow); + /* remember that the buffer can be written to now */ + __set_bit(i, dd->pio_writing); + if (!first && first != last) /* first == last on VL15, avoid */ + dd->last_pio = i; + break; + } + spin_unlock_irqrestore(&dd->pioavail_lock, flags); + + if (j == nbufs) { + if (!updated) + /* + * First time through; shadow exhausted, but may be + * buffers available, try an update and then rescan. + */ + goto update_shadow; + no_send_bufs(dd); + buf = NULL; + } else { + if (i < dd->piobcnt2k) + buf = (u32 __iomem *)(dd->pio2kbase + + i * dd->palign); + else if (i < dd->piobcnt2k + dd->piobcnt4k || !dd->piovl15base) + buf = (u32 __iomem *)(dd->pio4kbase + + (i - dd->piobcnt2k) * dd->align4k); + else + buf = (u32 __iomem *)(dd->piovl15base + + (i - (dd->piobcnt2k + dd->piobcnt4k)) * + dd->align4k); + if (pbufnum) + *pbufnum = i; + dd->upd_pio_shadow = 0; + } + + return buf; +} + +/* + * Record that the caller is finished writing to the buffer so we don't + * disarm it while it is being written and disarm it now if needed. + */ +void qib_sendbuf_done(struct qib_devdata *dd, unsigned n) +{ + unsigned long flags; + + spin_lock_irqsave(&dd->pioavail_lock, flags); + __clear_bit(n, dd->pio_writing); + if (__test_and_clear_bit(n, dd->pio_need_disarm)) + dd->f_sendctrl(dd->pport, QIB_SENDCTRL_DISARM_BUF(n)); + spin_unlock_irqrestore(&dd->pioavail_lock, flags); +} + +/** + * qib_chg_pioavailkernel - change which send buffers are available for kernel + * @dd: the qlogic_ib device + * @start: the starting send buffer number + * @len: the number of send buffers + * @avail: true if the buffers are available for kernel use, false otherwise + */ +void qib_chg_pioavailkernel(struct qib_devdata *dd, unsigned start, + unsigned len, u32 avail, struct qib_ctxtdata *rcd) +{ + unsigned long flags; + unsigned end; + unsigned ostart = start; + + /* There are two bits per send buffer (busy and generation) */ + start *= 2; + end = start + len * 2; + + spin_lock_irqsave(&dd->pioavail_lock, flags); + /* Set or clear the busy bit in the shadow. */ + while (start < end) { + if (avail) { + unsigned long dma; + int i; + + /* + * The BUSY bit will never be set, because we disarm + * the user buffers before we hand them back to the + * kernel. We do have to make sure the generation + * bit is set correctly in shadow, since it could + * have changed many times while allocated to user. + * We can't use the bitmap functions on the full + * dma array because it is always little-endian, so + * we have to flip to host-order first. + * BITS_PER_LONG is slightly wrong, since it's + * always 64 bits per register in chip... + * We only work on 64 bit kernels, so that's OK. + */ + i = start / BITS_PER_LONG; + __clear_bit(QLOGIC_IB_SENDPIOAVAIL_BUSY_SHIFT + start, + dd->pioavailshadow); + dma = (unsigned long) + le64_to_cpu(dd->pioavailregs_dma[i]); + if (test_bit((QLOGIC_IB_SENDPIOAVAIL_CHECK_SHIFT + + start) % BITS_PER_LONG, &dma)) + __set_bit(QLOGIC_IB_SENDPIOAVAIL_CHECK_SHIFT + + start, dd->pioavailshadow); + else + __clear_bit(QLOGIC_IB_SENDPIOAVAIL_CHECK_SHIFT + + start, dd->pioavailshadow); + __set_bit(start, dd->pioavailkernel); + if ((start >> 1) < dd->min_kernel_pio) + dd->min_kernel_pio = start >> 1; + } else { + __set_bit(start + QLOGIC_IB_SENDPIOAVAIL_BUSY_SHIFT, + dd->pioavailshadow); + __clear_bit(start, dd->pioavailkernel); + if ((start >> 1) > dd->min_kernel_pio) + dd->min_kernel_pio = start >> 1; + } + start += 2; + } + + if (dd->min_kernel_pio > 0 && dd->last_pio < dd->min_kernel_pio - 1) + dd->last_pio = dd->min_kernel_pio - 1; + spin_unlock_irqrestore(&dd->pioavail_lock, flags); + + dd->f_txchk_change(dd, ostart, len, avail, rcd); +} + +/* + * Flush all sends that might be in the ready to send state, as well as any + * that are in the process of being sent. Used whenever we need to be + * sure the send side is idle. Cleans up all buffer state by canceling + * all pio buffers, and issuing an abort, which cleans up anything in the + * launch fifo. The cancel is superfluous on some chip versions, but + * it's safer to always do it. + * PIOAvail bits are updated by the chip as if a normal send had happened. + */ +void qib_cancel_sends(struct qib_pportdata *ppd) +{ + struct qib_devdata *dd = ppd->dd; + struct qib_ctxtdata *rcd; + unsigned long flags; + unsigned ctxt; + unsigned i; + unsigned last; + + /* + * Tell PSM to disarm buffers again before trying to reuse them. + * We need to be sure the rcd doesn't change out from under us + * while we do so. We hold the two locks sequentially. We might + * needlessly set some need_disarm bits as a result, if the + * context is closed after we release the uctxt_lock, but that's + * fairly benign, and safer than nesting the locks. + */ + for (ctxt = dd->first_user_ctxt; ctxt < dd->cfgctxts; ctxt++) { + spin_lock_irqsave(&dd->uctxt_lock, flags); + rcd = dd->rcd[ctxt]; + if (rcd && rcd->ppd == ppd) { + last = rcd->pio_base + rcd->piocnt; + if (rcd->user_event_mask) { + /* + * subctxt_cnt is 0 if not shared, so do base + * separately, first, then remaining subctxt, + * if any + */ + set_bit(_QIB_EVENT_DISARM_BUFS_BIT, + &rcd->user_event_mask[0]); + for (i = 1; i < rcd->subctxt_cnt; i++) + set_bit(_QIB_EVENT_DISARM_BUFS_BIT, + &rcd->user_event_mask[i]); + } + i = rcd->pio_base; + spin_unlock_irqrestore(&dd->uctxt_lock, flags); + spin_lock_irqsave(&dd->pioavail_lock, flags); + for (; i < last; i++) + __set_bit(i, dd->pio_need_disarm); + spin_unlock_irqrestore(&dd->pioavail_lock, flags); + } else + spin_unlock_irqrestore(&dd->uctxt_lock, flags); + } + + if (!(dd->flags & QIB_HAS_SEND_DMA)) + dd->f_sendctrl(ppd, QIB_SENDCTRL_DISARM_ALL | + QIB_SENDCTRL_FLUSH); +} + +/* + * Force an update of in-memory copy of the pioavail registers, when + * needed for any of a variety of reasons. + * If already off, this routine is a nop, on the assumption that the + * caller (or set of callers) will "do the right thing". + * This is a per-device operation, so just the first port. + */ +void qib_force_pio_avail_update(struct qib_devdata *dd) +{ + dd->f_sendctrl(dd->pport, QIB_SENDCTRL_AVAIL_BLIP); +} + +void qib_hol_down(struct qib_pportdata *ppd) +{ + /* + * Cancel sends when the link goes DOWN so that we aren't doing it + * at INIT when we might be trying to send SMI packets. + */ + if (!(ppd->lflags & QIBL_IB_AUTONEG_INPROG)) + qib_cancel_sends(ppd); +} + +/* + * Link is at INIT. + * We start the HoL timer so we can detect stuck packets blocking SMP replies. + * Timer may already be running, so use mod_timer, not add_timer. + */ +void qib_hol_init(struct qib_pportdata *ppd) +{ + if (ppd->hol_state != QIB_HOL_INIT) { + ppd->hol_state = QIB_HOL_INIT; + mod_timer(&ppd->hol_timer, + jiffies + msecs_to_jiffies(qib_hol_timeout_ms)); + } +} + +/* + * Link is up, continue any user processes, and ensure timer + * is a nop, if running. Let timer keep running, if set; it + * will nop when it sees the link is up. + */ +void qib_hol_up(struct qib_pportdata *ppd) +{ + ppd->hol_state = QIB_HOL_UP; +} + +/* + * This is only called via the timer. + */ +void qib_hol_event(unsigned long opaque) +{ + struct qib_pportdata *ppd = (struct qib_pportdata *)opaque; + + /* If hardware error, etc, skip. */ + if (!(ppd->dd->flags & QIB_INITTED)) + return; + + if (ppd->hol_state != QIB_HOL_UP) { + /* + * Try to flush sends in case a stuck packet is blocking + * SMP replies. + */ + qib_hol_down(ppd); + mod_timer(&ppd->hol_timer, + jiffies + msecs_to_jiffies(qib_hol_timeout_ms)); + } +} diff --git a/kernel/drivers/infiniband/hw/qib/qib_uc.c b/kernel/drivers/infiniband/hw/qib/qib_uc.c new file mode 100644 index 000000000..aa3a8035b --- /dev/null +++ b/kernel/drivers/infiniband/hw/qib/qib_uc.c @@ -0,0 +1,536 @@ +/* + * Copyright (c) 2006, 2007, 2008, 2009, 2010 QLogic Corporation. + * All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "qib.h" + +/* cut down ridiculously long IB macro names */ +#define OP(x) IB_OPCODE_UC_##x + +/** + * qib_make_uc_req - construct a request packet (SEND, RDMA write) + * @qp: a pointer to the QP + * + * Return 1 if constructed; otherwise, return 0. + */ +int qib_make_uc_req(struct qib_qp *qp) +{ + struct qib_other_headers *ohdr; + struct qib_swqe *wqe; + unsigned long flags; + u32 hwords; + u32 bth0; + u32 len; + u32 pmtu = qp->pmtu; + int ret = 0; + + spin_lock_irqsave(&qp->s_lock, flags); + + if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_SEND_OK)) { + if (!(ib_qib_state_ops[qp->state] & QIB_FLUSH_SEND)) + goto bail; + /* We are in the error state, flush the work request. */ + if (qp->s_last == qp->s_head) + goto bail; + /* If DMAs are in progress, we can't flush immediately. */ + if (atomic_read(&qp->s_dma_busy)) { + qp->s_flags |= QIB_S_WAIT_DMA; + goto bail; + } + wqe = get_swqe_ptr(qp, qp->s_last); + qib_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); + goto done; + } + + ohdr = &qp->s_hdr->u.oth; + if (qp->remote_ah_attr.ah_flags & IB_AH_GRH) + ohdr = &qp->s_hdr->u.l.oth; + + /* header size in 32-bit words LRH+BTH = (8+12)/4. */ + hwords = 5; + bth0 = 0; + + /* Get the next send request. */ + wqe = get_swqe_ptr(qp, qp->s_cur); + qp->s_wqe = NULL; + switch (qp->s_state) { + default: + if (!(ib_qib_state_ops[qp->state] & + QIB_PROCESS_NEXT_SEND_OK)) + goto bail; + /* Check if send work queue is empty. */ + if (qp->s_cur == qp->s_head) + goto bail; + /* + * Start a new request. + */ + wqe->psn = qp->s_next_psn; + qp->s_psn = qp->s_next_psn; + qp->s_sge.sge = wqe->sg_list[0]; + qp->s_sge.sg_list = wqe->sg_list + 1; + qp->s_sge.num_sge = wqe->wr.num_sge; + qp->s_sge.total_len = wqe->length; + len = wqe->length; + qp->s_len = len; + switch (wqe->wr.opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + if (len > pmtu) { + qp->s_state = OP(SEND_FIRST); + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_SEND) + qp->s_state = OP(SEND_ONLY); + else { + qp->s_state = + OP(SEND_ONLY_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + } + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= IB_BTH_SOLICITED; + qp->s_wqe = wqe; + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + ohdr->u.rc.reth.vaddr = + cpu_to_be64(wqe->wr.wr.rdma.remote_addr); + ohdr->u.rc.reth.rkey = + cpu_to_be32(wqe->wr.wr.rdma.rkey); + ohdr->u.rc.reth.length = cpu_to_be32(len); + hwords += sizeof(struct ib_reth) / 4; + if (len > pmtu) { + qp->s_state = OP(RDMA_WRITE_FIRST); + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_RDMA_WRITE) + qp->s_state = OP(RDMA_WRITE_ONLY); + else { + qp->s_state = + OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE); + /* Immediate data comes after the RETH */ + ohdr->u.rc.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= IB_BTH_SOLICITED; + } + qp->s_wqe = wqe; + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + + default: + goto bail; + } + break; + + case OP(SEND_FIRST): + qp->s_state = OP(SEND_MIDDLE); + /* FALLTHROUGH */ + case OP(SEND_MIDDLE): + len = qp->s_len; + if (len > pmtu) { + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_SEND) + qp->s_state = OP(SEND_LAST); + else { + qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + } + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= IB_BTH_SOLICITED; + qp->s_wqe = wqe; + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + + case OP(RDMA_WRITE_FIRST): + qp->s_state = OP(RDMA_WRITE_MIDDLE); + /* FALLTHROUGH */ + case OP(RDMA_WRITE_MIDDLE): + len = qp->s_len; + if (len > pmtu) { + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_RDMA_WRITE) + qp->s_state = OP(RDMA_WRITE_LAST); + else { + qp->s_state = + OP(RDMA_WRITE_LAST_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= IB_BTH_SOLICITED; + } + qp->s_wqe = wqe; + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + } + qp->s_len -= len; + qp->s_hdrwords = hwords; + qp->s_cur_sge = &qp->s_sge; + qp->s_cur_size = len; + qib_make_ruc_header(qp, ohdr, bth0 | (qp->s_state << 24), + qp->s_next_psn++ & QIB_PSN_MASK); +done: + ret = 1; + goto unlock; + +bail: + qp->s_flags &= ~QIB_S_BUSY; +unlock: + spin_unlock_irqrestore(&qp->s_lock, flags); + return ret; +} + +/** + * qib_uc_rcv - handle an incoming UC packet + * @ibp: the port the packet came in on + * @hdr: the header of the packet + * @has_grh: true if the packet has a GRH + * @data: the packet data + * @tlen: the length of the packet + * @qp: the QP for this packet. + * + * This is called from qib_qp_rcv() to process an incoming UC packet + * for the given QP. + * Called at interrupt level. + */ +void qib_uc_rcv(struct qib_ibport *ibp, struct qib_ib_header *hdr, + int has_grh, void *data, u32 tlen, struct qib_qp *qp) +{ + struct qib_other_headers *ohdr; + u32 opcode; + u32 hdrsize; + u32 psn; + u32 pad; + struct ib_wc wc; + u32 pmtu = qp->pmtu; + struct ib_reth *reth; + int ret; + + /* Check for GRH */ + if (!has_grh) { + ohdr = &hdr->u.oth; + hdrsize = 8 + 12; /* LRH + BTH */ + } else { + ohdr = &hdr->u.l.oth; + hdrsize = 8 + 40 + 12; /* LRH + GRH + BTH */ + } + + opcode = be32_to_cpu(ohdr->bth[0]); + if (qib_ruc_check_hdr(ibp, hdr, has_grh, qp, opcode)) + return; + + psn = be32_to_cpu(ohdr->bth[2]); + opcode >>= 24; + + /* Compare the PSN verses the expected PSN. */ + if (unlikely(qib_cmp24(psn, qp->r_psn) != 0)) { + /* + * Handle a sequence error. + * Silently drop any current message. + */ + qp->r_psn = psn; +inv: + if (qp->r_state == OP(SEND_FIRST) || + qp->r_state == OP(SEND_MIDDLE)) { + set_bit(QIB_R_REWIND_SGE, &qp->r_aflags); + qp->r_sge.num_sge = 0; + } else + qib_put_ss(&qp->r_sge); + qp->r_state = OP(SEND_LAST); + switch (opcode) { + case OP(SEND_FIRST): + case OP(SEND_ONLY): + case OP(SEND_ONLY_WITH_IMMEDIATE): + goto send_first; + + case OP(RDMA_WRITE_FIRST): + case OP(RDMA_WRITE_ONLY): + case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): + goto rdma_first; + + default: + goto drop; + } + } + + /* Check for opcode sequence errors. */ + switch (qp->r_state) { + case OP(SEND_FIRST): + case OP(SEND_MIDDLE): + if (opcode == OP(SEND_MIDDLE) || + opcode == OP(SEND_LAST) || + opcode == OP(SEND_LAST_WITH_IMMEDIATE)) + break; + goto inv; + + case OP(RDMA_WRITE_FIRST): + case OP(RDMA_WRITE_MIDDLE): + if (opcode == OP(RDMA_WRITE_MIDDLE) || + opcode == OP(RDMA_WRITE_LAST) || + opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE)) + break; + goto inv; + + default: + if (opcode == OP(SEND_FIRST) || + opcode == OP(SEND_ONLY) || + opcode == OP(SEND_ONLY_WITH_IMMEDIATE) || + opcode == OP(RDMA_WRITE_FIRST) || + opcode == OP(RDMA_WRITE_ONLY) || + opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) + break; + goto inv; + } + + if (qp->state == IB_QPS_RTR && !(qp->r_flags & QIB_R_COMM_EST)) { + qp->r_flags |= QIB_R_COMM_EST; + if (qp->ibqp.event_handler) { + struct ib_event ev; + + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_COMM_EST; + qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); + } + } + + /* OK, process the packet. */ + switch (opcode) { + case OP(SEND_FIRST): + case OP(SEND_ONLY): + case OP(SEND_ONLY_WITH_IMMEDIATE): +send_first: + if (test_and_clear_bit(QIB_R_REWIND_SGE, &qp->r_aflags)) + qp->r_sge = qp->s_rdma_read_sge; + else { + ret = qib_get_rwqe(qp, 0); + if (ret < 0) + goto op_err; + if (!ret) + goto drop; + /* + * qp->s_rdma_read_sge will be the owner + * of the mr references. + */ + qp->s_rdma_read_sge = qp->r_sge; + } + qp->r_rcv_len = 0; + if (opcode == OP(SEND_ONLY)) + goto no_immediate_data; + else if (opcode == OP(SEND_ONLY_WITH_IMMEDIATE)) + goto send_last_imm; + /* FALLTHROUGH */ + case OP(SEND_MIDDLE): + /* Check for invalid length PMTU or posted rwqe len. */ + if (unlikely(tlen != (hdrsize + pmtu + 4))) + goto rewind; + qp->r_rcv_len += pmtu; + if (unlikely(qp->r_rcv_len > qp->r_len)) + goto rewind; + qib_copy_sge(&qp->r_sge, data, pmtu, 0); + break; + + case OP(SEND_LAST_WITH_IMMEDIATE): +send_last_imm: + wc.ex.imm_data = ohdr->u.imm_data; + hdrsize += 4; + wc.wc_flags = IB_WC_WITH_IMM; + goto send_last; + case OP(SEND_LAST): +no_immediate_data: + wc.ex.imm_data = 0; + wc.wc_flags = 0; +send_last: + /* Get the number of bytes the message was padded by. */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + /* Check for invalid length. */ + /* XXX LAST len should be >= 1 */ + if (unlikely(tlen < (hdrsize + pad + 4))) + goto rewind; + /* Don't count the CRC. */ + tlen -= (hdrsize + pad + 4); + wc.byte_len = tlen + qp->r_rcv_len; + if (unlikely(wc.byte_len > qp->r_len)) + goto rewind; + wc.opcode = IB_WC_RECV; + qib_copy_sge(&qp->r_sge, data, tlen, 0); + qib_put_ss(&qp->s_rdma_read_sge); +last_imm: + wc.wr_id = qp->r_wr_id; + wc.status = IB_WC_SUCCESS; + wc.qp = &qp->ibqp; + wc.src_qp = qp->remote_qpn; + wc.slid = qp->remote_ah_attr.dlid; + wc.sl = qp->remote_ah_attr.sl; + /* zero fields that are N/A */ + wc.vendor_err = 0; + wc.pkey_index = 0; + wc.dlid_path_bits = 0; + wc.port_num = 0; + /* Signal completion event if the solicited bit is set. */ + qib_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, + (ohdr->bth[0] & + cpu_to_be32(IB_BTH_SOLICITED)) != 0); + break; + + case OP(RDMA_WRITE_FIRST): + case OP(RDMA_WRITE_ONLY): + case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): /* consume RWQE */ +rdma_first: + if (unlikely(!(qp->qp_access_flags & + IB_ACCESS_REMOTE_WRITE))) { + goto drop; + } + reth = &ohdr->u.rc.reth; + hdrsize += sizeof(*reth); + qp->r_len = be32_to_cpu(reth->length); + qp->r_rcv_len = 0; + qp->r_sge.sg_list = NULL; + if (qp->r_len != 0) { + u32 rkey = be32_to_cpu(reth->rkey); + u64 vaddr = be64_to_cpu(reth->vaddr); + int ok; + + /* Check rkey */ + ok = qib_rkey_ok(qp, &qp->r_sge.sge, qp->r_len, + vaddr, rkey, IB_ACCESS_REMOTE_WRITE); + if (unlikely(!ok)) + goto drop; + qp->r_sge.num_sge = 1; + } else { + qp->r_sge.num_sge = 0; + qp->r_sge.sge.mr = NULL; + qp->r_sge.sge.vaddr = NULL; + qp->r_sge.sge.length = 0; + qp->r_sge.sge.sge_length = 0; + } + if (opcode == OP(RDMA_WRITE_ONLY)) + goto rdma_last; + else if (opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) { + wc.ex.imm_data = ohdr->u.rc.imm_data; + goto rdma_last_imm; + } + /* FALLTHROUGH */ + case OP(RDMA_WRITE_MIDDLE): + /* Check for invalid length PMTU or posted rwqe len. */ + if (unlikely(tlen != (hdrsize + pmtu + 4))) + goto drop; + qp->r_rcv_len += pmtu; + if (unlikely(qp->r_rcv_len > qp->r_len)) + goto drop; + qib_copy_sge(&qp->r_sge, data, pmtu, 1); + break; + + case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): + wc.ex.imm_data = ohdr->u.imm_data; +rdma_last_imm: + hdrsize += 4; + wc.wc_flags = IB_WC_WITH_IMM; + + /* Get the number of bytes the message was padded by. */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + /* Check for invalid length. */ + /* XXX LAST len should be >= 1 */ + if (unlikely(tlen < (hdrsize + pad + 4))) + goto drop; + /* Don't count the CRC. */ + tlen -= (hdrsize + pad + 4); + if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) + goto drop; + if (test_and_clear_bit(QIB_R_REWIND_SGE, &qp->r_aflags)) + qib_put_ss(&qp->s_rdma_read_sge); + else { + ret = qib_get_rwqe(qp, 1); + if (ret < 0) + goto op_err; + if (!ret) + goto drop; + } + wc.byte_len = qp->r_len; + wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; + qib_copy_sge(&qp->r_sge, data, tlen, 1); + qib_put_ss(&qp->r_sge); + goto last_imm; + + case OP(RDMA_WRITE_LAST): +rdma_last: + /* Get the number of bytes the message was padded by. */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + /* Check for invalid length. */ + /* XXX LAST len should be >= 1 */ + if (unlikely(tlen < (hdrsize + pad + 4))) + goto drop; + /* Don't count the CRC. */ + tlen -= (hdrsize + pad + 4); + if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) + goto drop; + qib_copy_sge(&qp->r_sge, data, tlen, 1); + qib_put_ss(&qp->r_sge); + break; + + default: + /* Drop packet for unknown opcodes. */ + goto drop; + } + qp->r_psn++; + qp->r_state = opcode; + return; + +rewind: + set_bit(QIB_R_REWIND_SGE, &qp->r_aflags); + qp->r_sge.num_sge = 0; +drop: + ibp->n_pkt_drops++; + return; + +op_err: + qib_rc_error(qp, IB_WC_LOC_QP_OP_ERR); + return; + +} diff --git a/kernel/drivers/infiniband/hw/qib/qib_ud.c b/kernel/drivers/infiniband/hw/qib/qib_ud.c new file mode 100644 index 000000000..26243b722 --- /dev/null +++ b/kernel/drivers/infiniband/hw/qib/qib_ud.c @@ -0,0 +1,590 @@ +/* + * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include "qib.h" +#include "qib_mad.h" + +/** + * qib_ud_loopback - handle send on loopback QPs + * @sqp: the sending QP + * @swqe: the send work request + * + * This is called from qib_make_ud_req() to forward a WQE addressed + * to the same HCA. + * Note that the receive interrupt handler may be calling qib_ud_rcv() + * while this is being called. + */ +static void qib_ud_loopback(struct qib_qp *sqp, struct qib_swqe *swqe) +{ + struct qib_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num); + struct qib_pportdata *ppd; + struct qib_qp *qp; + struct ib_ah_attr *ah_attr; + unsigned long flags; + struct qib_sge_state ssge; + struct qib_sge *sge; + struct ib_wc wc; + u32 length; + enum ib_qp_type sqptype, dqptype; + + qp = qib_lookup_qpn(ibp, swqe->wr.wr.ud.remote_qpn); + if (!qp) { + ibp->n_pkt_drops++; + return; + } + + sqptype = sqp->ibqp.qp_type == IB_QPT_GSI ? + IB_QPT_UD : sqp->ibqp.qp_type; + dqptype = qp->ibqp.qp_type == IB_QPT_GSI ? + IB_QPT_UD : qp->ibqp.qp_type; + + if (dqptype != sqptype || + !(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK)) { + ibp->n_pkt_drops++; + goto drop; + } + + ah_attr = &to_iah(swqe->wr.wr.ud.ah)->attr; + ppd = ppd_from_ibp(ibp); + + if (qp->ibqp.qp_num > 1) { + u16 pkey1; + u16 pkey2; + u16 lid; + + pkey1 = qib_get_pkey(ibp, sqp->s_pkey_index); + pkey2 = qib_get_pkey(ibp, qp->s_pkey_index); + if (unlikely(!qib_pkey_ok(pkey1, pkey2))) { + lid = ppd->lid | (ah_attr->src_path_bits & + ((1 << ppd->lmc) - 1)); + qib_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_PKEY, pkey1, + ah_attr->sl, + sqp->ibqp.qp_num, qp->ibqp.qp_num, + cpu_to_be16(lid), + cpu_to_be16(ah_attr->dlid)); + goto drop; + } + } + + /* + * Check that the qkey matches (except for QP0, see 9.6.1.4.1). + * Qkeys with the high order bit set mean use the + * qkey from the QP context instead of the WR (see 10.2.5). + */ + if (qp->ibqp.qp_num) { + u32 qkey; + + qkey = (int)swqe->wr.wr.ud.remote_qkey < 0 ? + sqp->qkey : swqe->wr.wr.ud.remote_qkey; + if (unlikely(qkey != qp->qkey)) { + u16 lid; + + lid = ppd->lid | (ah_attr->src_path_bits & + ((1 << ppd->lmc) - 1)); + qib_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_QKEY, qkey, + ah_attr->sl, + sqp->ibqp.qp_num, qp->ibqp.qp_num, + cpu_to_be16(lid), + cpu_to_be16(ah_attr->dlid)); + goto drop; + } + } + + /* + * A GRH is expected to precede the data even if not + * present on the wire. + */ + length = swqe->length; + memset(&wc, 0, sizeof(wc)); + wc.byte_len = length + sizeof(struct ib_grh); + + if (swqe->wr.opcode == IB_WR_SEND_WITH_IMM) { + wc.wc_flags = IB_WC_WITH_IMM; + wc.ex.imm_data = swqe->wr.ex.imm_data; + } + + spin_lock_irqsave(&qp->r_lock, flags); + + /* + * Get the next work request entry to find where to put the data. + */ + if (qp->r_flags & QIB_R_REUSE_SGE) + qp->r_flags &= ~QIB_R_REUSE_SGE; + else { + int ret; + + ret = qib_get_rwqe(qp, 0); + if (ret < 0) { + qib_rc_error(qp, IB_WC_LOC_QP_OP_ERR); + goto bail_unlock; + } + if (!ret) { + if (qp->ibqp.qp_num == 0) + ibp->n_vl15_dropped++; + goto bail_unlock; + } + } + /* Silently drop packets which are too big. */ + if (unlikely(wc.byte_len > qp->r_len)) { + qp->r_flags |= QIB_R_REUSE_SGE; + ibp->n_pkt_drops++; + goto bail_unlock; + } + + if (ah_attr->ah_flags & IB_AH_GRH) { + qib_copy_sge(&qp->r_sge, &ah_attr->grh, + sizeof(struct ib_grh), 1); + wc.wc_flags |= IB_WC_GRH; + } else + qib_skip_sge(&qp->r_sge, sizeof(struct ib_grh), 1); + ssge.sg_list = swqe->sg_list + 1; + ssge.sge = *swqe->sg_list; + ssge.num_sge = swqe->wr.num_sge; + sge = &ssge.sge; + while (length) { + u32 len = sge->length; + + if (len > length) + len = length; + if (len > sge->sge_length) + len = sge->sge_length; + BUG_ON(len == 0); + qib_copy_sge(&qp->r_sge, sge->vaddr, len, 1); + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (sge->sge_length == 0) { + if (--ssge.num_sge) + *sge = *ssge.sg_list++; + } else if (sge->length == 0 && sge->mr->lkey) { + if (++sge->n >= QIB_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + break; + sge->n = 0; + } + sge->vaddr = + sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = + sge->mr->map[sge->m]->segs[sge->n].length; + } + length -= len; + } + qib_put_ss(&qp->r_sge); + if (!test_and_clear_bit(QIB_R_WRID_VALID, &qp->r_aflags)) + goto bail_unlock; + wc.wr_id = qp->r_wr_id; + wc.status = IB_WC_SUCCESS; + wc.opcode = IB_WC_RECV; + wc.qp = &qp->ibqp; + wc.src_qp = sqp->ibqp.qp_num; + wc.pkey_index = qp->ibqp.qp_type == IB_QPT_GSI ? + swqe->wr.wr.ud.pkey_index : 0; + wc.slid = ppd->lid | (ah_attr->src_path_bits & ((1 << ppd->lmc) - 1)); + wc.sl = ah_attr->sl; + wc.dlid_path_bits = ah_attr->dlid & ((1 << ppd->lmc) - 1); + wc.port_num = qp->port_num; + /* Signal completion event if the solicited bit is set. */ + qib_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, + swqe->wr.send_flags & IB_SEND_SOLICITED); + ibp->n_loop_pkts++; +bail_unlock: + spin_unlock_irqrestore(&qp->r_lock, flags); +drop: + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); +} + +/** + * qib_make_ud_req - construct a UD request packet + * @qp: the QP + * + * Return 1 if constructed; otherwise, return 0. + */ +int qib_make_ud_req(struct qib_qp *qp) +{ + struct qib_other_headers *ohdr; + struct ib_ah_attr *ah_attr; + struct qib_pportdata *ppd; + struct qib_ibport *ibp; + struct qib_swqe *wqe; + unsigned long flags; + u32 nwords; + u32 extra_bytes; + u32 bth0; + u16 lrh0; + u16 lid; + int ret = 0; + int next_cur; + + spin_lock_irqsave(&qp->s_lock, flags); + + if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_NEXT_SEND_OK)) { + if (!(ib_qib_state_ops[qp->state] & QIB_FLUSH_SEND)) + goto bail; + /* We are in the error state, flush the work request. */ + if (qp->s_last == qp->s_head) + goto bail; + /* If DMAs are in progress, we can't flush immediately. */ + if (atomic_read(&qp->s_dma_busy)) { + qp->s_flags |= QIB_S_WAIT_DMA; + goto bail; + } + wqe = get_swqe_ptr(qp, qp->s_last); + qib_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); + goto done; + } + + if (qp->s_cur == qp->s_head) + goto bail; + + wqe = get_swqe_ptr(qp, qp->s_cur); + next_cur = qp->s_cur + 1; + if (next_cur >= qp->s_size) + next_cur = 0; + + /* Construct the header. */ + ibp = to_iport(qp->ibqp.device, qp->port_num); + ppd = ppd_from_ibp(ibp); + ah_attr = &to_iah(wqe->wr.wr.ud.ah)->attr; + if (ah_attr->dlid >= QIB_MULTICAST_LID_BASE) { + if (ah_attr->dlid != QIB_PERMISSIVE_LID) + this_cpu_inc(ibp->pmastats->n_multicast_xmit); + else + this_cpu_inc(ibp->pmastats->n_unicast_xmit); + } else { + this_cpu_inc(ibp->pmastats->n_unicast_xmit); + lid = ah_attr->dlid & ~((1 << ppd->lmc) - 1); + if (unlikely(lid == ppd->lid)) { + /* + * If DMAs are in progress, we can't generate + * a completion for the loopback packet since + * it would be out of order. + * XXX Instead of waiting, we could queue a + * zero length descriptor so we get a callback. + */ + if (atomic_read(&qp->s_dma_busy)) { + qp->s_flags |= QIB_S_WAIT_DMA; + goto bail; + } + qp->s_cur = next_cur; + spin_unlock_irqrestore(&qp->s_lock, flags); + qib_ud_loopback(qp, wqe); + spin_lock_irqsave(&qp->s_lock, flags); + qib_send_complete(qp, wqe, IB_WC_SUCCESS); + goto done; + } + } + + qp->s_cur = next_cur; + extra_bytes = -wqe->length & 3; + nwords = (wqe->length + extra_bytes) >> 2; + + /* header size in 32-bit words LRH+BTH+DETH = (8+12+8)/4. */ + qp->s_hdrwords = 7; + qp->s_cur_size = wqe->length; + qp->s_cur_sge = &qp->s_sge; + qp->s_srate = ah_attr->static_rate; + qp->s_wqe = wqe; + qp->s_sge.sge = wqe->sg_list[0]; + qp->s_sge.sg_list = wqe->sg_list + 1; + qp->s_sge.num_sge = wqe->wr.num_sge; + qp->s_sge.total_len = wqe->length; + + if (ah_attr->ah_flags & IB_AH_GRH) { + /* Header size in 32-bit words. */ + qp->s_hdrwords += qib_make_grh(ibp, &qp->s_hdr->u.l.grh, + &ah_attr->grh, + qp->s_hdrwords, nwords); + lrh0 = QIB_LRH_GRH; + ohdr = &qp->s_hdr->u.l.oth; + /* + * Don't worry about sending to locally attached multicast + * QPs. It is unspecified by the spec. what happens. + */ + } else { + /* Header size in 32-bit words. */ + lrh0 = QIB_LRH_BTH; + ohdr = &qp->s_hdr->u.oth; + } + if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) { + qp->s_hdrwords++; + ohdr->u.ud.imm_data = wqe->wr.ex.imm_data; + bth0 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE << 24; + } else + bth0 = IB_OPCODE_UD_SEND_ONLY << 24; + lrh0 |= ah_attr->sl << 4; + if (qp->ibqp.qp_type == IB_QPT_SMI) + lrh0 |= 0xF000; /* Set VL (see ch. 13.5.3.1) */ + else + lrh0 |= ibp->sl_to_vl[ah_attr->sl] << 12; + qp->s_hdr->lrh[0] = cpu_to_be16(lrh0); + qp->s_hdr->lrh[1] = cpu_to_be16(ah_attr->dlid); /* DEST LID */ + qp->s_hdr->lrh[2] = cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC); + lid = ppd->lid; + if (lid) { + lid |= ah_attr->src_path_bits & ((1 << ppd->lmc) - 1); + qp->s_hdr->lrh[3] = cpu_to_be16(lid); + } else + qp->s_hdr->lrh[3] = IB_LID_PERMISSIVE; + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= IB_BTH_SOLICITED; + bth0 |= extra_bytes << 20; + bth0 |= qp->ibqp.qp_type == IB_QPT_SMI ? QIB_DEFAULT_P_KEY : + qib_get_pkey(ibp, qp->ibqp.qp_type == IB_QPT_GSI ? + wqe->wr.wr.ud.pkey_index : qp->s_pkey_index); + ohdr->bth[0] = cpu_to_be32(bth0); + /* + * Use the multicast QP if the destination LID is a multicast LID. + */ + ohdr->bth[1] = ah_attr->dlid >= QIB_MULTICAST_LID_BASE && + ah_attr->dlid != QIB_PERMISSIVE_LID ? + cpu_to_be32(QIB_MULTICAST_QPN) : + cpu_to_be32(wqe->wr.wr.ud.remote_qpn); + ohdr->bth[2] = cpu_to_be32(qp->s_next_psn++ & QIB_PSN_MASK); + /* + * Qkeys with the high order bit set mean use the + * qkey from the QP context instead of the WR (see 10.2.5). + */ + ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->wr.wr.ud.remote_qkey < 0 ? + qp->qkey : wqe->wr.wr.ud.remote_qkey); + ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num); + +done: + ret = 1; + goto unlock; + +bail: + qp->s_flags &= ~QIB_S_BUSY; +unlock: + spin_unlock_irqrestore(&qp->s_lock, flags); + return ret; +} + +static unsigned qib_lookup_pkey(struct qib_ibport *ibp, u16 pkey) +{ + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + struct qib_devdata *dd = ppd->dd; + unsigned ctxt = ppd->hw_pidx; + unsigned i; + + pkey &= 0x7fff; /* remove limited/full membership bit */ + + for (i = 0; i < ARRAY_SIZE(dd->rcd[ctxt]->pkeys); ++i) + if ((dd->rcd[ctxt]->pkeys[i] & 0x7fff) == pkey) + return i; + + /* + * Should not get here, this means hardware failed to validate pkeys. + * Punt and return index 0. + */ + return 0; +} + +/** + * qib_ud_rcv - receive an incoming UD packet + * @ibp: the port the packet came in on + * @hdr: the packet header + * @has_grh: true if the packet has a GRH + * @data: the packet data + * @tlen: the packet length + * @qp: the QP the packet came on + * + * This is called from qib_qp_rcv() to process an incoming UD packet + * for the given QP. + * Called at interrupt level. + */ +void qib_ud_rcv(struct qib_ibport *ibp, struct qib_ib_header *hdr, + int has_grh, void *data, u32 tlen, struct qib_qp *qp) +{ + struct qib_other_headers *ohdr; + int opcode; + u32 hdrsize; + u32 pad; + struct ib_wc wc; + u32 qkey; + u32 src_qp; + u16 dlid; + + /* Check for GRH */ + if (!has_grh) { + ohdr = &hdr->u.oth; + hdrsize = 8 + 12 + 8; /* LRH + BTH + DETH */ + } else { + ohdr = &hdr->u.l.oth; + hdrsize = 8 + 40 + 12 + 8; /* LRH + GRH + BTH + DETH */ + } + qkey = be32_to_cpu(ohdr->u.ud.deth[0]); + src_qp = be32_to_cpu(ohdr->u.ud.deth[1]) & QIB_QPN_MASK; + + /* + * Get the number of bytes the message was padded by + * and drop incomplete packets. + */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + if (unlikely(tlen < (hdrsize + pad + 4))) + goto drop; + + tlen -= hdrsize + pad + 4; + + /* + * Check that the permissive LID is only used on QP0 + * and the QKEY matches (see 9.6.1.4.1 and 9.6.1.5.1). + */ + if (qp->ibqp.qp_num) { + if (unlikely(hdr->lrh[1] == IB_LID_PERMISSIVE || + hdr->lrh[3] == IB_LID_PERMISSIVE)) + goto drop; + if (qp->ibqp.qp_num > 1) { + u16 pkey1, pkey2; + + pkey1 = be32_to_cpu(ohdr->bth[0]); + pkey2 = qib_get_pkey(ibp, qp->s_pkey_index); + if (unlikely(!qib_pkey_ok(pkey1, pkey2))) { + qib_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_PKEY, + pkey1, + (be16_to_cpu(hdr->lrh[0]) >> 4) & + 0xF, + src_qp, qp->ibqp.qp_num, + hdr->lrh[3], hdr->lrh[1]); + return; + } + } + if (unlikely(qkey != qp->qkey)) { + qib_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_QKEY, qkey, + (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF, + src_qp, qp->ibqp.qp_num, + hdr->lrh[3], hdr->lrh[1]); + return; + } + /* Drop invalid MAD packets (see 13.5.3.1). */ + if (unlikely(qp->ibqp.qp_num == 1 && + (tlen != 256 || + (be16_to_cpu(hdr->lrh[0]) >> 12) == 15))) + goto drop; + } else { + struct ib_smp *smp; + + /* Drop invalid MAD packets (see 13.5.3.1). */ + if (tlen != 256 || (be16_to_cpu(hdr->lrh[0]) >> 12) != 15) + goto drop; + smp = (struct ib_smp *) data; + if ((hdr->lrh[1] == IB_LID_PERMISSIVE || + hdr->lrh[3] == IB_LID_PERMISSIVE) && + smp->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + goto drop; + } + + /* + * The opcode is in the low byte when its in network order + * (top byte when in host order). + */ + opcode = be32_to_cpu(ohdr->bth[0]) >> 24; + if (qp->ibqp.qp_num > 1 && + opcode == IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE) { + wc.ex.imm_data = ohdr->u.ud.imm_data; + wc.wc_flags = IB_WC_WITH_IMM; + tlen -= sizeof(u32); + } else if (opcode == IB_OPCODE_UD_SEND_ONLY) { + wc.ex.imm_data = 0; + wc.wc_flags = 0; + } else + goto drop; + + /* + * A GRH is expected to precede the data even if not + * present on the wire. + */ + wc.byte_len = tlen + sizeof(struct ib_grh); + + /* + * Get the next work request entry to find where to put the data. + */ + if (qp->r_flags & QIB_R_REUSE_SGE) + qp->r_flags &= ~QIB_R_REUSE_SGE; + else { + int ret; + + ret = qib_get_rwqe(qp, 0); + if (ret < 0) { + qib_rc_error(qp, IB_WC_LOC_QP_OP_ERR); + return; + } + if (!ret) { + if (qp->ibqp.qp_num == 0) + ibp->n_vl15_dropped++; + return; + } + } + /* Silently drop packets which are too big. */ + if (unlikely(wc.byte_len > qp->r_len)) { + qp->r_flags |= QIB_R_REUSE_SGE; + goto drop; + } + if (has_grh) { + qib_copy_sge(&qp->r_sge, &hdr->u.l.grh, + sizeof(struct ib_grh), 1); + wc.wc_flags |= IB_WC_GRH; + } else + qib_skip_sge(&qp->r_sge, sizeof(struct ib_grh), 1); + qib_copy_sge(&qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh), 1); + qib_put_ss(&qp->r_sge); + if (!test_and_clear_bit(QIB_R_WRID_VALID, &qp->r_aflags)) + return; + wc.wr_id = qp->r_wr_id; + wc.status = IB_WC_SUCCESS; + wc.opcode = IB_WC_RECV; + wc.vendor_err = 0; + wc.qp = &qp->ibqp; + wc.src_qp = src_qp; + wc.pkey_index = qp->ibqp.qp_type == IB_QPT_GSI ? + qib_lookup_pkey(ibp, be32_to_cpu(ohdr->bth[0])) : 0; + wc.slid = be16_to_cpu(hdr->lrh[3]); + wc.sl = (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF; + dlid = be16_to_cpu(hdr->lrh[1]); + /* + * Save the LMC lower bits if the destination LID is a unicast LID. + */ + wc.dlid_path_bits = dlid >= QIB_MULTICAST_LID_BASE ? 0 : + dlid & ((1 << ppd_from_ibp(ibp)->lmc) - 1); + wc.port_num = qp->port_num; + /* Signal completion event if the solicited bit is set. */ + qib_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, + (ohdr->bth[0] & + cpu_to_be32(IB_BTH_SOLICITED)) != 0); + return; + +drop: + ibp->n_pkt_drops++; +} diff --git a/kernel/drivers/infiniband/hw/qib/qib_user_pages.c b/kernel/drivers/infiniband/hw/qib/qib_user_pages.c new file mode 100644 index 000000000..74f90b261 --- /dev/null +++ b/kernel/drivers/infiniband/hw/qib/qib_user_pages.c @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include "qib.h" + +static void __qib_release_user_pages(struct page **p, size_t num_pages, + int dirty) +{ + size_t i; + + for (i = 0; i < num_pages; i++) { + if (dirty) + set_page_dirty_lock(p[i]); + put_page(p[i]); + } +} + +/* + * Call with current->mm->mmap_sem held. + */ +static int __qib_get_user_pages(unsigned long start_page, size_t num_pages, + struct page **p) +{ + unsigned long lock_limit; + size_t got; + int ret; + + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + + if (num_pages > lock_limit && !capable(CAP_IPC_LOCK)) { + ret = -ENOMEM; + goto bail; + } + + for (got = 0; got < num_pages; got += ret) { + ret = get_user_pages(current, current->mm, + start_page + got * PAGE_SIZE, + num_pages - got, 1, 1, + p + got, NULL); + if (ret < 0) + goto bail_release; + } + + current->mm->pinned_vm += num_pages; + + ret = 0; + goto bail; + +bail_release: + __qib_release_user_pages(p, got, 0); +bail: + return ret; +} + +/** + * qib_map_page - a safety wrapper around pci_map_page() + * + * A dma_addr of all 0's is interpreted by the chip as "disabled". + * Unfortunately, it can also be a valid dma_addr returned on some + * architectures. + * + * The powerpc iommu assigns dma_addrs in ascending order, so we don't + * have to bother with retries or mapping a dummy page to insure we + * don't just get the same mapping again. + * + * I'm sure we won't be so lucky with other iommu's, so FIXME. + */ +dma_addr_t qib_map_page(struct pci_dev *hwdev, struct page *page, + unsigned long offset, size_t size, int direction) +{ + dma_addr_t phys; + + phys = pci_map_page(hwdev, page, offset, size, direction); + + if (phys == 0) { + pci_unmap_page(hwdev, phys, size, direction); + phys = pci_map_page(hwdev, page, offset, size, direction); + /* + * FIXME: If we get 0 again, we should keep this page, + * map another, then free the 0 page. + */ + } + + return phys; +} + +/** + * qib_get_user_pages - lock user pages into memory + * @start_page: the start page + * @num_pages: the number of pages + * @p: the output page structures + * + * This function takes a given start page (page aligned user virtual + * address) and pins it and the following specified number of pages. For + * now, num_pages is always 1, but that will probably change at some point + * (because caller is doing expected sends on a single virtually contiguous + * buffer, so we can do all pages at once). + */ +int qib_get_user_pages(unsigned long start_page, size_t num_pages, + struct page **p) +{ + int ret; + + down_write(¤t->mm->mmap_sem); + + ret = __qib_get_user_pages(start_page, num_pages, p); + + up_write(¤t->mm->mmap_sem); + + return ret; +} + +void qib_release_user_pages(struct page **p, size_t num_pages) +{ + if (current->mm) /* during close after signal, mm can be NULL */ + down_write(¤t->mm->mmap_sem); + + __qib_release_user_pages(p, num_pages, 1); + + if (current->mm) { + current->mm->pinned_vm -= num_pages; + up_write(¤t->mm->mmap_sem); + } +} diff --git a/kernel/drivers/infiniband/hw/qib/qib_user_sdma.c b/kernel/drivers/infiniband/hw/qib/qib_user_sdma.c new file mode 100644 index 000000000..3e0677c51 --- /dev/null +++ b/kernel/drivers/infiniband/hw/qib/qib_user_sdma.c @@ -0,0 +1,1465 @@ +/* + * Copyright (c) 2007, 2008, 2009 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "qib.h" +#include "qib_user_sdma.h" + +/* minimum size of header */ +#define QIB_USER_SDMA_MIN_HEADER_LENGTH 64 +/* expected size of headers (for dma_pool) */ +#define QIB_USER_SDMA_EXP_HEADER_LENGTH 64 +/* attempt to drain the queue for 5secs */ +#define QIB_USER_SDMA_DRAIN_TIMEOUT 250 + +/* + * track how many times a process open this driver. + */ +static struct rb_root qib_user_sdma_rb_root = RB_ROOT; + +struct qib_user_sdma_rb_node { + struct rb_node node; + int refcount; + pid_t pid; +}; + +struct qib_user_sdma_pkt { + struct list_head list; /* list element */ + + u8 tiddma; /* if this is NEW tid-sdma */ + u8 largepkt; /* this is large pkt from kmalloc */ + u16 frag_size; /* frag size used by PSM */ + u16 index; /* last header index or push index */ + u16 naddr; /* dimension of addr (1..3) ... */ + u16 addrlimit; /* addr array size */ + u16 tidsmidx; /* current tidsm index */ + u16 tidsmcount; /* tidsm array item count */ + u16 payload_size; /* payload size so far for header */ + u32 bytes_togo; /* bytes for processing */ + u32 counter; /* sdma pkts queued counter for this entry */ + struct qib_tid_session_member *tidsm; /* tid session member array */ + struct qib_user_sdma_queue *pq; /* which pq this pkt belongs to */ + u64 added; /* global descq number of entries */ + + struct { + u16 offset; /* offset for kvaddr, addr */ + u16 length; /* length in page */ + u16 first_desc; /* first desc */ + u16 last_desc; /* last desc */ + u16 put_page; /* should we put_page? */ + u16 dma_mapped; /* is page dma_mapped? */ + u16 dma_length; /* for dma_unmap_page() */ + u16 padding; + struct page *page; /* may be NULL (coherent mem) */ + void *kvaddr; /* FIXME: only for pio hack */ + dma_addr_t addr; + } addr[4]; /* max pages, any more and we coalesce */ +}; + +struct qib_user_sdma_queue { + /* + * pkts sent to dma engine are queued on this + * list head. the type of the elements of this + * list are struct qib_user_sdma_pkt... + */ + struct list_head sent; + + /* + * Because above list will be accessed by both process and + * signal handler, we need a spinlock for it. + */ + spinlock_t sent_lock ____cacheline_aligned_in_smp; + + /* headers with expected length are allocated from here... */ + char header_cache_name[64]; + struct dma_pool *header_cache; + + /* packets are allocated from the slab cache... */ + char pkt_slab_name[64]; + struct kmem_cache *pkt_slab; + + /* as packets go on the queued queue, they are counted... */ + u32 counter; + u32 sent_counter; + /* pending packets, not sending yet */ + u32 num_pending; + /* sending packets, not complete yet */ + u32 num_sending; + /* global descq number of entry of last sending packet */ + u64 added; + + /* dma page table */ + struct rb_root dma_pages_root; + + struct qib_user_sdma_rb_node *sdma_rb_node; + + /* protect everything above... */ + struct mutex lock; +}; + +static struct qib_user_sdma_rb_node * +qib_user_sdma_rb_search(struct rb_root *root, pid_t pid) +{ + struct qib_user_sdma_rb_node *sdma_rb_node; + struct rb_node *node = root->rb_node; + + while (node) { + sdma_rb_node = container_of(node, + struct qib_user_sdma_rb_node, node); + if (pid < sdma_rb_node->pid) + node = node->rb_left; + else if (pid > sdma_rb_node->pid) + node = node->rb_right; + else + return sdma_rb_node; + } + return NULL; +} + +static int +qib_user_sdma_rb_insert(struct rb_root *root, struct qib_user_sdma_rb_node *new) +{ + struct rb_node **node = &(root->rb_node); + struct rb_node *parent = NULL; + struct qib_user_sdma_rb_node *got; + + while (*node) { + got = container_of(*node, struct qib_user_sdma_rb_node, node); + parent = *node; + if (new->pid < got->pid) + node = &((*node)->rb_left); + else if (new->pid > got->pid) + node = &((*node)->rb_right); + else + return 0; + } + + rb_link_node(&new->node, parent, node); + rb_insert_color(&new->node, root); + return 1; +} + +struct qib_user_sdma_queue * +qib_user_sdma_queue_create(struct device *dev, int unit, int ctxt, int sctxt) +{ + struct qib_user_sdma_queue *pq = + kmalloc(sizeof(struct qib_user_sdma_queue), GFP_KERNEL); + struct qib_user_sdma_rb_node *sdma_rb_node; + + if (!pq) + goto done; + + pq->counter = 0; + pq->sent_counter = 0; + pq->num_pending = 0; + pq->num_sending = 0; + pq->added = 0; + pq->sdma_rb_node = NULL; + + INIT_LIST_HEAD(&pq->sent); + spin_lock_init(&pq->sent_lock); + mutex_init(&pq->lock); + + snprintf(pq->pkt_slab_name, sizeof(pq->pkt_slab_name), + "qib-user-sdma-pkts-%u-%02u.%02u", unit, ctxt, sctxt); + pq->pkt_slab = kmem_cache_create(pq->pkt_slab_name, + sizeof(struct qib_user_sdma_pkt), + 0, 0, NULL); + + if (!pq->pkt_slab) + goto err_kfree; + + snprintf(pq->header_cache_name, sizeof(pq->header_cache_name), + "qib-user-sdma-headers-%u-%02u.%02u", unit, ctxt, sctxt); + pq->header_cache = dma_pool_create(pq->header_cache_name, + dev, + QIB_USER_SDMA_EXP_HEADER_LENGTH, + 4, 0); + if (!pq->header_cache) + goto err_slab; + + pq->dma_pages_root = RB_ROOT; + + sdma_rb_node = qib_user_sdma_rb_search(&qib_user_sdma_rb_root, + current->pid); + if (sdma_rb_node) { + sdma_rb_node->refcount++; + } else { + int ret; + + sdma_rb_node = kmalloc(sizeof( + struct qib_user_sdma_rb_node), GFP_KERNEL); + if (!sdma_rb_node) + goto err_rb; + + sdma_rb_node->refcount = 1; + sdma_rb_node->pid = current->pid; + + ret = qib_user_sdma_rb_insert(&qib_user_sdma_rb_root, + sdma_rb_node); + BUG_ON(ret == 0); + } + pq->sdma_rb_node = sdma_rb_node; + + goto done; + +err_rb: + dma_pool_destroy(pq->header_cache); +err_slab: + kmem_cache_destroy(pq->pkt_slab); +err_kfree: + kfree(pq); + pq = NULL; + +done: + return pq; +} + +static void qib_user_sdma_init_frag(struct qib_user_sdma_pkt *pkt, + int i, u16 offset, u16 len, + u16 first_desc, u16 last_desc, + u16 put_page, u16 dma_mapped, + struct page *page, void *kvaddr, + dma_addr_t dma_addr, u16 dma_length) +{ + pkt->addr[i].offset = offset; + pkt->addr[i].length = len; + pkt->addr[i].first_desc = first_desc; + pkt->addr[i].last_desc = last_desc; + pkt->addr[i].put_page = put_page; + pkt->addr[i].dma_mapped = dma_mapped; + pkt->addr[i].page = page; + pkt->addr[i].kvaddr = kvaddr; + pkt->addr[i].addr = dma_addr; + pkt->addr[i].dma_length = dma_length; +} + +static void *qib_user_sdma_alloc_header(struct qib_user_sdma_queue *pq, + size_t len, dma_addr_t *dma_addr) +{ + void *hdr; + + if (len == QIB_USER_SDMA_EXP_HEADER_LENGTH) + hdr = dma_pool_alloc(pq->header_cache, GFP_KERNEL, + dma_addr); + else + hdr = NULL; + + if (!hdr) { + hdr = kmalloc(len, GFP_KERNEL); + if (!hdr) + return NULL; + + *dma_addr = 0; + } + + return hdr; +} + +static int qib_user_sdma_page_to_frags(const struct qib_devdata *dd, + struct qib_user_sdma_queue *pq, + struct qib_user_sdma_pkt *pkt, + struct page *page, u16 put, + u16 offset, u16 len, void *kvaddr) +{ + __le16 *pbc16; + void *pbcvaddr; + struct qib_message_header *hdr; + u16 newlen, pbclen, lastdesc, dma_mapped; + u32 vcto; + union qib_seqnum seqnum; + dma_addr_t pbcdaddr; + dma_addr_t dma_addr = + dma_map_page(&dd->pcidev->dev, + page, offset, len, DMA_TO_DEVICE); + int ret = 0; + + if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) { + /* + * dma mapping error, pkt has not managed + * this page yet, return the page here so + * the caller can ignore this page. + */ + if (put) { + put_page(page); + } else { + /* coalesce case */ + kunmap(page); + __free_page(page); + } + ret = -ENOMEM; + goto done; + } + offset = 0; + dma_mapped = 1; + + +next_fragment: + + /* + * In tid-sdma, the transfer length is restricted by + * receiver side current tid page length. + */ + if (pkt->tiddma && len > pkt->tidsm[pkt->tidsmidx].length) + newlen = pkt->tidsm[pkt->tidsmidx].length; + else + newlen = len; + + /* + * Then the transfer length is restricted by MTU. + * the last descriptor flag is determined by: + * 1. the current packet is at frag size length. + * 2. the current tid page is done if tid-sdma. + * 3. there is no more byte togo if sdma. + */ + lastdesc = 0; + if ((pkt->payload_size + newlen) >= pkt->frag_size) { + newlen = pkt->frag_size - pkt->payload_size; + lastdesc = 1; + } else if (pkt->tiddma) { + if (newlen == pkt->tidsm[pkt->tidsmidx].length) + lastdesc = 1; + } else { + if (newlen == pkt->bytes_togo) + lastdesc = 1; + } + + /* fill the next fragment in this page */ + qib_user_sdma_init_frag(pkt, pkt->naddr, /* index */ + offset, newlen, /* offset, len */ + 0, lastdesc, /* first last desc */ + put, dma_mapped, /* put page, dma mapped */ + page, kvaddr, /* struct page, virt addr */ + dma_addr, len); /* dma addr, dma length */ + pkt->bytes_togo -= newlen; + pkt->payload_size += newlen; + pkt->naddr++; + if (pkt->naddr == pkt->addrlimit) { + ret = -EFAULT; + goto done; + } + + /* If there is no more byte togo. (lastdesc==1) */ + if (pkt->bytes_togo == 0) { + /* The packet is done, header is not dma mapped yet. + * it should be from kmalloc */ + if (!pkt->addr[pkt->index].addr) { + pkt->addr[pkt->index].addr = + dma_map_single(&dd->pcidev->dev, + pkt->addr[pkt->index].kvaddr, + pkt->addr[pkt->index].dma_length, + DMA_TO_DEVICE); + if (dma_mapping_error(&dd->pcidev->dev, + pkt->addr[pkt->index].addr)) { + ret = -ENOMEM; + goto done; + } + pkt->addr[pkt->index].dma_mapped = 1; + } + + goto done; + } + + /* If tid-sdma, advance tid info. */ + if (pkt->tiddma) { + pkt->tidsm[pkt->tidsmidx].length -= newlen; + if (pkt->tidsm[pkt->tidsmidx].length) { + pkt->tidsm[pkt->tidsmidx].offset += newlen; + } else { + pkt->tidsmidx++; + if (pkt->tidsmidx == pkt->tidsmcount) { + ret = -EFAULT; + goto done; + } + } + } + + /* + * If this is NOT the last descriptor. (newlen==len) + * the current packet is not done yet, but the current + * send side page is done. + */ + if (lastdesc == 0) + goto done; + + /* + * If running this driver under PSM with message size + * fitting into one transfer unit, it is not possible + * to pass this line. otherwise, it is a buggggg. + */ + + /* + * Since the current packet is done, and there are more + * bytes togo, we need to create a new sdma header, copying + * from previous sdma header and modify both. + */ + pbclen = pkt->addr[pkt->index].length; + pbcvaddr = qib_user_sdma_alloc_header(pq, pbclen, &pbcdaddr); + if (!pbcvaddr) { + ret = -ENOMEM; + goto done; + } + /* Copy the previous sdma header to new sdma header */ + pbc16 = (__le16 *)pkt->addr[pkt->index].kvaddr; + memcpy(pbcvaddr, pbc16, pbclen); + + /* Modify the previous sdma header */ + hdr = (struct qib_message_header *)&pbc16[4]; + + /* New pbc length */ + pbc16[0] = cpu_to_le16(le16_to_cpu(pbc16[0])-(pkt->bytes_togo>>2)); + + /* New packet length */ + hdr->lrh[2] = cpu_to_be16(le16_to_cpu(pbc16[0])); + + if (pkt->tiddma) { + /* turn on the header suppression */ + hdr->iph.pkt_flags = + cpu_to_le16(le16_to_cpu(hdr->iph.pkt_flags)|0x2); + /* turn off ACK_REQ: 0x04 and EXPECTED_DONE: 0x20 */ + hdr->flags &= ~(0x04|0x20); + } else { + /* turn off extra bytes: 20-21 bits */ + hdr->bth[0] = cpu_to_be32(be32_to_cpu(hdr->bth[0])&0xFFCFFFFF); + /* turn off ACK_REQ: 0x04 */ + hdr->flags &= ~(0x04); + } + + /* New kdeth checksum */ + vcto = le32_to_cpu(hdr->iph.ver_ctxt_tid_offset); + hdr->iph.chksum = cpu_to_le16(QIB_LRH_BTH + + be16_to_cpu(hdr->lrh[2]) - + ((vcto>>16)&0xFFFF) - (vcto&0xFFFF) - + le16_to_cpu(hdr->iph.pkt_flags)); + + /* The packet is done, header is not dma mapped yet. + * it should be from kmalloc */ + if (!pkt->addr[pkt->index].addr) { + pkt->addr[pkt->index].addr = + dma_map_single(&dd->pcidev->dev, + pkt->addr[pkt->index].kvaddr, + pkt->addr[pkt->index].dma_length, + DMA_TO_DEVICE); + if (dma_mapping_error(&dd->pcidev->dev, + pkt->addr[pkt->index].addr)) { + ret = -ENOMEM; + goto done; + } + pkt->addr[pkt->index].dma_mapped = 1; + } + + /* Modify the new sdma header */ + pbc16 = (__le16 *)pbcvaddr; + hdr = (struct qib_message_header *)&pbc16[4]; + + /* New pbc length */ + pbc16[0] = cpu_to_le16(le16_to_cpu(pbc16[0])-(pkt->payload_size>>2)); + + /* New packet length */ + hdr->lrh[2] = cpu_to_be16(le16_to_cpu(pbc16[0])); + + if (pkt->tiddma) { + /* Set new tid and offset for new sdma header */ + hdr->iph.ver_ctxt_tid_offset = cpu_to_le32( + (le32_to_cpu(hdr->iph.ver_ctxt_tid_offset)&0xFF000000) + + (pkt->tidsm[pkt->tidsmidx].tid<tidsm[pkt->tidsmidx].offset>>2)); + } else { + /* Middle protocol new packet offset */ + hdr->uwords[2] += pkt->payload_size; + } + + /* New kdeth checksum */ + vcto = le32_to_cpu(hdr->iph.ver_ctxt_tid_offset); + hdr->iph.chksum = cpu_to_le16(QIB_LRH_BTH + + be16_to_cpu(hdr->lrh[2]) - + ((vcto>>16)&0xFFFF) - (vcto&0xFFFF) - + le16_to_cpu(hdr->iph.pkt_flags)); + + /* Next sequence number in new sdma header */ + seqnum.val = be32_to_cpu(hdr->bth[2]); + if (pkt->tiddma) + seqnum.seq++; + else + seqnum.pkt++; + hdr->bth[2] = cpu_to_be32(seqnum.val); + + /* Init new sdma header. */ + qib_user_sdma_init_frag(pkt, pkt->naddr, /* index */ + 0, pbclen, /* offset, len */ + 1, 0, /* first last desc */ + 0, 0, /* put page, dma mapped */ + NULL, pbcvaddr, /* struct page, virt addr */ + pbcdaddr, pbclen); /* dma addr, dma length */ + pkt->index = pkt->naddr; + pkt->payload_size = 0; + pkt->naddr++; + if (pkt->naddr == pkt->addrlimit) { + ret = -EFAULT; + goto done; + } + + /* Prepare for next fragment in this page */ + if (newlen != len) { + if (dma_mapped) { + put = 0; + dma_mapped = 0; + page = NULL; + kvaddr = NULL; + } + len -= newlen; + offset += newlen; + + goto next_fragment; + } + +done: + return ret; +} + +/* we've too many pages in the iovec, coalesce to a single page */ +static int qib_user_sdma_coalesce(const struct qib_devdata *dd, + struct qib_user_sdma_queue *pq, + struct qib_user_sdma_pkt *pkt, + const struct iovec *iov, + unsigned long niov) +{ + int ret = 0; + struct page *page = alloc_page(GFP_KERNEL); + void *mpage_save; + char *mpage; + int i; + int len = 0; + + if (!page) { + ret = -ENOMEM; + goto done; + } + + mpage = kmap(page); + mpage_save = mpage; + for (i = 0; i < niov; i++) { + int cfur; + + cfur = copy_from_user(mpage, + iov[i].iov_base, iov[i].iov_len); + if (cfur) { + ret = -EFAULT; + goto free_unmap; + } + + mpage += iov[i].iov_len; + len += iov[i].iov_len; + } + + ret = qib_user_sdma_page_to_frags(dd, pq, pkt, + page, 0, 0, len, mpage_save); + goto done; + +free_unmap: + kunmap(page); + __free_page(page); +done: + return ret; +} + +/* + * How many pages in this iovec element? + */ +static int qib_user_sdma_num_pages(const struct iovec *iov) +{ + const unsigned long addr = (unsigned long) iov->iov_base; + const unsigned long len = iov->iov_len; + const unsigned long spage = addr & PAGE_MASK; + const unsigned long epage = (addr + len - 1) & PAGE_MASK; + + return 1 + ((epage - spage) >> PAGE_SHIFT); +} + +static void qib_user_sdma_free_pkt_frag(struct device *dev, + struct qib_user_sdma_queue *pq, + struct qib_user_sdma_pkt *pkt, + int frag) +{ + const int i = frag; + + if (pkt->addr[i].page) { + /* only user data has page */ + if (pkt->addr[i].dma_mapped) + dma_unmap_page(dev, + pkt->addr[i].addr, + pkt->addr[i].dma_length, + DMA_TO_DEVICE); + + if (pkt->addr[i].kvaddr) + kunmap(pkt->addr[i].page); + + if (pkt->addr[i].put_page) + put_page(pkt->addr[i].page); + else + __free_page(pkt->addr[i].page); + } else if (pkt->addr[i].kvaddr) { + /* for headers */ + if (pkt->addr[i].dma_mapped) { + /* from kmalloc & dma mapped */ + dma_unmap_single(dev, + pkt->addr[i].addr, + pkt->addr[i].dma_length, + DMA_TO_DEVICE); + kfree(pkt->addr[i].kvaddr); + } else if (pkt->addr[i].addr) { + /* free coherent mem from cache... */ + dma_pool_free(pq->header_cache, + pkt->addr[i].kvaddr, pkt->addr[i].addr); + } else { + /* from kmalloc but not dma mapped */ + kfree(pkt->addr[i].kvaddr); + } + } +} + +/* return number of pages pinned... */ +static int qib_user_sdma_pin_pages(const struct qib_devdata *dd, + struct qib_user_sdma_queue *pq, + struct qib_user_sdma_pkt *pkt, + unsigned long addr, int tlen, int npages) +{ + struct page *pages[8]; + int i, j; + int ret = 0; + + while (npages) { + if (npages > 8) + j = 8; + else + j = npages; + + ret = get_user_pages_fast(addr, j, 0, pages); + if (ret != j) { + i = 0; + j = ret; + ret = -ENOMEM; + goto free_pages; + } + + for (i = 0; i < j; i++) { + /* map the pages... */ + unsigned long fofs = addr & ~PAGE_MASK; + int flen = ((fofs + tlen) > PAGE_SIZE) ? + (PAGE_SIZE - fofs) : tlen; + + ret = qib_user_sdma_page_to_frags(dd, pq, pkt, + pages[i], 1, fofs, flen, NULL); + if (ret < 0) { + /* current page has beed taken + * care of inside above call. + */ + i++; + goto free_pages; + } + + addr += flen; + tlen -= flen; + } + + npages -= j; + } + + goto done; + + /* if error, return all pages not managed by pkt */ +free_pages: + while (i < j) + put_page(pages[i++]); + +done: + return ret; +} + +static int qib_user_sdma_pin_pkt(const struct qib_devdata *dd, + struct qib_user_sdma_queue *pq, + struct qib_user_sdma_pkt *pkt, + const struct iovec *iov, + unsigned long niov) +{ + int ret = 0; + unsigned long idx; + + for (idx = 0; idx < niov; idx++) { + const int npages = qib_user_sdma_num_pages(iov + idx); + const unsigned long addr = (unsigned long) iov[idx].iov_base; + + ret = qib_user_sdma_pin_pages(dd, pq, pkt, addr, + iov[idx].iov_len, npages); + if (ret < 0) + goto free_pkt; + } + + goto done; + +free_pkt: + /* we need to ignore the first entry here */ + for (idx = 1; idx < pkt->naddr; idx++) + qib_user_sdma_free_pkt_frag(&dd->pcidev->dev, pq, pkt, idx); + + /* need to dma unmap the first entry, this is to restore to + * the original state so that caller can free the memory in + * error condition. Caller does not know if dma mapped or not*/ + if (pkt->addr[0].dma_mapped) { + dma_unmap_single(&dd->pcidev->dev, + pkt->addr[0].addr, + pkt->addr[0].dma_length, + DMA_TO_DEVICE); + pkt->addr[0].addr = 0; + pkt->addr[0].dma_mapped = 0; + } + +done: + return ret; +} + +static int qib_user_sdma_init_payload(const struct qib_devdata *dd, + struct qib_user_sdma_queue *pq, + struct qib_user_sdma_pkt *pkt, + const struct iovec *iov, + unsigned long niov, int npages) +{ + int ret = 0; + + if (pkt->frag_size == pkt->bytes_togo && + npages >= ARRAY_SIZE(pkt->addr)) + ret = qib_user_sdma_coalesce(dd, pq, pkt, iov, niov); + else + ret = qib_user_sdma_pin_pkt(dd, pq, pkt, iov, niov); + + return ret; +} + +/* free a packet list -- return counter value of last packet */ +static void qib_user_sdma_free_pkt_list(struct device *dev, + struct qib_user_sdma_queue *pq, + struct list_head *list) +{ + struct qib_user_sdma_pkt *pkt, *pkt_next; + + list_for_each_entry_safe(pkt, pkt_next, list, list) { + int i; + + for (i = 0; i < pkt->naddr; i++) + qib_user_sdma_free_pkt_frag(dev, pq, pkt, i); + + if (pkt->largepkt) + kfree(pkt); + else + kmem_cache_free(pq->pkt_slab, pkt); + } + INIT_LIST_HEAD(list); +} + +/* + * copy headers, coalesce etc -- pq->lock must be held + * + * we queue all the packets to list, returning the + * number of bytes total. list must be empty initially, + * as, if there is an error we clean it... + */ +static int qib_user_sdma_queue_pkts(const struct qib_devdata *dd, + struct qib_pportdata *ppd, + struct qib_user_sdma_queue *pq, + const struct iovec *iov, + unsigned long niov, + struct list_head *list, + int *maxpkts, int *ndesc) +{ + unsigned long idx = 0; + int ret = 0; + int npkts = 0; + __le32 *pbc; + dma_addr_t dma_addr; + struct qib_user_sdma_pkt *pkt = NULL; + size_t len; + size_t nw; + u32 counter = pq->counter; + u16 frag_size; + + while (idx < niov && npkts < *maxpkts) { + const unsigned long addr = (unsigned long) iov[idx].iov_base; + const unsigned long idx_save = idx; + unsigned pktnw; + unsigned pktnwc; + int nfrags = 0; + int npages = 0; + int bytes_togo = 0; + int tiddma = 0; + int cfur; + + len = iov[idx].iov_len; + nw = len >> 2; + + if (len < QIB_USER_SDMA_MIN_HEADER_LENGTH || + len > PAGE_SIZE || len & 3 || addr & 3) { + ret = -EINVAL; + goto free_list; + } + + pbc = qib_user_sdma_alloc_header(pq, len, &dma_addr); + if (!pbc) { + ret = -ENOMEM; + goto free_list; + } + + cfur = copy_from_user(pbc, iov[idx].iov_base, len); + if (cfur) { + ret = -EFAULT; + goto free_pbc; + } + + /* + * This assignment is a bit strange. it's because the + * the pbc counts the number of 32 bit words in the full + * packet _except_ the first word of the pbc itself... + */ + pktnwc = nw - 1; + + /* + * pktnw computation yields the number of 32 bit words + * that the caller has indicated in the PBC. note that + * this is one less than the total number of words that + * goes to the send DMA engine as the first 32 bit word + * of the PBC itself is not counted. Armed with this count, + * we can verify that the packet is consistent with the + * iovec lengths. + */ + pktnw = le32_to_cpu(*pbc) & 0xFFFF; + if (pktnw < pktnwc) { + ret = -EINVAL; + goto free_pbc; + } + + idx++; + while (pktnwc < pktnw && idx < niov) { + const size_t slen = iov[idx].iov_len; + const unsigned long faddr = + (unsigned long) iov[idx].iov_base; + + if (slen & 3 || faddr & 3 || !slen) { + ret = -EINVAL; + goto free_pbc; + } + + npages += qib_user_sdma_num_pages(&iov[idx]); + + bytes_togo += slen; + pktnwc += slen >> 2; + idx++; + nfrags++; + } + + if (pktnwc != pktnw) { + ret = -EINVAL; + goto free_pbc; + } + + frag_size = ((le32_to_cpu(*pbc))>>16) & 0xFFFF; + if (((frag_size ? frag_size : bytes_togo) + len) > + ppd->ibmaxlen) { + ret = -EINVAL; + goto free_pbc; + } + + if (frag_size) { + int pktsize, tidsmsize, n; + + n = npages*((2*PAGE_SIZE/frag_size)+1); + pktsize = sizeof(*pkt) + sizeof(pkt->addr[0])*n; + + /* + * Determine if this is tid-sdma or just sdma. + */ + tiddma = (((le32_to_cpu(pbc[7])>> + QLOGIC_IB_I_TID_SHIFT)& + QLOGIC_IB_I_TID_MASK) != + QLOGIC_IB_I_TID_MASK); + + if (tiddma) + tidsmsize = iov[idx].iov_len; + else + tidsmsize = 0; + + pkt = kmalloc(pktsize+tidsmsize, GFP_KERNEL); + if (!pkt) { + ret = -ENOMEM; + goto free_pbc; + } + pkt->largepkt = 1; + pkt->frag_size = frag_size; + pkt->addrlimit = n + ARRAY_SIZE(pkt->addr); + + if (tiddma) { + char *tidsm = (char *)pkt + pktsize; + + cfur = copy_from_user(tidsm, + iov[idx].iov_base, tidsmsize); + if (cfur) { + ret = -EFAULT; + goto free_pkt; + } + pkt->tidsm = + (struct qib_tid_session_member *)tidsm; + pkt->tidsmcount = tidsmsize/ + sizeof(struct qib_tid_session_member); + pkt->tidsmidx = 0; + idx++; + } + + /* + * pbc 'fill1' field is borrowed to pass frag size, + * we need to clear it after picking frag size, the + * hardware requires this field to be zero. + */ + *pbc = cpu_to_le32(le32_to_cpu(*pbc) & 0x0000FFFF); + } else { + pkt = kmem_cache_alloc(pq->pkt_slab, GFP_KERNEL); + if (!pkt) { + ret = -ENOMEM; + goto free_pbc; + } + pkt->largepkt = 0; + pkt->frag_size = bytes_togo; + pkt->addrlimit = ARRAY_SIZE(pkt->addr); + } + pkt->bytes_togo = bytes_togo; + pkt->payload_size = 0; + pkt->counter = counter; + pkt->tiddma = tiddma; + + /* setup the first header */ + qib_user_sdma_init_frag(pkt, 0, /* index */ + 0, len, /* offset, len */ + 1, 0, /* first last desc */ + 0, 0, /* put page, dma mapped */ + NULL, pbc, /* struct page, virt addr */ + dma_addr, len); /* dma addr, dma length */ + pkt->index = 0; + pkt->naddr = 1; + + if (nfrags) { + ret = qib_user_sdma_init_payload(dd, pq, pkt, + iov + idx_save + 1, + nfrags, npages); + if (ret < 0) + goto free_pkt; + } else { + /* since there is no payload, mark the + * header as the last desc. */ + pkt->addr[0].last_desc = 1; + + if (dma_addr == 0) { + /* + * the header is not dma mapped yet. + * it should be from kmalloc. + */ + dma_addr = dma_map_single(&dd->pcidev->dev, + pbc, len, DMA_TO_DEVICE); + if (dma_mapping_error(&dd->pcidev->dev, + dma_addr)) { + ret = -ENOMEM; + goto free_pkt; + } + pkt->addr[0].addr = dma_addr; + pkt->addr[0].dma_mapped = 1; + } + } + + counter++; + npkts++; + pkt->pq = pq; + pkt->index = 0; /* reset index for push on hw */ + *ndesc += pkt->naddr; + + list_add_tail(&pkt->list, list); + } + + *maxpkts = npkts; + ret = idx; + goto done; + +free_pkt: + if (pkt->largepkt) + kfree(pkt); + else + kmem_cache_free(pq->pkt_slab, pkt); +free_pbc: + if (dma_addr) + dma_pool_free(pq->header_cache, pbc, dma_addr); + else + kfree(pbc); +free_list: + qib_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, list); +done: + return ret; +} + +static void qib_user_sdma_set_complete_counter(struct qib_user_sdma_queue *pq, + u32 c) +{ + pq->sent_counter = c; +} + +/* try to clean out queue -- needs pq->lock */ +static int qib_user_sdma_queue_clean(struct qib_pportdata *ppd, + struct qib_user_sdma_queue *pq) +{ + struct qib_devdata *dd = ppd->dd; + struct list_head free_list; + struct qib_user_sdma_pkt *pkt; + struct qib_user_sdma_pkt *pkt_prev; + unsigned long flags; + int ret = 0; + + if (!pq->num_sending) + return 0; + + INIT_LIST_HEAD(&free_list); + + /* + * We need this spin lock here because interrupt handler + * might modify this list in qib_user_sdma_send_desc(), also + * we can not get interrupted, otherwise it is a deadlock. + */ + spin_lock_irqsave(&pq->sent_lock, flags); + list_for_each_entry_safe(pkt, pkt_prev, &pq->sent, list) { + s64 descd = ppd->sdma_descq_removed - pkt->added; + + if (descd < 0) + break; + + list_move_tail(&pkt->list, &free_list); + + /* one more packet cleaned */ + ret++; + pq->num_sending--; + } + spin_unlock_irqrestore(&pq->sent_lock, flags); + + if (!list_empty(&free_list)) { + u32 counter; + + pkt = list_entry(free_list.prev, + struct qib_user_sdma_pkt, list); + counter = pkt->counter; + + qib_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &free_list); + qib_user_sdma_set_complete_counter(pq, counter); + } + + return ret; +} + +void qib_user_sdma_queue_destroy(struct qib_user_sdma_queue *pq) +{ + if (!pq) + return; + + pq->sdma_rb_node->refcount--; + if (pq->sdma_rb_node->refcount == 0) { + rb_erase(&pq->sdma_rb_node->node, &qib_user_sdma_rb_root); + kfree(pq->sdma_rb_node); + } + dma_pool_destroy(pq->header_cache); + kmem_cache_destroy(pq->pkt_slab); + kfree(pq); +} + +/* clean descriptor queue, returns > 0 if some elements cleaned */ +static int qib_user_sdma_hwqueue_clean(struct qib_pportdata *ppd) +{ + int ret; + unsigned long flags; + + spin_lock_irqsave(&ppd->sdma_lock, flags); + ret = qib_sdma_make_progress(ppd); + spin_unlock_irqrestore(&ppd->sdma_lock, flags); + + return ret; +} + +/* we're in close, drain packets so that we can cleanup successfully... */ +void qib_user_sdma_queue_drain(struct qib_pportdata *ppd, + struct qib_user_sdma_queue *pq) +{ + struct qib_devdata *dd = ppd->dd; + unsigned long flags; + int i; + + if (!pq) + return; + + for (i = 0; i < QIB_USER_SDMA_DRAIN_TIMEOUT; i++) { + mutex_lock(&pq->lock); + if (!pq->num_pending && !pq->num_sending) { + mutex_unlock(&pq->lock); + break; + } + qib_user_sdma_hwqueue_clean(ppd); + qib_user_sdma_queue_clean(ppd, pq); + mutex_unlock(&pq->lock); + msleep(20); + } + + if (pq->num_pending || pq->num_sending) { + struct qib_user_sdma_pkt *pkt; + struct qib_user_sdma_pkt *pkt_prev; + struct list_head free_list; + + mutex_lock(&pq->lock); + spin_lock_irqsave(&ppd->sdma_lock, flags); + /* + * Since we hold sdma_lock, it is safe without sent_lock. + */ + if (pq->num_pending) { + list_for_each_entry_safe(pkt, pkt_prev, + &ppd->sdma_userpending, list) { + if (pkt->pq == pq) { + list_move_tail(&pkt->list, &pq->sent); + pq->num_pending--; + pq->num_sending++; + } + } + } + spin_unlock_irqrestore(&ppd->sdma_lock, flags); + + qib_dev_err(dd, "user sdma lists not empty: forcing!\n"); + INIT_LIST_HEAD(&free_list); + list_splice_init(&pq->sent, &free_list); + pq->num_sending = 0; + qib_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &free_list); + mutex_unlock(&pq->lock); + } +} + +static inline __le64 qib_sdma_make_desc0(u8 gen, + u64 addr, u64 dwlen, u64 dwoffset) +{ + return cpu_to_le64(/* SDmaPhyAddr[31:0] */ + ((addr & 0xfffffffcULL) << 32) | + /* SDmaGeneration[1:0] */ + ((gen & 3ULL) << 30) | + /* SDmaDwordCount[10:0] */ + ((dwlen & 0x7ffULL) << 16) | + /* SDmaBufOffset[12:2] */ + (dwoffset & 0x7ffULL)); +} + +static inline __le64 qib_sdma_make_first_desc0(__le64 descq) +{ + return descq | cpu_to_le64(1ULL << 12); +} + +static inline __le64 qib_sdma_make_last_desc0(__le64 descq) +{ + /* last */ /* dma head */ + return descq | cpu_to_le64(1ULL << 11 | 1ULL << 13); +} + +static inline __le64 qib_sdma_make_desc1(u64 addr) +{ + /* SDmaPhyAddr[47:32] */ + return cpu_to_le64(addr >> 32); +} + +static void qib_user_sdma_send_frag(struct qib_pportdata *ppd, + struct qib_user_sdma_pkt *pkt, int idx, + unsigned ofs, u16 tail, u8 gen) +{ + const u64 addr = (u64) pkt->addr[idx].addr + + (u64) pkt->addr[idx].offset; + const u64 dwlen = (u64) pkt->addr[idx].length / 4; + __le64 *descqp; + __le64 descq0; + + descqp = &ppd->sdma_descq[tail].qw[0]; + + descq0 = qib_sdma_make_desc0(gen, addr, dwlen, ofs); + if (pkt->addr[idx].first_desc) + descq0 = qib_sdma_make_first_desc0(descq0); + if (pkt->addr[idx].last_desc) { + descq0 = qib_sdma_make_last_desc0(descq0); + if (ppd->sdma_intrequest) { + descq0 |= cpu_to_le64(1ULL << 15); + ppd->sdma_intrequest = 0; + } + } + + descqp[0] = descq0; + descqp[1] = qib_sdma_make_desc1(addr); +} + +void qib_user_sdma_send_desc(struct qib_pportdata *ppd, + struct list_head *pktlist) +{ + struct qib_devdata *dd = ppd->dd; + u16 nfree, nsent; + u16 tail, tail_c; + u8 gen, gen_c; + + nfree = qib_sdma_descq_freecnt(ppd); + if (!nfree) + return; + +retry: + nsent = 0; + tail_c = tail = ppd->sdma_descq_tail; + gen_c = gen = ppd->sdma_generation; + while (!list_empty(pktlist)) { + struct qib_user_sdma_pkt *pkt = + list_entry(pktlist->next, struct qib_user_sdma_pkt, + list); + int i, j, c = 0; + unsigned ofs = 0; + u16 dtail = tail; + + for (i = pkt->index; i < pkt->naddr && nfree; i++) { + qib_user_sdma_send_frag(ppd, pkt, i, ofs, tail, gen); + ofs += pkt->addr[i].length >> 2; + + if (++tail == ppd->sdma_descq_cnt) { + tail = 0; + ++gen; + ppd->sdma_intrequest = 1; + } else if (tail == (ppd->sdma_descq_cnt>>1)) { + ppd->sdma_intrequest = 1; + } + nfree--; + if (pkt->addr[i].last_desc == 0) + continue; + + /* + * If the packet is >= 2KB mtu equivalent, we + * have to use the large buffers, and have to + * mark each descriptor as part of a large + * buffer packet. + */ + if (ofs > dd->piosize2kmax_dwords) { + for (j = pkt->index; j <= i; j++) { + ppd->sdma_descq[dtail].qw[0] |= + cpu_to_le64(1ULL << 14); + if (++dtail == ppd->sdma_descq_cnt) + dtail = 0; + } + } + c += i + 1 - pkt->index; + pkt->index = i + 1; /* index for next first */ + tail_c = dtail = tail; + gen_c = gen; + ofs = 0; /* reset for next packet */ + } + + ppd->sdma_descq_added += c; + nsent += c; + if (pkt->index == pkt->naddr) { + pkt->added = ppd->sdma_descq_added; + pkt->pq->added = pkt->added; + pkt->pq->num_pending--; + spin_lock(&pkt->pq->sent_lock); + pkt->pq->num_sending++; + list_move_tail(&pkt->list, &pkt->pq->sent); + spin_unlock(&pkt->pq->sent_lock); + } + if (!nfree || (nsent<<2) > ppd->sdma_descq_cnt) + break; + } + + /* advance the tail on the chip if necessary */ + if (ppd->sdma_descq_tail != tail_c) { + ppd->sdma_generation = gen_c; + dd->f_sdma_update_tail(ppd, tail_c); + } + + if (nfree && !list_empty(pktlist)) + goto retry; +} + +/* pq->lock must be held, get packets on the wire... */ +static int qib_user_sdma_push_pkts(struct qib_pportdata *ppd, + struct qib_user_sdma_queue *pq, + struct list_head *pktlist, int count) +{ + unsigned long flags; + + if (unlikely(!(ppd->lflags & QIBL_LINKACTIVE))) + return -ECOMM; + + /* non-blocking mode */ + if (pq->sdma_rb_node->refcount > 1) { + spin_lock_irqsave(&ppd->sdma_lock, flags); + if (unlikely(!__qib_sdma_running(ppd))) { + spin_unlock_irqrestore(&ppd->sdma_lock, flags); + return -ECOMM; + } + pq->num_pending += count; + list_splice_tail_init(pktlist, &ppd->sdma_userpending); + qib_user_sdma_send_desc(ppd, &ppd->sdma_userpending); + spin_unlock_irqrestore(&ppd->sdma_lock, flags); + return 0; + } + + /* In this case, descriptors from this process are not + * linked to ppd pending queue, interrupt handler + * won't update this process, it is OK to directly + * modify without sdma lock. + */ + + + pq->num_pending += count; + /* + * Blocking mode for single rail process, we must + * release/regain sdma_lock to give other process + * chance to make progress. This is important for + * performance. + */ + do { + spin_lock_irqsave(&ppd->sdma_lock, flags); + if (unlikely(!__qib_sdma_running(ppd))) { + spin_unlock_irqrestore(&ppd->sdma_lock, flags); + return -ECOMM; + } + qib_user_sdma_send_desc(ppd, pktlist); + if (!list_empty(pktlist)) + qib_sdma_make_progress(ppd); + spin_unlock_irqrestore(&ppd->sdma_lock, flags); + } while (!list_empty(pktlist)); + + return 0; +} + +int qib_user_sdma_writev(struct qib_ctxtdata *rcd, + struct qib_user_sdma_queue *pq, + const struct iovec *iov, + unsigned long dim) +{ + struct qib_devdata *dd = rcd->dd; + struct qib_pportdata *ppd = rcd->ppd; + int ret = 0; + struct list_head list; + int npkts = 0; + + INIT_LIST_HEAD(&list); + + mutex_lock(&pq->lock); + + /* why not -ECOMM like qib_user_sdma_push_pkts() below? */ + if (!qib_sdma_running(ppd)) + goto done_unlock; + + /* if I have packets not complete yet */ + if (pq->added > ppd->sdma_descq_removed) + qib_user_sdma_hwqueue_clean(ppd); + /* if I have complete packets to be freed */ + if (pq->num_sending) + qib_user_sdma_queue_clean(ppd, pq); + + while (dim) { + int mxp = 1; + int ndesc = 0; + + ret = qib_user_sdma_queue_pkts(dd, ppd, pq, + iov, dim, &list, &mxp, &ndesc); + if (ret < 0) + goto done_unlock; + else { + dim -= ret; + iov += ret; + } + + /* force packets onto the sdma hw queue... */ + if (!list_empty(&list)) { + /* + * Lazily clean hw queue. + */ + if (qib_sdma_descq_freecnt(ppd) < ndesc) { + qib_user_sdma_hwqueue_clean(ppd); + if (pq->num_sending) + qib_user_sdma_queue_clean(ppd, pq); + } + + ret = qib_user_sdma_push_pkts(ppd, pq, &list, mxp); + if (ret < 0) + goto done_unlock; + else { + npkts += mxp; + pq->counter += mxp; + } + } + } + +done_unlock: + if (!list_empty(&list)) + qib_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &list); + mutex_unlock(&pq->lock); + + return (ret < 0) ? ret : npkts; +} + +int qib_user_sdma_make_progress(struct qib_pportdata *ppd, + struct qib_user_sdma_queue *pq) +{ + int ret = 0; + + mutex_lock(&pq->lock); + qib_user_sdma_hwqueue_clean(ppd); + ret = qib_user_sdma_queue_clean(ppd, pq); + mutex_unlock(&pq->lock); + + return ret; +} + +u32 qib_user_sdma_complete_counter(const struct qib_user_sdma_queue *pq) +{ + return pq ? pq->sent_counter : 0; +} + +u32 qib_user_sdma_inflight_counter(struct qib_user_sdma_queue *pq) +{ + return pq ? pq->counter : 0; +} diff --git a/kernel/drivers/infiniband/hw/qib/qib_user_sdma.h b/kernel/drivers/infiniband/hw/qib/qib_user_sdma.h new file mode 100644 index 000000000..ce8cbaf6a --- /dev/null +++ b/kernel/drivers/infiniband/hw/qib/qib_user_sdma.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2007, 2008 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include + +struct qib_user_sdma_queue; + +struct qib_user_sdma_queue * +qib_user_sdma_queue_create(struct device *dev, int unit, int port, int sport); +void qib_user_sdma_queue_destroy(struct qib_user_sdma_queue *pq); + +int qib_user_sdma_writev(struct qib_ctxtdata *pd, + struct qib_user_sdma_queue *pq, + const struct iovec *iov, + unsigned long dim); + +int qib_user_sdma_make_progress(struct qib_pportdata *ppd, + struct qib_user_sdma_queue *pq); + +void qib_user_sdma_queue_drain(struct qib_pportdata *ppd, + struct qib_user_sdma_queue *pq); + +u32 qib_user_sdma_complete_counter(const struct qib_user_sdma_queue *pq); +u32 qib_user_sdma_inflight_counter(struct qib_user_sdma_queue *pq); diff --git a/kernel/drivers/infiniband/hw/qib/qib_verbs.c b/kernel/drivers/infiniband/hw/qib/qib_verbs.c new file mode 100644 index 000000000..4a3599890 --- /dev/null +++ b/kernel/drivers/infiniband/hw/qib/qib_verbs.c @@ -0,0 +1,2339 @@ +/* + * Copyright (c) 2012, 2013 Intel Corporation. All rights reserved. + * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "qib.h" +#include "qib_common.h" + +static unsigned int ib_qib_qp_table_size = 256; +module_param_named(qp_table_size, ib_qib_qp_table_size, uint, S_IRUGO); +MODULE_PARM_DESC(qp_table_size, "QP table size"); + +unsigned int ib_qib_lkey_table_size = 16; +module_param_named(lkey_table_size, ib_qib_lkey_table_size, uint, + S_IRUGO); +MODULE_PARM_DESC(lkey_table_size, + "LKEY table size in bits (2^n, 1 <= n <= 23)"); + +static unsigned int ib_qib_max_pds = 0xFFFF; +module_param_named(max_pds, ib_qib_max_pds, uint, S_IRUGO); +MODULE_PARM_DESC(max_pds, + "Maximum number of protection domains to support"); + +static unsigned int ib_qib_max_ahs = 0xFFFF; +module_param_named(max_ahs, ib_qib_max_ahs, uint, S_IRUGO); +MODULE_PARM_DESC(max_ahs, "Maximum number of address handles to support"); + +unsigned int ib_qib_max_cqes = 0x2FFFF; +module_param_named(max_cqes, ib_qib_max_cqes, uint, S_IRUGO); +MODULE_PARM_DESC(max_cqes, + "Maximum number of completion queue entries to support"); + +unsigned int ib_qib_max_cqs = 0x1FFFF; +module_param_named(max_cqs, ib_qib_max_cqs, uint, S_IRUGO); +MODULE_PARM_DESC(max_cqs, "Maximum number of completion queues to support"); + +unsigned int ib_qib_max_qp_wrs = 0x3FFF; +module_param_named(max_qp_wrs, ib_qib_max_qp_wrs, uint, S_IRUGO); +MODULE_PARM_DESC(max_qp_wrs, "Maximum number of QP WRs to support"); + +unsigned int ib_qib_max_qps = 16384; +module_param_named(max_qps, ib_qib_max_qps, uint, S_IRUGO); +MODULE_PARM_DESC(max_qps, "Maximum number of QPs to support"); + +unsigned int ib_qib_max_sges = 0x60; +module_param_named(max_sges, ib_qib_max_sges, uint, S_IRUGO); +MODULE_PARM_DESC(max_sges, "Maximum number of SGEs to support"); + +unsigned int ib_qib_max_mcast_grps = 16384; +module_param_named(max_mcast_grps, ib_qib_max_mcast_grps, uint, S_IRUGO); +MODULE_PARM_DESC(max_mcast_grps, + "Maximum number of multicast groups to support"); + +unsigned int ib_qib_max_mcast_qp_attached = 16; +module_param_named(max_mcast_qp_attached, ib_qib_max_mcast_qp_attached, + uint, S_IRUGO); +MODULE_PARM_DESC(max_mcast_qp_attached, + "Maximum number of attached QPs to support"); + +unsigned int ib_qib_max_srqs = 1024; +module_param_named(max_srqs, ib_qib_max_srqs, uint, S_IRUGO); +MODULE_PARM_DESC(max_srqs, "Maximum number of SRQs to support"); + +unsigned int ib_qib_max_srq_sges = 128; +module_param_named(max_srq_sges, ib_qib_max_srq_sges, uint, S_IRUGO); +MODULE_PARM_DESC(max_srq_sges, "Maximum number of SRQ SGEs to support"); + +unsigned int ib_qib_max_srq_wrs = 0x1FFFF; +module_param_named(max_srq_wrs, ib_qib_max_srq_wrs, uint, S_IRUGO); +MODULE_PARM_DESC(max_srq_wrs, "Maximum number of SRQ WRs support"); + +static unsigned int ib_qib_disable_sma; +module_param_named(disable_sma, ib_qib_disable_sma, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(disable_sma, "Disable the SMA"); + +/* + * Note that it is OK to post send work requests in the SQE and ERR + * states; qib_do_send() will process them and generate error + * completions as per IB 1.2 C10-96. + */ +const int ib_qib_state_ops[IB_QPS_ERR + 1] = { + [IB_QPS_RESET] = 0, + [IB_QPS_INIT] = QIB_POST_RECV_OK, + [IB_QPS_RTR] = QIB_POST_RECV_OK | QIB_PROCESS_RECV_OK, + [IB_QPS_RTS] = QIB_POST_RECV_OK | QIB_PROCESS_RECV_OK | + QIB_POST_SEND_OK | QIB_PROCESS_SEND_OK | + QIB_PROCESS_NEXT_SEND_OK, + [IB_QPS_SQD] = QIB_POST_RECV_OK | QIB_PROCESS_RECV_OK | + QIB_POST_SEND_OK | QIB_PROCESS_SEND_OK, + [IB_QPS_SQE] = QIB_POST_RECV_OK | QIB_PROCESS_RECV_OK | + QIB_POST_SEND_OK | QIB_FLUSH_SEND, + [IB_QPS_ERR] = QIB_POST_RECV_OK | QIB_FLUSH_RECV | + QIB_POST_SEND_OK | QIB_FLUSH_SEND, +}; + +struct qib_ucontext { + struct ib_ucontext ibucontext; +}; + +static inline struct qib_ucontext *to_iucontext(struct ib_ucontext + *ibucontext) +{ + return container_of(ibucontext, struct qib_ucontext, ibucontext); +} + +/* + * Translate ib_wr_opcode into ib_wc_opcode. + */ +const enum ib_wc_opcode ib_qib_wc_opcode[] = { + [IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE, + [IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE, + [IB_WR_SEND] = IB_WC_SEND, + [IB_WR_SEND_WITH_IMM] = IB_WC_SEND, + [IB_WR_RDMA_READ] = IB_WC_RDMA_READ, + [IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP, + [IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD +}; + +/* + * System image GUID. + */ +__be64 ib_qib_sys_image_guid; + +/** + * qib_copy_sge - copy data to SGE memory + * @ss: the SGE state + * @data: the data to copy + * @length: the length of the data + */ +void qib_copy_sge(struct qib_sge_state *ss, void *data, u32 length, int release) +{ + struct qib_sge *sge = &ss->sge; + + while (length) { + u32 len = sge->length; + + if (len > length) + len = length; + if (len > sge->sge_length) + len = sge->sge_length; + BUG_ON(len == 0); + memcpy(sge->vaddr, data, len); + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (sge->sge_length == 0) { + if (release) + qib_put_mr(sge->mr); + if (--ss->num_sge) + *sge = *ss->sg_list++; + } else if (sge->length == 0 && sge->mr->lkey) { + if (++sge->n >= QIB_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + break; + sge->n = 0; + } + sge->vaddr = + sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = + sge->mr->map[sge->m]->segs[sge->n].length; + } + data += len; + length -= len; + } +} + +/** + * qib_skip_sge - skip over SGE memory - XXX almost dup of prev func + * @ss: the SGE state + * @length: the number of bytes to skip + */ +void qib_skip_sge(struct qib_sge_state *ss, u32 length, int release) +{ + struct qib_sge *sge = &ss->sge; + + while (length) { + u32 len = sge->length; + + if (len > length) + len = length; + if (len > sge->sge_length) + len = sge->sge_length; + BUG_ON(len == 0); + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (sge->sge_length == 0) { + if (release) + qib_put_mr(sge->mr); + if (--ss->num_sge) + *sge = *ss->sg_list++; + } else if (sge->length == 0 && sge->mr->lkey) { + if (++sge->n >= QIB_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + break; + sge->n = 0; + } + sge->vaddr = + sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = + sge->mr->map[sge->m]->segs[sge->n].length; + } + length -= len; + } +} + +/* + * Count the number of DMA descriptors needed to send length bytes of data. + * Don't modify the qib_sge_state to get the count. + * Return zero if any of the segments is not aligned. + */ +static u32 qib_count_sge(struct qib_sge_state *ss, u32 length) +{ + struct qib_sge *sg_list = ss->sg_list; + struct qib_sge sge = ss->sge; + u8 num_sge = ss->num_sge; + u32 ndesc = 1; /* count the header */ + + while (length) { + u32 len = sge.length; + + if (len > length) + len = length; + if (len > sge.sge_length) + len = sge.sge_length; + BUG_ON(len == 0); + if (((long) sge.vaddr & (sizeof(u32) - 1)) || + (len != length && (len & (sizeof(u32) - 1)))) { + ndesc = 0; + break; + } + ndesc++; + sge.vaddr += len; + sge.length -= len; + sge.sge_length -= len; + if (sge.sge_length == 0) { + if (--num_sge) + sge = *sg_list++; + } else if (sge.length == 0 && sge.mr->lkey) { + if (++sge.n >= QIB_SEGSZ) { + if (++sge.m >= sge.mr->mapsz) + break; + sge.n = 0; + } + sge.vaddr = + sge.mr->map[sge.m]->segs[sge.n].vaddr; + sge.length = + sge.mr->map[sge.m]->segs[sge.n].length; + } + length -= len; + } + return ndesc; +} + +/* + * Copy from the SGEs to the data buffer. + */ +static void qib_copy_from_sge(void *data, struct qib_sge_state *ss, u32 length) +{ + struct qib_sge *sge = &ss->sge; + + while (length) { + u32 len = sge->length; + + if (len > length) + len = length; + if (len > sge->sge_length) + len = sge->sge_length; + BUG_ON(len == 0); + memcpy(data, sge->vaddr, len); + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (sge->sge_length == 0) { + if (--ss->num_sge) + *sge = *ss->sg_list++; + } else if (sge->length == 0 && sge->mr->lkey) { + if (++sge->n >= QIB_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + break; + sge->n = 0; + } + sge->vaddr = + sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = + sge->mr->map[sge->m]->segs[sge->n].length; + } + data += len; + length -= len; + } +} + +/** + * qib_post_one_send - post one RC, UC, or UD send work request + * @qp: the QP to post on + * @wr: the work request to send + */ +static int qib_post_one_send(struct qib_qp *qp, struct ib_send_wr *wr, + int *scheduled) +{ + struct qib_swqe *wqe; + u32 next; + int i; + int j; + int acc; + int ret; + unsigned long flags; + struct qib_lkey_table *rkt; + struct qib_pd *pd; + + spin_lock_irqsave(&qp->s_lock, flags); + + /* Check that state is OK to post send. */ + if (unlikely(!(ib_qib_state_ops[qp->state] & QIB_POST_SEND_OK))) + goto bail_inval; + + /* IB spec says that num_sge == 0 is OK. */ + if (wr->num_sge > qp->s_max_sge) + goto bail_inval; + + /* + * Don't allow RDMA reads or atomic operations on UC or + * undefined operations. + * Make sure buffer is large enough to hold the result for atomics. + */ + if (wr->opcode == IB_WR_FAST_REG_MR) { + if (qib_fast_reg_mr(qp, wr)) + goto bail_inval; + } else if (qp->ibqp.qp_type == IB_QPT_UC) { + if ((unsigned) wr->opcode >= IB_WR_RDMA_READ) + goto bail_inval; + } else if (qp->ibqp.qp_type != IB_QPT_RC) { + /* Check IB_QPT_SMI, IB_QPT_GSI, IB_QPT_UD opcode */ + if (wr->opcode != IB_WR_SEND && + wr->opcode != IB_WR_SEND_WITH_IMM) + goto bail_inval; + /* Check UD destination address PD */ + if (qp->ibqp.pd != wr->wr.ud.ah->pd) + goto bail_inval; + } else if ((unsigned) wr->opcode > IB_WR_ATOMIC_FETCH_AND_ADD) + goto bail_inval; + else if (wr->opcode >= IB_WR_ATOMIC_CMP_AND_SWP && + (wr->num_sge == 0 || + wr->sg_list[0].length < sizeof(u64) || + wr->sg_list[0].addr & (sizeof(u64) - 1))) + goto bail_inval; + else if (wr->opcode >= IB_WR_RDMA_READ && !qp->s_max_rd_atomic) + goto bail_inval; + + next = qp->s_head + 1; + if (next >= qp->s_size) + next = 0; + if (next == qp->s_last) { + ret = -ENOMEM; + goto bail; + } + + rkt = &to_idev(qp->ibqp.device)->lk_table; + pd = to_ipd(qp->ibqp.pd); + wqe = get_swqe_ptr(qp, qp->s_head); + wqe->wr = *wr; + wqe->length = 0; + j = 0; + if (wr->num_sge) { + acc = wr->opcode >= IB_WR_RDMA_READ ? + IB_ACCESS_LOCAL_WRITE : 0; + for (i = 0; i < wr->num_sge; i++) { + u32 length = wr->sg_list[i].length; + int ok; + + if (length == 0) + continue; + ok = qib_lkey_ok(rkt, pd, &wqe->sg_list[j], + &wr->sg_list[i], acc); + if (!ok) + goto bail_inval_free; + wqe->length += length; + j++; + } + wqe->wr.num_sge = j; + } + if (qp->ibqp.qp_type == IB_QPT_UC || + qp->ibqp.qp_type == IB_QPT_RC) { + if (wqe->length > 0x80000000U) + goto bail_inval_free; + } else if (wqe->length > (dd_from_ibdev(qp->ibqp.device)->pport + + qp->port_num - 1)->ibmtu) + goto bail_inval_free; + else + atomic_inc(&to_iah(wr->wr.ud.ah)->refcount); + wqe->ssn = qp->s_ssn++; + qp->s_head = next; + + ret = 0; + goto bail; + +bail_inval_free: + while (j) { + struct qib_sge *sge = &wqe->sg_list[--j]; + + qib_put_mr(sge->mr); + } +bail_inval: + ret = -EINVAL; +bail: + if (!ret && !wr->next && + !qib_sdma_empty( + dd_from_ibdev(qp->ibqp.device)->pport + qp->port_num - 1)) { + qib_schedule_send(qp); + *scheduled = 1; + } + spin_unlock_irqrestore(&qp->s_lock, flags); + return ret; +} + +/** + * qib_post_send - post a send on a QP + * @ibqp: the QP to post the send on + * @wr: the list of work requests to post + * @bad_wr: the first bad WR is put here + * + * This may be called from interrupt context. + */ +static int qib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr) +{ + struct qib_qp *qp = to_iqp(ibqp); + int err = 0; + int scheduled = 0; + + for (; wr; wr = wr->next) { + err = qib_post_one_send(qp, wr, &scheduled); + if (err) { + *bad_wr = wr; + goto bail; + } + } + + /* Try to do the send work in the caller's context. */ + if (!scheduled) + qib_do_send(&qp->s_work); + +bail: + return err; +} + +/** + * qib_post_receive - post a receive on a QP + * @ibqp: the QP to post the receive on + * @wr: the WR to post + * @bad_wr: the first bad WR is put here + * + * This may be called from interrupt context. + */ +static int qib_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct qib_qp *qp = to_iqp(ibqp); + struct qib_rwq *wq = qp->r_rq.wq; + unsigned long flags; + int ret; + + /* Check that state is OK to post receive. */ + if (!(ib_qib_state_ops[qp->state] & QIB_POST_RECV_OK) || !wq) { + *bad_wr = wr; + ret = -EINVAL; + goto bail; + } + + for (; wr; wr = wr->next) { + struct qib_rwqe *wqe; + u32 next; + int i; + + if ((unsigned) wr->num_sge > qp->r_rq.max_sge) { + *bad_wr = wr; + ret = -EINVAL; + goto bail; + } + + spin_lock_irqsave(&qp->r_rq.lock, flags); + next = wq->head + 1; + if (next >= qp->r_rq.size) + next = 0; + if (next == wq->tail) { + spin_unlock_irqrestore(&qp->r_rq.lock, flags); + *bad_wr = wr; + ret = -ENOMEM; + goto bail; + } + + wqe = get_rwqe_ptr(&qp->r_rq, wq->head); + wqe->wr_id = wr->wr_id; + wqe->num_sge = wr->num_sge; + for (i = 0; i < wr->num_sge; i++) + wqe->sg_list[i] = wr->sg_list[i]; + /* Make sure queue entry is written before the head index. */ + smp_wmb(); + wq->head = next; + spin_unlock_irqrestore(&qp->r_rq.lock, flags); + } + ret = 0; + +bail: + return ret; +} + +/** + * qib_qp_rcv - processing an incoming packet on a QP + * @rcd: the context pointer + * @hdr: the packet header + * @has_grh: true if the packet has a GRH + * @data: the packet data + * @tlen: the packet length + * @qp: the QP the packet came on + * + * This is called from qib_ib_rcv() to process an incoming packet + * for the given QP. + * Called at interrupt level. + */ +static void qib_qp_rcv(struct qib_ctxtdata *rcd, struct qib_ib_header *hdr, + int has_grh, void *data, u32 tlen, struct qib_qp *qp) +{ + struct qib_ibport *ibp = &rcd->ppd->ibport_data; + + spin_lock(&qp->r_lock); + + /* Check for valid receive state. */ + if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK)) { + ibp->n_pkt_drops++; + goto unlock; + } + + switch (qp->ibqp.qp_type) { + case IB_QPT_SMI: + case IB_QPT_GSI: + if (ib_qib_disable_sma) + break; + /* FALLTHROUGH */ + case IB_QPT_UD: + qib_ud_rcv(ibp, hdr, has_grh, data, tlen, qp); + break; + + case IB_QPT_RC: + qib_rc_rcv(rcd, hdr, has_grh, data, tlen, qp); + break; + + case IB_QPT_UC: + qib_uc_rcv(ibp, hdr, has_grh, data, tlen, qp); + break; + + default: + break; + } + +unlock: + spin_unlock(&qp->r_lock); +} + +/** + * qib_ib_rcv - process an incoming packet + * @rcd: the context pointer + * @rhdr: the header of the packet + * @data: the packet payload + * @tlen: the packet length + * + * This is called from qib_kreceive() to process an incoming packet at + * interrupt level. Tlen is the length of the header + data + CRC in bytes. + */ +void qib_ib_rcv(struct qib_ctxtdata *rcd, void *rhdr, void *data, u32 tlen) +{ + struct qib_pportdata *ppd = rcd->ppd; + struct qib_ibport *ibp = &ppd->ibport_data; + struct qib_ib_header *hdr = rhdr; + struct qib_other_headers *ohdr; + struct qib_qp *qp; + u32 qp_num; + int lnh; + u8 opcode; + u16 lid; + + /* 24 == LRH+BTH+CRC */ + if (unlikely(tlen < 24)) + goto drop; + + /* Check for a valid destination LID (see ch. 7.11.1). */ + lid = be16_to_cpu(hdr->lrh[1]); + if (lid < QIB_MULTICAST_LID_BASE) { + lid &= ~((1 << ppd->lmc) - 1); + if (unlikely(lid != ppd->lid)) + goto drop; + } + + /* Check for GRH */ + lnh = be16_to_cpu(hdr->lrh[0]) & 3; + if (lnh == QIB_LRH_BTH) + ohdr = &hdr->u.oth; + else if (lnh == QIB_LRH_GRH) { + u32 vtf; + + ohdr = &hdr->u.l.oth; + if (hdr->u.l.grh.next_hdr != IB_GRH_NEXT_HDR) + goto drop; + vtf = be32_to_cpu(hdr->u.l.grh.version_tclass_flow); + if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION) + goto drop; + } else + goto drop; + + opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0x7f; +#ifdef CONFIG_DEBUG_FS + rcd->opstats->stats[opcode].n_bytes += tlen; + rcd->opstats->stats[opcode].n_packets++; +#endif + + /* Get the destination QP number. */ + qp_num = be32_to_cpu(ohdr->bth[1]) & QIB_QPN_MASK; + if (qp_num == QIB_MULTICAST_QPN) { + struct qib_mcast *mcast; + struct qib_mcast_qp *p; + + if (lnh != QIB_LRH_GRH) + goto drop; + mcast = qib_mcast_find(ibp, &hdr->u.l.grh.dgid); + if (mcast == NULL) + goto drop; + this_cpu_inc(ibp->pmastats->n_multicast_rcv); + list_for_each_entry_rcu(p, &mcast->qp_list, list) + qib_qp_rcv(rcd, hdr, 1, data, tlen, p->qp); + /* + * Notify qib_multicast_detach() if it is waiting for us + * to finish. + */ + if (atomic_dec_return(&mcast->refcount) <= 1) + wake_up(&mcast->wait); + } else { + if (rcd->lookaside_qp) { + if (rcd->lookaside_qpn != qp_num) { + if (atomic_dec_and_test( + &rcd->lookaside_qp->refcount)) + wake_up( + &rcd->lookaside_qp->wait); + rcd->lookaside_qp = NULL; + } + } + if (!rcd->lookaside_qp) { + qp = qib_lookup_qpn(ibp, qp_num); + if (!qp) + goto drop; + rcd->lookaside_qp = qp; + rcd->lookaside_qpn = qp_num; + } else + qp = rcd->lookaside_qp; + this_cpu_inc(ibp->pmastats->n_unicast_rcv); + qib_qp_rcv(rcd, hdr, lnh == QIB_LRH_GRH, data, tlen, qp); + } + return; + +drop: + ibp->n_pkt_drops++; +} + +/* + * This is called from a timer to check for QPs + * which need kernel memory in order to send a packet. + */ +static void mem_timer(unsigned long data) +{ + struct qib_ibdev *dev = (struct qib_ibdev *) data; + struct list_head *list = &dev->memwait; + struct qib_qp *qp = NULL; + unsigned long flags; + + spin_lock_irqsave(&dev->pending_lock, flags); + if (!list_empty(list)) { + qp = list_entry(list->next, struct qib_qp, iowait); + list_del_init(&qp->iowait); + atomic_inc(&qp->refcount); + if (!list_empty(list)) + mod_timer(&dev->mem_timer, jiffies + 1); + } + spin_unlock_irqrestore(&dev->pending_lock, flags); + + if (qp) { + spin_lock_irqsave(&qp->s_lock, flags); + if (qp->s_flags & QIB_S_WAIT_KMEM) { + qp->s_flags &= ~QIB_S_WAIT_KMEM; + qib_schedule_send(qp); + } + spin_unlock_irqrestore(&qp->s_lock, flags); + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + } +} + +static void update_sge(struct qib_sge_state *ss, u32 length) +{ + struct qib_sge *sge = &ss->sge; + + sge->vaddr += length; + sge->length -= length; + sge->sge_length -= length; + if (sge->sge_length == 0) { + if (--ss->num_sge) + *sge = *ss->sg_list++; + } else if (sge->length == 0 && sge->mr->lkey) { + if (++sge->n >= QIB_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + return; + sge->n = 0; + } + sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = sge->mr->map[sge->m]->segs[sge->n].length; + } +} + +#ifdef __LITTLE_ENDIAN +static inline u32 get_upper_bits(u32 data, u32 shift) +{ + return data >> shift; +} + +static inline u32 set_upper_bits(u32 data, u32 shift) +{ + return data << shift; +} + +static inline u32 clear_upper_bytes(u32 data, u32 n, u32 off) +{ + data <<= ((sizeof(u32) - n) * BITS_PER_BYTE); + data >>= ((sizeof(u32) - n - off) * BITS_PER_BYTE); + return data; +} +#else +static inline u32 get_upper_bits(u32 data, u32 shift) +{ + return data << shift; +} + +static inline u32 set_upper_bits(u32 data, u32 shift) +{ + return data >> shift; +} + +static inline u32 clear_upper_bytes(u32 data, u32 n, u32 off) +{ + data >>= ((sizeof(u32) - n) * BITS_PER_BYTE); + data <<= ((sizeof(u32) - n - off) * BITS_PER_BYTE); + return data; +} +#endif + +static void copy_io(u32 __iomem *piobuf, struct qib_sge_state *ss, + u32 length, unsigned flush_wc) +{ + u32 extra = 0; + u32 data = 0; + u32 last; + + while (1) { + u32 len = ss->sge.length; + u32 off; + + if (len > length) + len = length; + if (len > ss->sge.sge_length) + len = ss->sge.sge_length; + BUG_ON(len == 0); + /* If the source address is not aligned, try to align it. */ + off = (unsigned long)ss->sge.vaddr & (sizeof(u32) - 1); + if (off) { + u32 *addr = (u32 *)((unsigned long)ss->sge.vaddr & + ~(sizeof(u32) - 1)); + u32 v = get_upper_bits(*addr, off * BITS_PER_BYTE); + u32 y; + + y = sizeof(u32) - off; + if (len > y) + len = y; + if (len + extra >= sizeof(u32)) { + data |= set_upper_bits(v, extra * + BITS_PER_BYTE); + len = sizeof(u32) - extra; + if (len == length) { + last = data; + break; + } + __raw_writel(data, piobuf); + piobuf++; + extra = 0; + data = 0; + } else { + /* Clear unused upper bytes */ + data |= clear_upper_bytes(v, len, extra); + if (len == length) { + last = data; + break; + } + extra += len; + } + } else if (extra) { + /* Source address is aligned. */ + u32 *addr = (u32 *) ss->sge.vaddr; + int shift = extra * BITS_PER_BYTE; + int ushift = 32 - shift; + u32 l = len; + + while (l >= sizeof(u32)) { + u32 v = *addr; + + data |= set_upper_bits(v, shift); + __raw_writel(data, piobuf); + data = get_upper_bits(v, ushift); + piobuf++; + addr++; + l -= sizeof(u32); + } + /* + * We still have 'extra' number of bytes leftover. + */ + if (l) { + u32 v = *addr; + + if (l + extra >= sizeof(u32)) { + data |= set_upper_bits(v, shift); + len -= l + extra - sizeof(u32); + if (len == length) { + last = data; + break; + } + __raw_writel(data, piobuf); + piobuf++; + extra = 0; + data = 0; + } else { + /* Clear unused upper bytes */ + data |= clear_upper_bytes(v, l, extra); + if (len == length) { + last = data; + break; + } + extra += l; + } + } else if (len == length) { + last = data; + break; + } + } else if (len == length) { + u32 w; + + /* + * Need to round up for the last dword in the + * packet. + */ + w = (len + 3) >> 2; + qib_pio_copy(piobuf, ss->sge.vaddr, w - 1); + piobuf += w - 1; + last = ((u32 *) ss->sge.vaddr)[w - 1]; + break; + } else { + u32 w = len >> 2; + + qib_pio_copy(piobuf, ss->sge.vaddr, w); + piobuf += w; + + extra = len & (sizeof(u32) - 1); + if (extra) { + u32 v = ((u32 *) ss->sge.vaddr)[w]; + + /* Clear unused upper bytes */ + data = clear_upper_bytes(v, extra, 0); + } + } + update_sge(ss, len); + length -= len; + } + /* Update address before sending packet. */ + update_sge(ss, length); + if (flush_wc) { + /* must flush early everything before trigger word */ + qib_flush_wc(); + __raw_writel(last, piobuf); + /* be sure trigger word is written */ + qib_flush_wc(); + } else + __raw_writel(last, piobuf); +} + +static noinline struct qib_verbs_txreq *__get_txreq(struct qib_ibdev *dev, + struct qib_qp *qp) +{ + struct qib_verbs_txreq *tx; + unsigned long flags; + + spin_lock_irqsave(&qp->s_lock, flags); + spin_lock(&dev->pending_lock); + + if (!list_empty(&dev->txreq_free)) { + struct list_head *l = dev->txreq_free.next; + + list_del(l); + spin_unlock(&dev->pending_lock); + spin_unlock_irqrestore(&qp->s_lock, flags); + tx = list_entry(l, struct qib_verbs_txreq, txreq.list); + } else { + if (ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK && + list_empty(&qp->iowait)) { + dev->n_txwait++; + qp->s_flags |= QIB_S_WAIT_TX; + list_add_tail(&qp->iowait, &dev->txwait); + } + qp->s_flags &= ~QIB_S_BUSY; + spin_unlock(&dev->pending_lock); + spin_unlock_irqrestore(&qp->s_lock, flags); + tx = ERR_PTR(-EBUSY); + } + return tx; +} + +static inline struct qib_verbs_txreq *get_txreq(struct qib_ibdev *dev, + struct qib_qp *qp) +{ + struct qib_verbs_txreq *tx; + unsigned long flags; + + spin_lock_irqsave(&dev->pending_lock, flags); + /* assume the list non empty */ + if (likely(!list_empty(&dev->txreq_free))) { + struct list_head *l = dev->txreq_free.next; + + list_del(l); + spin_unlock_irqrestore(&dev->pending_lock, flags); + tx = list_entry(l, struct qib_verbs_txreq, txreq.list); + } else { + /* call slow path to get the extra lock */ + spin_unlock_irqrestore(&dev->pending_lock, flags); + tx = __get_txreq(dev, qp); + } + return tx; +} + +void qib_put_txreq(struct qib_verbs_txreq *tx) +{ + struct qib_ibdev *dev; + struct qib_qp *qp; + unsigned long flags; + + qp = tx->qp; + dev = to_idev(qp->ibqp.device); + + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + if (tx->mr) { + qib_put_mr(tx->mr); + tx->mr = NULL; + } + if (tx->txreq.flags & QIB_SDMA_TXREQ_F_FREEBUF) { + tx->txreq.flags &= ~QIB_SDMA_TXREQ_F_FREEBUF; + dma_unmap_single(&dd_from_dev(dev)->pcidev->dev, + tx->txreq.addr, tx->hdr_dwords << 2, + DMA_TO_DEVICE); + kfree(tx->align_buf); + } + + spin_lock_irqsave(&dev->pending_lock, flags); + + /* Put struct back on free list */ + list_add(&tx->txreq.list, &dev->txreq_free); + + if (!list_empty(&dev->txwait)) { + /* Wake up first QP wanting a free struct */ + qp = list_entry(dev->txwait.next, struct qib_qp, iowait); + list_del_init(&qp->iowait); + atomic_inc(&qp->refcount); + spin_unlock_irqrestore(&dev->pending_lock, flags); + + spin_lock_irqsave(&qp->s_lock, flags); + if (qp->s_flags & QIB_S_WAIT_TX) { + qp->s_flags &= ~QIB_S_WAIT_TX; + qib_schedule_send(qp); + } + spin_unlock_irqrestore(&qp->s_lock, flags); + + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + } else + spin_unlock_irqrestore(&dev->pending_lock, flags); +} + +/* + * This is called when there are send DMA descriptors that might be + * available. + * + * This is called with ppd->sdma_lock held. + */ +void qib_verbs_sdma_desc_avail(struct qib_pportdata *ppd, unsigned avail) +{ + struct qib_qp *qp, *nqp; + struct qib_qp *qps[20]; + struct qib_ibdev *dev; + unsigned i, n; + + n = 0; + dev = &ppd->dd->verbs_dev; + spin_lock(&dev->pending_lock); + + /* Search wait list for first QP wanting DMA descriptors. */ + list_for_each_entry_safe(qp, nqp, &dev->dmawait, iowait) { + if (qp->port_num != ppd->port) + continue; + if (n == ARRAY_SIZE(qps)) + break; + if (qp->s_tx->txreq.sg_count > avail) + break; + avail -= qp->s_tx->txreq.sg_count; + list_del_init(&qp->iowait); + atomic_inc(&qp->refcount); + qps[n++] = qp; + } + + spin_unlock(&dev->pending_lock); + + for (i = 0; i < n; i++) { + qp = qps[i]; + spin_lock(&qp->s_lock); + if (qp->s_flags & QIB_S_WAIT_DMA_DESC) { + qp->s_flags &= ~QIB_S_WAIT_DMA_DESC; + qib_schedule_send(qp); + } + spin_unlock(&qp->s_lock); + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + } +} + +/* + * This is called with ppd->sdma_lock held. + */ +static void sdma_complete(struct qib_sdma_txreq *cookie, int status) +{ + struct qib_verbs_txreq *tx = + container_of(cookie, struct qib_verbs_txreq, txreq); + struct qib_qp *qp = tx->qp; + + spin_lock(&qp->s_lock); + if (tx->wqe) + qib_send_complete(qp, tx->wqe, IB_WC_SUCCESS); + else if (qp->ibqp.qp_type == IB_QPT_RC) { + struct qib_ib_header *hdr; + + if (tx->txreq.flags & QIB_SDMA_TXREQ_F_FREEBUF) + hdr = &tx->align_buf->hdr; + else { + struct qib_ibdev *dev = to_idev(qp->ibqp.device); + + hdr = &dev->pio_hdrs[tx->hdr_inx].hdr; + } + qib_rc_send_complete(qp, hdr); + } + if (atomic_dec_and_test(&qp->s_dma_busy)) { + if (qp->state == IB_QPS_RESET) + wake_up(&qp->wait_dma); + else if (qp->s_flags & QIB_S_WAIT_DMA) { + qp->s_flags &= ~QIB_S_WAIT_DMA; + qib_schedule_send(qp); + } + } + spin_unlock(&qp->s_lock); + + qib_put_txreq(tx); +} + +static int wait_kmem(struct qib_ibdev *dev, struct qib_qp *qp) +{ + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&qp->s_lock, flags); + if (ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK) { + spin_lock(&dev->pending_lock); + if (list_empty(&qp->iowait)) { + if (list_empty(&dev->memwait)) + mod_timer(&dev->mem_timer, jiffies + 1); + qp->s_flags |= QIB_S_WAIT_KMEM; + list_add_tail(&qp->iowait, &dev->memwait); + } + spin_unlock(&dev->pending_lock); + qp->s_flags &= ~QIB_S_BUSY; + ret = -EBUSY; + } + spin_unlock_irqrestore(&qp->s_lock, flags); + + return ret; +} + +static int qib_verbs_send_dma(struct qib_qp *qp, struct qib_ib_header *hdr, + u32 hdrwords, struct qib_sge_state *ss, u32 len, + u32 plen, u32 dwords) +{ + struct qib_ibdev *dev = to_idev(qp->ibqp.device); + struct qib_devdata *dd = dd_from_dev(dev); + struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + struct qib_verbs_txreq *tx; + struct qib_pio_header *phdr; + u32 control; + u32 ndesc; + int ret; + + tx = qp->s_tx; + if (tx) { + qp->s_tx = NULL; + /* resend previously constructed packet */ + ret = qib_sdma_verbs_send(ppd, tx->ss, tx->dwords, tx); + goto bail; + } + + tx = get_txreq(dev, qp); + if (IS_ERR(tx)) + goto bail_tx; + + control = dd->f_setpbc_control(ppd, plen, qp->s_srate, + be16_to_cpu(hdr->lrh[0]) >> 12); + tx->qp = qp; + atomic_inc(&qp->refcount); + tx->wqe = qp->s_wqe; + tx->mr = qp->s_rdma_mr; + if (qp->s_rdma_mr) + qp->s_rdma_mr = NULL; + tx->txreq.callback = sdma_complete; + if (dd->flags & QIB_HAS_SDMA_TIMEOUT) + tx->txreq.flags = QIB_SDMA_TXREQ_F_HEADTOHOST; + else + tx->txreq.flags = QIB_SDMA_TXREQ_F_INTREQ; + if (plen + 1 > dd->piosize2kmax_dwords) + tx->txreq.flags |= QIB_SDMA_TXREQ_F_USELARGEBUF; + + if (len) { + /* + * Don't try to DMA if it takes more descriptors than + * the queue holds. + */ + ndesc = qib_count_sge(ss, len); + if (ndesc >= ppd->sdma_descq_cnt) + ndesc = 0; + } else + ndesc = 1; + if (ndesc) { + phdr = &dev->pio_hdrs[tx->hdr_inx]; + phdr->pbc[0] = cpu_to_le32(plen); + phdr->pbc[1] = cpu_to_le32(control); + memcpy(&phdr->hdr, hdr, hdrwords << 2); + tx->txreq.flags |= QIB_SDMA_TXREQ_F_FREEDESC; + tx->txreq.sg_count = ndesc; + tx->txreq.addr = dev->pio_hdrs_phys + + tx->hdr_inx * sizeof(struct qib_pio_header); + tx->hdr_dwords = hdrwords + 2; /* add PBC length */ + ret = qib_sdma_verbs_send(ppd, ss, dwords, tx); + goto bail; + } + + /* Allocate a buffer and copy the header and payload to it. */ + tx->hdr_dwords = plen + 1; + phdr = kmalloc(tx->hdr_dwords << 2, GFP_ATOMIC); + if (!phdr) + goto err_tx; + phdr->pbc[0] = cpu_to_le32(plen); + phdr->pbc[1] = cpu_to_le32(control); + memcpy(&phdr->hdr, hdr, hdrwords << 2); + qib_copy_from_sge((u32 *) &phdr->hdr + hdrwords, ss, len); + + tx->txreq.addr = dma_map_single(&dd->pcidev->dev, phdr, + tx->hdr_dwords << 2, DMA_TO_DEVICE); + if (dma_mapping_error(&dd->pcidev->dev, tx->txreq.addr)) + goto map_err; + tx->align_buf = phdr; + tx->txreq.flags |= QIB_SDMA_TXREQ_F_FREEBUF; + tx->txreq.sg_count = 1; + ret = qib_sdma_verbs_send(ppd, NULL, 0, tx); + goto unaligned; + +map_err: + kfree(phdr); +err_tx: + qib_put_txreq(tx); + ret = wait_kmem(dev, qp); +unaligned: + ibp->n_unaligned++; +bail: + return ret; +bail_tx: + ret = PTR_ERR(tx); + goto bail; +} + +/* + * If we are now in the error state, return zero to flush the + * send work request. + */ +static int no_bufs_available(struct qib_qp *qp) +{ + struct qib_ibdev *dev = to_idev(qp->ibqp.device); + struct qib_devdata *dd; + unsigned long flags; + int ret = 0; + + /* + * Note that as soon as want_buffer() is called and + * possibly before it returns, qib_ib_piobufavail() + * could be called. Therefore, put QP on the I/O wait list before + * enabling the PIO avail interrupt. + */ + spin_lock_irqsave(&qp->s_lock, flags); + if (ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK) { + spin_lock(&dev->pending_lock); + if (list_empty(&qp->iowait)) { + dev->n_piowait++; + qp->s_flags |= QIB_S_WAIT_PIO; + list_add_tail(&qp->iowait, &dev->piowait); + dd = dd_from_dev(dev); + dd->f_wantpiobuf_intr(dd, 1); + } + spin_unlock(&dev->pending_lock); + qp->s_flags &= ~QIB_S_BUSY; + ret = -EBUSY; + } + spin_unlock_irqrestore(&qp->s_lock, flags); + return ret; +} + +static int qib_verbs_send_pio(struct qib_qp *qp, struct qib_ib_header *ibhdr, + u32 hdrwords, struct qib_sge_state *ss, u32 len, + u32 plen, u32 dwords) +{ + struct qib_devdata *dd = dd_from_ibdev(qp->ibqp.device); + struct qib_pportdata *ppd = dd->pport + qp->port_num - 1; + u32 *hdr = (u32 *) ibhdr; + u32 __iomem *piobuf_orig; + u32 __iomem *piobuf; + u64 pbc; + unsigned long flags; + unsigned flush_wc; + u32 control; + u32 pbufn; + + control = dd->f_setpbc_control(ppd, plen, qp->s_srate, + be16_to_cpu(ibhdr->lrh[0]) >> 12); + pbc = ((u64) control << 32) | plen; + piobuf = dd->f_getsendbuf(ppd, pbc, &pbufn); + if (unlikely(piobuf == NULL)) + return no_bufs_available(qp); + + /* + * Write the pbc. + * We have to flush after the PBC for correctness on some cpus + * or WC buffer can be written out of order. + */ + writeq(pbc, piobuf); + piobuf_orig = piobuf; + piobuf += 2; + + flush_wc = dd->flags & QIB_PIO_FLUSH_WC; + if (len == 0) { + /* + * If there is just the header portion, must flush before + * writing last word of header for correctness, and after + * the last header word (trigger word). + */ + if (flush_wc) { + qib_flush_wc(); + qib_pio_copy(piobuf, hdr, hdrwords - 1); + qib_flush_wc(); + __raw_writel(hdr[hdrwords - 1], piobuf + hdrwords - 1); + qib_flush_wc(); + } else + qib_pio_copy(piobuf, hdr, hdrwords); + goto done; + } + + if (flush_wc) + qib_flush_wc(); + qib_pio_copy(piobuf, hdr, hdrwords); + piobuf += hdrwords; + + /* The common case is aligned and contained in one segment. */ + if (likely(ss->num_sge == 1 && len <= ss->sge.length && + !((unsigned long)ss->sge.vaddr & (sizeof(u32) - 1)))) { + u32 *addr = (u32 *) ss->sge.vaddr; + + /* Update address before sending packet. */ + update_sge(ss, len); + if (flush_wc) { + qib_pio_copy(piobuf, addr, dwords - 1); + /* must flush early everything before trigger word */ + qib_flush_wc(); + __raw_writel(addr[dwords - 1], piobuf + dwords - 1); + /* be sure trigger word is written */ + qib_flush_wc(); + } else + qib_pio_copy(piobuf, addr, dwords); + goto done; + } + copy_io(piobuf, ss, len, flush_wc); +done: + if (dd->flags & QIB_USE_SPCL_TRIG) { + u32 spcl_off = (pbufn >= dd->piobcnt2k) ? 2047 : 1023; + + qib_flush_wc(); + __raw_writel(0xaebecede, piobuf_orig + spcl_off); + } + qib_sendbuf_done(dd, pbufn); + if (qp->s_rdma_mr) { + qib_put_mr(qp->s_rdma_mr); + qp->s_rdma_mr = NULL; + } + if (qp->s_wqe) { + spin_lock_irqsave(&qp->s_lock, flags); + qib_send_complete(qp, qp->s_wqe, IB_WC_SUCCESS); + spin_unlock_irqrestore(&qp->s_lock, flags); + } else if (qp->ibqp.qp_type == IB_QPT_RC) { + spin_lock_irqsave(&qp->s_lock, flags); + qib_rc_send_complete(qp, ibhdr); + spin_unlock_irqrestore(&qp->s_lock, flags); + } + return 0; +} + +/** + * qib_verbs_send - send a packet + * @qp: the QP to send on + * @hdr: the packet header + * @hdrwords: the number of 32-bit words in the header + * @ss: the SGE to send + * @len: the length of the packet in bytes + * + * Return zero if packet is sent or queued OK. + * Return non-zero and clear qp->s_flags QIB_S_BUSY otherwise. + */ +int qib_verbs_send(struct qib_qp *qp, struct qib_ib_header *hdr, + u32 hdrwords, struct qib_sge_state *ss, u32 len) +{ + struct qib_devdata *dd = dd_from_ibdev(qp->ibqp.device); + u32 plen; + int ret; + u32 dwords = (len + 3) >> 2; + + /* + * Calculate the send buffer trigger address. + * The +1 counts for the pbc control dword following the pbc length. + */ + plen = hdrwords + dwords + 1; + + /* + * VL15 packets (IB_QPT_SMI) will always use PIO, so we + * can defer SDMA restart until link goes ACTIVE without + * worrying about just how we got there. + */ + if (qp->ibqp.qp_type == IB_QPT_SMI || + !(dd->flags & QIB_HAS_SEND_DMA)) + ret = qib_verbs_send_pio(qp, hdr, hdrwords, ss, len, + plen, dwords); + else + ret = qib_verbs_send_dma(qp, hdr, hdrwords, ss, len, + plen, dwords); + + return ret; +} + +int qib_snapshot_counters(struct qib_pportdata *ppd, u64 *swords, + u64 *rwords, u64 *spkts, u64 *rpkts, + u64 *xmit_wait) +{ + int ret; + struct qib_devdata *dd = ppd->dd; + + if (!(dd->flags & QIB_PRESENT)) { + /* no hardware, freeze, etc. */ + ret = -EINVAL; + goto bail; + } + *swords = dd->f_portcntr(ppd, QIBPORTCNTR_WORDSEND); + *rwords = dd->f_portcntr(ppd, QIBPORTCNTR_WORDRCV); + *spkts = dd->f_portcntr(ppd, QIBPORTCNTR_PKTSEND); + *rpkts = dd->f_portcntr(ppd, QIBPORTCNTR_PKTRCV); + *xmit_wait = dd->f_portcntr(ppd, QIBPORTCNTR_SENDSTALL); + + ret = 0; + +bail: + return ret; +} + +/** + * qib_get_counters - get various chip counters + * @dd: the qlogic_ib device + * @cntrs: counters are placed here + * + * Return the counters needed by recv_pma_get_portcounters(). + */ +int qib_get_counters(struct qib_pportdata *ppd, + struct qib_verbs_counters *cntrs) +{ + int ret; + + if (!(ppd->dd->flags & QIB_PRESENT)) { + /* no hardware, freeze, etc. */ + ret = -EINVAL; + goto bail; + } + cntrs->symbol_error_counter = + ppd->dd->f_portcntr(ppd, QIBPORTCNTR_IBSYMBOLERR); + cntrs->link_error_recovery_counter = + ppd->dd->f_portcntr(ppd, QIBPORTCNTR_IBLINKERRRECOV); + /* + * The link downed counter counts when the other side downs the + * connection. We add in the number of times we downed the link + * due to local link integrity errors to compensate. + */ + cntrs->link_downed_counter = + ppd->dd->f_portcntr(ppd, QIBPORTCNTR_IBLINKDOWN); + cntrs->port_rcv_errors = + ppd->dd->f_portcntr(ppd, QIBPORTCNTR_RXDROPPKT) + + ppd->dd->f_portcntr(ppd, QIBPORTCNTR_RCVOVFL) + + ppd->dd->f_portcntr(ppd, QIBPORTCNTR_ERR_RLEN) + + ppd->dd->f_portcntr(ppd, QIBPORTCNTR_INVALIDRLEN) + + ppd->dd->f_portcntr(ppd, QIBPORTCNTR_ERRLINK) + + ppd->dd->f_portcntr(ppd, QIBPORTCNTR_ERRICRC) + + ppd->dd->f_portcntr(ppd, QIBPORTCNTR_ERRVCRC) + + ppd->dd->f_portcntr(ppd, QIBPORTCNTR_ERRLPCRC) + + ppd->dd->f_portcntr(ppd, QIBPORTCNTR_BADFORMAT); + cntrs->port_rcv_errors += + ppd->dd->f_portcntr(ppd, QIBPORTCNTR_RXLOCALPHYERR); + cntrs->port_rcv_errors += + ppd->dd->f_portcntr(ppd, QIBPORTCNTR_RXVLERR); + cntrs->port_rcv_remphys_errors = + ppd->dd->f_portcntr(ppd, QIBPORTCNTR_RCVEBP); + cntrs->port_xmit_discards = + ppd->dd->f_portcntr(ppd, QIBPORTCNTR_UNSUPVL); + cntrs->port_xmit_data = ppd->dd->f_portcntr(ppd, + QIBPORTCNTR_WORDSEND); + cntrs->port_rcv_data = ppd->dd->f_portcntr(ppd, + QIBPORTCNTR_WORDRCV); + cntrs->port_xmit_packets = ppd->dd->f_portcntr(ppd, + QIBPORTCNTR_PKTSEND); + cntrs->port_rcv_packets = ppd->dd->f_portcntr(ppd, + QIBPORTCNTR_PKTRCV); + cntrs->local_link_integrity_errors = + ppd->dd->f_portcntr(ppd, QIBPORTCNTR_LLI); + cntrs->excessive_buffer_overrun_errors = + ppd->dd->f_portcntr(ppd, QIBPORTCNTR_EXCESSBUFOVFL); + cntrs->vl15_dropped = + ppd->dd->f_portcntr(ppd, QIBPORTCNTR_VL15PKTDROP); + + ret = 0; + +bail: + return ret; +} + +/** + * qib_ib_piobufavail - callback when a PIO buffer is available + * @dd: the device pointer + * + * This is called from qib_intr() at interrupt level when a PIO buffer is + * available after qib_verbs_send() returned an error that no buffers were + * available. Disable the interrupt if there are no more QPs waiting. + */ +void qib_ib_piobufavail(struct qib_devdata *dd) +{ + struct qib_ibdev *dev = &dd->verbs_dev; + struct list_head *list; + struct qib_qp *qps[5]; + struct qib_qp *qp; + unsigned long flags; + unsigned i, n; + + list = &dev->piowait; + n = 0; + + /* + * Note: checking that the piowait list is empty and clearing + * the buffer available interrupt needs to be atomic or we + * could end up with QPs on the wait list with the interrupt + * disabled. + */ + spin_lock_irqsave(&dev->pending_lock, flags); + while (!list_empty(list)) { + if (n == ARRAY_SIZE(qps)) + goto full; + qp = list_entry(list->next, struct qib_qp, iowait); + list_del_init(&qp->iowait); + atomic_inc(&qp->refcount); + qps[n++] = qp; + } + dd->f_wantpiobuf_intr(dd, 0); +full: + spin_unlock_irqrestore(&dev->pending_lock, flags); + + for (i = 0; i < n; i++) { + qp = qps[i]; + + spin_lock_irqsave(&qp->s_lock, flags); + if (qp->s_flags & QIB_S_WAIT_PIO) { + qp->s_flags &= ~QIB_S_WAIT_PIO; + qib_schedule_send(qp); + } + spin_unlock_irqrestore(&qp->s_lock, flags); + + /* Notify qib_destroy_qp() if it is waiting. */ + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + } +} + +static int qib_query_device(struct ib_device *ibdev, + struct ib_device_attr *props) +{ + struct qib_devdata *dd = dd_from_ibdev(ibdev); + struct qib_ibdev *dev = to_idev(ibdev); + + memset(props, 0, sizeof(*props)); + + props->device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR | + IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT | + IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN | + IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE; + props->page_size_cap = PAGE_SIZE; + props->vendor_id = + QIB_SRC_OUI_1 << 16 | QIB_SRC_OUI_2 << 8 | QIB_SRC_OUI_3; + props->vendor_part_id = dd->deviceid; + props->hw_ver = dd->minrev; + props->sys_image_guid = ib_qib_sys_image_guid; + props->max_mr_size = ~0ULL; + props->max_qp = ib_qib_max_qps; + props->max_qp_wr = ib_qib_max_qp_wrs; + props->max_sge = ib_qib_max_sges; + props->max_cq = ib_qib_max_cqs; + props->max_ah = ib_qib_max_ahs; + props->max_cqe = ib_qib_max_cqes; + props->max_mr = dev->lk_table.max; + props->max_fmr = dev->lk_table.max; + props->max_map_per_fmr = 32767; + props->max_pd = ib_qib_max_pds; + props->max_qp_rd_atom = QIB_MAX_RDMA_ATOMIC; + props->max_qp_init_rd_atom = 255; + /* props->max_res_rd_atom */ + props->max_srq = ib_qib_max_srqs; + props->max_srq_wr = ib_qib_max_srq_wrs; + props->max_srq_sge = ib_qib_max_srq_sges; + /* props->local_ca_ack_delay */ + props->atomic_cap = IB_ATOMIC_GLOB; + props->max_pkeys = qib_get_npkeys(dd); + props->max_mcast_grp = ib_qib_max_mcast_grps; + props->max_mcast_qp_attach = ib_qib_max_mcast_qp_attached; + props->max_total_mcast_qp_attach = props->max_mcast_qp_attach * + props->max_mcast_grp; + + return 0; +} + +static int qib_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props) +{ + struct qib_devdata *dd = dd_from_ibdev(ibdev); + struct qib_ibport *ibp = to_iport(ibdev, port); + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + enum ib_mtu mtu; + u16 lid = ppd->lid; + + memset(props, 0, sizeof(*props)); + props->lid = lid ? lid : be16_to_cpu(IB_LID_PERMISSIVE); + props->lmc = ppd->lmc; + props->sm_lid = ibp->sm_lid; + props->sm_sl = ibp->sm_sl; + props->state = dd->f_iblink_state(ppd->lastibcstat); + props->phys_state = dd->f_ibphys_portstate(ppd->lastibcstat); + props->port_cap_flags = ibp->port_cap_flags; + props->gid_tbl_len = QIB_GUIDS_PER_PORT; + props->max_msg_sz = 0x80000000; + props->pkey_tbl_len = qib_get_npkeys(dd); + props->bad_pkey_cntr = ibp->pkey_violations; + props->qkey_viol_cntr = ibp->qkey_violations; + props->active_width = ppd->link_width_active; + /* See rate_show() */ + props->active_speed = ppd->link_speed_active; + props->max_vl_num = qib_num_vls(ppd->vls_supported); + props->init_type_reply = 0; + + props->max_mtu = qib_ibmtu ? qib_ibmtu : IB_MTU_4096; + switch (ppd->ibmtu) { + case 4096: + mtu = IB_MTU_4096; + break; + case 2048: + mtu = IB_MTU_2048; + break; + case 1024: + mtu = IB_MTU_1024; + break; + case 512: + mtu = IB_MTU_512; + break; + case 256: + mtu = IB_MTU_256; + break; + default: + mtu = IB_MTU_2048; + } + props->active_mtu = mtu; + props->subnet_timeout = ibp->subnet_timeout; + + return 0; +} + +static int qib_modify_device(struct ib_device *device, + int device_modify_mask, + struct ib_device_modify *device_modify) +{ + struct qib_devdata *dd = dd_from_ibdev(device); + unsigned i; + int ret; + + if (device_modify_mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID | + IB_DEVICE_MODIFY_NODE_DESC)) { + ret = -EOPNOTSUPP; + goto bail; + } + + if (device_modify_mask & IB_DEVICE_MODIFY_NODE_DESC) { + memcpy(device->node_desc, device_modify->node_desc, 64); + for (i = 0; i < dd->num_pports; i++) { + struct qib_ibport *ibp = &dd->pport[i].ibport_data; + + qib_node_desc_chg(ibp); + } + } + + if (device_modify_mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID) { + ib_qib_sys_image_guid = + cpu_to_be64(device_modify->sys_image_guid); + for (i = 0; i < dd->num_pports; i++) { + struct qib_ibport *ibp = &dd->pport[i].ibport_data; + + qib_sys_guid_chg(ibp); + } + } + + ret = 0; + +bail: + return ret; +} + +static int qib_modify_port(struct ib_device *ibdev, u8 port, + int port_modify_mask, struct ib_port_modify *props) +{ + struct qib_ibport *ibp = to_iport(ibdev, port); + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + + ibp->port_cap_flags |= props->set_port_cap_mask; + ibp->port_cap_flags &= ~props->clr_port_cap_mask; + if (props->set_port_cap_mask || props->clr_port_cap_mask) + qib_cap_mask_chg(ibp); + if (port_modify_mask & IB_PORT_SHUTDOWN) + qib_set_linkstate(ppd, QIB_IB_LINKDOWN); + if (port_modify_mask & IB_PORT_RESET_QKEY_CNTR) + ibp->qkey_violations = 0; + return 0; +} + +static int qib_query_gid(struct ib_device *ibdev, u8 port, + int index, union ib_gid *gid) +{ + struct qib_devdata *dd = dd_from_ibdev(ibdev); + int ret = 0; + + if (!port || port > dd->num_pports) + ret = -EINVAL; + else { + struct qib_ibport *ibp = to_iport(ibdev, port); + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + + gid->global.subnet_prefix = ibp->gid_prefix; + if (index == 0) + gid->global.interface_id = ppd->guid; + else if (index < QIB_GUIDS_PER_PORT) + gid->global.interface_id = ibp->guids[index - 1]; + else + ret = -EINVAL; + } + + return ret; +} + +static struct ib_pd *qib_alloc_pd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct qib_ibdev *dev = to_idev(ibdev); + struct qib_pd *pd; + struct ib_pd *ret; + + /* + * This is actually totally arbitrary. Some correctness tests + * assume there's a maximum number of PDs that can be allocated. + * We don't actually have this limit, but we fail the test if + * we allow allocations of more than we report for this value. + */ + + pd = kmalloc(sizeof(*pd), GFP_KERNEL); + if (!pd) { + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + spin_lock(&dev->n_pds_lock); + if (dev->n_pds_allocated == ib_qib_max_pds) { + spin_unlock(&dev->n_pds_lock); + kfree(pd); + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + dev->n_pds_allocated++; + spin_unlock(&dev->n_pds_lock); + + /* ib_alloc_pd() will initialize pd->ibpd. */ + pd->user = udata != NULL; + + ret = &pd->ibpd; + +bail: + return ret; +} + +static int qib_dealloc_pd(struct ib_pd *ibpd) +{ + struct qib_pd *pd = to_ipd(ibpd); + struct qib_ibdev *dev = to_idev(ibpd->device); + + spin_lock(&dev->n_pds_lock); + dev->n_pds_allocated--; + spin_unlock(&dev->n_pds_lock); + + kfree(pd); + + return 0; +} + +int qib_check_ah(struct ib_device *ibdev, struct ib_ah_attr *ah_attr) +{ + /* A multicast address requires a GRH (see ch. 8.4.1). */ + if (ah_attr->dlid >= QIB_MULTICAST_LID_BASE && + ah_attr->dlid != QIB_PERMISSIVE_LID && + !(ah_attr->ah_flags & IB_AH_GRH)) + goto bail; + if ((ah_attr->ah_flags & IB_AH_GRH) && + ah_attr->grh.sgid_index >= QIB_GUIDS_PER_PORT) + goto bail; + if (ah_attr->dlid == 0) + goto bail; + if (ah_attr->port_num < 1 || + ah_attr->port_num > ibdev->phys_port_cnt) + goto bail; + if (ah_attr->static_rate != IB_RATE_PORT_CURRENT && + ib_rate_to_mult(ah_attr->static_rate) < 0) + goto bail; + if (ah_attr->sl > 15) + goto bail; + return 0; +bail: + return -EINVAL; +} + +/** + * qib_create_ah - create an address handle + * @pd: the protection domain + * @ah_attr: the attributes of the AH + * + * This may be called from interrupt context. + */ +static struct ib_ah *qib_create_ah(struct ib_pd *pd, + struct ib_ah_attr *ah_attr) +{ + struct qib_ah *ah; + struct ib_ah *ret; + struct qib_ibdev *dev = to_idev(pd->device); + unsigned long flags; + + if (qib_check_ah(pd->device, ah_attr)) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + + ah = kmalloc(sizeof(*ah), GFP_ATOMIC); + if (!ah) { + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + spin_lock_irqsave(&dev->n_ahs_lock, flags); + if (dev->n_ahs_allocated == ib_qib_max_ahs) { + spin_unlock_irqrestore(&dev->n_ahs_lock, flags); + kfree(ah); + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + dev->n_ahs_allocated++; + spin_unlock_irqrestore(&dev->n_ahs_lock, flags); + + /* ib_create_ah() will initialize ah->ibah. */ + ah->attr = *ah_attr; + atomic_set(&ah->refcount, 0); + + ret = &ah->ibah; + +bail: + return ret; +} + +struct ib_ah *qib_create_qp0_ah(struct qib_ibport *ibp, u16 dlid) +{ + struct ib_ah_attr attr; + struct ib_ah *ah = ERR_PTR(-EINVAL); + struct qib_qp *qp0; + + memset(&attr, 0, sizeof(attr)); + attr.dlid = dlid; + attr.port_num = ppd_from_ibp(ibp)->port; + rcu_read_lock(); + qp0 = rcu_dereference(ibp->qp0); + if (qp0) + ah = ib_create_ah(qp0->ibqp.pd, &attr); + rcu_read_unlock(); + return ah; +} + +/** + * qib_destroy_ah - destroy an address handle + * @ibah: the AH to destroy + * + * This may be called from interrupt context. + */ +static int qib_destroy_ah(struct ib_ah *ibah) +{ + struct qib_ibdev *dev = to_idev(ibah->device); + struct qib_ah *ah = to_iah(ibah); + unsigned long flags; + + if (atomic_read(&ah->refcount) != 0) + return -EBUSY; + + spin_lock_irqsave(&dev->n_ahs_lock, flags); + dev->n_ahs_allocated--; + spin_unlock_irqrestore(&dev->n_ahs_lock, flags); + + kfree(ah); + + return 0; +} + +static int qib_modify_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr) +{ + struct qib_ah *ah = to_iah(ibah); + + if (qib_check_ah(ibah->device, ah_attr)) + return -EINVAL; + + ah->attr = *ah_attr; + + return 0; +} + +static int qib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr) +{ + struct qib_ah *ah = to_iah(ibah); + + *ah_attr = ah->attr; + + return 0; +} + +/** + * qib_get_npkeys - return the size of the PKEY table for context 0 + * @dd: the qlogic_ib device + */ +unsigned qib_get_npkeys(struct qib_devdata *dd) +{ + return ARRAY_SIZE(dd->rcd[0]->pkeys); +} + +/* + * Return the indexed PKEY from the port PKEY table. + * No need to validate rcd[ctxt]; the port is setup if we are here. + */ +unsigned qib_get_pkey(struct qib_ibport *ibp, unsigned index) +{ + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + struct qib_devdata *dd = ppd->dd; + unsigned ctxt = ppd->hw_pidx; + unsigned ret; + + /* dd->rcd null if mini_init or some init failures */ + if (!dd->rcd || index >= ARRAY_SIZE(dd->rcd[ctxt]->pkeys)) + ret = 0; + else + ret = dd->rcd[ctxt]->pkeys[index]; + + return ret; +} + +static int qib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, + u16 *pkey) +{ + struct qib_devdata *dd = dd_from_ibdev(ibdev); + int ret; + + if (index >= qib_get_npkeys(dd)) { + ret = -EINVAL; + goto bail; + } + + *pkey = qib_get_pkey(to_iport(ibdev, port), index); + ret = 0; + +bail: + return ret; +} + +/** + * qib_alloc_ucontext - allocate a ucontest + * @ibdev: the infiniband device + * @udata: not used by the QLogic_IB driver + */ + +static struct ib_ucontext *qib_alloc_ucontext(struct ib_device *ibdev, + struct ib_udata *udata) +{ + struct qib_ucontext *context; + struct ib_ucontext *ret; + + context = kmalloc(sizeof(*context), GFP_KERNEL); + if (!context) { + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + ret = &context->ibucontext; + +bail: + return ret; +} + +static int qib_dealloc_ucontext(struct ib_ucontext *context) +{ + kfree(to_iucontext(context)); + return 0; +} + +static void init_ibport(struct qib_pportdata *ppd) +{ + struct qib_verbs_counters cntrs; + struct qib_ibport *ibp = &ppd->ibport_data; + + spin_lock_init(&ibp->lock); + /* Set the prefix to the default value (see ch. 4.1.1) */ + ibp->gid_prefix = IB_DEFAULT_GID_PREFIX; + ibp->sm_lid = be16_to_cpu(IB_LID_PERMISSIVE); + ibp->port_cap_flags = IB_PORT_SYS_IMAGE_GUID_SUP | + IB_PORT_CLIENT_REG_SUP | IB_PORT_SL_MAP_SUP | + IB_PORT_TRAP_SUP | IB_PORT_AUTO_MIGR_SUP | + IB_PORT_DR_NOTICE_SUP | IB_PORT_CAP_MASK_NOTICE_SUP | + IB_PORT_OTHER_LOCAL_CHANGES_SUP; + if (ppd->dd->flags & QIB_HAS_LINK_LATENCY) + ibp->port_cap_flags |= IB_PORT_LINK_LATENCY_SUP; + ibp->pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA; + ibp->pma_counter_select[1] = IB_PMA_PORT_RCV_DATA; + ibp->pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS; + ibp->pma_counter_select[3] = IB_PMA_PORT_RCV_PKTS; + ibp->pma_counter_select[4] = IB_PMA_PORT_XMIT_WAIT; + + /* Snapshot current HW counters to "clear" them. */ + qib_get_counters(ppd, &cntrs); + ibp->z_symbol_error_counter = cntrs.symbol_error_counter; + ibp->z_link_error_recovery_counter = + cntrs.link_error_recovery_counter; + ibp->z_link_downed_counter = cntrs.link_downed_counter; + ibp->z_port_rcv_errors = cntrs.port_rcv_errors; + ibp->z_port_rcv_remphys_errors = cntrs.port_rcv_remphys_errors; + ibp->z_port_xmit_discards = cntrs.port_xmit_discards; + ibp->z_port_xmit_data = cntrs.port_xmit_data; + ibp->z_port_rcv_data = cntrs.port_rcv_data; + ibp->z_port_xmit_packets = cntrs.port_xmit_packets; + ibp->z_port_rcv_packets = cntrs.port_rcv_packets; + ibp->z_local_link_integrity_errors = + cntrs.local_link_integrity_errors; + ibp->z_excessive_buffer_overrun_errors = + cntrs.excessive_buffer_overrun_errors; + ibp->z_vl15_dropped = cntrs.vl15_dropped; + RCU_INIT_POINTER(ibp->qp0, NULL); + RCU_INIT_POINTER(ibp->qp1, NULL); +} + +/** + * qib_register_ib_device - register our device with the infiniband core + * @dd: the device data structure + * Return the allocated qib_ibdev pointer or NULL on error. + */ +int qib_register_ib_device(struct qib_devdata *dd) +{ + struct qib_ibdev *dev = &dd->verbs_dev; + struct ib_device *ibdev = &dev->ibdev; + struct qib_pportdata *ppd = dd->pport; + unsigned i, lk_tab_size; + int ret; + + dev->qp_table_size = ib_qib_qp_table_size; + get_random_bytes(&dev->qp_rnd, sizeof(dev->qp_rnd)); + dev->qp_table = kmalloc_array( + dev->qp_table_size, + sizeof(*dev->qp_table), + GFP_KERNEL); + if (!dev->qp_table) { + ret = -ENOMEM; + goto err_qpt; + } + for (i = 0; i < dev->qp_table_size; i++) + RCU_INIT_POINTER(dev->qp_table[i], NULL); + + for (i = 0; i < dd->num_pports; i++) + init_ibport(ppd + i); + + /* Only need to initialize non-zero fields. */ + spin_lock_init(&dev->qpt_lock); + spin_lock_init(&dev->n_pds_lock); + spin_lock_init(&dev->n_ahs_lock); + spin_lock_init(&dev->n_cqs_lock); + spin_lock_init(&dev->n_qps_lock); + spin_lock_init(&dev->n_srqs_lock); + spin_lock_init(&dev->n_mcast_grps_lock); + init_timer(&dev->mem_timer); + dev->mem_timer.function = mem_timer; + dev->mem_timer.data = (unsigned long) dev; + + qib_init_qpn_table(dd, &dev->qpn_table); + + /* + * The top ib_qib_lkey_table_size bits are used to index the + * table. The lower 8 bits can be owned by the user (copied from + * the LKEY). The remaining bits act as a generation number or tag. + */ + spin_lock_init(&dev->lk_table.lock); + dev->lk_table.max = 1 << ib_qib_lkey_table_size; + lk_tab_size = dev->lk_table.max * sizeof(*dev->lk_table.table); + dev->lk_table.table = (struct qib_mregion __rcu **) + __get_free_pages(GFP_KERNEL, get_order(lk_tab_size)); + if (dev->lk_table.table == NULL) { + ret = -ENOMEM; + goto err_lk; + } + RCU_INIT_POINTER(dev->dma_mr, NULL); + for (i = 0; i < dev->lk_table.max; i++) + RCU_INIT_POINTER(dev->lk_table.table[i], NULL); + INIT_LIST_HEAD(&dev->pending_mmaps); + spin_lock_init(&dev->pending_lock); + dev->mmap_offset = PAGE_SIZE; + spin_lock_init(&dev->mmap_offset_lock); + INIT_LIST_HEAD(&dev->piowait); + INIT_LIST_HEAD(&dev->dmawait); + INIT_LIST_HEAD(&dev->txwait); + INIT_LIST_HEAD(&dev->memwait); + INIT_LIST_HEAD(&dev->txreq_free); + + if (ppd->sdma_descq_cnt) { + dev->pio_hdrs = dma_alloc_coherent(&dd->pcidev->dev, + ppd->sdma_descq_cnt * + sizeof(struct qib_pio_header), + &dev->pio_hdrs_phys, + GFP_KERNEL); + if (!dev->pio_hdrs) { + ret = -ENOMEM; + goto err_hdrs; + } + } + + for (i = 0; i < ppd->sdma_descq_cnt; i++) { + struct qib_verbs_txreq *tx; + + tx = kzalloc(sizeof(*tx), GFP_KERNEL); + if (!tx) { + ret = -ENOMEM; + goto err_tx; + } + tx->hdr_inx = i; + list_add(&tx->txreq.list, &dev->txreq_free); + } + + /* + * The system image GUID is supposed to be the same for all + * IB HCAs in a single system but since there can be other + * device types in the system, we can't be sure this is unique. + */ + if (!ib_qib_sys_image_guid) + ib_qib_sys_image_guid = ppd->guid; + + strlcpy(ibdev->name, "qib%d", IB_DEVICE_NAME_MAX); + ibdev->owner = THIS_MODULE; + ibdev->node_guid = ppd->guid; + ibdev->uverbs_abi_ver = QIB_UVERBS_ABI_VERSION; + ibdev->uverbs_cmd_mask = + (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | + (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | + (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_CREATE_AH) | + (1ull << IB_USER_VERBS_CMD_MODIFY_AH) | + (1ull << IB_USER_VERBS_CMD_QUERY_AH) | + (1ull << IB_USER_VERBS_CMD_DESTROY_AH) | + (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_DEREG_MR) | + (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | + (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_CMD_RESIZE_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | + (1ull << IB_USER_VERBS_CMD_POLL_CQ) | + (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_QP) | + (1ull << IB_USER_VERBS_CMD_QUERY_QP) | + (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | + (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | + (1ull << IB_USER_VERBS_CMD_POST_SEND) | + (1ull << IB_USER_VERBS_CMD_POST_RECV) | + (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_DETACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) | + (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) | + (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) | + (1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV); + ibdev->node_type = RDMA_NODE_IB_CA; + ibdev->phys_port_cnt = dd->num_pports; + ibdev->num_comp_vectors = 1; + ibdev->dma_device = &dd->pcidev->dev; + ibdev->query_device = qib_query_device; + ibdev->modify_device = qib_modify_device; + ibdev->query_port = qib_query_port; + ibdev->modify_port = qib_modify_port; + ibdev->query_pkey = qib_query_pkey; + ibdev->query_gid = qib_query_gid; + ibdev->alloc_ucontext = qib_alloc_ucontext; + ibdev->dealloc_ucontext = qib_dealloc_ucontext; + ibdev->alloc_pd = qib_alloc_pd; + ibdev->dealloc_pd = qib_dealloc_pd; + ibdev->create_ah = qib_create_ah; + ibdev->destroy_ah = qib_destroy_ah; + ibdev->modify_ah = qib_modify_ah; + ibdev->query_ah = qib_query_ah; + ibdev->create_srq = qib_create_srq; + ibdev->modify_srq = qib_modify_srq; + ibdev->query_srq = qib_query_srq; + ibdev->destroy_srq = qib_destroy_srq; + ibdev->create_qp = qib_create_qp; + ibdev->modify_qp = qib_modify_qp; + ibdev->query_qp = qib_query_qp; + ibdev->destroy_qp = qib_destroy_qp; + ibdev->post_send = qib_post_send; + ibdev->post_recv = qib_post_receive; + ibdev->post_srq_recv = qib_post_srq_receive; + ibdev->create_cq = qib_create_cq; + ibdev->destroy_cq = qib_destroy_cq; + ibdev->resize_cq = qib_resize_cq; + ibdev->poll_cq = qib_poll_cq; + ibdev->req_notify_cq = qib_req_notify_cq; + ibdev->get_dma_mr = qib_get_dma_mr; + ibdev->reg_phys_mr = qib_reg_phys_mr; + ibdev->reg_user_mr = qib_reg_user_mr; + ibdev->dereg_mr = qib_dereg_mr; + ibdev->alloc_fast_reg_mr = qib_alloc_fast_reg_mr; + ibdev->alloc_fast_reg_page_list = qib_alloc_fast_reg_page_list; + ibdev->free_fast_reg_page_list = qib_free_fast_reg_page_list; + ibdev->alloc_fmr = qib_alloc_fmr; + ibdev->map_phys_fmr = qib_map_phys_fmr; + ibdev->unmap_fmr = qib_unmap_fmr; + ibdev->dealloc_fmr = qib_dealloc_fmr; + ibdev->attach_mcast = qib_multicast_attach; + ibdev->detach_mcast = qib_multicast_detach; + ibdev->process_mad = qib_process_mad; + ibdev->mmap = qib_mmap; + ibdev->dma_ops = &qib_dma_mapping_ops; + + snprintf(ibdev->node_desc, sizeof(ibdev->node_desc), + "Intel Infiniband HCA %s", init_utsname()->nodename); + + ret = ib_register_device(ibdev, qib_create_port_files); + if (ret) + goto err_reg; + + ret = qib_create_agents(dev); + if (ret) + goto err_agents; + + ret = qib_verbs_register_sysfs(dd); + if (ret) + goto err_class; + + goto bail; + +err_class: + qib_free_agents(dev); +err_agents: + ib_unregister_device(ibdev); +err_reg: +err_tx: + while (!list_empty(&dev->txreq_free)) { + struct list_head *l = dev->txreq_free.next; + struct qib_verbs_txreq *tx; + + list_del(l); + tx = list_entry(l, struct qib_verbs_txreq, txreq.list); + kfree(tx); + } + if (ppd->sdma_descq_cnt) + dma_free_coherent(&dd->pcidev->dev, + ppd->sdma_descq_cnt * + sizeof(struct qib_pio_header), + dev->pio_hdrs, dev->pio_hdrs_phys); +err_hdrs: + free_pages((unsigned long) dev->lk_table.table, get_order(lk_tab_size)); +err_lk: + kfree(dev->qp_table); +err_qpt: + qib_dev_err(dd, "cannot register verbs: %d!\n", -ret); +bail: + return ret; +} + +void qib_unregister_ib_device(struct qib_devdata *dd) +{ + struct qib_ibdev *dev = &dd->verbs_dev; + struct ib_device *ibdev = &dev->ibdev; + u32 qps_inuse; + unsigned lk_tab_size; + + qib_verbs_unregister_sysfs(dd); + + qib_free_agents(dev); + + ib_unregister_device(ibdev); + + if (!list_empty(&dev->piowait)) + qib_dev_err(dd, "piowait list not empty!\n"); + if (!list_empty(&dev->dmawait)) + qib_dev_err(dd, "dmawait list not empty!\n"); + if (!list_empty(&dev->txwait)) + qib_dev_err(dd, "txwait list not empty!\n"); + if (!list_empty(&dev->memwait)) + qib_dev_err(dd, "memwait list not empty!\n"); + if (dev->dma_mr) + qib_dev_err(dd, "DMA MR not NULL!\n"); + + qps_inuse = qib_free_all_qps(dd); + if (qps_inuse) + qib_dev_err(dd, "QP memory leak! %u still in use\n", + qps_inuse); + + del_timer_sync(&dev->mem_timer); + qib_free_qpn_table(&dev->qpn_table); + while (!list_empty(&dev->txreq_free)) { + struct list_head *l = dev->txreq_free.next; + struct qib_verbs_txreq *tx; + + list_del(l); + tx = list_entry(l, struct qib_verbs_txreq, txreq.list); + kfree(tx); + } + if (dd->pport->sdma_descq_cnt) + dma_free_coherent(&dd->pcidev->dev, + dd->pport->sdma_descq_cnt * + sizeof(struct qib_pio_header), + dev->pio_hdrs, dev->pio_hdrs_phys); + lk_tab_size = dev->lk_table.max * sizeof(*dev->lk_table.table); + free_pages((unsigned long) dev->lk_table.table, + get_order(lk_tab_size)); + kfree(dev->qp_table); +} + +/* + * This must be called with s_lock held. + */ +void qib_schedule_send(struct qib_qp *qp) +{ + if (qib_send_ok(qp)) { + struct qib_ibport *ibp = + to_iport(qp->ibqp.device, qp->port_num); + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + + queue_work(ppd->qib_wq, &qp->s_work); + } +} diff --git a/kernel/drivers/infiniband/hw/qib/qib_verbs.h b/kernel/drivers/infiniband/hw/qib/qib_verbs.h new file mode 100644 index 000000000..bfc8948fd --- /dev/null +++ b/kernel/drivers/infiniband/hw/qib/qib_verbs.h @@ -0,0 +1,1173 @@ +/* + * Copyright (c) 2012, 2013 Intel Corporation. All rights reserved. + * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef QIB_VERBS_H +#define QIB_VERBS_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct qib_ctxtdata; +struct qib_pportdata; +struct qib_devdata; +struct qib_verbs_txreq; + +#define QIB_MAX_RDMA_ATOMIC 16 +#define QIB_GUIDS_PER_PORT 5 + +#define QPN_MAX (1 << 24) +#define QPNMAP_ENTRIES (QPN_MAX / PAGE_SIZE / BITS_PER_BYTE) + +/* + * Increment this value if any changes that break userspace ABI + * compatibility are made. + */ +#define QIB_UVERBS_ABI_VERSION 2 + +/* + * Define an ib_cq_notify value that is not valid so we know when CQ + * notifications are armed. + */ +#define IB_CQ_NONE (IB_CQ_NEXT_COMP + 1) + +#define IB_SEQ_NAK (3 << 29) + +/* AETH NAK opcode values */ +#define IB_RNR_NAK 0x20 +#define IB_NAK_PSN_ERROR 0x60 +#define IB_NAK_INVALID_REQUEST 0x61 +#define IB_NAK_REMOTE_ACCESS_ERROR 0x62 +#define IB_NAK_REMOTE_OPERATIONAL_ERROR 0x63 +#define IB_NAK_INVALID_RD_REQUEST 0x64 + +/* Flags for checking QP state (see ib_qib_state_ops[]) */ +#define QIB_POST_SEND_OK 0x01 +#define QIB_POST_RECV_OK 0x02 +#define QIB_PROCESS_RECV_OK 0x04 +#define QIB_PROCESS_SEND_OK 0x08 +#define QIB_PROCESS_NEXT_SEND_OK 0x10 +#define QIB_FLUSH_SEND 0x20 +#define QIB_FLUSH_RECV 0x40 +#define QIB_PROCESS_OR_FLUSH_SEND \ + (QIB_PROCESS_SEND_OK | QIB_FLUSH_SEND) + +/* IB Performance Manager status values */ +#define IB_PMA_SAMPLE_STATUS_DONE 0x00 +#define IB_PMA_SAMPLE_STATUS_STARTED 0x01 +#define IB_PMA_SAMPLE_STATUS_RUNNING 0x02 + +/* Mandatory IB performance counter select values. */ +#define IB_PMA_PORT_XMIT_DATA cpu_to_be16(0x0001) +#define IB_PMA_PORT_RCV_DATA cpu_to_be16(0x0002) +#define IB_PMA_PORT_XMIT_PKTS cpu_to_be16(0x0003) +#define IB_PMA_PORT_RCV_PKTS cpu_to_be16(0x0004) +#define IB_PMA_PORT_XMIT_WAIT cpu_to_be16(0x0005) + +#define QIB_VENDOR_IPG cpu_to_be16(0xFFA0) + +#define IB_BTH_REQ_ACK (1 << 31) +#define IB_BTH_SOLICITED (1 << 23) +#define IB_BTH_MIG_REQ (1 << 22) + +/* XXX Should be defined in ib_verbs.h enum ib_port_cap_flags */ +#define IB_PORT_OTHER_LOCAL_CHANGES_SUP (1 << 26) + +#define IB_GRH_VERSION 6 +#define IB_GRH_VERSION_MASK 0xF +#define IB_GRH_VERSION_SHIFT 28 +#define IB_GRH_TCLASS_MASK 0xFF +#define IB_GRH_TCLASS_SHIFT 20 +#define IB_GRH_FLOW_MASK 0xFFFFF +#define IB_GRH_FLOW_SHIFT 0 +#define IB_GRH_NEXT_HDR 0x1B + +#define IB_DEFAULT_GID_PREFIX cpu_to_be64(0xfe80000000000000ULL) + +/* Values for set/get portinfo VLCap OperationalVLs */ +#define IB_VL_VL0 1 +#define IB_VL_VL0_1 2 +#define IB_VL_VL0_3 3 +#define IB_VL_VL0_7 4 +#define IB_VL_VL0_14 5 + +static inline int qib_num_vls(int vls) +{ + switch (vls) { + default: + case IB_VL_VL0: + return 1; + case IB_VL_VL0_1: + return 2; + case IB_VL_VL0_3: + return 4; + case IB_VL_VL0_7: + return 8; + case IB_VL_VL0_14: + return 15; + } +} + +struct ib_reth { + __be64 vaddr; + __be32 rkey; + __be32 length; +} __packed; + +struct ib_atomic_eth { + __be32 vaddr[2]; /* unaligned so access as 2 32-bit words */ + __be32 rkey; + __be64 swap_data; + __be64 compare_data; +} __packed; + +struct qib_other_headers { + __be32 bth[3]; + union { + struct { + __be32 deth[2]; + __be32 imm_data; + } ud; + struct { + struct ib_reth reth; + __be32 imm_data; + } rc; + struct { + __be32 aeth; + __be32 atomic_ack_eth[2]; + } at; + __be32 imm_data; + __be32 aeth; + struct ib_atomic_eth atomic_eth; + } u; +} __packed; + +/* + * Note that UD packets with a GRH header are 8+40+12+8 = 68 bytes + * long (72 w/ imm_data). Only the first 56 bytes of the IB header + * will be in the eager header buffer. The remaining 12 or 16 bytes + * are in the data buffer. + */ +struct qib_ib_header { + __be16 lrh[4]; + union { + struct { + struct ib_grh grh; + struct qib_other_headers oth; + } l; + struct qib_other_headers oth; + } u; +} __packed; + +struct qib_pio_header { + __le32 pbc[2]; + struct qib_ib_header hdr; +} __packed; + +/* + * There is one struct qib_mcast for each multicast GID. + * All attached QPs are then stored as a list of + * struct qib_mcast_qp. + */ +struct qib_mcast_qp { + struct list_head list; + struct qib_qp *qp; +}; + +struct qib_mcast { + struct rb_node rb_node; + union ib_gid mgid; + struct list_head qp_list; + wait_queue_head_t wait; + atomic_t refcount; + int n_attached; +}; + +/* Protection domain */ +struct qib_pd { + struct ib_pd ibpd; + int user; /* non-zero if created from user space */ +}; + +/* Address Handle */ +struct qib_ah { + struct ib_ah ibah; + struct ib_ah_attr attr; + atomic_t refcount; +}; + +/* + * This structure is used by qib_mmap() to validate an offset + * when an mmap() request is made. The vm_area_struct then uses + * this as its vm_private_data. + */ +struct qib_mmap_info { + struct list_head pending_mmaps; + struct ib_ucontext *context; + void *obj; + __u64 offset; + struct kref ref; + unsigned size; +}; + +/* + * This structure is used to contain the head pointer, tail pointer, + * and completion queue entries as a single memory allocation so + * it can be mmap'ed into user space. + */ +struct qib_cq_wc { + u32 head; /* index of next entry to fill */ + u32 tail; /* index of next ib_poll_cq() entry */ + union { + /* these are actually size ibcq.cqe + 1 */ + struct ib_uverbs_wc uqueue[0]; + struct ib_wc kqueue[0]; + }; +}; + +/* + * The completion queue structure. + */ +struct qib_cq { + struct ib_cq ibcq; + struct kthread_work comptask; + struct qib_devdata *dd; + spinlock_t lock; /* protect changes in this struct */ + u8 notify; + u8 triggered; + struct qib_cq_wc *queue; + struct qib_mmap_info *ip; +}; + +/* + * A segment is a linear region of low physical memory. + * XXX Maybe we should use phys addr here and kmap()/kunmap(). + * Used by the verbs layer. + */ +struct qib_seg { + void *vaddr; + size_t length; +}; + +/* The number of qib_segs that fit in a page. */ +#define QIB_SEGSZ (PAGE_SIZE / sizeof(struct qib_seg)) + +struct qib_segarray { + struct qib_seg segs[QIB_SEGSZ]; +}; + +struct qib_mregion { + struct ib_pd *pd; /* shares refcnt of ibmr.pd */ + u64 user_base; /* User's address for this region */ + u64 iova; /* IB start address of this region */ + size_t length; + u32 lkey; + u32 offset; /* offset (bytes) to start of region */ + int access_flags; + u32 max_segs; /* number of qib_segs in all the arrays */ + u32 mapsz; /* size of the map array */ + u8 page_shift; /* 0 - non unform/non powerof2 sizes */ + u8 lkey_published; /* in global table */ + struct completion comp; /* complete when refcount goes to zero */ + struct rcu_head list; + atomic_t refcount; + struct qib_segarray *map[0]; /* the segments */ +}; + +/* + * These keep track of the copy progress within a memory region. + * Used by the verbs layer. + */ +struct qib_sge { + struct qib_mregion *mr; + void *vaddr; /* kernel virtual address of segment */ + u32 sge_length; /* length of the SGE */ + u32 length; /* remaining length of the segment */ + u16 m; /* current index: mr->map[m] */ + u16 n; /* current index: mr->map[m]->segs[n] */ +}; + +/* Memory region */ +struct qib_mr { + struct ib_mr ibmr; + struct ib_umem *umem; + struct qib_mregion mr; /* must be last */ +}; + +/* + * Send work request queue entry. + * The size of the sg_list is determined when the QP is created and stored + * in qp->s_max_sge. + */ +struct qib_swqe { + struct ib_send_wr wr; /* don't use wr.sg_list */ + u32 psn; /* first packet sequence number */ + u32 lpsn; /* last packet sequence number */ + u32 ssn; /* send sequence number */ + u32 length; /* total length of data in sg_list */ + struct qib_sge sg_list[0]; +}; + +/* + * Receive work request queue entry. + * The size of the sg_list is determined when the QP (or SRQ) is created + * and stored in qp->r_rq.max_sge (or srq->rq.max_sge). + */ +struct qib_rwqe { + u64 wr_id; + u8 num_sge; + struct ib_sge sg_list[0]; +}; + +/* + * This structure is used to contain the head pointer, tail pointer, + * and receive work queue entries as a single memory allocation so + * it can be mmap'ed into user space. + * Note that the wq array elements are variable size so you can't + * just index into the array to get the N'th element; + * use get_rwqe_ptr() instead. + */ +struct qib_rwq { + u32 head; /* new work requests posted to the head */ + u32 tail; /* receives pull requests from here. */ + struct qib_rwqe wq[0]; +}; + +struct qib_rq { + struct qib_rwq *wq; + u32 size; /* size of RWQE array */ + u8 max_sge; + spinlock_t lock /* protect changes in this struct */ + ____cacheline_aligned_in_smp; +}; + +struct qib_srq { + struct ib_srq ibsrq; + struct qib_rq rq; + struct qib_mmap_info *ip; + /* send signal when number of RWQEs < limit */ + u32 limit; +}; + +struct qib_sge_state { + struct qib_sge *sg_list; /* next SGE to be used if any */ + struct qib_sge sge; /* progress state for the current SGE */ + u32 total_len; + u8 num_sge; +}; + +/* + * This structure holds the information that the send tasklet needs + * to send a RDMA read response or atomic operation. + */ +struct qib_ack_entry { + u8 opcode; + u8 sent; + u32 psn; + u32 lpsn; + union { + struct qib_sge rdma_sge; + u64 atomic_data; + }; +}; + +/* + * Variables prefixed with s_ are for the requester (sender). + * Variables prefixed with r_ are for the responder (receiver). + * Variables prefixed with ack_ are for responder replies. + * + * Common variables are protected by both r_rq.lock and s_lock in that order + * which only happens in modify_qp() or changing the QP 'state'. + */ +struct qib_qp { + struct ib_qp ibqp; + /* read mostly fields above and below */ + struct ib_ah_attr remote_ah_attr; + struct ib_ah_attr alt_ah_attr; + struct qib_qp __rcu *next; /* link list for QPN hash table */ + struct qib_swqe *s_wq; /* send work queue */ + struct qib_mmap_info *ip; + struct qib_ib_header *s_hdr; /* next packet header to send */ + unsigned long timeout_jiffies; /* computed from timeout */ + + enum ib_mtu path_mtu; + u32 remote_qpn; + u32 pmtu; /* decoded from path_mtu */ + u32 qkey; /* QKEY for this QP (for UD or RD) */ + u32 s_size; /* send work queue size */ + u32 s_rnr_timeout; /* number of milliseconds for RNR timeout */ + + u8 state; /* QP state */ + u8 qp_access_flags; + u8 alt_timeout; /* Alternate path timeout for this QP */ + u8 timeout; /* Timeout for this QP */ + u8 s_srate; + u8 s_mig_state; + u8 port_num; + u8 s_pkey_index; /* PKEY index to use */ + u8 s_alt_pkey_index; /* Alternate path PKEY index to use */ + u8 r_max_rd_atomic; /* max number of RDMA read/atomic to receive */ + u8 s_max_rd_atomic; /* max number of RDMA read/atomic to send */ + u8 s_retry_cnt; /* number of times to retry */ + u8 s_rnr_retry_cnt; + u8 r_min_rnr_timer; /* retry timeout value for RNR NAKs */ + u8 s_max_sge; /* size of s_wq->sg_list */ + u8 s_draining; + + /* start of read/write fields */ + + atomic_t refcount ____cacheline_aligned_in_smp; + wait_queue_head_t wait; + + + struct qib_ack_entry s_ack_queue[QIB_MAX_RDMA_ATOMIC + 1] + ____cacheline_aligned_in_smp; + struct qib_sge_state s_rdma_read_sge; + + spinlock_t r_lock ____cacheline_aligned_in_smp; /* used for APM */ + unsigned long r_aflags; + u64 r_wr_id; /* ID for current receive WQE */ + u32 r_ack_psn; /* PSN for next ACK or atomic ACK */ + u32 r_len; /* total length of r_sge */ + u32 r_rcv_len; /* receive data len processed */ + u32 r_psn; /* expected rcv packet sequence number */ + u32 r_msn; /* message sequence number */ + + u8 r_state; /* opcode of last packet received */ + u8 r_flags; + u8 r_head_ack_queue; /* index into s_ack_queue[] */ + + struct list_head rspwait; /* link for waititing to respond */ + + struct qib_sge_state r_sge; /* current receive data */ + struct qib_rq r_rq; /* receive work queue */ + + spinlock_t s_lock ____cacheline_aligned_in_smp; + struct qib_sge_state *s_cur_sge; + u32 s_flags; + struct qib_verbs_txreq *s_tx; + struct qib_swqe *s_wqe; + struct qib_sge_state s_sge; /* current send request data */ + struct qib_mregion *s_rdma_mr; + atomic_t s_dma_busy; + u32 s_cur_size; /* size of send packet in bytes */ + u32 s_len; /* total length of s_sge */ + u32 s_rdma_read_len; /* total length of s_rdma_read_sge */ + u32 s_next_psn; /* PSN for next request */ + u32 s_last_psn; /* last response PSN processed */ + u32 s_sending_psn; /* lowest PSN that is being sent */ + u32 s_sending_hpsn; /* highest PSN that is being sent */ + u32 s_psn; /* current packet sequence number */ + u32 s_ack_rdma_psn; /* PSN for sending RDMA read responses */ + u32 s_ack_psn; /* PSN for acking sends and RDMA writes */ + u32 s_head; /* new entries added here */ + u32 s_tail; /* next entry to process */ + u32 s_cur; /* current work queue entry */ + u32 s_acked; /* last un-ACK'ed entry */ + u32 s_last; /* last completed entry */ + u32 s_ssn; /* SSN of tail entry */ + u32 s_lsn; /* limit sequence number (credit) */ + u16 s_hdrwords; /* size of s_hdr in 32 bit words */ + u16 s_rdma_ack_cnt; + u8 s_state; /* opcode of last packet sent */ + u8 s_ack_state; /* opcode of packet to ACK */ + u8 s_nak_state; /* non-zero if NAK is pending */ + u8 r_nak_state; /* non-zero if NAK is pending */ + u8 s_retry; /* requester retry counter */ + u8 s_rnr_retry; /* requester RNR retry counter */ + u8 s_num_rd_atomic; /* number of RDMA read/atomic pending */ + u8 s_tail_ack_queue; /* index into s_ack_queue[] */ + + struct qib_sge_state s_ack_rdma_sge; + struct timer_list s_timer; + struct list_head iowait; /* link for wait PIO buf */ + + struct work_struct s_work; + + wait_queue_head_t wait_dma; + + struct qib_sge r_sg_list[0] /* verified SGEs */ + ____cacheline_aligned_in_smp; +}; + +/* + * Atomic bit definitions for r_aflags. + */ +#define QIB_R_WRID_VALID 0 +#define QIB_R_REWIND_SGE 1 + +/* + * Bit definitions for r_flags. + */ +#define QIB_R_REUSE_SGE 0x01 +#define QIB_R_RDMAR_SEQ 0x02 +#define QIB_R_RSP_NAK 0x04 +#define QIB_R_RSP_SEND 0x08 +#define QIB_R_COMM_EST 0x10 + +/* + * Bit definitions for s_flags. + * + * QIB_S_SIGNAL_REQ_WR - set if QP send WRs contain completion signaled + * QIB_S_BUSY - send tasklet is processing the QP + * QIB_S_TIMER - the RC retry timer is active + * QIB_S_ACK_PENDING - an ACK is waiting to be sent after RDMA read/atomics + * QIB_S_WAIT_FENCE - waiting for all prior RDMA read or atomic SWQEs + * before processing the next SWQE + * QIB_S_WAIT_RDMAR - waiting for a RDMA read or atomic SWQE to complete + * before processing the next SWQE + * QIB_S_WAIT_RNR - waiting for RNR timeout + * QIB_S_WAIT_SSN_CREDIT - waiting for RC credits to process next SWQE + * QIB_S_WAIT_DMA - waiting for send DMA queue to drain before generating + * next send completion entry not via send DMA + * QIB_S_WAIT_PIO - waiting for a send buffer to be available + * QIB_S_WAIT_TX - waiting for a struct qib_verbs_txreq to be available + * QIB_S_WAIT_DMA_DESC - waiting for DMA descriptors to be available + * QIB_S_WAIT_KMEM - waiting for kernel memory to be available + * QIB_S_WAIT_PSN - waiting for a packet to exit the send DMA queue + * QIB_S_WAIT_ACK - waiting for an ACK packet before sending more requests + * QIB_S_SEND_ONE - send one packet, request ACK, then wait for ACK + */ +#define QIB_S_SIGNAL_REQ_WR 0x0001 +#define QIB_S_BUSY 0x0002 +#define QIB_S_TIMER 0x0004 +#define QIB_S_RESP_PENDING 0x0008 +#define QIB_S_ACK_PENDING 0x0010 +#define QIB_S_WAIT_FENCE 0x0020 +#define QIB_S_WAIT_RDMAR 0x0040 +#define QIB_S_WAIT_RNR 0x0080 +#define QIB_S_WAIT_SSN_CREDIT 0x0100 +#define QIB_S_WAIT_DMA 0x0200 +#define QIB_S_WAIT_PIO 0x0400 +#define QIB_S_WAIT_TX 0x0800 +#define QIB_S_WAIT_DMA_DESC 0x1000 +#define QIB_S_WAIT_KMEM 0x2000 +#define QIB_S_WAIT_PSN 0x4000 +#define QIB_S_WAIT_ACK 0x8000 +#define QIB_S_SEND_ONE 0x10000 +#define QIB_S_UNLIMITED_CREDIT 0x20000 + +/* + * Wait flags that would prevent any packet type from being sent. + */ +#define QIB_S_ANY_WAIT_IO (QIB_S_WAIT_PIO | QIB_S_WAIT_TX | \ + QIB_S_WAIT_DMA_DESC | QIB_S_WAIT_KMEM) + +/* + * Wait flags that would prevent send work requests from making progress. + */ +#define QIB_S_ANY_WAIT_SEND (QIB_S_WAIT_FENCE | QIB_S_WAIT_RDMAR | \ + QIB_S_WAIT_RNR | QIB_S_WAIT_SSN_CREDIT | QIB_S_WAIT_DMA | \ + QIB_S_WAIT_PSN | QIB_S_WAIT_ACK) + +#define QIB_S_ANY_WAIT (QIB_S_ANY_WAIT_IO | QIB_S_ANY_WAIT_SEND) + +#define QIB_PSN_CREDIT 16 + +/* + * Since struct qib_swqe is not a fixed size, we can't simply index into + * struct qib_qp.s_wq. This function does the array index computation. + */ +static inline struct qib_swqe *get_swqe_ptr(struct qib_qp *qp, + unsigned n) +{ + return (struct qib_swqe *)((char *)qp->s_wq + + (sizeof(struct qib_swqe) + + qp->s_max_sge * + sizeof(struct qib_sge)) * n); +} + +/* + * Since struct qib_rwqe is not a fixed size, we can't simply index into + * struct qib_rwq.wq. This function does the array index computation. + */ +static inline struct qib_rwqe *get_rwqe_ptr(struct qib_rq *rq, unsigned n) +{ + return (struct qib_rwqe *) + ((char *) rq->wq->wq + + (sizeof(struct qib_rwqe) + + rq->max_sge * sizeof(struct ib_sge)) * n); +} + +/* + * QPN-map pages start out as NULL, they get allocated upon + * first use and are never deallocated. This way, + * large bitmaps are not allocated unless large numbers of QPs are used. + */ +struct qpn_map { + void *page; +}; + +struct qib_qpn_table { + spinlock_t lock; /* protect changes in this struct */ + unsigned flags; /* flags for QP0/1 allocated for each port */ + u32 last; /* last QP number allocated */ + u32 nmaps; /* size of the map table */ + u16 limit; + u16 mask; + /* bit map of free QP numbers other than 0/1 */ + struct qpn_map map[QPNMAP_ENTRIES]; +}; + +struct qib_lkey_table { + spinlock_t lock; /* protect changes in this struct */ + u32 next; /* next unused index (speeds search) */ + u32 gen; /* generation count */ + u32 max; /* size of the table */ + struct qib_mregion __rcu **table; +}; + +struct qib_opcode_stats { + u64 n_packets; /* number of packets */ + u64 n_bytes; /* total number of bytes */ +}; + +struct qib_opcode_stats_perctx { + struct qib_opcode_stats stats[128]; +}; + +struct qib_pma_counters { + u64 n_unicast_xmit; /* total unicast packets sent */ + u64 n_unicast_rcv; /* total unicast packets received */ + u64 n_multicast_xmit; /* total multicast packets sent */ + u64 n_multicast_rcv; /* total multicast packets received */ +}; + +struct qib_ibport { + struct qib_qp __rcu *qp0; + struct qib_qp __rcu *qp1; + struct ib_mad_agent *send_agent; /* agent for SMI (traps) */ + struct qib_ah *sm_ah; + struct qib_ah *smi_ah; + struct rb_root mcast_tree; + spinlock_t lock; /* protect changes in this struct */ + + /* non-zero when timer is set */ + unsigned long mkey_lease_timeout; + unsigned long trap_timeout; + __be64 gid_prefix; /* in network order */ + __be64 mkey; + __be64 guids[QIB_GUIDS_PER_PORT - 1]; /* writable GUIDs */ + u64 tid; /* TID for traps */ + struct qib_pma_counters __percpu *pmastats; + u64 z_unicast_xmit; /* starting count for PMA */ + u64 z_unicast_rcv; /* starting count for PMA */ + u64 z_multicast_xmit; /* starting count for PMA */ + u64 z_multicast_rcv; /* starting count for PMA */ + u64 z_symbol_error_counter; /* starting count for PMA */ + u64 z_link_error_recovery_counter; /* starting count for PMA */ + u64 z_link_downed_counter; /* starting count for PMA */ + u64 z_port_rcv_errors; /* starting count for PMA */ + u64 z_port_rcv_remphys_errors; /* starting count for PMA */ + u64 z_port_xmit_discards; /* starting count for PMA */ + u64 z_port_xmit_data; /* starting count for PMA */ + u64 z_port_rcv_data; /* starting count for PMA */ + u64 z_port_xmit_packets; /* starting count for PMA */ + u64 z_port_rcv_packets; /* starting count for PMA */ + u32 z_local_link_integrity_errors; /* starting count for PMA */ + u32 z_excessive_buffer_overrun_errors; /* starting count for PMA */ + u32 z_vl15_dropped; /* starting count for PMA */ + u32 n_rc_resends; + u32 n_rc_acks; + u32 n_rc_qacks; + u32 n_rc_delayed_comp; + u32 n_seq_naks; + u32 n_rdma_seq; + u32 n_rnr_naks; + u32 n_other_naks; + u32 n_loop_pkts; + u32 n_pkt_drops; + u32 n_vl15_dropped; + u32 n_rc_timeouts; + u32 n_dmawait; + u32 n_unaligned; + u32 n_rc_dupreq; + u32 n_rc_seqnak; + u32 port_cap_flags; + u32 pma_sample_start; + u32 pma_sample_interval; + __be16 pma_counter_select[5]; + u16 pma_tag; + u16 pkey_violations; + u16 qkey_violations; + u16 mkey_violations; + u16 mkey_lease_period; + u16 sm_lid; + u16 repress_traps; + u8 sm_sl; + u8 mkeyprot; + u8 subnet_timeout; + u8 vl_high_limit; + u8 sl_to_vl[16]; + +}; + + +struct qib_ibdev { + struct ib_device ibdev; + struct list_head pending_mmaps; + spinlock_t mmap_offset_lock; /* protect mmap_offset */ + u32 mmap_offset; + struct qib_mregion __rcu *dma_mr; + + /* QP numbers are shared by all IB ports */ + struct qib_qpn_table qpn_table; + struct qib_lkey_table lk_table; + struct list_head piowait; /* list for wait PIO buf */ + struct list_head dmawait; /* list for wait DMA */ + struct list_head txwait; /* list for wait qib_verbs_txreq */ + struct list_head memwait; /* list for wait kernel memory */ + struct list_head txreq_free; + struct timer_list mem_timer; + struct qib_qp __rcu **qp_table; + struct qib_pio_header *pio_hdrs; + dma_addr_t pio_hdrs_phys; + /* list of QPs waiting for RNR timer */ + spinlock_t pending_lock; /* protect wait lists, PMA counters, etc. */ + u32 qp_table_size; /* size of the hash table */ + u32 qp_rnd; /* random bytes for hash */ + spinlock_t qpt_lock; + + u32 n_piowait; + u32 n_txwait; + + u32 n_pds_allocated; /* number of PDs allocated for device */ + spinlock_t n_pds_lock; + u32 n_ahs_allocated; /* number of AHs allocated for device */ + spinlock_t n_ahs_lock; + u32 n_cqs_allocated; /* number of CQs allocated for device */ + spinlock_t n_cqs_lock; + u32 n_qps_allocated; /* number of QPs allocated for device */ + spinlock_t n_qps_lock; + u32 n_srqs_allocated; /* number of SRQs allocated for device */ + spinlock_t n_srqs_lock; + u32 n_mcast_grps_allocated; /* number of mcast groups allocated */ + spinlock_t n_mcast_grps_lock; +#ifdef CONFIG_DEBUG_FS + /* per HCA debugfs */ + struct dentry *qib_ibdev_dbg; +#endif +}; + +struct qib_verbs_counters { + u64 symbol_error_counter; + u64 link_error_recovery_counter; + u64 link_downed_counter; + u64 port_rcv_errors; + u64 port_rcv_remphys_errors; + u64 port_xmit_discards; + u64 port_xmit_data; + u64 port_rcv_data; + u64 port_xmit_packets; + u64 port_rcv_packets; + u32 local_link_integrity_errors; + u32 excessive_buffer_overrun_errors; + u32 vl15_dropped; +}; + +static inline struct qib_mr *to_imr(struct ib_mr *ibmr) +{ + return container_of(ibmr, struct qib_mr, ibmr); +} + +static inline struct qib_pd *to_ipd(struct ib_pd *ibpd) +{ + return container_of(ibpd, struct qib_pd, ibpd); +} + +static inline struct qib_ah *to_iah(struct ib_ah *ibah) +{ + return container_of(ibah, struct qib_ah, ibah); +} + +static inline struct qib_cq *to_icq(struct ib_cq *ibcq) +{ + return container_of(ibcq, struct qib_cq, ibcq); +} + +static inline struct qib_srq *to_isrq(struct ib_srq *ibsrq) +{ + return container_of(ibsrq, struct qib_srq, ibsrq); +} + +static inline struct qib_qp *to_iqp(struct ib_qp *ibqp) +{ + return container_of(ibqp, struct qib_qp, ibqp); +} + +static inline struct qib_ibdev *to_idev(struct ib_device *ibdev) +{ + return container_of(ibdev, struct qib_ibdev, ibdev); +} + +/* + * Send if not busy or waiting for I/O and either + * a RC response is pending or we can process send work requests. + */ +static inline int qib_send_ok(struct qib_qp *qp) +{ + return !(qp->s_flags & (QIB_S_BUSY | QIB_S_ANY_WAIT_IO)) && + (qp->s_hdrwords || (qp->s_flags & QIB_S_RESP_PENDING) || + !(qp->s_flags & QIB_S_ANY_WAIT_SEND)); +} + +/* + * This must be called with s_lock held. + */ +void qib_schedule_send(struct qib_qp *qp); + +static inline int qib_pkey_ok(u16 pkey1, u16 pkey2) +{ + u16 p1 = pkey1 & 0x7FFF; + u16 p2 = pkey2 & 0x7FFF; + + /* + * Low 15 bits must be non-zero and match, and + * one of the two must be a full member. + */ + return p1 && p1 == p2 && ((__s16)pkey1 < 0 || (__s16)pkey2 < 0); +} + +void qib_bad_pqkey(struct qib_ibport *ibp, __be16 trap_num, u32 key, u32 sl, + u32 qp1, u32 qp2, __be16 lid1, __be16 lid2); +void qib_cap_mask_chg(struct qib_ibport *ibp); +void qib_sys_guid_chg(struct qib_ibport *ibp); +void qib_node_desc_chg(struct qib_ibport *ibp); +int qib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + struct ib_wc *in_wc, struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad); +int qib_create_agents(struct qib_ibdev *dev); +void qib_free_agents(struct qib_ibdev *dev); + +/* + * Compare the lower 24 bits of the two values. + * Returns an integer <, ==, or > than zero. + */ +static inline int qib_cmp24(u32 a, u32 b) +{ + return (((int) a) - ((int) b)) << 8; +} + +struct qib_mcast *qib_mcast_find(struct qib_ibport *ibp, union ib_gid *mgid); + +int qib_snapshot_counters(struct qib_pportdata *ppd, u64 *swords, + u64 *rwords, u64 *spkts, u64 *rpkts, + u64 *xmit_wait); + +int qib_get_counters(struct qib_pportdata *ppd, + struct qib_verbs_counters *cntrs); + +int qib_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid); + +int qib_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid); + +int qib_mcast_tree_empty(struct qib_ibport *ibp); + +__be32 qib_compute_aeth(struct qib_qp *qp); + +struct qib_qp *qib_lookup_qpn(struct qib_ibport *ibp, u32 qpn); + +struct ib_qp *qib_create_qp(struct ib_pd *ibpd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata); + +int qib_destroy_qp(struct ib_qp *ibqp); + +int qib_error_qp(struct qib_qp *qp, enum ib_wc_status err); + +int qib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata); + +int qib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_qp_init_attr *init_attr); + +unsigned qib_free_all_qps(struct qib_devdata *dd); + +void qib_init_qpn_table(struct qib_devdata *dd, struct qib_qpn_table *qpt); + +void qib_free_qpn_table(struct qib_qpn_table *qpt); + +#ifdef CONFIG_DEBUG_FS + +struct qib_qp_iter; + +struct qib_qp_iter *qib_qp_iter_init(struct qib_ibdev *dev); + +int qib_qp_iter_next(struct qib_qp_iter *iter); + +void qib_qp_iter_print(struct seq_file *s, struct qib_qp_iter *iter); + +#endif + +void qib_get_credit(struct qib_qp *qp, u32 aeth); + +unsigned qib_pkt_delay(u32 plen, u8 snd_mult, u8 rcv_mult); + +void qib_verbs_sdma_desc_avail(struct qib_pportdata *ppd, unsigned avail); + +void qib_put_txreq(struct qib_verbs_txreq *tx); + +int qib_verbs_send(struct qib_qp *qp, struct qib_ib_header *hdr, + u32 hdrwords, struct qib_sge_state *ss, u32 len); + +void qib_copy_sge(struct qib_sge_state *ss, void *data, u32 length, + int release); + +void qib_skip_sge(struct qib_sge_state *ss, u32 length, int release); + +void qib_uc_rcv(struct qib_ibport *ibp, struct qib_ib_header *hdr, + int has_grh, void *data, u32 tlen, struct qib_qp *qp); + +void qib_rc_rcv(struct qib_ctxtdata *rcd, struct qib_ib_header *hdr, + int has_grh, void *data, u32 tlen, struct qib_qp *qp); + +int qib_check_ah(struct ib_device *ibdev, struct ib_ah_attr *ah_attr); + +struct ib_ah *qib_create_qp0_ah(struct qib_ibport *ibp, u16 dlid); + +void qib_rc_rnr_retry(unsigned long arg); + +void qib_rc_send_complete(struct qib_qp *qp, struct qib_ib_header *hdr); + +void qib_rc_error(struct qib_qp *qp, enum ib_wc_status err); + +int qib_post_ud_send(struct qib_qp *qp, struct ib_send_wr *wr); + +void qib_ud_rcv(struct qib_ibport *ibp, struct qib_ib_header *hdr, + int has_grh, void *data, u32 tlen, struct qib_qp *qp); + +int qib_alloc_lkey(struct qib_mregion *mr, int dma_region); + +void qib_free_lkey(struct qib_mregion *mr); + +int qib_lkey_ok(struct qib_lkey_table *rkt, struct qib_pd *pd, + struct qib_sge *isge, struct ib_sge *sge, int acc); + +int qib_rkey_ok(struct qib_qp *qp, struct qib_sge *sge, + u32 len, u64 vaddr, u32 rkey, int acc); + +int qib_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr); + +struct ib_srq *qib_create_srq(struct ib_pd *ibpd, + struct ib_srq_init_attr *srq_init_attr, + struct ib_udata *udata); + +int qib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, + struct ib_udata *udata); + +int qib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr); + +int qib_destroy_srq(struct ib_srq *ibsrq); + +int qib_cq_init(struct qib_devdata *dd); + +void qib_cq_exit(struct qib_devdata *dd); + +void qib_cq_enter(struct qib_cq *cq, struct ib_wc *entry, int sig); + +int qib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry); + +struct ib_cq *qib_create_cq(struct ib_device *ibdev, int entries, + int comp_vector, struct ib_ucontext *context, + struct ib_udata *udata); + +int qib_destroy_cq(struct ib_cq *ibcq); + +int qib_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags); + +int qib_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata); + +struct ib_mr *qib_get_dma_mr(struct ib_pd *pd, int acc); + +struct ib_mr *qib_reg_phys_mr(struct ib_pd *pd, + struct ib_phys_buf *buffer_list, + int num_phys_buf, int acc, u64 *iova_start); + +struct ib_mr *qib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int mr_access_flags, + struct ib_udata *udata); + +int qib_dereg_mr(struct ib_mr *ibmr); + +struct ib_mr *qib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len); + +struct ib_fast_reg_page_list *qib_alloc_fast_reg_page_list( + struct ib_device *ibdev, int page_list_len); + +void qib_free_fast_reg_page_list(struct ib_fast_reg_page_list *pl); + +int qib_fast_reg_mr(struct qib_qp *qp, struct ib_send_wr *wr); + +struct ib_fmr *qib_alloc_fmr(struct ib_pd *pd, int mr_access_flags, + struct ib_fmr_attr *fmr_attr); + +int qib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, + int list_len, u64 iova); + +int qib_unmap_fmr(struct list_head *fmr_list); + +int qib_dealloc_fmr(struct ib_fmr *ibfmr); + +static inline void qib_get_mr(struct qib_mregion *mr) +{ + atomic_inc(&mr->refcount); +} + +void mr_rcu_callback(struct rcu_head *list); + +static inline void qib_put_mr(struct qib_mregion *mr) +{ + if (unlikely(atomic_dec_and_test(&mr->refcount))) + call_rcu(&mr->list, mr_rcu_callback); +} + +static inline void qib_put_ss(struct qib_sge_state *ss) +{ + while (ss->num_sge) { + qib_put_mr(ss->sge.mr); + if (--ss->num_sge) + ss->sge = *ss->sg_list++; + } +} + + +void qib_release_mmap_info(struct kref *ref); + +struct qib_mmap_info *qib_create_mmap_info(struct qib_ibdev *dev, u32 size, + struct ib_ucontext *context, + void *obj); + +void qib_update_mmap_info(struct qib_ibdev *dev, struct qib_mmap_info *ip, + u32 size, void *obj); + +int qib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma); + +int qib_get_rwqe(struct qib_qp *qp, int wr_id_only); + +void qib_migrate_qp(struct qib_qp *qp); + +int qib_ruc_check_hdr(struct qib_ibport *ibp, struct qib_ib_header *hdr, + int has_grh, struct qib_qp *qp, u32 bth0); + +u32 qib_make_grh(struct qib_ibport *ibp, struct ib_grh *hdr, + struct ib_global_route *grh, u32 hwords, u32 nwords); + +void qib_make_ruc_header(struct qib_qp *qp, struct qib_other_headers *ohdr, + u32 bth0, u32 bth2); + +void qib_do_send(struct work_struct *work); + +void qib_send_complete(struct qib_qp *qp, struct qib_swqe *wqe, + enum ib_wc_status status); + +void qib_send_rc_ack(struct qib_qp *qp); + +int qib_make_rc_req(struct qib_qp *qp); + +int qib_make_uc_req(struct qib_qp *qp); + +int qib_make_ud_req(struct qib_qp *qp); + +int qib_register_ib_device(struct qib_devdata *); + +void qib_unregister_ib_device(struct qib_devdata *); + +void qib_ib_rcv(struct qib_ctxtdata *, void *, void *, u32); + +void qib_ib_piobufavail(struct qib_devdata *); + +unsigned qib_get_npkeys(struct qib_devdata *); + +unsigned qib_get_pkey(struct qib_ibport *, unsigned); + +extern const enum ib_wc_opcode ib_qib_wc_opcode[]; + +/* + * Below HCA-independent IB PhysPortState values, returned + * by the f_ibphys_portstate() routine. + */ +#define IB_PHYSPORTSTATE_SLEEP 1 +#define IB_PHYSPORTSTATE_POLL 2 +#define IB_PHYSPORTSTATE_DISABLED 3 +#define IB_PHYSPORTSTATE_CFG_TRAIN 4 +#define IB_PHYSPORTSTATE_LINKUP 5 +#define IB_PHYSPORTSTATE_LINK_ERR_RECOVER 6 +#define IB_PHYSPORTSTATE_CFG_DEBOUNCE 8 +#define IB_PHYSPORTSTATE_CFG_IDLE 0xB +#define IB_PHYSPORTSTATE_RECOVERY_RETRAIN 0xC +#define IB_PHYSPORTSTATE_RECOVERY_WAITRMT 0xE +#define IB_PHYSPORTSTATE_RECOVERY_IDLE 0xF +#define IB_PHYSPORTSTATE_CFG_ENH 0x10 +#define IB_PHYSPORTSTATE_CFG_WAIT_ENH 0x13 + +extern const int ib_qib_state_ops[]; + +extern __be64 ib_qib_sys_image_guid; /* in network order */ + +extern unsigned int ib_qib_lkey_table_size; + +extern unsigned int ib_qib_max_cqes; + +extern unsigned int ib_qib_max_cqs; + +extern unsigned int ib_qib_max_qp_wrs; + +extern unsigned int ib_qib_max_qps; + +extern unsigned int ib_qib_max_sges; + +extern unsigned int ib_qib_max_mcast_grps; + +extern unsigned int ib_qib_max_mcast_qp_attached; + +extern unsigned int ib_qib_max_srqs; + +extern unsigned int ib_qib_max_srq_sges; + +extern unsigned int ib_qib_max_srq_wrs; + +extern const u32 ib_qib_rnr_table[]; + +extern struct ib_dma_mapping_ops qib_dma_mapping_ops; + +#endif /* QIB_VERBS_H */ diff --git a/kernel/drivers/infiniband/hw/qib/qib_verbs_mcast.c b/kernel/drivers/infiniband/hw/qib/qib_verbs_mcast.c new file mode 100644 index 000000000..f8ea069a3 --- /dev/null +++ b/kernel/drivers/infiniband/hw/qib/qib_verbs_mcast.c @@ -0,0 +1,368 @@ +/* + * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include "qib.h" + +/** + * qib_mcast_qp_alloc - alloc a struct to link a QP to mcast GID struct + * @qp: the QP to link + */ +static struct qib_mcast_qp *qib_mcast_qp_alloc(struct qib_qp *qp) +{ + struct qib_mcast_qp *mqp; + + mqp = kmalloc(sizeof(*mqp), GFP_KERNEL); + if (!mqp) + goto bail; + + mqp->qp = qp; + atomic_inc(&qp->refcount); + +bail: + return mqp; +} + +static void qib_mcast_qp_free(struct qib_mcast_qp *mqp) +{ + struct qib_qp *qp = mqp->qp; + + /* Notify qib_destroy_qp() if it is waiting. */ + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + + kfree(mqp); +} + +/** + * qib_mcast_alloc - allocate the multicast GID structure + * @mgid: the multicast GID + * + * A list of QPs will be attached to this structure. + */ +static struct qib_mcast *qib_mcast_alloc(union ib_gid *mgid) +{ + struct qib_mcast *mcast; + + mcast = kmalloc(sizeof(*mcast), GFP_KERNEL); + if (!mcast) + goto bail; + + mcast->mgid = *mgid; + INIT_LIST_HEAD(&mcast->qp_list); + init_waitqueue_head(&mcast->wait); + atomic_set(&mcast->refcount, 0); + mcast->n_attached = 0; + +bail: + return mcast; +} + +static void qib_mcast_free(struct qib_mcast *mcast) +{ + struct qib_mcast_qp *p, *tmp; + + list_for_each_entry_safe(p, tmp, &mcast->qp_list, list) + qib_mcast_qp_free(p); + + kfree(mcast); +} + +/** + * qib_mcast_find - search the global table for the given multicast GID + * @ibp: the IB port structure + * @mgid: the multicast GID to search for + * + * Returns NULL if not found. + * + * The caller is responsible for decrementing the reference count if found. + */ +struct qib_mcast *qib_mcast_find(struct qib_ibport *ibp, union ib_gid *mgid) +{ + struct rb_node *n; + unsigned long flags; + struct qib_mcast *mcast; + + spin_lock_irqsave(&ibp->lock, flags); + n = ibp->mcast_tree.rb_node; + while (n) { + int ret; + + mcast = rb_entry(n, struct qib_mcast, rb_node); + + ret = memcmp(mgid->raw, mcast->mgid.raw, + sizeof(union ib_gid)); + if (ret < 0) + n = n->rb_left; + else if (ret > 0) + n = n->rb_right; + else { + atomic_inc(&mcast->refcount); + spin_unlock_irqrestore(&ibp->lock, flags); + goto bail; + } + } + spin_unlock_irqrestore(&ibp->lock, flags); + + mcast = NULL; + +bail: + return mcast; +} + +/** + * qib_mcast_add - insert mcast GID into table and attach QP struct + * @mcast: the mcast GID table + * @mqp: the QP to attach + * + * Return zero if both were added. Return EEXIST if the GID was already in + * the table but the QP was added. Return ESRCH if the QP was already + * attached and neither structure was added. + */ +static int qib_mcast_add(struct qib_ibdev *dev, struct qib_ibport *ibp, + struct qib_mcast *mcast, struct qib_mcast_qp *mqp) +{ + struct rb_node **n = &ibp->mcast_tree.rb_node; + struct rb_node *pn = NULL; + int ret; + + spin_lock_irq(&ibp->lock); + + while (*n) { + struct qib_mcast *tmcast; + struct qib_mcast_qp *p; + + pn = *n; + tmcast = rb_entry(pn, struct qib_mcast, rb_node); + + ret = memcmp(mcast->mgid.raw, tmcast->mgid.raw, + sizeof(union ib_gid)); + if (ret < 0) { + n = &pn->rb_left; + continue; + } + if (ret > 0) { + n = &pn->rb_right; + continue; + } + + /* Search the QP list to see if this is already there. */ + list_for_each_entry_rcu(p, &tmcast->qp_list, list) { + if (p->qp == mqp->qp) { + ret = ESRCH; + goto bail; + } + } + if (tmcast->n_attached == ib_qib_max_mcast_qp_attached) { + ret = ENOMEM; + goto bail; + } + + tmcast->n_attached++; + + list_add_tail_rcu(&mqp->list, &tmcast->qp_list); + ret = EEXIST; + goto bail; + } + + spin_lock(&dev->n_mcast_grps_lock); + if (dev->n_mcast_grps_allocated == ib_qib_max_mcast_grps) { + spin_unlock(&dev->n_mcast_grps_lock); + ret = ENOMEM; + goto bail; + } + + dev->n_mcast_grps_allocated++; + spin_unlock(&dev->n_mcast_grps_lock); + + mcast->n_attached++; + + list_add_tail_rcu(&mqp->list, &mcast->qp_list); + + atomic_inc(&mcast->refcount); + rb_link_node(&mcast->rb_node, pn, n); + rb_insert_color(&mcast->rb_node, &ibp->mcast_tree); + + ret = 0; + +bail: + spin_unlock_irq(&ibp->lock); + + return ret; +} + +int qib_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + struct qib_qp *qp = to_iqp(ibqp); + struct qib_ibdev *dev = to_idev(ibqp->device); + struct qib_ibport *ibp; + struct qib_mcast *mcast; + struct qib_mcast_qp *mqp; + int ret; + + if (ibqp->qp_num <= 1 || qp->state == IB_QPS_RESET) { + ret = -EINVAL; + goto bail; + } + + /* + * Allocate data structures since its better to do this outside of + * spin locks and it will most likely be needed. + */ + mcast = qib_mcast_alloc(gid); + if (mcast == NULL) { + ret = -ENOMEM; + goto bail; + } + mqp = qib_mcast_qp_alloc(qp); + if (mqp == NULL) { + qib_mcast_free(mcast); + ret = -ENOMEM; + goto bail; + } + ibp = to_iport(ibqp->device, qp->port_num); + switch (qib_mcast_add(dev, ibp, mcast, mqp)) { + case ESRCH: + /* Neither was used: OK to attach the same QP twice. */ + qib_mcast_qp_free(mqp); + qib_mcast_free(mcast); + break; + + case EEXIST: /* The mcast wasn't used */ + qib_mcast_free(mcast); + break; + + case ENOMEM: + /* Exceeded the maximum number of mcast groups. */ + qib_mcast_qp_free(mqp); + qib_mcast_free(mcast); + ret = -ENOMEM; + goto bail; + + default: + break; + } + + ret = 0; + +bail: + return ret; +} + +int qib_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + struct qib_qp *qp = to_iqp(ibqp); + struct qib_ibdev *dev = to_idev(ibqp->device); + struct qib_ibport *ibp = to_iport(ibqp->device, qp->port_num); + struct qib_mcast *mcast = NULL; + struct qib_mcast_qp *p, *tmp; + struct rb_node *n; + int last = 0; + int ret; + + if (ibqp->qp_num <= 1 || qp->state == IB_QPS_RESET) { + ret = -EINVAL; + goto bail; + } + + spin_lock_irq(&ibp->lock); + + /* Find the GID in the mcast table. */ + n = ibp->mcast_tree.rb_node; + while (1) { + if (n == NULL) { + spin_unlock_irq(&ibp->lock); + ret = -EINVAL; + goto bail; + } + + mcast = rb_entry(n, struct qib_mcast, rb_node); + ret = memcmp(gid->raw, mcast->mgid.raw, + sizeof(union ib_gid)); + if (ret < 0) + n = n->rb_left; + else if (ret > 0) + n = n->rb_right; + else + break; + } + + /* Search the QP list. */ + list_for_each_entry_safe(p, tmp, &mcast->qp_list, list) { + if (p->qp != qp) + continue; + /* + * We found it, so remove it, but don't poison the forward + * link until we are sure there are no list walkers. + */ + list_del_rcu(&p->list); + mcast->n_attached--; + + /* If this was the last attached QP, remove the GID too. */ + if (list_empty(&mcast->qp_list)) { + rb_erase(&mcast->rb_node, &ibp->mcast_tree); + last = 1; + } + break; + } + + spin_unlock_irq(&ibp->lock); + + if (p) { + /* + * Wait for any list walkers to finish before freeing the + * list element. + */ + wait_event(mcast->wait, atomic_read(&mcast->refcount) <= 1); + qib_mcast_qp_free(p); + } + if (last) { + atomic_dec(&mcast->refcount); + wait_event(mcast->wait, !atomic_read(&mcast->refcount)); + qib_mcast_free(mcast); + spin_lock_irq(&dev->n_mcast_grps_lock); + dev->n_mcast_grps_allocated--; + spin_unlock_irq(&dev->n_mcast_grps_lock); + } + + ret = 0; + +bail: + return ret; +} + +int qib_mcast_tree_empty(struct qib_ibport *ibp) +{ + return ibp->mcast_tree.rb_node == NULL; +} diff --git a/kernel/drivers/infiniband/hw/qib/qib_wc_ppc64.c b/kernel/drivers/infiniband/hw/qib/qib_wc_ppc64.c new file mode 100644 index 000000000..673cf4c22 --- /dev/null +++ b/kernel/drivers/infiniband/hw/qib/qib_wc_ppc64.c @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * This file is conditionally built on PowerPC only. Otherwise weak symbol + * versions of the functions exported from here are used. + */ + +#include "qib.h" + +/** + * qib_enable_wc - enable write combining for MMIO writes to the device + * @dd: qlogic_ib device + * + * Nothing to do on PowerPC, so just return without error. + */ +int qib_enable_wc(struct qib_devdata *dd) +{ + return 0; +} + +/** + * qib_unordered_wc - indicate whether write combining is unordered + * + * Because our performance depends on our ability to do write + * combining mmio writes in the most efficient way, we need to + * know if we are on a processor that may reorder stores when + * write combining. + */ +int qib_unordered_wc(void) +{ + return 1; +} diff --git a/kernel/drivers/infiniband/hw/qib/qib_wc_x86_64.c b/kernel/drivers/infiniband/hw/qib/qib_wc_x86_64.c new file mode 100644 index 000000000..edd0ddbd4 --- /dev/null +++ b/kernel/drivers/infiniband/hw/qib/qib_wc_x86_64.c @@ -0,0 +1,150 @@ +/* + * Copyright (c) 2012 Intel Corporation. All rights reserved. + * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * This file is conditionally built on x86_64 only. Otherwise weak symbol + * versions of the functions exported from here are used. + */ + +#include +#include +#include + +#include "qib.h" + +/** + * qib_enable_wc - enable write combining for MMIO writes to the device + * @dd: qlogic_ib device + * + * This routine is x86_64-specific; it twiddles the CPU's MTRRs to enable + * write combining. + */ +int qib_enable_wc(struct qib_devdata *dd) +{ + int ret = 0; + u64 pioaddr, piolen; + unsigned bits; + const unsigned long addr = pci_resource_start(dd->pcidev, 0); + const size_t len = pci_resource_len(dd->pcidev, 0); + + /* + * Set the PIO buffers to be WCCOMB, so we get HT bursts to the + * chip. Linux (possibly the hardware) requires it to be on a power + * of 2 address matching the length (which has to be a power of 2). + * For rev1, that means the base address, for rev2, it will be just + * the PIO buffers themselves. + * For chips with two sets of buffers, the calculations are + * somewhat more complicated; we need to sum, and the piobufbase + * register has both offsets, 2K in low 32 bits, 4K in high 32 bits. + * The buffers are still packed, so a single range covers both. + */ + if (dd->piobcnt2k && dd->piobcnt4k) { + /* 2 sizes for chip */ + unsigned long pio2kbase, pio4kbase; + + pio2kbase = dd->piobufbase & 0xffffffffUL; + pio4kbase = (dd->piobufbase >> 32) & 0xffffffffUL; + if (pio2kbase < pio4kbase) { + /* all current chips */ + pioaddr = addr + pio2kbase; + piolen = pio4kbase - pio2kbase + + dd->piobcnt4k * dd->align4k; + } else { + pioaddr = addr + pio4kbase; + piolen = pio2kbase - pio4kbase + + dd->piobcnt2k * dd->palign; + } + } else { /* single buffer size (2K, currently) */ + pioaddr = addr + dd->piobufbase; + piolen = dd->piobcnt2k * dd->palign + + dd->piobcnt4k * dd->align4k; + } + + for (bits = 0; !(piolen & (1ULL << bits)); bits++) + ; /* do nothing */ + + if (piolen != (1ULL << bits)) { + piolen >>= bits; + while (piolen >>= 1) + bits++; + piolen = 1ULL << (bits + 1); + } + if (pioaddr & (piolen - 1)) { + u64 atmp = pioaddr & ~(piolen - 1); + + if (atmp < addr || (atmp + piolen) > (addr + len)) { + qib_dev_err(dd, + "No way to align address/size (%llx/%llx), no WC mtrr\n", + (unsigned long long) atmp, + (unsigned long long) piolen << 1); + ret = -ENODEV; + } else { + pioaddr = atmp; + piolen <<= 1; + } + } + + if (!ret) { + dd->wc_cookie = arch_phys_wc_add(pioaddr, piolen); + if (dd->wc_cookie < 0) + /* use error from routine */ + ret = dd->wc_cookie; + } + + return ret; +} + +/** + * qib_disable_wc - disable write combining for MMIO writes to the device + * @dd: qlogic_ib device + */ +void qib_disable_wc(struct qib_devdata *dd) +{ + arch_phys_wc_del(dd->wc_cookie); +} + +/** + * qib_unordered_wc - indicate whether write combining is ordered + * + * Because our performance depends on our ability to do write combining mmio + * writes in the most efficient way, we need to know if we are on an Intel + * or AMD x86_64 processor. AMD x86_64 processors flush WC buffers out in + * the order completed, and so no special flushing is required to get + * correct ordering. Intel processors, however, will flush write buffers + * out in "random" orders, and so explicit ordering is needed at times. + */ +int qib_unordered_wc(void) +{ + return boot_cpu_data.x86_vendor != X86_VENDOR_AMD; +} diff --git a/kernel/drivers/infiniband/hw/usnic/Kconfig b/kernel/drivers/infiniband/hw/usnic/Kconfig new file mode 100644 index 000000000..29ab11c34 --- /dev/null +++ b/kernel/drivers/infiniband/hw/usnic/Kconfig @@ -0,0 +1,10 @@ +config INFINIBAND_USNIC + tristate "Verbs support for Cisco VIC" + depends on NETDEVICES && ETHERNET && INET && PCI && INTEL_IOMMU + select ENIC + select NET_VENDOR_CISCO + select PCI_IOV + select INFINIBAND_USER_ACCESS + ---help--- + This is a low-level driver for Cisco's Virtual Interface + Cards (VICs), including the VIC 1240 and 1280 cards. diff --git a/kernel/drivers/infiniband/hw/usnic/Makefile b/kernel/drivers/infiniband/hw/usnic/Makefile new file mode 100644 index 000000000..99fb2db47 --- /dev/null +++ b/kernel/drivers/infiniband/hw/usnic/Makefile @@ -0,0 +1,15 @@ +ccflags-y := -Idrivers/net/ethernet/cisco/enic + +obj-$(CONFIG_INFINIBAND_USNIC)+= usnic_verbs.o + +usnic_verbs-y=\ +usnic_fwd.o \ +usnic_transport.o \ +usnic_uiom.o \ +usnic_uiom_interval_tree.o \ +usnic_vnic.o \ +usnic_ib_main.o \ +usnic_ib_qp_grp.o \ +usnic_ib_sysfs.o \ +usnic_ib_verbs.o \ +usnic_debugfs.o \ diff --git a/kernel/drivers/infiniband/hw/usnic/usnic.h b/kernel/drivers/infiniband/hw/usnic/usnic.h new file mode 100644 index 000000000..5be13d899 --- /dev/null +++ b/kernel/drivers/infiniband/hw/usnic/usnic.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_H_ +#define USNIC_H_ + +#define DRV_NAME "usnic_verbs" + +#define PCI_DEVICE_ID_CISCO_VIC_USPACE_NIC 0x00cf /* User space NIC */ + +#define DRV_VERSION "1.0.3" +#define DRV_RELDATE "December 19, 2013" + +#endif /* USNIC_H_ */ diff --git a/kernel/drivers/infiniband/hw/usnic/usnic_abi.h b/kernel/drivers/infiniband/hw/usnic/usnic_abi.h new file mode 100644 index 000000000..04a662295 --- /dev/null +++ b/kernel/drivers/infiniband/hw/usnic/usnic_abi.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + + +#ifndef USNIC_ABI_H +#define USNIC_ABI_H + +/* ABI between userspace and kernel */ +#define USNIC_UVERBS_ABI_VERSION 4 + +#define USNIC_QP_GRP_MAX_WQS 8 +#define USNIC_QP_GRP_MAX_RQS 8 +#define USNIC_QP_GRP_MAX_CQS 16 + +enum usnic_transport_type { + USNIC_TRANSPORT_UNKNOWN = 0, + USNIC_TRANSPORT_ROCE_CUSTOM = 1, + USNIC_TRANSPORT_IPV4_UDP = 2, + USNIC_TRANSPORT_MAX = 3, +}; + +struct usnic_transport_spec { + enum usnic_transport_type trans_type; + union { + struct { + uint16_t port_num; + } usnic_roce; + struct { + uint32_t sock_fd; + } udp; + }; +}; + +struct usnic_ib_create_qp_cmd { + struct usnic_transport_spec spec; +}; + +/*TODO: Future - usnic_modify_qp needs to pass in generic filters */ +struct usnic_ib_create_qp_resp { + u32 vfid; + u32 qp_grp_id; + u64 bar_bus_addr; + u32 bar_len; +/* + * WQ, RQ, CQ are explicity specified bc exposing a generic resources inteface + * expands the scope of ABI to many files. + */ + u32 wq_cnt; + u32 rq_cnt; + u32 cq_cnt; + u32 wq_idx[USNIC_QP_GRP_MAX_WQS]; + u32 rq_idx[USNIC_QP_GRP_MAX_RQS]; + u32 cq_idx[USNIC_QP_GRP_MAX_CQS]; + u32 transport; + u32 reserved[9]; +}; + +#endif /* USNIC_ABI_H */ diff --git a/kernel/drivers/infiniband/hw/usnic/usnic_common_pkt_hdr.h b/kernel/drivers/infiniband/hw/usnic/usnic_common_pkt_hdr.h new file mode 100644 index 000000000..393567266 --- /dev/null +++ b/kernel/drivers/infiniband/hw/usnic/usnic_common_pkt_hdr.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_CMN_PKT_HDR_H +#define USNIC_CMN_PKT_HDR_H + +#define USNIC_ROCE_ETHERTYPE (0x8915) +#define USNIC_ROCE_GRH_VER (8) +#define USNIC_PROTO_VER (1) +#define USNIC_ROCE_GRH_VER_SHIFT (4) + +#endif /* USNIC_COMMON_PKT_HDR_H */ diff --git a/kernel/drivers/infiniband/hw/usnic/usnic_common_util.h b/kernel/drivers/infiniband/hw/usnic/usnic_common_util.h new file mode 100644 index 000000000..9d737ed5e --- /dev/null +++ b/kernel/drivers/infiniband/hw/usnic/usnic_common_util.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_CMN_UTIL_H +#define USNIC_CMN_UTIL_H + +static inline void +usnic_mac_to_gid(const char *const mac, char *raw_gid) +{ + raw_gid[0] = 0xfe; + raw_gid[1] = 0x80; + memset(&raw_gid[2], 0, 6); + raw_gid[8] = mac[0]^2; + raw_gid[9] = mac[1]; + raw_gid[10] = mac[2]; + raw_gid[11] = 0xff; + raw_gid[12] = 0xfe; + raw_gid[13] = mac[3]; + raw_gid[14] = mac[4]; + raw_gid[15] = mac[5]; +} + +static inline void +usnic_mac_ip_to_gid(const char *const mac, const __be32 inaddr, char *raw_gid) +{ + raw_gid[0] = 0xfe; + raw_gid[1] = 0x80; + memset(&raw_gid[2], 0, 2); + memcpy(&raw_gid[4], &inaddr, 4); + raw_gid[8] = mac[0]^2; + raw_gid[9] = mac[1]; + raw_gid[10] = mac[2]; + raw_gid[11] = 0xff; + raw_gid[12] = 0xfe; + raw_gid[13] = mac[3]; + raw_gid[14] = mac[4]; + raw_gid[15] = mac[5]; +} + +static inline void +usnic_write_gid_if_id_from_mac(char *mac, char *raw_gid) +{ + raw_gid[8] = mac[0]^2; + raw_gid[9] = mac[1]; + raw_gid[10] = mac[2]; + raw_gid[11] = 0xff; + raw_gid[12] = 0xfe; + raw_gid[13] = mac[3]; + raw_gid[14] = mac[4]; + raw_gid[15] = mac[5]; +} + +#endif /* USNIC_COMMON_UTIL_H */ diff --git a/kernel/drivers/infiniband/hw/usnic/usnic_debugfs.c b/kernel/drivers/infiniband/hw/usnic/usnic_debugfs.c new file mode 100644 index 000000000..5d1386016 --- /dev/null +++ b/kernel/drivers/infiniband/hw/usnic/usnic_debugfs.c @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include + +#include "usnic.h" +#include "usnic_log.h" +#include "usnic_debugfs.h" +#include "usnic_ib_qp_grp.h" +#include "usnic_transport.h" + +static struct dentry *debugfs_root; +static struct dentry *flows_dentry; + +static ssize_t usnic_debugfs_buildinfo_read(struct file *f, char __user *data, + size_t count, loff_t *ppos) +{ + char buf[500]; + int res; + + if (*ppos > 0) + return 0; + + res = scnprintf(buf, sizeof(buf), + "version: %s\n" + "build date: %s\n", + DRV_VERSION, DRV_RELDATE); + + return simple_read_from_buffer(data, count, ppos, buf, res); +} + +static const struct file_operations usnic_debugfs_buildinfo_ops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = usnic_debugfs_buildinfo_read +}; + +static ssize_t flowinfo_read(struct file *f, char __user *data, + size_t count, loff_t *ppos) +{ + struct usnic_ib_qp_grp_flow *qp_flow; + int n; + int left; + char *ptr; + char buf[512]; + + qp_flow = f->private_data; + ptr = buf; + left = count; + + if (*ppos > 0) + return 0; + + spin_lock(&qp_flow->qp_grp->lock); + n = scnprintf(ptr, left, + "QP Grp ID: %d Transport: %s ", + qp_flow->qp_grp->grp_id, + usnic_transport_to_str(qp_flow->trans_type)); + UPDATE_PTR_LEFT(n, ptr, left); + if (qp_flow->trans_type == USNIC_TRANSPORT_ROCE_CUSTOM) { + n = scnprintf(ptr, left, "Port_Num:%hu\n", + qp_flow->usnic_roce.port_num); + UPDATE_PTR_LEFT(n, ptr, left); + } else if (qp_flow->trans_type == USNIC_TRANSPORT_IPV4_UDP) { + n = usnic_transport_sock_to_str(ptr, left, + qp_flow->udp.sock); + UPDATE_PTR_LEFT(n, ptr, left); + n = scnprintf(ptr, left, "\n"); + UPDATE_PTR_LEFT(n, ptr, left); + } + spin_unlock(&qp_flow->qp_grp->lock); + + return simple_read_from_buffer(data, count, ppos, buf, ptr - buf); +} + +static const struct file_operations flowinfo_ops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = flowinfo_read, +}; + +void usnic_debugfs_init(void) +{ + debugfs_root = debugfs_create_dir(DRV_NAME, NULL); + if (IS_ERR(debugfs_root)) { + usnic_err("Failed to create debugfs root dir, check if debugfs is enabled in kernel configuration\n"); + goto out_clear_root; + } + + flows_dentry = debugfs_create_dir("flows", debugfs_root); + if (IS_ERR_OR_NULL(flows_dentry)) { + usnic_err("Failed to create debugfs flow dir with err %ld\n", + PTR_ERR(flows_dentry)); + goto out_free_root; + } + + debugfs_create_file("build-info", S_IRUGO, debugfs_root, + NULL, &usnic_debugfs_buildinfo_ops); + return; + +out_free_root: + debugfs_remove_recursive(debugfs_root); +out_clear_root: + debugfs_root = NULL; +} + +void usnic_debugfs_exit(void) +{ + if (!debugfs_root) + return; + + debugfs_remove_recursive(debugfs_root); + debugfs_root = NULL; +} + +void usnic_debugfs_flow_add(struct usnic_ib_qp_grp_flow *qp_flow) +{ + if (IS_ERR_OR_NULL(flows_dentry)) + return; + + scnprintf(qp_flow->dentry_name, sizeof(qp_flow->dentry_name), + "%u", qp_flow->flow->flow_id); + qp_flow->dbgfs_dentry = debugfs_create_file(qp_flow->dentry_name, + S_IRUGO, + flows_dentry, + qp_flow, + &flowinfo_ops); + if (IS_ERR_OR_NULL(qp_flow->dbgfs_dentry)) { + usnic_err("Failed to create dbg fs entry for flow %u\n", + qp_flow->flow->flow_id); + } +} + +void usnic_debugfs_flow_remove(struct usnic_ib_qp_grp_flow *qp_flow) +{ + if (!IS_ERR_OR_NULL(qp_flow->dbgfs_dentry)) + debugfs_remove(qp_flow->dbgfs_dentry); +} diff --git a/kernel/drivers/infiniband/hw/usnic/usnic_debugfs.h b/kernel/drivers/infiniband/hw/usnic/usnic_debugfs.h new file mode 100644 index 000000000..4087d24a8 --- /dev/null +++ b/kernel/drivers/infiniband/hw/usnic/usnic_debugfs.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#ifndef USNIC_DEBUGFS_H_ +#define USNIC_DEBUGFS_H_ + +#include "usnic_ib_qp_grp.h" + +void usnic_debugfs_init(void); + +void usnic_debugfs_exit(void); +void usnic_debugfs_flow_add(struct usnic_ib_qp_grp_flow *qp_flow); +void usnic_debugfs_flow_remove(struct usnic_ib_qp_grp_flow *qp_flow); + +#endif /*!USNIC_DEBUGFS_H_ */ diff --git a/kernel/drivers/infiniband/hw/usnic/usnic_fwd.c b/kernel/drivers/infiniband/hw/usnic/usnic_fwd.c new file mode 100644 index 000000000..e3c9bd9d3 --- /dev/null +++ b/kernel/drivers/infiniband/hw/usnic/usnic_fwd.c @@ -0,0 +1,350 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include + +#include "enic_api.h" +#include "usnic_common_pkt_hdr.h" +#include "usnic_fwd.h" +#include "usnic_log.h" + +static int usnic_fwd_devcmd_locked(struct usnic_fwd_dev *ufdev, int vnic_idx, + enum vnic_devcmd_cmd cmd, u64 *a0, + u64 *a1) +{ + int status; + struct net_device *netdev = ufdev->netdev; + + lockdep_assert_held(&ufdev->lock); + + status = enic_api_devcmd_proxy_by_index(netdev, + vnic_idx, + cmd, + a0, a1, + 1000); + if (status) { + if (status == ERR_EINVAL && cmd == CMD_DEL_FILTER) { + usnic_dbg("Dev %s vnic idx %u cmd %u already deleted", + ufdev->name, vnic_idx, cmd); + } else { + usnic_err("Dev %s vnic idx %u cmd %u failed with status %d\n", + ufdev->name, vnic_idx, cmd, + status); + } + } else { + usnic_dbg("Dev %s vnic idx %u cmd %u success", + ufdev->name, vnic_idx, cmd); + } + + return status; +} + +static int usnic_fwd_devcmd(struct usnic_fwd_dev *ufdev, int vnic_idx, + enum vnic_devcmd_cmd cmd, u64 *a0, u64 *a1) +{ + int status; + + spin_lock(&ufdev->lock); + status = usnic_fwd_devcmd_locked(ufdev, vnic_idx, cmd, a0, a1); + spin_unlock(&ufdev->lock); + + return status; +} + +struct usnic_fwd_dev *usnic_fwd_dev_alloc(struct pci_dev *pdev) +{ + struct usnic_fwd_dev *ufdev; + + ufdev = kzalloc(sizeof(*ufdev), GFP_KERNEL); + if (!ufdev) + return NULL; + + ufdev->pdev = pdev; + ufdev->netdev = pci_get_drvdata(pdev); + spin_lock_init(&ufdev->lock); + strncpy(ufdev->name, netdev_name(ufdev->netdev), + sizeof(ufdev->name) - 1); + + return ufdev; +} + +void usnic_fwd_dev_free(struct usnic_fwd_dev *ufdev) +{ + kfree(ufdev); +} + +void usnic_fwd_set_mac(struct usnic_fwd_dev *ufdev, char mac[ETH_ALEN]) +{ + spin_lock(&ufdev->lock); + memcpy(&ufdev->mac, mac, sizeof(ufdev->mac)); + spin_unlock(&ufdev->lock); +} + +int usnic_fwd_add_ipaddr(struct usnic_fwd_dev *ufdev, __be32 inaddr) +{ + int status; + + spin_lock(&ufdev->lock); + if (ufdev->inaddr == 0) { + ufdev->inaddr = inaddr; + status = 0; + } else { + status = -EFAULT; + } + spin_unlock(&ufdev->lock); + + return status; +} + +void usnic_fwd_del_ipaddr(struct usnic_fwd_dev *ufdev) +{ + spin_lock(&ufdev->lock); + ufdev->inaddr = 0; + spin_unlock(&ufdev->lock); +} + +void usnic_fwd_carrier_up(struct usnic_fwd_dev *ufdev) +{ + spin_lock(&ufdev->lock); + ufdev->link_up = 1; + spin_unlock(&ufdev->lock); +} + +void usnic_fwd_carrier_down(struct usnic_fwd_dev *ufdev) +{ + spin_lock(&ufdev->lock); + ufdev->link_up = 0; + spin_unlock(&ufdev->lock); +} + +void usnic_fwd_set_mtu(struct usnic_fwd_dev *ufdev, unsigned int mtu) +{ + spin_lock(&ufdev->lock); + ufdev->mtu = mtu; + spin_unlock(&ufdev->lock); +} + +static int usnic_fwd_dev_ready_locked(struct usnic_fwd_dev *ufdev) +{ + lockdep_assert_held(&ufdev->lock); + + if (!ufdev->link_up) + return -EPERM; + + return 0; +} + +static int validate_filter_locked(struct usnic_fwd_dev *ufdev, + struct filter *filter) +{ + + lockdep_assert_held(&ufdev->lock); + + if (filter->type == FILTER_IPV4_5TUPLE) { + if (!(filter->u.ipv4.flags & FILTER_FIELD_5TUP_DST_AD)) + return -EACCES; + if (!(filter->u.ipv4.flags & FILTER_FIELD_5TUP_DST_PT)) + return -EBUSY; + else if (ufdev->inaddr == 0) + return -EINVAL; + else if (filter->u.ipv4.dst_port == 0) + return -ERANGE; + else if (ntohl(ufdev->inaddr) != filter->u.ipv4.dst_addr) + return -EFAULT; + else + return 0; + } + + return 0; +} + +static void fill_tlv(struct filter_tlv *tlv, struct filter *filter, + struct filter_action *action) +{ + tlv->type = CLSF_TLV_FILTER; + tlv->length = sizeof(struct filter); + *((struct filter *)&tlv->val) = *filter; + + tlv = (struct filter_tlv *)((char *)tlv + sizeof(struct filter_tlv) + + sizeof(struct filter)); + tlv->type = CLSF_TLV_ACTION; + tlv->length = sizeof(struct filter_action); + *((struct filter_action *)&tlv->val) = *action; +} + +struct usnic_fwd_flow* +usnic_fwd_alloc_flow(struct usnic_fwd_dev *ufdev, struct filter *filter, + struct usnic_filter_action *uaction) +{ + struct filter_tlv *tlv; + struct pci_dev *pdev; + struct usnic_fwd_flow *flow; + uint64_t a0, a1; + uint64_t tlv_size; + dma_addr_t tlv_pa; + int status; + + pdev = ufdev->pdev; + tlv_size = (2*sizeof(struct filter_tlv) + sizeof(struct filter) + + sizeof(struct filter_action)); + + flow = kzalloc(sizeof(*flow), GFP_ATOMIC); + if (!flow) + return ERR_PTR(-ENOMEM); + + tlv = pci_alloc_consistent(pdev, tlv_size, &tlv_pa); + if (!tlv) { + usnic_err("Failed to allocate memory\n"); + status = -ENOMEM; + goto out_free_flow; + } + + fill_tlv(tlv, filter, &uaction->action); + + spin_lock(&ufdev->lock); + status = usnic_fwd_dev_ready_locked(ufdev); + if (status) { + usnic_err("Forwarding dev %s not ready with status %d\n", + ufdev->name, status); + goto out_free_tlv; + } + + status = validate_filter_locked(ufdev, filter); + if (status) { + usnic_err("Failed to validate filter with status %d\n", + status); + goto out_free_tlv; + } + + /* Issue Devcmd */ + a0 = tlv_pa; + a1 = tlv_size; + status = usnic_fwd_devcmd_locked(ufdev, uaction->vnic_idx, + CMD_ADD_FILTER, &a0, &a1); + if (status) { + usnic_err("VF %s Filter add failed with status:%d", + ufdev->name, status); + status = -EFAULT; + goto out_free_tlv; + } else { + usnic_dbg("VF %s FILTER ID:%llu", ufdev->name, a0); + } + + flow->flow_id = (uint32_t) a0; + flow->vnic_idx = uaction->vnic_idx; + flow->ufdev = ufdev; + +out_free_tlv: + spin_unlock(&ufdev->lock); + pci_free_consistent(pdev, tlv_size, tlv, tlv_pa); + if (!status) + return flow; +out_free_flow: + kfree(flow); + return ERR_PTR(status); +} + +int usnic_fwd_dealloc_flow(struct usnic_fwd_flow *flow) +{ + int status; + u64 a0, a1; + + a0 = flow->flow_id; + + status = usnic_fwd_devcmd(flow->ufdev, flow->vnic_idx, + CMD_DEL_FILTER, &a0, &a1); + if (status) { + if (status == ERR_EINVAL) { + usnic_dbg("Filter %u already deleted for VF Idx %u pf: %s status: %d", + flow->flow_id, flow->vnic_idx, + flow->ufdev->name, status); + } else { + usnic_err("PF %s VF Idx %u Filter: %u FILTER DELETE failed with status %d", + flow->ufdev->name, flow->vnic_idx, + flow->flow_id, status); + } + status = 0; + /* + * Log the error and fake success to the caller because if + * a flow fails to be deleted in the firmware, it is an + * unrecoverable error. + */ + } else { + usnic_dbg("PF %s VF Idx %u Filter: %u FILTER DELETED", + flow->ufdev->name, flow->vnic_idx, + flow->flow_id); + } + + kfree(flow); + return status; +} + +int usnic_fwd_enable_qp(struct usnic_fwd_dev *ufdev, int vnic_idx, int qp_idx) +{ + int status; + struct net_device *pf_netdev; + u64 a0, a1; + + pf_netdev = ufdev->netdev; + a0 = qp_idx; + a1 = CMD_QP_RQWQ; + + status = usnic_fwd_devcmd(ufdev, vnic_idx, CMD_QP_ENABLE, + &a0, &a1); + if (status) { + usnic_err("PF %s VNIC Index %u RQ Index: %u ENABLE Failed with status %d", + netdev_name(pf_netdev), + vnic_idx, + qp_idx, + status); + } else { + usnic_dbg("PF %s VNIC Index %u RQ Index: %u ENABLED", + netdev_name(pf_netdev), + vnic_idx, qp_idx); + } + + return status; +} + +int usnic_fwd_disable_qp(struct usnic_fwd_dev *ufdev, int vnic_idx, int qp_idx) +{ + int status; + u64 a0, a1; + struct net_device *pf_netdev; + + pf_netdev = ufdev->netdev; + a0 = qp_idx; + a1 = CMD_QP_RQWQ; + + status = usnic_fwd_devcmd(ufdev, vnic_idx, CMD_QP_DISABLE, + &a0, &a1); + if (status) { + usnic_err("PF %s VNIC Index %u RQ Index: %u DISABLE Failed with status %d", + netdev_name(pf_netdev), + vnic_idx, + qp_idx, + status); + } else { + usnic_dbg("PF %s VNIC Index %u RQ Index: %u DISABLED", + netdev_name(pf_netdev), + vnic_idx, + qp_idx); + } + + return status; +} diff --git a/kernel/drivers/infiniband/hw/usnic/usnic_fwd.h b/kernel/drivers/infiniband/hw/usnic/usnic_fwd.h new file mode 100644 index 000000000..93713a223 --- /dev/null +++ b/kernel/drivers/infiniband/hw/usnic/usnic_fwd.h @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_FWD_H_ +#define USNIC_FWD_H_ + +#include +#include +#include +#include + +#include "usnic_abi.h" +#include "usnic_common_pkt_hdr.h" +#include "vnic_devcmd.h" + +struct usnic_fwd_dev { + struct pci_dev *pdev; + struct net_device *netdev; + spinlock_t lock; + /* + * The following fields can be read directly off the device. + * However, they should be set by a accessor function, except name, + * which cannot be changed. + */ + bool link_up; + char mac[ETH_ALEN]; + unsigned int mtu; + __be32 inaddr; + char name[IFNAMSIZ+1]; +}; + +struct usnic_fwd_flow { + uint32_t flow_id; + struct usnic_fwd_dev *ufdev; + unsigned int vnic_idx; +}; + +struct usnic_filter_action { + int vnic_idx; + struct filter_action action; +}; + +struct usnic_fwd_dev *usnic_fwd_dev_alloc(struct pci_dev *pdev); +void usnic_fwd_dev_free(struct usnic_fwd_dev *ufdev); + +void usnic_fwd_set_mac(struct usnic_fwd_dev *ufdev, char mac[ETH_ALEN]); +int usnic_fwd_add_ipaddr(struct usnic_fwd_dev *ufdev, __be32 inaddr); +void usnic_fwd_del_ipaddr(struct usnic_fwd_dev *ufdev); +void usnic_fwd_carrier_up(struct usnic_fwd_dev *ufdev); +void usnic_fwd_carrier_down(struct usnic_fwd_dev *ufdev); +void usnic_fwd_set_mtu(struct usnic_fwd_dev *ufdev, unsigned int mtu); + +/* + * Allocate a flow on this forwarding device. Whoever calls this function, + * must monitor netdev events on ufdev's netdevice. If NETDEV_REBOOT or + * NETDEV_DOWN is seen, flow will no longer function and must be + * immediately freed by calling usnic_dealloc_flow. + */ +struct usnic_fwd_flow* +usnic_fwd_alloc_flow(struct usnic_fwd_dev *ufdev, struct filter *filter, + struct usnic_filter_action *action); +int usnic_fwd_dealloc_flow(struct usnic_fwd_flow *flow); +int usnic_fwd_enable_qp(struct usnic_fwd_dev *ufdev, int vnic_idx, int qp_idx); +int usnic_fwd_disable_qp(struct usnic_fwd_dev *ufdev, int vnic_idx, int qp_idx); + +static inline void usnic_fwd_init_usnic_filter(struct filter *filter, + uint32_t usnic_id) +{ + filter->type = FILTER_USNIC_ID; + filter->u.usnic.ethtype = USNIC_ROCE_ETHERTYPE; + filter->u.usnic.flags = FILTER_FIELD_USNIC_ETHTYPE | + FILTER_FIELD_USNIC_ID | + FILTER_FIELD_USNIC_PROTO; + filter->u.usnic.proto_version = (USNIC_ROCE_GRH_VER << + USNIC_ROCE_GRH_VER_SHIFT) | + USNIC_PROTO_VER; + filter->u.usnic.usnic_id = usnic_id; +} + +static inline void usnic_fwd_init_udp_filter(struct filter *filter, + uint32_t daddr, uint16_t dport) +{ + filter->type = FILTER_IPV4_5TUPLE; + filter->u.ipv4.flags = FILTER_FIELD_5TUP_PROTO; + filter->u.ipv4.protocol = PROTO_UDP; + + if (daddr) { + filter->u.ipv4.flags |= FILTER_FIELD_5TUP_DST_AD; + filter->u.ipv4.dst_addr = daddr; + } + + if (dport) { + filter->u.ipv4.flags |= FILTER_FIELD_5TUP_DST_PT; + filter->u.ipv4.dst_port = dport; + } +} + +#endif /* !USNIC_FWD_H_ */ diff --git a/kernel/drivers/infiniband/hw/usnic/usnic_ib.h b/kernel/drivers/infiniband/hw/usnic/usnic_ib.h new file mode 100644 index 000000000..e5a9297dd --- /dev/null +++ b/kernel/drivers/infiniband/hw/usnic/usnic_ib.h @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_IB_H_ +#define USNIC_IB_H_ + +#include +#include + +#include + + +#include "usnic.h" +#include "usnic_abi.h" +#include "usnic_vnic.h" + +#define USNIC_IB_PORT_CNT 1 +#define USNIC_IB_NUM_COMP_VECTORS 1 + +extern unsigned int usnic_ib_share_vf; + +struct usnic_ib_ucontext { + struct ib_ucontext ibucontext; + /* Protected by usnic_ib_dev->usdev_lock */ + struct list_head qp_grp_list; + struct list_head link; +}; + +struct usnic_ib_pd { + struct ib_pd ibpd; + struct usnic_uiom_pd *umem_pd; +}; + +struct usnic_ib_mr { + struct ib_mr ibmr; + struct usnic_uiom_reg *umem; +}; + +struct usnic_ib_dev { + struct ib_device ib_dev; + struct pci_dev *pdev; + struct net_device *netdev; + struct usnic_fwd_dev *ufdev; + struct list_head ib_dev_link; + struct list_head vf_dev_list; + struct list_head ctx_list; + struct mutex usdev_lock; + + /* provisioning information */ + struct kref vf_cnt; + unsigned int vf_res_cnt[USNIC_VNIC_RES_TYPE_MAX]; + + /* sysfs vars for QPN reporting */ + struct kobject *qpn_kobj; +}; + +struct usnic_ib_vf { + struct usnic_ib_dev *pf; + spinlock_t lock; + struct usnic_vnic *vnic; + unsigned int qp_grp_ref_cnt; + struct usnic_ib_pd *pd; + struct list_head link; +}; + +static inline +struct usnic_ib_dev *to_usdev(struct ib_device *ibdev) +{ + return container_of(ibdev, struct usnic_ib_dev, ib_dev); +} + +static inline +struct usnic_ib_ucontext *to_ucontext(struct ib_ucontext *ibucontext) +{ + return container_of(ibucontext, struct usnic_ib_ucontext, ibucontext); +} + +static inline +struct usnic_ib_pd *to_upd(struct ib_pd *ibpd) +{ + return container_of(ibpd, struct usnic_ib_pd, ibpd); +} + +static inline +struct usnic_ib_ucontext *to_uucontext(struct ib_ucontext *ibucontext) +{ + return container_of(ibucontext, struct usnic_ib_ucontext, ibucontext); +} + +static inline +struct usnic_ib_mr *to_umr(struct ib_mr *ibmr) +{ + return container_of(ibmr, struct usnic_ib_mr, ibmr); +} +void usnic_ib_log_vf(struct usnic_ib_vf *vf); + +#define UPDATE_PTR_LEFT(N, P, L) \ +do { \ + L -= (N); \ + P += (N); \ +} while (0) + +#endif /* USNIC_IB_H_ */ diff --git a/kernel/drivers/infiniband/hw/usnic/usnic_ib_main.c b/kernel/drivers/infiniband/hw/usnic/usnic_ib_main.c new file mode 100644 index 000000000..0d0f98695 --- /dev/null +++ b/kernel/drivers/infiniband/hw/usnic/usnic_ib_main.c @@ -0,0 +1,682 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Author: Upinder Malhi + * Author: Anant Deepak + * Author: Cesare Cantu' + * Author: Jeff Squyres + * Author: Kiran Thirumalai + * Author: Xuyang Wang + * Author: Reese Faucette + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "usnic_abi.h" +#include "usnic_common_util.h" +#include "usnic_ib.h" +#include "usnic_ib_qp_grp.h" +#include "usnic_log.h" +#include "usnic_fwd.h" +#include "usnic_debugfs.h" +#include "usnic_ib_verbs.h" +#include "usnic_transport.h" +#include "usnic_uiom.h" +#include "usnic_ib_sysfs.h" + +unsigned int usnic_log_lvl = USNIC_LOG_LVL_ERR; +unsigned int usnic_ib_share_vf = 1; + +static const char usnic_version[] = + DRV_NAME ": Cisco VIC (USNIC) Verbs Driver v" + DRV_VERSION " (" DRV_RELDATE ")\n"; + +static DEFINE_MUTEX(usnic_ib_ibdev_list_lock); +static LIST_HEAD(usnic_ib_ibdev_list); + +/* Callback dump funcs */ +static int usnic_ib_dump_vf_hdr(void *obj, char *buf, int buf_sz) +{ + struct usnic_ib_vf *vf = obj; + return scnprintf(buf, buf_sz, "PF: %s ", vf->pf->ib_dev.name); +} +/* End callback dump funcs */ + +static void usnic_ib_dump_vf(struct usnic_ib_vf *vf, char *buf, int buf_sz) +{ + usnic_vnic_dump(vf->vnic, buf, buf_sz, vf, + usnic_ib_dump_vf_hdr, + usnic_ib_qp_grp_dump_hdr, usnic_ib_qp_grp_dump_rows); +} + +void usnic_ib_log_vf(struct usnic_ib_vf *vf) +{ + char buf[1000]; + usnic_ib_dump_vf(vf, buf, sizeof(buf)); + usnic_dbg("%s\n", buf); +} + +/* Start of netdev section */ +static inline const char *usnic_ib_netdev_event_to_string(unsigned long event) +{ + const char *event2str[] = {"NETDEV_NONE", "NETDEV_UP", "NETDEV_DOWN", + "NETDEV_REBOOT", "NETDEV_CHANGE", + "NETDEV_REGISTER", "NETDEV_UNREGISTER", "NETDEV_CHANGEMTU", + "NETDEV_CHANGEADDR", "NETDEV_GOING_DOWN", "NETDEV_FEAT_CHANGE", + "NETDEV_BONDING_FAILOVER", "NETDEV_PRE_UP", + "NETDEV_PRE_TYPE_CHANGE", "NETDEV_POST_TYPE_CHANGE", + "NETDEV_POST_INT", "NETDEV_UNREGISTER_FINAL", "NETDEV_RELEASE", + "NETDEV_NOTIFY_PEERS", "NETDEV_JOIN" + }; + + if (event >= ARRAY_SIZE(event2str)) + return "UNKNOWN_NETDEV_EVENT"; + else + return event2str[event]; +} + +static void usnic_ib_qp_grp_modify_active_to_err(struct usnic_ib_dev *us_ibdev) +{ + struct usnic_ib_ucontext *ctx; + struct usnic_ib_qp_grp *qp_grp; + enum ib_qp_state cur_state; + int status; + + BUG_ON(!mutex_is_locked(&us_ibdev->usdev_lock)); + + list_for_each_entry(ctx, &us_ibdev->ctx_list, link) { + list_for_each_entry(qp_grp, &ctx->qp_grp_list, link) { + cur_state = qp_grp->state; + if (cur_state == IB_QPS_INIT || + cur_state == IB_QPS_RTR || + cur_state == IB_QPS_RTS) { + status = usnic_ib_qp_grp_modify(qp_grp, + IB_QPS_ERR, + NULL); + if (status) { + usnic_err("Failed to transistion qp grp %u from %s to %s\n", + qp_grp->grp_id, + usnic_ib_qp_grp_state_to_string + (cur_state), + usnic_ib_qp_grp_state_to_string + (IB_QPS_ERR)); + } + } + } + } +} + +static void usnic_ib_handle_usdev_event(struct usnic_ib_dev *us_ibdev, + unsigned long event) +{ + struct net_device *netdev; + struct ib_event ib_event; + + memset(&ib_event, 0, sizeof(ib_event)); + + mutex_lock(&us_ibdev->usdev_lock); + netdev = us_ibdev->netdev; + switch (event) { + case NETDEV_REBOOT: + usnic_info("PF Reset on %s\n", us_ibdev->ib_dev.name); + usnic_ib_qp_grp_modify_active_to_err(us_ibdev); + ib_event.event = IB_EVENT_PORT_ERR; + ib_event.device = &us_ibdev->ib_dev; + ib_event.element.port_num = 1; + ib_dispatch_event(&ib_event); + break; + case NETDEV_UP: + case NETDEV_DOWN: + case NETDEV_CHANGE: + if (!us_ibdev->ufdev->link_up && + netif_carrier_ok(netdev)) { + usnic_fwd_carrier_up(us_ibdev->ufdev); + usnic_info("Link UP on %s\n", us_ibdev->ib_dev.name); + ib_event.event = IB_EVENT_PORT_ACTIVE; + ib_event.device = &us_ibdev->ib_dev; + ib_event.element.port_num = 1; + ib_dispatch_event(&ib_event); + } else if (us_ibdev->ufdev->link_up && + !netif_carrier_ok(netdev)) { + usnic_fwd_carrier_down(us_ibdev->ufdev); + usnic_info("Link DOWN on %s\n", us_ibdev->ib_dev.name); + usnic_ib_qp_grp_modify_active_to_err(us_ibdev); + ib_event.event = IB_EVENT_PORT_ERR; + ib_event.device = &us_ibdev->ib_dev; + ib_event.element.port_num = 1; + ib_dispatch_event(&ib_event); + } else { + usnic_dbg("Ignoring %s on %s\n", + usnic_ib_netdev_event_to_string(event), + us_ibdev->ib_dev.name); + } + break; + case NETDEV_CHANGEADDR: + if (!memcmp(us_ibdev->ufdev->mac, netdev->dev_addr, + sizeof(us_ibdev->ufdev->mac))) { + usnic_dbg("Ignoring addr change on %s\n", + us_ibdev->ib_dev.name); + } else { + usnic_info(" %s old mac: %pM new mac: %pM\n", + us_ibdev->ib_dev.name, + us_ibdev->ufdev->mac, + netdev->dev_addr); + usnic_fwd_set_mac(us_ibdev->ufdev, netdev->dev_addr); + usnic_ib_qp_grp_modify_active_to_err(us_ibdev); + ib_event.event = IB_EVENT_GID_CHANGE; + ib_event.device = &us_ibdev->ib_dev; + ib_event.element.port_num = 1; + ib_dispatch_event(&ib_event); + } + + break; + case NETDEV_CHANGEMTU: + if (us_ibdev->ufdev->mtu != netdev->mtu) { + usnic_info("MTU Change on %s old: %u new: %u\n", + us_ibdev->ib_dev.name, + us_ibdev->ufdev->mtu, netdev->mtu); + usnic_fwd_set_mtu(us_ibdev->ufdev, netdev->mtu); + usnic_ib_qp_grp_modify_active_to_err(us_ibdev); + } else { + usnic_dbg("Ignoring MTU change on %s\n", + us_ibdev->ib_dev.name); + } + break; + default: + usnic_dbg("Ignoring event %s on %s", + usnic_ib_netdev_event_to_string(event), + us_ibdev->ib_dev.name); + } + mutex_unlock(&us_ibdev->usdev_lock); +} + +static int usnic_ib_netdevice_event(struct notifier_block *notifier, + unsigned long event, void *ptr) +{ + struct usnic_ib_dev *us_ibdev; + + struct net_device *netdev = netdev_notifier_info_to_dev(ptr); + + mutex_lock(&usnic_ib_ibdev_list_lock); + list_for_each_entry(us_ibdev, &usnic_ib_ibdev_list, ib_dev_link) { + if (us_ibdev->netdev == netdev) { + usnic_ib_handle_usdev_event(us_ibdev, event); + break; + } + } + mutex_unlock(&usnic_ib_ibdev_list_lock); + + return NOTIFY_DONE; +} + +static struct notifier_block usnic_ib_netdevice_notifier = { + .notifier_call = usnic_ib_netdevice_event +}; +/* End of netdev section */ + +/* Start of inet section */ +static int usnic_ib_handle_inet_event(struct usnic_ib_dev *us_ibdev, + unsigned long event, void *ptr) +{ + struct in_ifaddr *ifa = ptr; + struct ib_event ib_event; + + mutex_lock(&us_ibdev->usdev_lock); + + switch (event) { + case NETDEV_DOWN: + usnic_info("%s via ip notifiers", + usnic_ib_netdev_event_to_string(event)); + usnic_fwd_del_ipaddr(us_ibdev->ufdev); + usnic_ib_qp_grp_modify_active_to_err(us_ibdev); + ib_event.event = IB_EVENT_GID_CHANGE; + ib_event.device = &us_ibdev->ib_dev; + ib_event.element.port_num = 1; + ib_dispatch_event(&ib_event); + break; + case NETDEV_UP: + usnic_fwd_add_ipaddr(us_ibdev->ufdev, ifa->ifa_address); + usnic_info("%s via ip notifiers: ip %pI4", + usnic_ib_netdev_event_to_string(event), + &us_ibdev->ufdev->inaddr); + ib_event.event = IB_EVENT_GID_CHANGE; + ib_event.device = &us_ibdev->ib_dev; + ib_event.element.port_num = 1; + ib_dispatch_event(&ib_event); + break; + default: + usnic_info("Ignoring event %s on %s", + usnic_ib_netdev_event_to_string(event), + us_ibdev->ib_dev.name); + } + mutex_unlock(&us_ibdev->usdev_lock); + + return NOTIFY_DONE; +} + +static int usnic_ib_inetaddr_event(struct notifier_block *notifier, + unsigned long event, void *ptr) +{ + struct usnic_ib_dev *us_ibdev; + struct in_ifaddr *ifa = ptr; + struct net_device *netdev = ifa->ifa_dev->dev; + + mutex_lock(&usnic_ib_ibdev_list_lock); + list_for_each_entry(us_ibdev, &usnic_ib_ibdev_list, ib_dev_link) { + if (us_ibdev->netdev == netdev) { + usnic_ib_handle_inet_event(us_ibdev, event, ptr); + break; + } + } + mutex_unlock(&usnic_ib_ibdev_list_lock); + + return NOTIFY_DONE; +} +static struct notifier_block usnic_ib_inetaddr_notifier = { + .notifier_call = usnic_ib_inetaddr_event +}; +/* End of inet section*/ + +/* Start of PF discovery section */ +static void *usnic_ib_device_add(struct pci_dev *dev) +{ + struct usnic_ib_dev *us_ibdev; + union ib_gid gid; + struct in_ifaddr *in; + struct net_device *netdev; + + usnic_dbg("\n"); + netdev = pci_get_drvdata(dev); + + us_ibdev = (struct usnic_ib_dev *)ib_alloc_device(sizeof(*us_ibdev)); + if (IS_ERR_OR_NULL(us_ibdev)) { + usnic_err("Device %s context alloc failed\n", + netdev_name(pci_get_drvdata(dev))); + return ERR_PTR(us_ibdev ? PTR_ERR(us_ibdev) : -EFAULT); + } + + us_ibdev->ufdev = usnic_fwd_dev_alloc(dev); + if (IS_ERR_OR_NULL(us_ibdev->ufdev)) { + usnic_err("Failed to alloc ufdev for %s with err %ld\n", + pci_name(dev), PTR_ERR(us_ibdev->ufdev)); + goto err_dealloc; + } + + mutex_init(&us_ibdev->usdev_lock); + INIT_LIST_HEAD(&us_ibdev->vf_dev_list); + INIT_LIST_HEAD(&us_ibdev->ctx_list); + + us_ibdev->pdev = dev; + us_ibdev->netdev = pci_get_drvdata(dev); + us_ibdev->ib_dev.owner = THIS_MODULE; + us_ibdev->ib_dev.node_type = RDMA_NODE_USNIC_UDP; + us_ibdev->ib_dev.phys_port_cnt = USNIC_IB_PORT_CNT; + us_ibdev->ib_dev.num_comp_vectors = USNIC_IB_NUM_COMP_VECTORS; + us_ibdev->ib_dev.dma_device = &dev->dev; + us_ibdev->ib_dev.uverbs_abi_ver = USNIC_UVERBS_ABI_VERSION; + strlcpy(us_ibdev->ib_dev.name, "usnic_%d", IB_DEVICE_NAME_MAX); + + us_ibdev->ib_dev.uverbs_cmd_mask = + (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | + (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | + (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_DEREG_MR) | + (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | + (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_QP) | + (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | + (1ull << IB_USER_VERBS_CMD_QUERY_QP) | + (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | + (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_DETACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_OPEN_QP); + + us_ibdev->ib_dev.query_device = usnic_ib_query_device; + us_ibdev->ib_dev.query_port = usnic_ib_query_port; + us_ibdev->ib_dev.query_pkey = usnic_ib_query_pkey; + us_ibdev->ib_dev.query_gid = usnic_ib_query_gid; + us_ibdev->ib_dev.get_link_layer = usnic_ib_port_link_layer; + us_ibdev->ib_dev.alloc_pd = usnic_ib_alloc_pd; + us_ibdev->ib_dev.dealloc_pd = usnic_ib_dealloc_pd; + us_ibdev->ib_dev.create_qp = usnic_ib_create_qp; + us_ibdev->ib_dev.modify_qp = usnic_ib_modify_qp; + us_ibdev->ib_dev.query_qp = usnic_ib_query_qp; + us_ibdev->ib_dev.destroy_qp = usnic_ib_destroy_qp; + us_ibdev->ib_dev.create_cq = usnic_ib_create_cq; + us_ibdev->ib_dev.destroy_cq = usnic_ib_destroy_cq; + us_ibdev->ib_dev.reg_user_mr = usnic_ib_reg_mr; + us_ibdev->ib_dev.dereg_mr = usnic_ib_dereg_mr; + us_ibdev->ib_dev.alloc_ucontext = usnic_ib_alloc_ucontext; + us_ibdev->ib_dev.dealloc_ucontext = usnic_ib_dealloc_ucontext; + us_ibdev->ib_dev.mmap = usnic_ib_mmap; + us_ibdev->ib_dev.create_ah = usnic_ib_create_ah; + us_ibdev->ib_dev.destroy_ah = usnic_ib_destroy_ah; + us_ibdev->ib_dev.post_send = usnic_ib_post_send; + us_ibdev->ib_dev.post_recv = usnic_ib_post_recv; + us_ibdev->ib_dev.poll_cq = usnic_ib_poll_cq; + us_ibdev->ib_dev.req_notify_cq = usnic_ib_req_notify_cq; + us_ibdev->ib_dev.get_dma_mr = usnic_ib_get_dma_mr; + + + if (ib_register_device(&us_ibdev->ib_dev, NULL)) + goto err_fwd_dealloc; + + usnic_fwd_set_mtu(us_ibdev->ufdev, us_ibdev->netdev->mtu); + usnic_fwd_set_mac(us_ibdev->ufdev, us_ibdev->netdev->dev_addr); + if (netif_carrier_ok(us_ibdev->netdev)) + usnic_fwd_carrier_up(us_ibdev->ufdev); + + in = ((struct in_device *)(netdev->ip_ptr))->ifa_list; + if (in != NULL) + usnic_fwd_add_ipaddr(us_ibdev->ufdev, in->ifa_address); + + usnic_mac_ip_to_gid(us_ibdev->netdev->perm_addr, + us_ibdev->ufdev->inaddr, &gid.raw[0]); + memcpy(&us_ibdev->ib_dev.node_guid, &gid.global.interface_id, + sizeof(gid.global.interface_id)); + kref_init(&us_ibdev->vf_cnt); + + usnic_info("Added ibdev: %s netdev: %s with mac %pM Link: %u MTU: %u\n", + us_ibdev->ib_dev.name, netdev_name(us_ibdev->netdev), + us_ibdev->ufdev->mac, us_ibdev->ufdev->link_up, + us_ibdev->ufdev->mtu); + return us_ibdev; + +err_fwd_dealloc: + usnic_fwd_dev_free(us_ibdev->ufdev); +err_dealloc: + usnic_err("failed -- deallocing device\n"); + ib_dealloc_device(&us_ibdev->ib_dev); + return NULL; +} + +static void usnic_ib_device_remove(struct usnic_ib_dev *us_ibdev) +{ + usnic_info("Unregistering %s\n", us_ibdev->ib_dev.name); + usnic_ib_sysfs_unregister_usdev(us_ibdev); + usnic_fwd_dev_free(us_ibdev->ufdev); + ib_unregister_device(&us_ibdev->ib_dev); + ib_dealloc_device(&us_ibdev->ib_dev); +} + +static void usnic_ib_undiscover_pf(struct kref *kref) +{ + struct usnic_ib_dev *us_ibdev, *tmp; + struct pci_dev *dev; + bool found = false; + + dev = container_of(kref, struct usnic_ib_dev, vf_cnt)->pdev; + mutex_lock(&usnic_ib_ibdev_list_lock); + list_for_each_entry_safe(us_ibdev, tmp, + &usnic_ib_ibdev_list, ib_dev_link) { + if (us_ibdev->pdev == dev) { + list_del(&us_ibdev->ib_dev_link); + usnic_ib_device_remove(us_ibdev); + found = true; + break; + } + } + + WARN(!found, "Failed to remove PF %s\n", pci_name(dev)); + + mutex_unlock(&usnic_ib_ibdev_list_lock); +} + +static struct usnic_ib_dev *usnic_ib_discover_pf(struct usnic_vnic *vnic) +{ + struct usnic_ib_dev *us_ibdev; + struct pci_dev *parent_pci, *vf_pci; + int err; + + vf_pci = usnic_vnic_get_pdev(vnic); + parent_pci = pci_physfn(vf_pci); + + BUG_ON(!parent_pci); + + mutex_lock(&usnic_ib_ibdev_list_lock); + list_for_each_entry(us_ibdev, &usnic_ib_ibdev_list, ib_dev_link) { + if (us_ibdev->pdev == parent_pci) { + kref_get(&us_ibdev->vf_cnt); + goto out; + } + } + + us_ibdev = usnic_ib_device_add(parent_pci); + if (IS_ERR_OR_NULL(us_ibdev)) { + us_ibdev = us_ibdev ? us_ibdev : ERR_PTR(-EFAULT); + goto out; + } + + err = usnic_ib_sysfs_register_usdev(us_ibdev); + if (err) { + usnic_ib_device_remove(us_ibdev); + us_ibdev = ERR_PTR(err); + goto out; + } + + list_add(&us_ibdev->ib_dev_link, &usnic_ib_ibdev_list); +out: + mutex_unlock(&usnic_ib_ibdev_list_lock); + return us_ibdev; +} +/* End of PF discovery section */ + +/* Start of PCI section */ + +static const struct pci_device_id usnic_ib_pci_ids[] = { + {PCI_DEVICE(PCI_VENDOR_ID_CISCO, PCI_DEVICE_ID_CISCO_VIC_USPACE_NIC)}, + {0,} +}; + +static int usnic_ib_pci_probe(struct pci_dev *pdev, + const struct pci_device_id *id) +{ + int err; + struct usnic_ib_dev *pf; + struct usnic_ib_vf *vf; + enum usnic_vnic_res_type res_type; + + vf = kzalloc(sizeof(*vf), GFP_KERNEL); + if (!vf) + return -ENOMEM; + + err = pci_enable_device(pdev); + if (err) { + usnic_err("Failed to enable %s with err %d\n", + pci_name(pdev), err); + goto out_clean_vf; + } + + err = pci_request_regions(pdev, DRV_NAME); + if (err) { + usnic_err("Failed to request region for %s with err %d\n", + pci_name(pdev), err); + goto out_disable_device; + } + + pci_set_master(pdev); + pci_set_drvdata(pdev, vf); + + vf->vnic = usnic_vnic_alloc(pdev); + if (IS_ERR_OR_NULL(vf->vnic)) { + err = vf->vnic ? PTR_ERR(vf->vnic) : -ENOMEM; + usnic_err("Failed to alloc vnic for %s with err %d\n", + pci_name(pdev), err); + goto out_release_regions; + } + + pf = usnic_ib_discover_pf(vf->vnic); + if (IS_ERR_OR_NULL(pf)) { + usnic_err("Failed to discover pf of vnic %s with err%ld\n", + pci_name(pdev), PTR_ERR(pf)); + err = pf ? PTR_ERR(pf) : -EFAULT; + goto out_clean_vnic; + } + + vf->pf = pf; + spin_lock_init(&vf->lock); + mutex_lock(&pf->usdev_lock); + list_add_tail(&vf->link, &pf->vf_dev_list); + /* + * Save max settings (will be same for each VF, easier to re-write than + * to say "if (!set) { set_values(); set=1; } + */ + for (res_type = USNIC_VNIC_RES_TYPE_EOL+1; + res_type < USNIC_VNIC_RES_TYPE_MAX; + res_type++) { + pf->vf_res_cnt[res_type] = usnic_vnic_res_cnt(vf->vnic, + res_type); + } + + mutex_unlock(&pf->usdev_lock); + + usnic_info("Registering usnic VF %s into PF %s\n", pci_name(pdev), + pf->ib_dev.name); + usnic_ib_log_vf(vf); + return 0; + +out_clean_vnic: + usnic_vnic_free(vf->vnic); +out_release_regions: + pci_set_drvdata(pdev, NULL); + pci_clear_master(pdev); + pci_release_regions(pdev); +out_disable_device: + pci_disable_device(pdev); +out_clean_vf: + kfree(vf); + return err; +} + +static void usnic_ib_pci_remove(struct pci_dev *pdev) +{ + struct usnic_ib_vf *vf = pci_get_drvdata(pdev); + struct usnic_ib_dev *pf = vf->pf; + + mutex_lock(&pf->usdev_lock); + list_del(&vf->link); + mutex_unlock(&pf->usdev_lock); + + kref_put(&pf->vf_cnt, usnic_ib_undiscover_pf); + usnic_vnic_free(vf->vnic); + pci_set_drvdata(pdev, NULL); + pci_clear_master(pdev); + pci_release_regions(pdev); + pci_disable_device(pdev); + kfree(vf); + + usnic_info("Removed VF %s\n", pci_name(pdev)); +} + +/* PCI driver entry points */ +static struct pci_driver usnic_ib_pci_driver = { + .name = DRV_NAME, + .id_table = usnic_ib_pci_ids, + .probe = usnic_ib_pci_probe, + .remove = usnic_ib_pci_remove, +}; +/* End of PCI section */ + +/* Start of module section */ +static int __init usnic_ib_init(void) +{ + int err; + + printk_once(KERN_INFO "%s", usnic_version); + + err = usnic_uiom_init(DRV_NAME); + if (err) { + usnic_err("Unable to initalize umem with err %d\n", err); + return err; + } + + if (pci_register_driver(&usnic_ib_pci_driver)) { + usnic_err("Unable to register with PCI\n"); + goto out_umem_fini; + } + + err = register_netdevice_notifier(&usnic_ib_netdevice_notifier); + if (err) { + usnic_err("Failed to register netdev notifier\n"); + goto out_pci_unreg; + } + + err = register_inetaddr_notifier(&usnic_ib_inetaddr_notifier); + if (err) { + usnic_err("Failed to register inet addr notifier\n"); + goto out_unreg_netdev_notifier; + } + + err = usnic_transport_init(); + if (err) { + usnic_err("Failed to initialize transport\n"); + goto out_unreg_inetaddr_notifier; + } + + usnic_debugfs_init(); + + return 0; + +out_unreg_inetaddr_notifier: + unregister_inetaddr_notifier(&usnic_ib_inetaddr_notifier); +out_unreg_netdev_notifier: + unregister_netdevice_notifier(&usnic_ib_netdevice_notifier); +out_pci_unreg: + pci_unregister_driver(&usnic_ib_pci_driver); +out_umem_fini: + usnic_uiom_fini(); + + return err; +} + +static void __exit usnic_ib_destroy(void) +{ + usnic_dbg("\n"); + usnic_debugfs_exit(); + usnic_transport_fini(); + unregister_inetaddr_notifier(&usnic_ib_inetaddr_notifier); + unregister_netdevice_notifier(&usnic_ib_netdevice_notifier); + pci_unregister_driver(&usnic_ib_pci_driver); + usnic_uiom_fini(); +} + +MODULE_DESCRIPTION("Cisco VIC (usNIC) Verbs Driver"); +MODULE_AUTHOR("Upinder Malhi "); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION(DRV_VERSION); +module_param(usnic_log_lvl, uint, S_IRUGO | S_IWUSR); +module_param(usnic_ib_share_vf, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(usnic_log_lvl, " Off=0, Err=1, Info=2, Debug=3"); +MODULE_PARM_DESC(usnic_ib_share_vf, "Off=0, On=1 VF sharing amongst QPs"); +MODULE_DEVICE_TABLE(pci, usnic_ib_pci_ids); + +module_init(usnic_ib_init); +module_exit(usnic_ib_destroy); +/* End of module section */ diff --git a/kernel/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c b/kernel/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c new file mode 100644 index 000000000..db3588df3 --- /dev/null +++ b/kernel/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c @@ -0,0 +1,761 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include +#include + +#include "usnic_log.h" +#include "usnic_vnic.h" +#include "usnic_fwd.h" +#include "usnic_uiom.h" +#include "usnic_debugfs.h" +#include "usnic_ib_qp_grp.h" +#include "usnic_ib_sysfs.h" +#include "usnic_transport.h" + +#define DFLT_RQ_IDX 0 + +const char *usnic_ib_qp_grp_state_to_string(enum ib_qp_state state) +{ + switch (state) { + case IB_QPS_RESET: + return "Rst"; + case IB_QPS_INIT: + return "Init"; + case IB_QPS_RTR: + return "RTR"; + case IB_QPS_RTS: + return "RTS"; + case IB_QPS_SQD: + return "SQD"; + case IB_QPS_SQE: + return "SQE"; + case IB_QPS_ERR: + return "ERR"; + default: + return "UNKOWN STATE"; + + } +} + +int usnic_ib_qp_grp_dump_hdr(char *buf, int buf_sz) +{ + return scnprintf(buf, buf_sz, "|QPN\t|State\t|PID\t|VF Idx\t|Fil ID"); +} + +int usnic_ib_qp_grp_dump_rows(void *obj, char *buf, int buf_sz) +{ + struct usnic_ib_qp_grp *qp_grp = obj; + struct usnic_ib_qp_grp_flow *default_flow; + if (obj) { + default_flow = list_first_entry(&qp_grp->flows_lst, + struct usnic_ib_qp_grp_flow, link); + return scnprintf(buf, buf_sz, "|%d\t|%s\t|%d\t|%hu\t|%d", + qp_grp->ibqp.qp_num, + usnic_ib_qp_grp_state_to_string( + qp_grp->state), + qp_grp->owner_pid, + usnic_vnic_get_index(qp_grp->vf->vnic), + default_flow->flow->flow_id); + } else { + return scnprintf(buf, buf_sz, "|N/A\t|N/A\t|N/A\t|N/A\t|N/A"); + } +} + +static struct usnic_vnic_res_chunk * +get_qp_res_chunk(struct usnic_ib_qp_grp *qp_grp) +{ + lockdep_assert_held(&qp_grp->lock); + /* + * The QP res chunk, used to derive qp indices, + * are just indices of the RQs + */ + return usnic_ib_qp_grp_get_chunk(qp_grp, USNIC_VNIC_RES_TYPE_RQ); +} + +static int enable_qp_grp(struct usnic_ib_qp_grp *qp_grp) +{ + + int status; + int i, vnic_idx; + struct usnic_vnic_res_chunk *res_chunk; + struct usnic_vnic_res *res; + + lockdep_assert_held(&qp_grp->lock); + + vnic_idx = usnic_vnic_get_index(qp_grp->vf->vnic); + + res_chunk = get_qp_res_chunk(qp_grp); + if (IS_ERR_OR_NULL(res_chunk)) { + usnic_err("Unable to get qp res with err %ld\n", + PTR_ERR(res_chunk)); + return res_chunk ? PTR_ERR(res_chunk) : -ENOMEM; + } + + for (i = 0; i < res_chunk->cnt; i++) { + res = res_chunk->res[i]; + status = usnic_fwd_enable_qp(qp_grp->ufdev, vnic_idx, + res->vnic_idx); + if (status) { + usnic_err("Failed to enable qp %d of %s:%d\n with err %d\n", + res->vnic_idx, qp_grp->ufdev->name, + vnic_idx, status); + goto out_err; + } + } + + return 0; + +out_err: + for (i--; i >= 0; i--) { + res = res_chunk->res[i]; + usnic_fwd_disable_qp(qp_grp->ufdev, vnic_idx, + res->vnic_idx); + } + + return status; +} + +static int disable_qp_grp(struct usnic_ib_qp_grp *qp_grp) +{ + int i, vnic_idx; + struct usnic_vnic_res_chunk *res_chunk; + struct usnic_vnic_res *res; + int status = 0; + + lockdep_assert_held(&qp_grp->lock); + vnic_idx = usnic_vnic_get_index(qp_grp->vf->vnic); + + res_chunk = get_qp_res_chunk(qp_grp); + if (IS_ERR_OR_NULL(res_chunk)) { + usnic_err("Unable to get qp res with err %ld\n", + PTR_ERR(res_chunk)); + return res_chunk ? PTR_ERR(res_chunk) : -ENOMEM; + } + + for (i = 0; i < res_chunk->cnt; i++) { + res = res_chunk->res[i]; + status = usnic_fwd_disable_qp(qp_grp->ufdev, vnic_idx, + res->vnic_idx); + if (status) { + usnic_err("Failed to disable rq %d of %s:%d\n with err %d\n", + res->vnic_idx, + qp_grp->ufdev->name, + vnic_idx, status); + } + } + + return status; + +} + +static int init_filter_action(struct usnic_ib_qp_grp *qp_grp, + struct usnic_filter_action *uaction) +{ + struct usnic_vnic_res_chunk *res_chunk; + + res_chunk = usnic_ib_qp_grp_get_chunk(qp_grp, USNIC_VNIC_RES_TYPE_RQ); + if (IS_ERR_OR_NULL(res_chunk)) { + usnic_err("Unable to get %s with err %ld\n", + usnic_vnic_res_type_to_str(USNIC_VNIC_RES_TYPE_RQ), + PTR_ERR(res_chunk)); + return res_chunk ? PTR_ERR(res_chunk) : -ENOMEM; + } + + uaction->vnic_idx = usnic_vnic_get_index(qp_grp->vf->vnic); + uaction->action.type = FILTER_ACTION_RQ_STEERING; + uaction->action.u.rq_idx = res_chunk->res[DFLT_RQ_IDX]->vnic_idx; + + return 0; +} + +static struct usnic_ib_qp_grp_flow* +create_roce_custom_flow(struct usnic_ib_qp_grp *qp_grp, + struct usnic_transport_spec *trans_spec) +{ + uint16_t port_num; + int err; + struct filter filter; + struct usnic_filter_action uaction; + struct usnic_ib_qp_grp_flow *qp_flow; + struct usnic_fwd_flow *flow; + enum usnic_transport_type trans_type; + + trans_type = trans_spec->trans_type; + port_num = trans_spec->usnic_roce.port_num; + + /* Reserve Port */ + port_num = usnic_transport_rsrv_port(trans_type, port_num); + if (port_num == 0) + return ERR_PTR(-EINVAL); + + /* Create Flow */ + usnic_fwd_init_usnic_filter(&filter, port_num); + err = init_filter_action(qp_grp, &uaction); + if (err) + goto out_unreserve_port; + + flow = usnic_fwd_alloc_flow(qp_grp->ufdev, &filter, &uaction); + if (IS_ERR_OR_NULL(flow)) { + usnic_err("Unable to alloc flow failed with err %ld\n", + PTR_ERR(flow)); + err = flow ? PTR_ERR(flow) : -EFAULT; + goto out_unreserve_port; + } + + /* Create Flow Handle */ + qp_flow = kzalloc(sizeof(*qp_flow), GFP_ATOMIC); + if (IS_ERR_OR_NULL(qp_flow)) { + err = qp_flow ? PTR_ERR(qp_flow) : -ENOMEM; + goto out_dealloc_flow; + } + qp_flow->flow = flow; + qp_flow->trans_type = trans_type; + qp_flow->usnic_roce.port_num = port_num; + qp_flow->qp_grp = qp_grp; + return qp_flow; + +out_dealloc_flow: + usnic_fwd_dealloc_flow(flow); +out_unreserve_port: + usnic_transport_unrsrv_port(trans_type, port_num); + return ERR_PTR(err); +} + +static void release_roce_custom_flow(struct usnic_ib_qp_grp_flow *qp_flow) +{ + usnic_fwd_dealloc_flow(qp_flow->flow); + usnic_transport_unrsrv_port(qp_flow->trans_type, + qp_flow->usnic_roce.port_num); + kfree(qp_flow); +} + +static struct usnic_ib_qp_grp_flow* +create_udp_flow(struct usnic_ib_qp_grp *qp_grp, + struct usnic_transport_spec *trans_spec) +{ + struct socket *sock; + int sock_fd; + int err; + struct filter filter; + struct usnic_filter_action uaction; + struct usnic_ib_qp_grp_flow *qp_flow; + struct usnic_fwd_flow *flow; + enum usnic_transport_type trans_type; + uint32_t addr; + uint16_t port_num; + int proto; + + trans_type = trans_spec->trans_type; + sock_fd = trans_spec->udp.sock_fd; + + /* Get and check socket */ + sock = usnic_transport_get_socket(sock_fd); + if (IS_ERR_OR_NULL(sock)) + return ERR_CAST(sock); + + err = usnic_transport_sock_get_addr(sock, &proto, &addr, &port_num); + if (err) + goto out_put_sock; + + if (proto != IPPROTO_UDP) { + usnic_err("Protocol for fd %d is not UDP", sock_fd); + err = -EPERM; + goto out_put_sock; + } + + /* Create flow */ + usnic_fwd_init_udp_filter(&filter, addr, port_num); + err = init_filter_action(qp_grp, &uaction); + if (err) + goto out_put_sock; + + flow = usnic_fwd_alloc_flow(qp_grp->ufdev, &filter, &uaction); + if (IS_ERR_OR_NULL(flow)) { + usnic_err("Unable to alloc flow failed with err %ld\n", + PTR_ERR(flow)); + err = flow ? PTR_ERR(flow) : -EFAULT; + goto out_put_sock; + } + + /* Create qp_flow */ + qp_flow = kzalloc(sizeof(*qp_flow), GFP_ATOMIC); + if (IS_ERR_OR_NULL(qp_flow)) { + err = qp_flow ? PTR_ERR(qp_flow) : -ENOMEM; + goto out_dealloc_flow; + } + qp_flow->flow = flow; + qp_flow->trans_type = trans_type; + qp_flow->udp.sock = sock; + qp_flow->qp_grp = qp_grp; + return qp_flow; + +out_dealloc_flow: + usnic_fwd_dealloc_flow(flow); +out_put_sock: + usnic_transport_put_socket(sock); + return ERR_PTR(err); +} + +static void release_udp_flow(struct usnic_ib_qp_grp_flow *qp_flow) +{ + usnic_fwd_dealloc_flow(qp_flow->flow); + usnic_transport_put_socket(qp_flow->udp.sock); + kfree(qp_flow); +} + +static struct usnic_ib_qp_grp_flow* +create_and_add_flow(struct usnic_ib_qp_grp *qp_grp, + struct usnic_transport_spec *trans_spec) +{ + struct usnic_ib_qp_grp_flow *qp_flow; + enum usnic_transport_type trans_type; + + trans_type = trans_spec->trans_type; + switch (trans_type) { + case USNIC_TRANSPORT_ROCE_CUSTOM: + qp_flow = create_roce_custom_flow(qp_grp, trans_spec); + break; + case USNIC_TRANSPORT_IPV4_UDP: + qp_flow = create_udp_flow(qp_grp, trans_spec); + break; + default: + usnic_err("Unsupported transport %u\n", + trans_spec->trans_type); + return ERR_PTR(-EINVAL); + } + + if (!IS_ERR_OR_NULL(qp_flow)) { + list_add_tail(&qp_flow->link, &qp_grp->flows_lst); + usnic_debugfs_flow_add(qp_flow); + } + + + return qp_flow; +} + +static void release_and_remove_flow(struct usnic_ib_qp_grp_flow *qp_flow) +{ + usnic_debugfs_flow_remove(qp_flow); + list_del(&qp_flow->link); + + switch (qp_flow->trans_type) { + case USNIC_TRANSPORT_ROCE_CUSTOM: + release_roce_custom_flow(qp_flow); + break; + case USNIC_TRANSPORT_IPV4_UDP: + release_udp_flow(qp_flow); + break; + default: + WARN(1, "Unsupported transport %u\n", + qp_flow->trans_type); + break; + } +} + +static void release_and_remove_all_flows(struct usnic_ib_qp_grp *qp_grp) +{ + struct usnic_ib_qp_grp_flow *qp_flow, *tmp; + list_for_each_entry_safe(qp_flow, tmp, &qp_grp->flows_lst, link) + release_and_remove_flow(qp_flow); +} + +int usnic_ib_qp_grp_modify(struct usnic_ib_qp_grp *qp_grp, + enum ib_qp_state new_state, + void *data) +{ + int status = 0; + int vnic_idx; + struct ib_event ib_event; + enum ib_qp_state old_state; + struct usnic_transport_spec *trans_spec; + struct usnic_ib_qp_grp_flow *qp_flow; + + old_state = qp_grp->state; + vnic_idx = usnic_vnic_get_index(qp_grp->vf->vnic); + trans_spec = (struct usnic_transport_spec *) data; + + spin_lock(&qp_grp->lock); + switch (new_state) { + case IB_QPS_RESET: + switch (old_state) { + case IB_QPS_RESET: + /* NO-OP */ + break; + case IB_QPS_INIT: + release_and_remove_all_flows(qp_grp); + status = 0; + break; + case IB_QPS_RTR: + case IB_QPS_RTS: + case IB_QPS_ERR: + status = disable_qp_grp(qp_grp); + release_and_remove_all_flows(qp_grp); + break; + default: + status = -EINVAL; + } + break; + case IB_QPS_INIT: + switch (old_state) { + case IB_QPS_RESET: + if (trans_spec) { + qp_flow = create_and_add_flow(qp_grp, + trans_spec); + if (IS_ERR_OR_NULL(qp_flow)) { + status = qp_flow ? PTR_ERR(qp_flow) : -EFAULT; + break; + } + } else { + /* + * Optional to specify filters. + */ + status = 0; + } + break; + case IB_QPS_INIT: + if (trans_spec) { + qp_flow = create_and_add_flow(qp_grp, + trans_spec); + if (IS_ERR_OR_NULL(qp_flow)) { + status = qp_flow ? PTR_ERR(qp_flow) : -EFAULT; + break; + } + } else { + /* + * Doesn't make sense to go into INIT state + * from INIT state w/o adding filters. + */ + status = -EINVAL; + } + break; + case IB_QPS_RTR: + status = disable_qp_grp(qp_grp); + break; + case IB_QPS_RTS: + status = disable_qp_grp(qp_grp); + break; + default: + status = -EINVAL; + } + break; + case IB_QPS_RTR: + switch (old_state) { + case IB_QPS_INIT: + status = enable_qp_grp(qp_grp); + break; + default: + status = -EINVAL; + } + break; + case IB_QPS_RTS: + switch (old_state) { + case IB_QPS_RTR: + /* NO-OP FOR NOW */ + break; + default: + status = -EINVAL; + } + break; + case IB_QPS_ERR: + ib_event.device = &qp_grp->vf->pf->ib_dev; + ib_event.element.qp = &qp_grp->ibqp; + ib_event.event = IB_EVENT_QP_FATAL; + + switch (old_state) { + case IB_QPS_RESET: + qp_grp->ibqp.event_handler(&ib_event, + qp_grp->ibqp.qp_context); + break; + case IB_QPS_INIT: + release_and_remove_all_flows(qp_grp); + qp_grp->ibqp.event_handler(&ib_event, + qp_grp->ibqp.qp_context); + break; + case IB_QPS_RTR: + case IB_QPS_RTS: + status = disable_qp_grp(qp_grp); + release_and_remove_all_flows(qp_grp); + qp_grp->ibqp.event_handler(&ib_event, + qp_grp->ibqp.qp_context); + break; + default: + status = -EINVAL; + } + break; + default: + status = -EINVAL; + } + spin_unlock(&qp_grp->lock); + + if (!status) { + qp_grp->state = new_state; + usnic_info("Transistioned %u from %s to %s", + qp_grp->grp_id, + usnic_ib_qp_grp_state_to_string(old_state), + usnic_ib_qp_grp_state_to_string(new_state)); + } else { + usnic_err("Failed to transition %u from %s to %s", + qp_grp->grp_id, + usnic_ib_qp_grp_state_to_string(old_state), + usnic_ib_qp_grp_state_to_string(new_state)); + } + + return status; +} + +static struct usnic_vnic_res_chunk** +alloc_res_chunk_list(struct usnic_vnic *vnic, + struct usnic_vnic_res_spec *res_spec, void *owner_obj) +{ + enum usnic_vnic_res_type res_type; + struct usnic_vnic_res_chunk **res_chunk_list; + int err, i, res_cnt, res_lst_sz; + + for (res_lst_sz = 0; + res_spec->resources[res_lst_sz].type != USNIC_VNIC_RES_TYPE_EOL; + res_lst_sz++) { + /* Do Nothing */ + } + + res_chunk_list = kzalloc(sizeof(*res_chunk_list)*(res_lst_sz+1), + GFP_ATOMIC); + if (!res_chunk_list) + return ERR_PTR(-ENOMEM); + + for (i = 0; res_spec->resources[i].type != USNIC_VNIC_RES_TYPE_EOL; + i++) { + res_type = res_spec->resources[i].type; + res_cnt = res_spec->resources[i].cnt; + + res_chunk_list[i] = usnic_vnic_get_resources(vnic, res_type, + res_cnt, owner_obj); + if (IS_ERR_OR_NULL(res_chunk_list[i])) { + err = res_chunk_list[i] ? + PTR_ERR(res_chunk_list[i]) : -ENOMEM; + usnic_err("Failed to get %s from %s with err %d\n", + usnic_vnic_res_type_to_str(res_type), + usnic_vnic_pci_name(vnic), + err); + goto out_free_res; + } + } + + return res_chunk_list; + +out_free_res: + for (i--; i > 0; i--) + usnic_vnic_put_resources(res_chunk_list[i]); + kfree(res_chunk_list); + return ERR_PTR(err); +} + +static void free_qp_grp_res(struct usnic_vnic_res_chunk **res_chunk_list) +{ + int i; + for (i = 0; res_chunk_list[i]; i++) + usnic_vnic_put_resources(res_chunk_list[i]); + kfree(res_chunk_list); +} + +static int qp_grp_and_vf_bind(struct usnic_ib_vf *vf, + struct usnic_ib_pd *pd, + struct usnic_ib_qp_grp *qp_grp) +{ + int err; + struct pci_dev *pdev; + + lockdep_assert_held(&vf->lock); + + pdev = usnic_vnic_get_pdev(vf->vnic); + if (vf->qp_grp_ref_cnt == 0) { + err = usnic_uiom_attach_dev_to_pd(pd->umem_pd, &pdev->dev); + if (err) { + usnic_err("Failed to attach %s to domain\n", + pci_name(pdev)); + return err; + } + vf->pd = pd; + } + vf->qp_grp_ref_cnt++; + + WARN_ON(vf->pd != pd); + qp_grp->vf = vf; + + return 0; +} + +static void qp_grp_and_vf_unbind(struct usnic_ib_qp_grp *qp_grp) +{ + struct pci_dev *pdev; + struct usnic_ib_pd *pd; + + lockdep_assert_held(&qp_grp->vf->lock); + + pd = qp_grp->vf->pd; + pdev = usnic_vnic_get_pdev(qp_grp->vf->vnic); + if (--qp_grp->vf->qp_grp_ref_cnt == 0) { + qp_grp->vf->pd = NULL; + usnic_uiom_detach_dev_from_pd(pd->umem_pd, &pdev->dev); + } + qp_grp->vf = NULL; +} + +static void log_spec(struct usnic_vnic_res_spec *res_spec) +{ + char buf[512]; + usnic_vnic_spec_dump(buf, sizeof(buf), res_spec); + usnic_dbg("%s\n", buf); +} + +static int qp_grp_id_from_flow(struct usnic_ib_qp_grp_flow *qp_flow, + uint32_t *id) +{ + enum usnic_transport_type trans_type = qp_flow->trans_type; + int err; + uint16_t port_num = 0; + + switch (trans_type) { + case USNIC_TRANSPORT_ROCE_CUSTOM: + *id = qp_flow->usnic_roce.port_num; + break; + case USNIC_TRANSPORT_IPV4_UDP: + err = usnic_transport_sock_get_addr(qp_flow->udp.sock, + NULL, NULL, + &port_num); + if (err) + return err; + /* + * Copy port_num to stack first and then to *id, + * so that the short to int cast works for little + * and big endian systems. + */ + *id = port_num; + break; + default: + usnic_err("Unsupported transport %u\n", trans_type); + return -EINVAL; + } + + return 0; +} + +struct usnic_ib_qp_grp * +usnic_ib_qp_grp_create(struct usnic_fwd_dev *ufdev, struct usnic_ib_vf *vf, + struct usnic_ib_pd *pd, + struct usnic_vnic_res_spec *res_spec, + struct usnic_transport_spec *transport_spec) +{ + struct usnic_ib_qp_grp *qp_grp; + int err; + enum usnic_transport_type transport = transport_spec->trans_type; + struct usnic_ib_qp_grp_flow *qp_flow; + + lockdep_assert_held(&vf->lock); + + err = usnic_vnic_res_spec_satisfied(&min_transport_spec[transport], + res_spec); + if (err) { + usnic_err("Spec does not meet miniumum req for transport %d\n", + transport); + log_spec(res_spec); + return ERR_PTR(err); + } + + qp_grp = kzalloc(sizeof(*qp_grp), GFP_ATOMIC); + if (!qp_grp) { + usnic_err("Unable to alloc qp_grp - Out of memory\n"); + return NULL; + } + + qp_grp->res_chunk_list = alloc_res_chunk_list(vf->vnic, res_spec, + qp_grp); + if (IS_ERR_OR_NULL(qp_grp->res_chunk_list)) { + err = qp_grp->res_chunk_list ? + PTR_ERR(qp_grp->res_chunk_list) : -ENOMEM; + usnic_err("Unable to alloc res for %d with err %d\n", + qp_grp->grp_id, err); + goto out_free_qp_grp; + } + + err = qp_grp_and_vf_bind(vf, pd, qp_grp); + if (err) + goto out_free_res; + + INIT_LIST_HEAD(&qp_grp->flows_lst); + spin_lock_init(&qp_grp->lock); + qp_grp->ufdev = ufdev; + qp_grp->state = IB_QPS_RESET; + qp_grp->owner_pid = current->pid; + + qp_flow = create_and_add_flow(qp_grp, transport_spec); + if (IS_ERR_OR_NULL(qp_flow)) { + usnic_err("Unable to create and add flow with err %ld\n", + PTR_ERR(qp_flow)); + err = qp_flow ? PTR_ERR(qp_flow) : -EFAULT; + goto out_qp_grp_vf_unbind; + } + + err = qp_grp_id_from_flow(qp_flow, &qp_grp->grp_id); + if (err) + goto out_release_flow; + qp_grp->ibqp.qp_num = qp_grp->grp_id; + + usnic_ib_sysfs_qpn_add(qp_grp); + + return qp_grp; + +out_release_flow: + release_and_remove_flow(qp_flow); +out_qp_grp_vf_unbind: + qp_grp_and_vf_unbind(qp_grp); +out_free_res: + free_qp_grp_res(qp_grp->res_chunk_list); +out_free_qp_grp: + kfree(qp_grp); + + return ERR_PTR(err); +} + +void usnic_ib_qp_grp_destroy(struct usnic_ib_qp_grp *qp_grp) +{ + + WARN_ON(qp_grp->state != IB_QPS_RESET); + lockdep_assert_held(&qp_grp->vf->lock); + + release_and_remove_all_flows(qp_grp); + usnic_ib_sysfs_qpn_remove(qp_grp); + qp_grp_and_vf_unbind(qp_grp); + free_qp_grp_res(qp_grp->res_chunk_list); + kfree(qp_grp); +} + +struct usnic_vnic_res_chunk* +usnic_ib_qp_grp_get_chunk(struct usnic_ib_qp_grp *qp_grp, + enum usnic_vnic_res_type res_type) +{ + int i; + + for (i = 0; qp_grp->res_chunk_list[i]; i++) { + if (qp_grp->res_chunk_list[i]->type == res_type) + return qp_grp->res_chunk_list[i]; + } + + return ERR_PTR(-EINVAL); +} diff --git a/kernel/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.h b/kernel/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.h new file mode 100644 index 000000000..b0aafe8db --- /dev/null +++ b/kernel/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.h @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_IB_QP_GRP_H_ +#define USNIC_IB_QP_GRP_H_ + +#include +#include + +#include "usnic_ib.h" +#include "usnic_abi.h" +#include "usnic_fwd.h" +#include "usnic_vnic.h" + +/* + * The qp group struct represents all the hw resources needed to present a ib_qp + */ +struct usnic_ib_qp_grp { + struct ib_qp ibqp; + enum ib_qp_state state; + int grp_id; + + struct usnic_fwd_dev *ufdev; + struct usnic_ib_ucontext *ctx; + struct list_head flows_lst; + + struct usnic_vnic_res_chunk **res_chunk_list; + + pid_t owner_pid; + struct usnic_ib_vf *vf; + struct list_head link; + + spinlock_t lock; + + struct kobject kobj; +}; + +struct usnic_ib_qp_grp_flow { + struct usnic_fwd_flow *flow; + enum usnic_transport_type trans_type; + union { + struct { + uint16_t port_num; + } usnic_roce; + struct { + struct socket *sock; + } udp; + }; + struct usnic_ib_qp_grp *qp_grp; + struct list_head link; + + /* Debug FS */ + struct dentry *dbgfs_dentry; + char dentry_name[32]; +}; + +static const struct +usnic_vnic_res_spec min_transport_spec[USNIC_TRANSPORT_MAX] = { + { /*USNIC_TRANSPORT_UNKNOWN*/ + .resources = { + {.type = USNIC_VNIC_RES_TYPE_EOL, .cnt = 0,}, + }, + }, + { /*USNIC_TRANSPORT_ROCE_CUSTOM*/ + .resources = { + {.type = USNIC_VNIC_RES_TYPE_WQ, .cnt = 1,}, + {.type = USNIC_VNIC_RES_TYPE_RQ, .cnt = 1,}, + {.type = USNIC_VNIC_RES_TYPE_CQ, .cnt = 1,}, + {.type = USNIC_VNIC_RES_TYPE_EOL, .cnt = 0,}, + }, + }, + { /*USNIC_TRANSPORT_IPV4_UDP*/ + .resources = { + {.type = USNIC_VNIC_RES_TYPE_WQ, .cnt = 1,}, + {.type = USNIC_VNIC_RES_TYPE_RQ, .cnt = 1,}, + {.type = USNIC_VNIC_RES_TYPE_CQ, .cnt = 1,}, + {.type = USNIC_VNIC_RES_TYPE_EOL, .cnt = 0,}, + }, + }, +}; + +const char *usnic_ib_qp_grp_state_to_string(enum ib_qp_state state); +int usnic_ib_qp_grp_dump_hdr(char *buf, int buf_sz); +int usnic_ib_qp_grp_dump_rows(void *obj, char *buf, int buf_sz); +struct usnic_ib_qp_grp * +usnic_ib_qp_grp_create(struct usnic_fwd_dev *ufdev, struct usnic_ib_vf *vf, + struct usnic_ib_pd *pd, + struct usnic_vnic_res_spec *res_spec, + struct usnic_transport_spec *trans_spec); +void usnic_ib_qp_grp_destroy(struct usnic_ib_qp_grp *qp_grp); +int usnic_ib_qp_grp_modify(struct usnic_ib_qp_grp *qp_grp, + enum ib_qp_state new_state, + void *data); +struct usnic_vnic_res_chunk +*usnic_ib_qp_grp_get_chunk(struct usnic_ib_qp_grp *qp_grp, + enum usnic_vnic_res_type type); +static inline +struct usnic_ib_qp_grp *to_uqp_grp(struct ib_qp *ibqp) +{ + return container_of(ibqp, struct usnic_ib_qp_grp, ibqp); +} +#endif /* USNIC_IB_QP_GRP_H_ */ diff --git a/kernel/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c b/kernel/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c new file mode 100644 index 000000000..27dc67c16 --- /dev/null +++ b/kernel/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c @@ -0,0 +1,341 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include +#include + +#include +#include + +#include "usnic_common_util.h" +#include "usnic_ib.h" +#include "usnic_ib_qp_grp.h" +#include "usnic_vnic.h" +#include "usnic_ib_verbs.h" +#include "usnic_log.h" + +static ssize_t usnic_ib_show_fw_ver(struct device *device, + struct device_attribute *attr, + char *buf) +{ + struct usnic_ib_dev *us_ibdev = + container_of(device, struct usnic_ib_dev, ib_dev.dev); + struct ethtool_drvinfo info; + + mutex_lock(&us_ibdev->usdev_lock); + us_ibdev->netdev->ethtool_ops->get_drvinfo(us_ibdev->netdev, &info); + mutex_unlock(&us_ibdev->usdev_lock); + + return scnprintf(buf, PAGE_SIZE, "%s\n", info.fw_version); +} + +static ssize_t usnic_ib_show_board(struct device *device, + struct device_attribute *attr, + char *buf) +{ + struct usnic_ib_dev *us_ibdev = + container_of(device, struct usnic_ib_dev, ib_dev.dev); + unsigned short subsystem_device_id; + + mutex_lock(&us_ibdev->usdev_lock); + subsystem_device_id = us_ibdev->pdev->subsystem_device; + mutex_unlock(&us_ibdev->usdev_lock); + + return scnprintf(buf, PAGE_SIZE, "%hu\n", subsystem_device_id); +} + +/* + * Report the configuration for this PF + */ +static ssize_t +usnic_ib_show_config(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct usnic_ib_dev *us_ibdev; + char *ptr; + unsigned left; + unsigned n; + enum usnic_vnic_res_type res_type; + + us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev); + + /* Buffer space limit is 1 page */ + ptr = buf; + left = PAGE_SIZE; + + mutex_lock(&us_ibdev->usdev_lock); + if (atomic_read(&us_ibdev->vf_cnt.refcount) > 0) { + char *busname; + + /* + * bus name seems to come with annoying prefix. + * Remove it if it is predictable + */ + busname = us_ibdev->pdev->bus->name; + if (strncmp(busname, "PCI Bus ", 8) == 0) + busname += 8; + + n = scnprintf(ptr, left, + "%s: %s:%d.%d, %s, %pM, %u VFs\n Per VF:", + us_ibdev->ib_dev.name, + busname, + PCI_SLOT(us_ibdev->pdev->devfn), + PCI_FUNC(us_ibdev->pdev->devfn), + netdev_name(us_ibdev->netdev), + us_ibdev->ufdev->mac, + atomic_read(&us_ibdev->vf_cnt.refcount)); + UPDATE_PTR_LEFT(n, ptr, left); + + for (res_type = USNIC_VNIC_RES_TYPE_EOL; + res_type < USNIC_VNIC_RES_TYPE_MAX; + res_type++) { + if (us_ibdev->vf_res_cnt[res_type] == 0) + continue; + n = scnprintf(ptr, left, " %d %s%s", + us_ibdev->vf_res_cnt[res_type], + usnic_vnic_res_type_to_str(res_type), + (res_type < (USNIC_VNIC_RES_TYPE_MAX - 1)) ? + "," : ""); + UPDATE_PTR_LEFT(n, ptr, left); + } + n = scnprintf(ptr, left, "\n"); + UPDATE_PTR_LEFT(n, ptr, left); + } else { + n = scnprintf(ptr, left, "%s: no VFs\n", + us_ibdev->ib_dev.name); + UPDATE_PTR_LEFT(n, ptr, left); + } + mutex_unlock(&us_ibdev->usdev_lock); + + return ptr - buf; +} + +static ssize_t +usnic_ib_show_iface(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct usnic_ib_dev *us_ibdev; + + us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev); + + return scnprintf(buf, PAGE_SIZE, "%s\n", + netdev_name(us_ibdev->netdev)); +} + +static ssize_t +usnic_ib_show_max_vf(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct usnic_ib_dev *us_ibdev; + + us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev); + + return scnprintf(buf, PAGE_SIZE, "%u\n", + atomic_read(&us_ibdev->vf_cnt.refcount)); +} + +static ssize_t +usnic_ib_show_qp_per_vf(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct usnic_ib_dev *us_ibdev; + int qp_per_vf; + + us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev); + qp_per_vf = max(us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_WQ], + us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_RQ]); + + return scnprintf(buf, PAGE_SIZE, + "%d\n", qp_per_vf); +} + +static ssize_t +usnic_ib_show_cq_per_vf(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct usnic_ib_dev *us_ibdev; + + us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev); + + return scnprintf(buf, PAGE_SIZE, "%d\n", + us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_CQ]); +} + +static DEVICE_ATTR(fw_ver, S_IRUGO, usnic_ib_show_fw_ver, NULL); +static DEVICE_ATTR(board_id, S_IRUGO, usnic_ib_show_board, NULL); +static DEVICE_ATTR(config, S_IRUGO, usnic_ib_show_config, NULL); +static DEVICE_ATTR(iface, S_IRUGO, usnic_ib_show_iface, NULL); +static DEVICE_ATTR(max_vf, S_IRUGO, usnic_ib_show_max_vf, NULL); +static DEVICE_ATTR(qp_per_vf, S_IRUGO, usnic_ib_show_qp_per_vf, NULL); +static DEVICE_ATTR(cq_per_vf, S_IRUGO, usnic_ib_show_cq_per_vf, NULL); + +static struct device_attribute *usnic_class_attributes[] = { + &dev_attr_fw_ver, + &dev_attr_board_id, + &dev_attr_config, + &dev_attr_iface, + &dev_attr_max_vf, + &dev_attr_qp_per_vf, + &dev_attr_cq_per_vf, +}; + +struct qpn_attribute { + struct attribute attr; + ssize_t (*show)(struct usnic_ib_qp_grp *, char *buf); +}; + +/* + * Definitions for supporting QPN entries in sysfs + */ +static ssize_t +usnic_ib_qpn_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + struct usnic_ib_qp_grp *qp_grp; + struct qpn_attribute *qpn_attr; + + qp_grp = container_of(kobj, struct usnic_ib_qp_grp, kobj); + qpn_attr = container_of(attr, struct qpn_attribute, attr); + + return qpn_attr->show(qp_grp, buf); +} + +static const struct sysfs_ops usnic_ib_qpn_sysfs_ops = { + .show = usnic_ib_qpn_attr_show +}; + +#define QPN_ATTR_RO(NAME) \ +struct qpn_attribute qpn_attr_##NAME = __ATTR_RO(NAME) + +static ssize_t context_show(struct usnic_ib_qp_grp *qp_grp, char *buf) +{ + return scnprintf(buf, PAGE_SIZE, "0x%p\n", qp_grp->ctx); +} + +static ssize_t summary_show(struct usnic_ib_qp_grp *qp_grp, char *buf) +{ + int i, j, n; + int left; + char *ptr; + struct usnic_vnic_res_chunk *res_chunk; + struct usnic_vnic_res *vnic_res; + + left = PAGE_SIZE; + ptr = buf; + + n = scnprintf(ptr, left, + "QPN: %d State: (%s) PID: %u VF Idx: %hu ", + qp_grp->ibqp.qp_num, + usnic_ib_qp_grp_state_to_string(qp_grp->state), + qp_grp->owner_pid, + usnic_vnic_get_index(qp_grp->vf->vnic)); + UPDATE_PTR_LEFT(n, ptr, left); + + for (i = 0; qp_grp->res_chunk_list[i]; i++) { + res_chunk = qp_grp->res_chunk_list[i]; + for (j = 0; j < res_chunk->cnt; j++) { + vnic_res = res_chunk->res[j]; + n = scnprintf(ptr, left, "%s[%d] ", + usnic_vnic_res_type_to_str(vnic_res->type), + vnic_res->vnic_idx); + UPDATE_PTR_LEFT(n, ptr, left); + } + } + + n = scnprintf(ptr, left, "\n"); + UPDATE_PTR_LEFT(n, ptr, left); + + return ptr - buf; +} + +static QPN_ATTR_RO(context); +static QPN_ATTR_RO(summary); + +static struct attribute *usnic_ib_qpn_default_attrs[] = { + &qpn_attr_context.attr, + &qpn_attr_summary.attr, + NULL +}; + +static struct kobj_type usnic_ib_qpn_type = { + .sysfs_ops = &usnic_ib_qpn_sysfs_ops, + .default_attrs = usnic_ib_qpn_default_attrs +}; + +int usnic_ib_sysfs_register_usdev(struct usnic_ib_dev *us_ibdev) +{ + int i; + int err; + for (i = 0; i < ARRAY_SIZE(usnic_class_attributes); ++i) { + err = device_create_file(&us_ibdev->ib_dev.dev, + usnic_class_attributes[i]); + if (err) { + usnic_err("Failed to create device file %d for %s eith err %d", + i, us_ibdev->ib_dev.name, err); + return -EINVAL; + } + } + + /* create kernel object for looking at individual QPs */ + kobject_get(&us_ibdev->ib_dev.dev.kobj); + us_ibdev->qpn_kobj = kobject_create_and_add("qpn", + &us_ibdev->ib_dev.dev.kobj); + if (us_ibdev->qpn_kobj == NULL) { + kobject_put(&us_ibdev->ib_dev.dev.kobj); + return -ENOMEM; + } + + return 0; +} + +void usnic_ib_sysfs_unregister_usdev(struct usnic_ib_dev *us_ibdev) +{ + int i; + for (i = 0; i < ARRAY_SIZE(usnic_class_attributes); ++i) { + device_remove_file(&us_ibdev->ib_dev.dev, + usnic_class_attributes[i]); + } + + kobject_put(us_ibdev->qpn_kobj); +} + +void usnic_ib_sysfs_qpn_add(struct usnic_ib_qp_grp *qp_grp) +{ + struct usnic_ib_dev *us_ibdev; + int err; + + us_ibdev = qp_grp->vf->pf; + + err = kobject_init_and_add(&qp_grp->kobj, &usnic_ib_qpn_type, + kobject_get(us_ibdev->qpn_kobj), + "%d", qp_grp->grp_id); + if (err) { + kobject_put(us_ibdev->qpn_kobj); + return; + } +} + +void usnic_ib_sysfs_qpn_remove(struct usnic_ib_qp_grp *qp_grp) +{ + struct usnic_ib_dev *us_ibdev; + + us_ibdev = qp_grp->vf->pf; + + kobject_put(&qp_grp->kobj); + kobject_put(us_ibdev->qpn_kobj); +} diff --git a/kernel/drivers/infiniband/hw/usnic/usnic_ib_sysfs.h b/kernel/drivers/infiniband/hw/usnic/usnic_ib_sysfs.h new file mode 100644 index 000000000..0d09b493c --- /dev/null +++ b/kernel/drivers/infiniband/hw/usnic/usnic_ib_sysfs.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_IB_SYSFS_H_ +#define USNIC_IB_SYSFS_H_ + +#include "usnic_ib.h" + +int usnic_ib_sysfs_register_usdev(struct usnic_ib_dev *us_ibdev); +void usnic_ib_sysfs_unregister_usdev(struct usnic_ib_dev *us_ibdev); +void usnic_ib_sysfs_qpn_add(struct usnic_ib_qp_grp *qp_grp); +void usnic_ib_sysfs_qpn_remove(struct usnic_ib_qp_grp *qp_grp); + +#endif /* !USNIC_IB_SYSFS_H_ */ diff --git a/kernel/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/kernel/drivers/infiniband/hw/usnic/usnic_ib_verbs.c new file mode 100644 index 000000000..53bd6a2d9 --- /dev/null +++ b/kernel/drivers/infiniband/hw/usnic/usnic_ib_verbs.c @@ -0,0 +1,768 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include +#include + +#include +#include + +#include "usnic_abi.h" +#include "usnic_ib.h" +#include "usnic_common_util.h" +#include "usnic_ib_qp_grp.h" +#include "usnic_fwd.h" +#include "usnic_log.h" +#include "usnic_uiom.h" +#include "usnic_transport.h" + +#define USNIC_DEFAULT_TRANSPORT USNIC_TRANSPORT_ROCE_CUSTOM + +static void usnic_ib_fw_string_to_u64(char *fw_ver_str, u64 *fw_ver) +{ + *fw_ver = (u64) *fw_ver_str; +} + +static int usnic_ib_fill_create_qp_resp(struct usnic_ib_qp_grp *qp_grp, + struct ib_udata *udata) +{ + struct usnic_ib_dev *us_ibdev; + struct usnic_ib_create_qp_resp resp; + struct pci_dev *pdev; + struct vnic_dev_bar *bar; + struct usnic_vnic_res_chunk *chunk; + struct usnic_ib_qp_grp_flow *default_flow; + int i, err; + + memset(&resp, 0, sizeof(resp)); + + us_ibdev = qp_grp->vf->pf; + pdev = usnic_vnic_get_pdev(qp_grp->vf->vnic); + if (!pdev) { + usnic_err("Failed to get pdev of qp_grp %d\n", + qp_grp->grp_id); + return -EFAULT; + } + + bar = usnic_vnic_get_bar(qp_grp->vf->vnic, 0); + if (!bar) { + usnic_err("Failed to get bar0 of qp_grp %d vf %s", + qp_grp->grp_id, pci_name(pdev)); + return -EFAULT; + } + + resp.vfid = usnic_vnic_get_index(qp_grp->vf->vnic); + resp.bar_bus_addr = bar->bus_addr; + resp.bar_len = bar->len; + + chunk = usnic_ib_qp_grp_get_chunk(qp_grp, USNIC_VNIC_RES_TYPE_RQ); + if (IS_ERR_OR_NULL(chunk)) { + usnic_err("Failed to get chunk %s for qp_grp %d with err %ld\n", + usnic_vnic_res_type_to_str(USNIC_VNIC_RES_TYPE_RQ), + qp_grp->grp_id, + PTR_ERR(chunk)); + return chunk ? PTR_ERR(chunk) : -ENOMEM; + } + + WARN_ON(chunk->type != USNIC_VNIC_RES_TYPE_RQ); + resp.rq_cnt = chunk->cnt; + for (i = 0; i < chunk->cnt; i++) + resp.rq_idx[i] = chunk->res[i]->vnic_idx; + + chunk = usnic_ib_qp_grp_get_chunk(qp_grp, USNIC_VNIC_RES_TYPE_WQ); + if (IS_ERR_OR_NULL(chunk)) { + usnic_err("Failed to get chunk %s for qp_grp %d with err %ld\n", + usnic_vnic_res_type_to_str(USNIC_VNIC_RES_TYPE_WQ), + qp_grp->grp_id, + PTR_ERR(chunk)); + return chunk ? PTR_ERR(chunk) : -ENOMEM; + } + + WARN_ON(chunk->type != USNIC_VNIC_RES_TYPE_WQ); + resp.wq_cnt = chunk->cnt; + for (i = 0; i < chunk->cnt; i++) + resp.wq_idx[i] = chunk->res[i]->vnic_idx; + + chunk = usnic_ib_qp_grp_get_chunk(qp_grp, USNIC_VNIC_RES_TYPE_CQ); + if (IS_ERR_OR_NULL(chunk)) { + usnic_err("Failed to get chunk %s for qp_grp %d with err %ld\n", + usnic_vnic_res_type_to_str(USNIC_VNIC_RES_TYPE_CQ), + qp_grp->grp_id, + PTR_ERR(chunk)); + return chunk ? PTR_ERR(chunk) : -ENOMEM; + } + + WARN_ON(chunk->type != USNIC_VNIC_RES_TYPE_CQ); + resp.cq_cnt = chunk->cnt; + for (i = 0; i < chunk->cnt; i++) + resp.cq_idx[i] = chunk->res[i]->vnic_idx; + + default_flow = list_first_entry(&qp_grp->flows_lst, + struct usnic_ib_qp_grp_flow, link); + resp.transport = default_flow->trans_type; + + err = ib_copy_to_udata(udata, &resp, sizeof(resp)); + if (err) { + usnic_err("Failed to copy udata for %s", us_ibdev->ib_dev.name); + return err; + } + + return 0; +} + +static struct usnic_ib_qp_grp* +find_free_vf_and_create_qp_grp(struct usnic_ib_dev *us_ibdev, + struct usnic_ib_pd *pd, + struct usnic_transport_spec *trans_spec, + struct usnic_vnic_res_spec *res_spec) +{ + struct usnic_ib_vf *vf; + struct usnic_vnic *vnic; + struct usnic_ib_qp_grp *qp_grp; + struct device *dev, **dev_list; + int i, found = 0; + + BUG_ON(!mutex_is_locked(&us_ibdev->usdev_lock)); + + if (list_empty(&us_ibdev->vf_dev_list)) { + usnic_info("No vfs to allocate\n"); + return NULL; + } + + if (usnic_ib_share_vf) { + /* Try to find resouces on a used vf which is in pd */ + dev_list = usnic_uiom_get_dev_list(pd->umem_pd); + for (i = 0; dev_list[i]; i++) { + dev = dev_list[i]; + vf = pci_get_drvdata(to_pci_dev(dev)); + spin_lock(&vf->lock); + vnic = vf->vnic; + if (!usnic_vnic_check_room(vnic, res_spec)) { + usnic_dbg("Found used vnic %s from %s\n", + us_ibdev->ib_dev.name, + pci_name(usnic_vnic_get_pdev( + vnic))); + found = 1; + break; + } + spin_unlock(&vf->lock); + + } + usnic_uiom_free_dev_list(dev_list); + } + + if (!found) { + /* Try to find resources on an unused vf */ + list_for_each_entry(vf, &us_ibdev->vf_dev_list, link) { + spin_lock(&vf->lock); + vnic = vf->vnic; + if (vf->qp_grp_ref_cnt == 0 && + usnic_vnic_check_room(vnic, res_spec) == 0) { + found = 1; + break; + } + spin_unlock(&vf->lock); + } + } + + if (!found) { + usnic_info("No free qp grp found on %s\n", + us_ibdev->ib_dev.name); + return ERR_PTR(-ENOMEM); + } + + qp_grp = usnic_ib_qp_grp_create(us_ibdev->ufdev, vf, pd, res_spec, + trans_spec); + spin_unlock(&vf->lock); + if (IS_ERR_OR_NULL(qp_grp)) { + usnic_err("Failed to allocate qp_grp\n"); + return ERR_PTR(qp_grp ? PTR_ERR(qp_grp) : -ENOMEM); + } + + return qp_grp; +} + +static void qp_grp_destroy(struct usnic_ib_qp_grp *qp_grp) +{ + struct usnic_ib_vf *vf = qp_grp->vf; + + WARN_ON(qp_grp->state != IB_QPS_RESET); + + spin_lock(&vf->lock); + usnic_ib_qp_grp_destroy(qp_grp); + spin_unlock(&vf->lock); +} + +static void eth_speed_to_ib_speed(int speed, u8 *active_speed, + u8 *active_width) +{ + if (speed <= 10000) { + *active_width = IB_WIDTH_1X; + *active_speed = IB_SPEED_FDR10; + } else if (speed <= 20000) { + *active_width = IB_WIDTH_4X; + *active_speed = IB_SPEED_DDR; + } else if (speed <= 30000) { + *active_width = IB_WIDTH_4X; + *active_speed = IB_SPEED_QDR; + } else if (speed <= 40000) { + *active_width = IB_WIDTH_4X; + *active_speed = IB_SPEED_FDR10; + } else { + *active_width = IB_WIDTH_4X; + *active_speed = IB_SPEED_EDR; + } +} + +static int create_qp_validate_user_data(struct usnic_ib_create_qp_cmd cmd) +{ + if (cmd.spec.trans_type <= USNIC_TRANSPORT_UNKNOWN || + cmd.spec.trans_type >= USNIC_TRANSPORT_MAX) + return -EINVAL; + + return 0; +} + +/* Start of ib callback functions */ + +enum rdma_link_layer usnic_ib_port_link_layer(struct ib_device *device, + u8 port_num) +{ + return IB_LINK_LAYER_ETHERNET; +} + +int usnic_ib_query_device(struct ib_device *ibdev, + struct ib_device_attr *props) +{ + struct usnic_ib_dev *us_ibdev = to_usdev(ibdev); + union ib_gid gid; + struct ethtool_drvinfo info; + struct ethtool_cmd cmd; + int qp_per_vf; + + usnic_dbg("\n"); + mutex_lock(&us_ibdev->usdev_lock); + us_ibdev->netdev->ethtool_ops->get_drvinfo(us_ibdev->netdev, &info); + us_ibdev->netdev->ethtool_ops->get_settings(us_ibdev->netdev, &cmd); + memset(props, 0, sizeof(*props)); + usnic_mac_ip_to_gid(us_ibdev->ufdev->mac, us_ibdev->ufdev->inaddr, + &gid.raw[0]); + memcpy(&props->sys_image_guid, &gid.global.interface_id, + sizeof(gid.global.interface_id)); + usnic_ib_fw_string_to_u64(&info.fw_version[0], &props->fw_ver); + props->max_mr_size = USNIC_UIOM_MAX_MR_SIZE; + props->page_size_cap = USNIC_UIOM_PAGE_SIZE; + props->vendor_id = PCI_VENDOR_ID_CISCO; + props->vendor_part_id = PCI_DEVICE_ID_CISCO_VIC_USPACE_NIC; + props->hw_ver = us_ibdev->pdev->subsystem_device; + qp_per_vf = max(us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_WQ], + us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_RQ]); + props->max_qp = qp_per_vf * + atomic_read(&us_ibdev->vf_cnt.refcount); + props->device_cap_flags = IB_DEVICE_PORT_ACTIVE_EVENT | + IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_BLOCK_MULTICAST_LOOPBACK; + props->max_cq = us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_CQ] * + atomic_read(&us_ibdev->vf_cnt.refcount); + props->max_pd = USNIC_UIOM_MAX_PD_CNT; + props->max_mr = USNIC_UIOM_MAX_MR_CNT; + props->local_ca_ack_delay = 0; + props->max_pkeys = 0; + props->atomic_cap = IB_ATOMIC_NONE; + props->masked_atomic_cap = props->atomic_cap; + props->max_qp_rd_atom = 0; + props->max_qp_init_rd_atom = 0; + props->max_res_rd_atom = 0; + props->max_srq = 0; + props->max_srq_wr = 0; + props->max_srq_sge = 0; + props->max_fast_reg_page_list_len = 0; + props->max_mcast_grp = 0; + props->max_mcast_qp_attach = 0; + props->max_total_mcast_qp_attach = 0; + props->max_map_per_fmr = 0; + /* Owned by Userspace + * max_qp_wr, max_sge, max_sge_rd, max_cqe */ + mutex_unlock(&us_ibdev->usdev_lock); + + return 0; +} + +int usnic_ib_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props) +{ + struct usnic_ib_dev *us_ibdev = to_usdev(ibdev); + struct ethtool_cmd cmd; + + usnic_dbg("\n"); + + mutex_lock(&us_ibdev->usdev_lock); + us_ibdev->netdev->ethtool_ops->get_settings(us_ibdev->netdev, &cmd); + memset(props, 0, sizeof(*props)); + + props->lid = 0; + props->lmc = 1; + props->sm_lid = 0; + props->sm_sl = 0; + + if (!us_ibdev->ufdev->link_up) { + props->state = IB_PORT_DOWN; + props->phys_state = 3; + } else if (!us_ibdev->ufdev->inaddr) { + props->state = IB_PORT_INIT; + props->phys_state = 4; + } else { + props->state = IB_PORT_ACTIVE; + props->phys_state = 5; + } + + props->port_cap_flags = 0; + props->gid_tbl_len = 1; + props->pkey_tbl_len = 1; + props->bad_pkey_cntr = 0; + props->qkey_viol_cntr = 0; + eth_speed_to_ib_speed(cmd.speed, &props->active_speed, + &props->active_width); + props->max_mtu = IB_MTU_4096; + props->active_mtu = iboe_get_mtu(us_ibdev->ufdev->mtu); + /* Userspace will adjust for hdrs */ + props->max_msg_sz = us_ibdev->ufdev->mtu; + props->max_vl_num = 1; + mutex_unlock(&us_ibdev->usdev_lock); + + return 0; +} + +int usnic_ib_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr) +{ + struct usnic_ib_qp_grp *qp_grp; + struct usnic_ib_vf *vf; + int err; + + usnic_dbg("\n"); + + memset(qp_attr, 0, sizeof(*qp_attr)); + memset(qp_init_attr, 0, sizeof(*qp_init_attr)); + + qp_grp = to_uqp_grp(qp); + vf = qp_grp->vf; + mutex_lock(&vf->pf->usdev_lock); + usnic_dbg("\n"); + qp_attr->qp_state = qp_grp->state; + qp_attr->cur_qp_state = qp_grp->state; + + switch (qp_grp->ibqp.qp_type) { + case IB_QPT_UD: + qp_attr->qkey = 0; + break; + default: + usnic_err("Unexpected qp_type %d\n", qp_grp->ibqp.qp_type); + err = -EINVAL; + goto err_out; + } + + mutex_unlock(&vf->pf->usdev_lock); + return 0; + +err_out: + mutex_unlock(&vf->pf->usdev_lock); + return err; +} + +int usnic_ib_query_gid(struct ib_device *ibdev, u8 port, int index, + union ib_gid *gid) +{ + + struct usnic_ib_dev *us_ibdev = to_usdev(ibdev); + usnic_dbg("\n"); + + if (index > 1) + return -EINVAL; + + mutex_lock(&us_ibdev->usdev_lock); + memset(&(gid->raw[0]), 0, sizeof(gid->raw)); + usnic_mac_ip_to_gid(us_ibdev->ufdev->mac, us_ibdev->ufdev->inaddr, + &gid->raw[0]); + mutex_unlock(&us_ibdev->usdev_lock); + + return 0; +} + +int usnic_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, + u16 *pkey) +{ + if (index > 1) + return -EINVAL; + + *pkey = 0xffff; + return 0; +} + +struct ib_pd *usnic_ib_alloc_pd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct usnic_ib_pd *pd; + void *umem_pd; + + usnic_dbg("\n"); + + pd = kzalloc(sizeof(*pd), GFP_KERNEL); + if (!pd) + return ERR_PTR(-ENOMEM); + + umem_pd = pd->umem_pd = usnic_uiom_alloc_pd(); + if (IS_ERR_OR_NULL(umem_pd)) { + kfree(pd); + return ERR_PTR(umem_pd ? PTR_ERR(umem_pd) : -ENOMEM); + } + + usnic_info("domain 0x%p allocated for context 0x%p and device %s\n", + pd, context, ibdev->name); + return &pd->ibpd; +} + +int usnic_ib_dealloc_pd(struct ib_pd *pd) +{ + usnic_info("freeing domain 0x%p\n", pd); + + usnic_uiom_dealloc_pd((to_upd(pd))->umem_pd); + kfree(pd); + return 0; +} + +struct ib_qp *usnic_ib_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata) +{ + int err; + struct usnic_ib_dev *us_ibdev; + struct usnic_ib_qp_grp *qp_grp; + struct usnic_ib_ucontext *ucontext; + int cq_cnt; + struct usnic_vnic_res_spec res_spec; + struct usnic_ib_create_qp_cmd cmd; + struct usnic_transport_spec trans_spec; + + usnic_dbg("\n"); + + ucontext = to_uucontext(pd->uobject->context); + us_ibdev = to_usdev(pd->device); + + if (init_attr->create_flags) + return ERR_PTR(-EINVAL); + + err = ib_copy_from_udata(&cmd, udata, sizeof(cmd)); + if (err) { + usnic_err("%s: cannot copy udata for create_qp\n", + us_ibdev->ib_dev.name); + return ERR_PTR(-EINVAL); + } + + err = create_qp_validate_user_data(cmd); + if (err) { + usnic_err("%s: Failed to validate user data\n", + us_ibdev->ib_dev.name); + return ERR_PTR(-EINVAL); + } + + if (init_attr->qp_type != IB_QPT_UD) { + usnic_err("%s asked to make a non-UD QP: %d\n", + us_ibdev->ib_dev.name, init_attr->qp_type); + return ERR_PTR(-EINVAL); + } + + trans_spec = cmd.spec; + mutex_lock(&us_ibdev->usdev_lock); + cq_cnt = (init_attr->send_cq == init_attr->recv_cq) ? 1 : 2; + res_spec = min_transport_spec[trans_spec.trans_type]; + usnic_vnic_res_spec_update(&res_spec, USNIC_VNIC_RES_TYPE_CQ, cq_cnt); + qp_grp = find_free_vf_and_create_qp_grp(us_ibdev, to_upd(pd), + &trans_spec, + &res_spec); + if (IS_ERR_OR_NULL(qp_grp)) { + err = qp_grp ? PTR_ERR(qp_grp) : -ENOMEM; + goto out_release_mutex; + } + + err = usnic_ib_fill_create_qp_resp(qp_grp, udata); + if (err) { + err = -EBUSY; + goto out_release_qp_grp; + } + + qp_grp->ctx = ucontext; + list_add_tail(&qp_grp->link, &ucontext->qp_grp_list); + usnic_ib_log_vf(qp_grp->vf); + mutex_unlock(&us_ibdev->usdev_lock); + return &qp_grp->ibqp; + +out_release_qp_grp: + qp_grp_destroy(qp_grp); +out_release_mutex: + mutex_unlock(&us_ibdev->usdev_lock); + return ERR_PTR(err); +} + +int usnic_ib_destroy_qp(struct ib_qp *qp) +{ + struct usnic_ib_qp_grp *qp_grp; + struct usnic_ib_vf *vf; + + usnic_dbg("\n"); + + qp_grp = to_uqp_grp(qp); + vf = qp_grp->vf; + mutex_lock(&vf->pf->usdev_lock); + if (usnic_ib_qp_grp_modify(qp_grp, IB_QPS_RESET, NULL)) { + usnic_err("Failed to move qp grp %u to reset\n", + qp_grp->grp_id); + } + + list_del(&qp_grp->link); + qp_grp_destroy(qp_grp); + mutex_unlock(&vf->pf->usdev_lock); + + return 0; +} + +int usnic_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct usnic_ib_qp_grp *qp_grp; + int status; + usnic_dbg("\n"); + + qp_grp = to_uqp_grp(ibqp); + + /* TODO: Future Support All States */ + mutex_lock(&qp_grp->vf->pf->usdev_lock); + if ((attr_mask & IB_QP_STATE) && attr->qp_state == IB_QPS_INIT) { + status = usnic_ib_qp_grp_modify(qp_grp, IB_QPS_INIT, NULL); + } else if ((attr_mask & IB_QP_STATE) && attr->qp_state == IB_QPS_RTR) { + status = usnic_ib_qp_grp_modify(qp_grp, IB_QPS_RTR, NULL); + } else if ((attr_mask & IB_QP_STATE) && attr->qp_state == IB_QPS_RTS) { + status = usnic_ib_qp_grp_modify(qp_grp, IB_QPS_RTS, NULL); + } else { + usnic_err("Unexpected combination mask: %u state: %u\n", + attr_mask & IB_QP_STATE, attr->qp_state); + status = -EINVAL; + } + + mutex_unlock(&qp_grp->vf->pf->usdev_lock); + return status; +} + +struct ib_cq *usnic_ib_create_cq(struct ib_device *ibdev, int entries, + int vector, struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct ib_cq *cq; + + usnic_dbg("\n"); + cq = kzalloc(sizeof(*cq), GFP_KERNEL); + if (!cq) + return ERR_PTR(-EBUSY); + + return cq; +} + +int usnic_ib_destroy_cq(struct ib_cq *cq) +{ + usnic_dbg("\n"); + kfree(cq); + return 0; +} + +struct ib_mr *usnic_ib_reg_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int access_flags, + struct ib_udata *udata) +{ + struct usnic_ib_mr *mr; + int err; + + usnic_dbg("start 0x%llx va 0x%llx length 0x%llx\n", start, + virt_addr, length); + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (IS_ERR_OR_NULL(mr)) + return ERR_PTR(mr ? PTR_ERR(mr) : -ENOMEM); + + mr->umem = usnic_uiom_reg_get(to_upd(pd)->umem_pd, start, length, + access_flags, 0); + if (IS_ERR_OR_NULL(mr->umem)) { + err = mr->umem ? PTR_ERR(mr->umem) : -EFAULT; + goto err_free; + } + + mr->ibmr.lkey = mr->ibmr.rkey = 0; + return &mr->ibmr; + +err_free: + kfree(mr); + return ERR_PTR(err); +} + +int usnic_ib_dereg_mr(struct ib_mr *ibmr) +{ + struct usnic_ib_mr *mr = to_umr(ibmr); + + usnic_dbg("va 0x%lx length 0x%zx\n", mr->umem->va, mr->umem->length); + + usnic_uiom_reg_release(mr->umem, ibmr->pd->uobject->context->closing); + kfree(mr); + return 0; +} + +struct ib_ucontext *usnic_ib_alloc_ucontext(struct ib_device *ibdev, + struct ib_udata *udata) +{ + struct usnic_ib_ucontext *context; + struct usnic_ib_dev *us_ibdev = to_usdev(ibdev); + usnic_dbg("\n"); + + context = kmalloc(sizeof(*context), GFP_KERNEL); + if (!context) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&context->qp_grp_list); + mutex_lock(&us_ibdev->usdev_lock); + list_add_tail(&context->link, &us_ibdev->ctx_list); + mutex_unlock(&us_ibdev->usdev_lock); + + return &context->ibucontext; +} + +int usnic_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) +{ + struct usnic_ib_ucontext *context = to_uucontext(ibcontext); + struct usnic_ib_dev *us_ibdev = to_usdev(ibcontext->device); + usnic_dbg("\n"); + + mutex_lock(&us_ibdev->usdev_lock); + BUG_ON(!list_empty(&context->qp_grp_list)); + list_del(&context->link); + mutex_unlock(&us_ibdev->usdev_lock); + kfree(context); + return 0; +} + +int usnic_ib_mmap(struct ib_ucontext *context, + struct vm_area_struct *vma) +{ + struct usnic_ib_ucontext *uctx = to_ucontext(context); + struct usnic_ib_dev *us_ibdev; + struct usnic_ib_qp_grp *qp_grp; + struct usnic_ib_vf *vf; + struct vnic_dev_bar *bar; + dma_addr_t bus_addr; + unsigned int len; + unsigned int vfid; + + usnic_dbg("\n"); + + us_ibdev = to_usdev(context->device); + vma->vm_flags |= VM_IO; + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + vfid = vma->vm_pgoff; + usnic_dbg("Page Offset %lu PAGE_SHIFT %u VFID %u\n", + vma->vm_pgoff, PAGE_SHIFT, vfid); + + mutex_lock(&us_ibdev->usdev_lock); + list_for_each_entry(qp_grp, &uctx->qp_grp_list, link) { + vf = qp_grp->vf; + if (usnic_vnic_get_index(vf->vnic) == vfid) { + bar = usnic_vnic_get_bar(vf->vnic, 0); + if ((vma->vm_end - vma->vm_start) != bar->len) { + usnic_err("Bar0 Len %lu - Request map %lu\n", + bar->len, + vma->vm_end - vma->vm_start); + mutex_unlock(&us_ibdev->usdev_lock); + return -EINVAL; + } + bus_addr = bar->bus_addr; + len = bar->len; + usnic_dbg("bus: %pa vaddr: %p size: %ld\n", + &bus_addr, bar->vaddr, bar->len); + mutex_unlock(&us_ibdev->usdev_lock); + + return remap_pfn_range(vma, + vma->vm_start, + bus_addr >> PAGE_SHIFT, + len, vma->vm_page_prot); + } + } + + mutex_unlock(&us_ibdev->usdev_lock); + usnic_err("No VF %u found\n", vfid); + return -EINVAL; +} + +/* In ib callbacks section - Start of stub funcs */ +struct ib_ah *usnic_ib_create_ah(struct ib_pd *pd, + struct ib_ah_attr *ah_attr) +{ + usnic_dbg("\n"); + return ERR_PTR(-EPERM); +} + +int usnic_ib_destroy_ah(struct ib_ah *ah) +{ + usnic_dbg("\n"); + return -EINVAL; +} + +int usnic_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr) +{ + usnic_dbg("\n"); + return -EINVAL; +} + +int usnic_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + usnic_dbg("\n"); + return -EINVAL; +} + +int usnic_ib_poll_cq(struct ib_cq *ibcq, int num_entries, + struct ib_wc *wc) +{ + usnic_dbg("\n"); + return -EINVAL; +} + +int usnic_ib_req_notify_cq(struct ib_cq *cq, + enum ib_cq_notify_flags flags) +{ + usnic_dbg("\n"); + return -EINVAL; +} + +struct ib_mr *usnic_ib_get_dma_mr(struct ib_pd *pd, int acc) +{ + usnic_dbg("\n"); + return ERR_PTR(-ENOMEM); +} + + +/* In ib callbacks section - End of stub funcs */ +/* End of ib callbacks section */ diff --git a/kernel/drivers/infiniband/hw/usnic/usnic_ib_verbs.h b/kernel/drivers/infiniband/hw/usnic/usnic_ib_verbs.h new file mode 100644 index 000000000..bb864f5ae --- /dev/null +++ b/kernel/drivers/infiniband/hw/usnic/usnic_ib_verbs.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_IB_VERBS_H_ +#define USNIC_IB_VERBS_H_ + +#include "usnic_ib.h" + +enum rdma_link_layer usnic_ib_port_link_layer(struct ib_device *device, + u8 port_num); +int usnic_ib_query_device(struct ib_device *ibdev, + struct ib_device_attr *props); +int usnic_ib_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props); +int usnic_ib_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr); +int usnic_ib_query_gid(struct ib_device *ibdev, u8 port, int index, + union ib_gid *gid); +int usnic_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, + u16 *pkey); +struct ib_pd *usnic_ib_alloc_pd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata); +int usnic_ib_dealloc_pd(struct ib_pd *pd); +struct ib_qp *usnic_ib_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata); +int usnic_ib_destroy_qp(struct ib_qp *qp); +int usnic_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata); +struct ib_cq *usnic_ib_create_cq(struct ib_device *ibdev, int entries, + int vector, struct ib_ucontext *context, + struct ib_udata *udata); +int usnic_ib_destroy_cq(struct ib_cq *cq); +struct ib_mr *usnic_ib_reg_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int access_flags, + struct ib_udata *udata); +int usnic_ib_dereg_mr(struct ib_mr *ibmr); +struct ib_ucontext *usnic_ib_alloc_ucontext(struct ib_device *ibdev, + struct ib_udata *udata); +int usnic_ib_dealloc_ucontext(struct ib_ucontext *ibcontext); +int usnic_ib_mmap(struct ib_ucontext *context, + struct vm_area_struct *vma); +struct ib_ah *usnic_ib_create_ah(struct ib_pd *pd, + struct ib_ah_attr *ah_attr); +int usnic_ib_destroy_ah(struct ib_ah *ah); +int usnic_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr); +int usnic_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr); +int usnic_ib_poll_cq(struct ib_cq *ibcq, int num_entries, + struct ib_wc *wc); +int usnic_ib_req_notify_cq(struct ib_cq *cq, + enum ib_cq_notify_flags flags); +struct ib_mr *usnic_ib_get_dma_mr(struct ib_pd *pd, int acc); +#endif /* !USNIC_IB_VERBS_H */ diff --git a/kernel/drivers/infiniband/hw/usnic/usnic_log.h b/kernel/drivers/infiniband/hw/usnic/usnic_log.h new file mode 100644 index 000000000..75777a66c --- /dev/null +++ b/kernel/drivers/infiniband/hw/usnic/usnic_log.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_LOG_H_ +#define USNIC_LOG_H_ + +#include "usnic.h" + +extern unsigned int usnic_log_lvl; + +#define USNIC_LOG_LVL_NONE (0) +#define USNIC_LOG_LVL_ERR (1) +#define USNIC_LOG_LVL_INFO (2) +#define USNIC_LOG_LVL_DBG (3) + +#define usnic_printk(lvl, args...) \ + do { \ + printk(lvl "%s:%s:%d: ", DRV_NAME, __func__, \ + __LINE__); \ + printk(args); \ + } while (0) + +#define usnic_dbg(args...) \ + do { \ + if (unlikely(usnic_log_lvl >= USNIC_LOG_LVL_DBG)) { \ + usnic_printk(KERN_INFO, args); \ + } \ +} while (0) + +#define usnic_info(args...) \ +do { \ + if (usnic_log_lvl >= USNIC_LOG_LVL_INFO) { \ + usnic_printk(KERN_INFO, args); \ + } \ +} while (0) + +#define usnic_err(args...) \ + do { \ + if (usnic_log_lvl >= USNIC_LOG_LVL_ERR) { \ + usnic_printk(KERN_ERR, args); \ + } \ + } while (0) +#endif /* !USNIC_LOG_H_ */ diff --git a/kernel/drivers/infiniband/hw/usnic/usnic_transport.c b/kernel/drivers/infiniband/hw/usnic/usnic_transport.c new file mode 100644 index 000000000..ddef6f77a --- /dev/null +++ b/kernel/drivers/infiniband/hw/usnic/usnic_transport.c @@ -0,0 +1,202 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include +#include +#include + +#include "usnic_transport.h" +#include "usnic_log.h" + +/* ROCE */ +static unsigned long *roce_bitmap; +static u16 roce_next_port = 1; +#define ROCE_BITMAP_SZ ((1 << (8 /*CHAR_BIT*/ * sizeof(u16)))/8 /*CHAR BIT*/) +static DEFINE_SPINLOCK(roce_bitmap_lock); + +const char *usnic_transport_to_str(enum usnic_transport_type type) +{ + switch (type) { + case USNIC_TRANSPORT_UNKNOWN: + return "Unknown"; + case USNIC_TRANSPORT_ROCE_CUSTOM: + return "roce custom"; + case USNIC_TRANSPORT_IPV4_UDP: + return "IPv4 UDP"; + case USNIC_TRANSPORT_MAX: + return "Max?"; + default: + return "Not known"; + } +} + +int usnic_transport_sock_to_str(char *buf, int buf_sz, + struct socket *sock) +{ + int err; + uint32_t addr; + uint16_t port; + int proto; + + memset(buf, 0, buf_sz); + err = usnic_transport_sock_get_addr(sock, &proto, &addr, &port); + if (err) + return 0; + + return scnprintf(buf, buf_sz, "Proto:%u Addr:%pI4h Port:%hu", + proto, &addr, port); +} + +/* + * reserve a port number. if "0" specified, we will try to pick one + * starting at roce_next_port. roce_next_port will take on the values + * 1..4096 + */ +u16 usnic_transport_rsrv_port(enum usnic_transport_type type, u16 port_num) +{ + if (type == USNIC_TRANSPORT_ROCE_CUSTOM) { + spin_lock(&roce_bitmap_lock); + if (!port_num) { + port_num = bitmap_find_next_zero_area(roce_bitmap, + ROCE_BITMAP_SZ, + roce_next_port /* start */, + 1 /* nr */, + 0 /* align */); + roce_next_port = (port_num & 4095) + 1; + } else if (test_bit(port_num, roce_bitmap)) { + usnic_err("Failed to allocate port for %s\n", + usnic_transport_to_str(type)); + spin_unlock(&roce_bitmap_lock); + goto out_fail; + } + bitmap_set(roce_bitmap, port_num, 1); + spin_unlock(&roce_bitmap_lock); + } else { + usnic_err("Failed to allocate port - transport %s unsupported\n", + usnic_transport_to_str(type)); + goto out_fail; + } + + usnic_dbg("Allocating port %hu for %s\n", port_num, + usnic_transport_to_str(type)); + return port_num; + +out_fail: + return 0; +} + +void usnic_transport_unrsrv_port(enum usnic_transport_type type, u16 port_num) +{ + if (type == USNIC_TRANSPORT_ROCE_CUSTOM) { + spin_lock(&roce_bitmap_lock); + if (!port_num) { + usnic_err("Unreserved unvalid port num 0 for %s\n", + usnic_transport_to_str(type)); + goto out_roce_custom; + } + + if (!test_bit(port_num, roce_bitmap)) { + usnic_err("Unreserving invalid %hu for %s\n", + port_num, + usnic_transport_to_str(type)); + goto out_roce_custom; + } + bitmap_clear(roce_bitmap, port_num, 1); + usnic_dbg("Freeing port %hu for %s\n", port_num, + usnic_transport_to_str(type)); +out_roce_custom: + spin_unlock(&roce_bitmap_lock); + } else { + usnic_err("Freeing invalid port %hu for %d\n", port_num, type); + } +} + +struct socket *usnic_transport_get_socket(int sock_fd) +{ + struct socket *sock; + int err; + char buf[25]; + + /* sockfd_lookup will internally do a fget */ + sock = sockfd_lookup(sock_fd, &err); + if (!sock) { + usnic_err("Unable to lookup socket for fd %d with err %d\n", + sock_fd, err); + return ERR_PTR(-ENOENT); + } + + usnic_transport_sock_to_str(buf, sizeof(buf), sock); + usnic_dbg("Get sock %s\n", buf); + + return sock; +} + +void usnic_transport_put_socket(struct socket *sock) +{ + char buf[100]; + + usnic_transport_sock_to_str(buf, sizeof(buf), sock); + usnic_dbg("Put sock %s\n", buf); + sockfd_put(sock); +} + +int usnic_transport_sock_get_addr(struct socket *sock, int *proto, + uint32_t *addr, uint16_t *port) +{ + int len; + int err; + struct sockaddr_in sock_addr; + + err = sock->ops->getname(sock, + (struct sockaddr *)&sock_addr, + &len, 0); + if (err) + return err; + + if (sock_addr.sin_family != AF_INET) + return -EINVAL; + + if (proto) + *proto = sock->sk->sk_protocol; + if (port) + *port = ntohs(((struct sockaddr_in *)&sock_addr)->sin_port); + if (addr) + *addr = ntohl(((struct sockaddr_in *) + &sock_addr)->sin_addr.s_addr); + + return 0; +} + +int usnic_transport_init(void) +{ + roce_bitmap = kzalloc(ROCE_BITMAP_SZ, GFP_KERNEL); + if (!roce_bitmap) { + usnic_err("Failed to allocate bit map"); + return -ENOMEM; + } + + /* Do not ever allocate bit 0, hence set it here */ + bitmap_set(roce_bitmap, 0, 1); + return 0; +} + +void usnic_transport_fini(void) +{ + kfree(roce_bitmap); +} diff --git a/kernel/drivers/infiniband/hw/usnic/usnic_transport.h b/kernel/drivers/infiniband/hw/usnic/usnic_transport.h new file mode 100644 index 000000000..7e5dc6d9f --- /dev/null +++ b/kernel/drivers/infiniband/hw/usnic/usnic_transport.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_TRANSPORT_H_ +#define USNIC_TRANSPORT_H_ + +#include "usnic_abi.h" + +const char *usnic_transport_to_str(enum usnic_transport_type trans_type); +/* + * Returns number of bytes written, excluding null terminator. If + * nothing was written, the function returns 0. + */ +int usnic_transport_sock_to_str(char *buf, int buf_sz, + struct socket *sock); +/* + * Reserve a port. If "port_num" is set, then the function will try + * to reserve that particular port. + */ +u16 usnic_transport_rsrv_port(enum usnic_transport_type type, u16 port_num); +void usnic_transport_unrsrv_port(enum usnic_transport_type type, u16 port_num); +/* + * Do a fget on the socket refered to by sock_fd and returns the socket. + * Socket will not be destroyed before usnic_transport_put_socket has + * been called. + */ +struct socket *usnic_transport_get_socket(int sock_fd); +void usnic_transport_put_socket(struct socket *sock); +/* + * Call usnic_transport_get_socket before calling *_sock_get_addr + */ +int usnic_transport_sock_get_addr(struct socket *sock, int *proto, + uint32_t *addr, uint16_t *port); +int usnic_transport_init(void); +void usnic_transport_fini(void); +#endif /* !USNIC_TRANSPORT_H */ diff --git a/kernel/drivers/infiniband/hw/usnic/usnic_uiom.c b/kernel/drivers/infiniband/hw/usnic/usnic_uiom.c new file mode 100644 index 000000000..417de1f32 --- /dev/null +++ b/kernel/drivers/infiniband/hw/usnic/usnic_uiom.c @@ -0,0 +1,604 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * Copyright (c) 2013 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "usnic_log.h" +#include "usnic_uiom.h" +#include "usnic_uiom_interval_tree.h" + +static struct workqueue_struct *usnic_uiom_wq; + +#define USNIC_UIOM_PAGE_CHUNK \ + ((PAGE_SIZE - offsetof(struct usnic_uiom_chunk, page_list)) /\ + ((void *) &((struct usnic_uiom_chunk *) 0)->page_list[1] - \ + (void *) &((struct usnic_uiom_chunk *) 0)->page_list[0])) + +static void usnic_uiom_reg_account(struct work_struct *work) +{ + struct usnic_uiom_reg *umem = container_of(work, + struct usnic_uiom_reg, work); + + down_write(&umem->mm->mmap_sem); + umem->mm->locked_vm -= umem->diff; + up_write(&umem->mm->mmap_sem); + mmput(umem->mm); + kfree(umem); +} + +static int usnic_uiom_dma_fault(struct iommu_domain *domain, + struct device *dev, + unsigned long iova, int flags, + void *token) +{ + usnic_err("Device %s iommu fault domain 0x%pK va 0x%lx flags 0x%x\n", + dev_name(dev), + domain, iova, flags); + return -ENOSYS; +} + +static void usnic_uiom_put_pages(struct list_head *chunk_list, int dirty) +{ + struct usnic_uiom_chunk *chunk, *tmp; + struct page *page; + struct scatterlist *sg; + int i; + dma_addr_t pa; + + list_for_each_entry_safe(chunk, tmp, chunk_list, list) { + for_each_sg(chunk->page_list, sg, chunk->nents, i) { + page = sg_page(sg); + pa = sg_phys(sg); + if (dirty) + set_page_dirty_lock(page); + put_page(page); + usnic_dbg("pa: %pa\n", &pa); + } + kfree(chunk); + } +} + +static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable, + int dmasync, struct list_head *chunk_list) +{ + struct page **page_list; + struct scatterlist *sg; + struct usnic_uiom_chunk *chunk; + unsigned long locked; + unsigned long lock_limit; + unsigned long cur_base; + unsigned long npages; + int ret; + int off; + int i; + int flags; + dma_addr_t pa; + DEFINE_DMA_ATTRS(attrs); + + if (dmasync) + dma_set_attr(DMA_ATTR_WRITE_BARRIER, &attrs); + + if (!can_do_mlock()) + return -EPERM; + + INIT_LIST_HEAD(chunk_list); + + page_list = (struct page **) __get_free_page(GFP_KERNEL); + if (!page_list) + return -ENOMEM; + + npages = PAGE_ALIGN(size + (addr & ~PAGE_MASK)) >> PAGE_SHIFT; + + down_write(¤t->mm->mmap_sem); + + locked = npages + current->mm->locked_vm; + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + + if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) { + ret = -ENOMEM; + goto out; + } + + flags = IOMMU_READ | IOMMU_CACHE; + flags |= (writable) ? IOMMU_WRITE : 0; + cur_base = addr & PAGE_MASK; + ret = 0; + + while (npages) { + ret = get_user_pages(current, current->mm, cur_base, + min_t(unsigned long, npages, + PAGE_SIZE / sizeof(struct page *)), + 1, !writable, page_list, NULL); + + if (ret < 0) + goto out; + + npages -= ret; + off = 0; + + while (ret) { + chunk = kmalloc(sizeof(*chunk) + + sizeof(struct scatterlist) * + min_t(int, ret, USNIC_UIOM_PAGE_CHUNK), + GFP_KERNEL); + if (!chunk) { + ret = -ENOMEM; + goto out; + } + + chunk->nents = min_t(int, ret, USNIC_UIOM_PAGE_CHUNK); + sg_init_table(chunk->page_list, chunk->nents); + for_each_sg(chunk->page_list, sg, chunk->nents, i) { + sg_set_page(sg, page_list[i + off], + PAGE_SIZE, 0); + pa = sg_phys(sg); + usnic_dbg("va: 0x%lx pa: %pa\n", + cur_base + i*PAGE_SIZE, &pa); + } + cur_base += chunk->nents * PAGE_SIZE; + ret -= chunk->nents; + off += chunk->nents; + list_add_tail(&chunk->list, chunk_list); + } + + ret = 0; + } + +out: + if (ret < 0) + usnic_uiom_put_pages(chunk_list, 0); + else + current->mm->locked_vm = locked; + + up_write(¤t->mm->mmap_sem); + free_page((unsigned long) page_list); + return ret; +} + +static void usnic_uiom_unmap_sorted_intervals(struct list_head *intervals, + struct usnic_uiom_pd *pd) +{ + struct usnic_uiom_interval_node *interval, *tmp; + long unsigned va, size; + + list_for_each_entry_safe(interval, tmp, intervals, link) { + va = interval->start << PAGE_SHIFT; + size = ((interval->last - interval->start) + 1) << PAGE_SHIFT; + while (size > 0) { + /* Workaround for RH 970401 */ + usnic_dbg("va 0x%lx size 0x%lx", va, PAGE_SIZE); + iommu_unmap(pd->domain, va, PAGE_SIZE); + va += PAGE_SIZE; + size -= PAGE_SIZE; + } + } +} + +static void __usnic_uiom_reg_release(struct usnic_uiom_pd *pd, + struct usnic_uiom_reg *uiomr, + int dirty) +{ + int npages; + unsigned long vpn_start, vpn_last; + struct usnic_uiom_interval_node *interval, *tmp; + int writable = 0; + LIST_HEAD(rm_intervals); + + npages = PAGE_ALIGN(uiomr->length + uiomr->offset) >> PAGE_SHIFT; + vpn_start = (uiomr->va & PAGE_MASK) >> PAGE_SHIFT; + vpn_last = vpn_start + npages - 1; + + spin_lock(&pd->lock); + usnic_uiom_remove_interval(&pd->rb_root, vpn_start, + vpn_last, &rm_intervals); + usnic_uiom_unmap_sorted_intervals(&rm_intervals, pd); + + list_for_each_entry_safe(interval, tmp, &rm_intervals, link) { + if (interval->flags & IOMMU_WRITE) + writable = 1; + list_del(&interval->link); + kfree(interval); + } + + usnic_uiom_put_pages(&uiomr->chunk_list, dirty & writable); + spin_unlock(&pd->lock); +} + +static int usnic_uiom_map_sorted_intervals(struct list_head *intervals, + struct usnic_uiom_reg *uiomr) +{ + int i, err; + size_t size; + struct usnic_uiom_chunk *chunk; + struct usnic_uiom_interval_node *interval_node; + dma_addr_t pa; + dma_addr_t pa_start = 0; + dma_addr_t pa_end = 0; + long int va_start = -EINVAL; + struct usnic_uiom_pd *pd = uiomr->pd; + long int va = uiomr->va & PAGE_MASK; + int flags = IOMMU_READ | IOMMU_CACHE; + + flags |= (uiomr->writable) ? IOMMU_WRITE : 0; + chunk = list_first_entry(&uiomr->chunk_list, struct usnic_uiom_chunk, + list); + list_for_each_entry(interval_node, intervals, link) { +iter_chunk: + for (i = 0; i < chunk->nents; i++, va += PAGE_SIZE) { + pa = sg_phys(&chunk->page_list[i]); + if ((va >> PAGE_SHIFT) < interval_node->start) + continue; + + if ((va >> PAGE_SHIFT) == interval_node->start) { + /* First page of the interval */ + va_start = va; + pa_start = pa; + pa_end = pa; + } + + WARN_ON(va_start == -EINVAL); + + if ((pa_end + PAGE_SIZE != pa) && + (pa != pa_start)) { + /* PAs are not contiguous */ + size = pa_end - pa_start + PAGE_SIZE; + usnic_dbg("va 0x%lx pa %pa size 0x%zx flags 0x%x", + va_start, &pa_start, size, flags); + err = iommu_map(pd->domain, va_start, pa_start, + size, flags); + if (err) { + usnic_err("Failed to map va 0x%lx pa %pa size 0x%zx with err %d\n", + va_start, &pa_start, size, err); + goto err_out; + } + va_start = va; + pa_start = pa; + pa_end = pa; + } + + if ((va >> PAGE_SHIFT) == interval_node->last) { + /* Last page of the interval */ + size = pa - pa_start + PAGE_SIZE; + usnic_dbg("va 0x%lx pa %pa size 0x%zx flags 0x%x\n", + va_start, &pa_start, size, flags); + err = iommu_map(pd->domain, va_start, pa_start, + size, flags); + if (err) { + usnic_err("Failed to map va 0x%lx pa %pa size 0x%zx with err %d\n", + va_start, &pa_start, size, err); + goto err_out; + } + break; + } + + if (pa != pa_start) + pa_end += PAGE_SIZE; + } + + if (i == chunk->nents) { + /* + * Hit last entry of the chunk, + * hence advance to next chunk + */ + chunk = list_first_entry(&chunk->list, + struct usnic_uiom_chunk, + list); + goto iter_chunk; + } + } + + return 0; + +err_out: + usnic_uiom_unmap_sorted_intervals(intervals, pd); + return err; +} + +struct usnic_uiom_reg *usnic_uiom_reg_get(struct usnic_uiom_pd *pd, + unsigned long addr, size_t size, + int writable, int dmasync) +{ + struct usnic_uiom_reg *uiomr; + unsigned long va_base, vpn_start, vpn_last; + unsigned long npages; + int offset, err; + LIST_HEAD(sorted_diff_intervals); + + /* + * Intel IOMMU map throws an error if a translation entry is + * changed from read to write. This module may not unmap + * and then remap the entry after fixing the permission + * b/c this open up a small windows where hw DMA may page fault + * Hence, make all entries to be writable. + */ + writable = 1; + + va_base = addr & PAGE_MASK; + offset = addr & ~PAGE_MASK; + npages = PAGE_ALIGN(size + offset) >> PAGE_SHIFT; + vpn_start = (addr & PAGE_MASK) >> PAGE_SHIFT; + vpn_last = vpn_start + npages - 1; + + uiomr = kmalloc(sizeof(*uiomr), GFP_KERNEL); + if (!uiomr) + return ERR_PTR(-ENOMEM); + + uiomr->va = va_base; + uiomr->offset = offset; + uiomr->length = size; + uiomr->writable = writable; + uiomr->pd = pd; + + err = usnic_uiom_get_pages(addr, size, writable, dmasync, + &uiomr->chunk_list); + if (err) { + usnic_err("Failed get_pages vpn [0x%lx,0x%lx] err %d\n", + vpn_start, vpn_last, err); + goto out_free_uiomr; + } + + spin_lock(&pd->lock); + err = usnic_uiom_get_intervals_diff(vpn_start, vpn_last, + (writable) ? IOMMU_WRITE : 0, + IOMMU_WRITE, + &pd->rb_root, + &sorted_diff_intervals); + if (err) { + usnic_err("Failed disjoint interval vpn [0x%lx,0x%lx] err %d\n", + vpn_start, vpn_last, err); + goto out_put_pages; + } + + err = usnic_uiom_map_sorted_intervals(&sorted_diff_intervals, uiomr); + if (err) { + usnic_err("Failed map interval vpn [0x%lx,0x%lx] err %d\n", + vpn_start, vpn_last, err); + goto out_put_intervals; + + } + + err = usnic_uiom_insert_interval(&pd->rb_root, vpn_start, vpn_last, + (writable) ? IOMMU_WRITE : 0); + if (err) { + usnic_err("Failed insert interval vpn [0x%lx,0x%lx] err %d\n", + vpn_start, vpn_last, err); + goto out_unmap_intervals; + } + + usnic_uiom_put_interval_set(&sorted_diff_intervals); + spin_unlock(&pd->lock); + + return uiomr; + +out_unmap_intervals: + usnic_uiom_unmap_sorted_intervals(&sorted_diff_intervals, pd); +out_put_intervals: + usnic_uiom_put_interval_set(&sorted_diff_intervals); +out_put_pages: + usnic_uiom_put_pages(&uiomr->chunk_list, 0); + spin_unlock(&pd->lock); +out_free_uiomr: + kfree(uiomr); + return ERR_PTR(err); +} + +void usnic_uiom_reg_release(struct usnic_uiom_reg *uiomr, int closing) +{ + struct mm_struct *mm; + unsigned long diff; + + __usnic_uiom_reg_release(uiomr->pd, uiomr, 1); + + mm = get_task_mm(current); + if (!mm) { + kfree(uiomr); + return; + } + + diff = PAGE_ALIGN(uiomr->length + uiomr->offset) >> PAGE_SHIFT; + + /* + * We may be called with the mm's mmap_sem already held. This + * can happen when a userspace munmap() is the call that drops + * the last reference to our file and calls our release + * method. If there are memory regions to destroy, we'll end + * up here and not be able to take the mmap_sem. In that case + * we defer the vm_locked accounting to the system workqueue. + */ + if (closing) { + if (!down_write_trylock(&mm->mmap_sem)) { + INIT_WORK(&uiomr->work, usnic_uiom_reg_account); + uiomr->mm = mm; + uiomr->diff = diff; + + queue_work(usnic_uiom_wq, &uiomr->work); + return; + } + } else + down_write(&mm->mmap_sem); + + current->mm->locked_vm -= diff; + up_write(&mm->mmap_sem); + mmput(mm); + kfree(uiomr); +} + +struct usnic_uiom_pd *usnic_uiom_alloc_pd(void) +{ + struct usnic_uiom_pd *pd; + void *domain; + + pd = kzalloc(sizeof(*pd), GFP_KERNEL); + if (!pd) + return ERR_PTR(-ENOMEM); + + pd->domain = domain = iommu_domain_alloc(&pci_bus_type); + if (IS_ERR_OR_NULL(domain)) { + usnic_err("Failed to allocate IOMMU domain with err %ld\n", + PTR_ERR(pd->domain)); + kfree(pd); + return ERR_PTR(domain ? PTR_ERR(domain) : -ENOMEM); + } + + iommu_set_fault_handler(pd->domain, usnic_uiom_dma_fault, NULL); + + spin_lock_init(&pd->lock); + INIT_LIST_HEAD(&pd->devs); + + return pd; +} + +void usnic_uiom_dealloc_pd(struct usnic_uiom_pd *pd) +{ + iommu_domain_free(pd->domain); + kfree(pd); +} + +int usnic_uiom_attach_dev_to_pd(struct usnic_uiom_pd *pd, struct device *dev) +{ + struct usnic_uiom_dev *uiom_dev; + int err; + + uiom_dev = kzalloc(sizeof(*uiom_dev), GFP_ATOMIC); + if (!uiom_dev) + return -ENOMEM; + uiom_dev->dev = dev; + + err = iommu_attach_device(pd->domain, dev); + if (err) + goto out_free_dev; + + if (!iommu_capable(dev->bus, IOMMU_CAP_CACHE_COHERENCY)) { + usnic_err("IOMMU of %s does not support cache coherency\n", + dev_name(dev)); + err = -EINVAL; + goto out_detach_device; + } + + spin_lock(&pd->lock); + list_add_tail(&uiom_dev->link, &pd->devs); + pd->dev_cnt++; + spin_unlock(&pd->lock); + + return 0; + +out_detach_device: + iommu_detach_device(pd->domain, dev); +out_free_dev: + kfree(uiom_dev); + return err; +} + +void usnic_uiom_detach_dev_from_pd(struct usnic_uiom_pd *pd, struct device *dev) +{ + struct usnic_uiom_dev *uiom_dev; + int found = 0; + + spin_lock(&pd->lock); + list_for_each_entry(uiom_dev, &pd->devs, link) { + if (uiom_dev->dev == dev) { + found = 1; + break; + } + } + + if (!found) { + usnic_err("Unable to free dev %s - not found\n", + dev_name(dev)); + spin_unlock(&pd->lock); + return; + } + + list_del(&uiom_dev->link); + pd->dev_cnt--; + spin_unlock(&pd->lock); + + return iommu_detach_device(pd->domain, dev); +} + +struct device **usnic_uiom_get_dev_list(struct usnic_uiom_pd *pd) +{ + struct usnic_uiom_dev *uiom_dev; + struct device **devs; + int i = 0; + + spin_lock(&pd->lock); + devs = kcalloc(pd->dev_cnt + 1, sizeof(*devs), GFP_ATOMIC); + if (!devs) { + devs = ERR_PTR(-ENOMEM); + goto out; + } + + list_for_each_entry(uiom_dev, &pd->devs, link) { + devs[i++] = uiom_dev->dev; + } +out: + spin_unlock(&pd->lock); + return devs; +} + +void usnic_uiom_free_dev_list(struct device **devs) +{ + kfree(devs); +} + +int usnic_uiom_init(char *drv_name) +{ + if (!iommu_present(&pci_bus_type)) { + usnic_err("IOMMU required but not present or enabled. USNIC QPs will not function w/o enabling IOMMU\n"); + return -EPERM; + } + + usnic_uiom_wq = create_workqueue(drv_name); + if (!usnic_uiom_wq) { + usnic_err("Unable to alloc wq for drv %s\n", drv_name); + return -ENOMEM; + } + + return 0; +} + +void usnic_uiom_fini(void) +{ + flush_workqueue(usnic_uiom_wq); + destroy_workqueue(usnic_uiom_wq); +} diff --git a/kernel/drivers/infiniband/hw/usnic/usnic_uiom.h b/kernel/drivers/infiniband/hw/usnic/usnic_uiom.h new file mode 100644 index 000000000..70440996e --- /dev/null +++ b/kernel/drivers/infiniband/hw/usnic/usnic_uiom.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_UIOM_H_ +#define USNIC_UIOM_H_ + +#include +#include + +#include "usnic_uiom_interval_tree.h" + +#define USNIC_UIOM_READ (1) +#define USNIC_UIOM_WRITE (2) + +#define USNIC_UIOM_MAX_PD_CNT (1000) +#define USNIC_UIOM_MAX_MR_CNT (1000000) +#define USNIC_UIOM_MAX_MR_SIZE (~0UL) +#define USNIC_UIOM_PAGE_SIZE (PAGE_SIZE) + +struct usnic_uiom_dev { + struct device *dev; + struct list_head link; +}; + +struct usnic_uiom_pd { + struct iommu_domain *domain; + spinlock_t lock; + struct rb_root rb_root; + struct list_head devs; + int dev_cnt; +}; + +struct usnic_uiom_reg { + struct usnic_uiom_pd *pd; + unsigned long va; + size_t length; + int offset; + int page_size; + int writable; + struct list_head chunk_list; + struct work_struct work; + struct mm_struct *mm; + unsigned long diff; +}; + +struct usnic_uiom_chunk { + struct list_head list; + int nents; + struct scatterlist page_list[0]; +}; + +struct usnic_uiom_pd *usnic_uiom_alloc_pd(void); +void usnic_uiom_dealloc_pd(struct usnic_uiom_pd *pd); +int usnic_uiom_attach_dev_to_pd(struct usnic_uiom_pd *pd, struct device *dev); +void usnic_uiom_detach_dev_from_pd(struct usnic_uiom_pd *pd, + struct device *dev); +struct device **usnic_uiom_get_dev_list(struct usnic_uiom_pd *pd); +void usnic_uiom_free_dev_list(struct device **devs); +struct usnic_uiom_reg *usnic_uiom_reg_get(struct usnic_uiom_pd *pd, + unsigned long addr, size_t size, + int access, int dmasync); +void usnic_uiom_reg_release(struct usnic_uiom_reg *uiomr, int closing); +int usnic_uiom_init(char *drv_name); +void usnic_uiom_fini(void); +#endif /* USNIC_UIOM_H_ */ diff --git a/kernel/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c b/kernel/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c new file mode 100644 index 000000000..3a4288e0f --- /dev/null +++ b/kernel/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c @@ -0,0 +1,254 @@ +/* + * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include +#include +#include + +#include +#include "usnic_uiom_interval_tree.h" + +#define START(node) ((node)->start) +#define LAST(node) ((node)->last) + +#define MAKE_NODE(node, start, end, ref_cnt, flags, err, err_out) \ + do { \ + node = usnic_uiom_interval_node_alloc(start, \ + end, ref_cnt, flags); \ + if (!node) { \ + err = -ENOMEM; \ + goto err_out; \ + } \ + } while (0) + +#define MARK_FOR_ADD(node, list) (list_add_tail(&node->link, list)) + +#define MAKE_NODE_AND_APPEND(node, start, end, ref_cnt, flags, err, \ + err_out, list) \ + do { \ + MAKE_NODE(node, start, end, \ + ref_cnt, flags, err, \ + err_out); \ + MARK_FOR_ADD(node, list); \ + } while (0) + +#define FLAGS_EQUAL(flags1, flags2, mask) \ + (((flags1) & (mask)) == ((flags2) & (mask))) + +static struct usnic_uiom_interval_node* +usnic_uiom_interval_node_alloc(long int start, long int last, int ref_cnt, + int flags) +{ + struct usnic_uiom_interval_node *interval = kzalloc(sizeof(*interval), + GFP_ATOMIC); + if (!interval) + return NULL; + + interval->start = start; + interval->last = last; + interval->flags = flags; + interval->ref_cnt = ref_cnt; + + return interval; +} + +static int interval_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + struct usnic_uiom_interval_node *node_a, *node_b; + + node_a = list_entry(a, struct usnic_uiom_interval_node, link); + node_b = list_entry(b, struct usnic_uiom_interval_node, link); + + /* long to int */ + if (node_a->start < node_b->start) + return -1; + else if (node_a->start > node_b->start) + return 1; + + return 0; +} + +static void +find_intervals_intersection_sorted(struct rb_root *root, unsigned long start, + unsigned long last, + struct list_head *list) +{ + struct usnic_uiom_interval_node *node; + + INIT_LIST_HEAD(list); + + for (node = usnic_uiom_interval_tree_iter_first(root, start, last); + node; + node = usnic_uiom_interval_tree_iter_next(node, start, last)) + list_add_tail(&node->link, list); + + list_sort(NULL, list, interval_cmp); +} + +int usnic_uiom_get_intervals_diff(unsigned long start, unsigned long last, + int flags, int flag_mask, + struct rb_root *root, + struct list_head *diff_set) +{ + struct usnic_uiom_interval_node *interval, *tmp; + int err = 0; + long int pivot = start; + LIST_HEAD(intersection_set); + + INIT_LIST_HEAD(diff_set); + + find_intervals_intersection_sorted(root, start, last, + &intersection_set); + + list_for_each_entry(interval, &intersection_set, link) { + if (pivot < interval->start) { + MAKE_NODE_AND_APPEND(tmp, pivot, interval->start - 1, + 1, flags, err, err_out, + diff_set); + pivot = interval->start; + } + + /* + * Invariant: Set [start, pivot] is either in diff_set or root, + * but not in both. + */ + + if (pivot > interval->last) { + continue; + } else if (pivot <= interval->last && + FLAGS_EQUAL(interval->flags, flags, + flag_mask)) { + pivot = interval->last + 1; + } + } + + if (pivot <= last) + MAKE_NODE_AND_APPEND(tmp, pivot, last, 1, flags, err, err_out, + diff_set); + + return 0; + +err_out: + list_for_each_entry_safe(interval, tmp, diff_set, link) { + list_del(&interval->link); + kfree(interval); + } + + return err; +} + +void usnic_uiom_put_interval_set(struct list_head *intervals) +{ + struct usnic_uiom_interval_node *interval, *tmp; + list_for_each_entry_safe(interval, tmp, intervals, link) + kfree(interval); +} + +int usnic_uiom_insert_interval(struct rb_root *root, unsigned long start, + unsigned long last, int flags) +{ + struct usnic_uiom_interval_node *interval, *tmp; + unsigned long istart, ilast; + int iref_cnt, iflags; + unsigned long lpivot = start; + int err = 0; + LIST_HEAD(to_add); + LIST_HEAD(intersection_set); + + find_intervals_intersection_sorted(root, start, last, + &intersection_set); + + list_for_each_entry(interval, &intersection_set, link) { + /* + * Invariant - lpivot is the left edge of next interval to be + * inserted + */ + istart = interval->start; + ilast = interval->last; + iref_cnt = interval->ref_cnt; + iflags = interval->flags; + + if (istart < lpivot) { + MAKE_NODE_AND_APPEND(tmp, istart, lpivot - 1, iref_cnt, + iflags, err, err_out, &to_add); + } else if (istart > lpivot) { + MAKE_NODE_AND_APPEND(tmp, lpivot, istart - 1, 1, flags, + err, err_out, &to_add); + lpivot = istart; + } else { + lpivot = istart; + } + + if (ilast > last) { + MAKE_NODE_AND_APPEND(tmp, lpivot, last, iref_cnt + 1, + iflags | flags, err, err_out, + &to_add); + MAKE_NODE_AND_APPEND(tmp, last + 1, ilast, iref_cnt, + iflags, err, err_out, &to_add); + } else { + MAKE_NODE_AND_APPEND(tmp, lpivot, ilast, iref_cnt + 1, + iflags | flags, err, err_out, + &to_add); + } + + lpivot = ilast + 1; + } + + if (lpivot <= last) + MAKE_NODE_AND_APPEND(tmp, lpivot, last, 1, flags, err, err_out, + &to_add); + + list_for_each_entry_safe(interval, tmp, &intersection_set, link) { + usnic_uiom_interval_tree_remove(interval, root); + kfree(interval); + } + + list_for_each_entry(interval, &to_add, link) + usnic_uiom_interval_tree_insert(interval, root); + + return 0; + +err_out: + list_for_each_entry_safe(interval, tmp, &to_add, link) + kfree(interval); + + return err; +} + +void usnic_uiom_remove_interval(struct rb_root *root, unsigned long start, + unsigned long last, struct list_head *removed) +{ + struct usnic_uiom_interval_node *interval; + + for (interval = usnic_uiom_interval_tree_iter_first(root, start, last); + interval; + interval = usnic_uiom_interval_tree_iter_next(interval, + start, + last)) { + if (--interval->ref_cnt == 0) + list_add_tail(&interval->link, removed); + } + + list_for_each_entry(interval, removed, link) + usnic_uiom_interval_tree_remove(interval, root); +} + +INTERVAL_TREE_DEFINE(struct usnic_uiom_interval_node, rb, + unsigned long, __subtree_last, + START, LAST, , usnic_uiom_interval_tree) diff --git a/kernel/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h b/kernel/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h new file mode 100644 index 000000000..d4f752e25 --- /dev/null +++ b/kernel/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_UIOM_INTERVAL_TREE_H_ +#define USNIC_UIOM_INTERVAL_TREE_H_ + +#include + +struct usnic_uiom_interval_node { + struct rb_node rb; + struct list_head link; + unsigned long start; + unsigned long last; + unsigned long __subtree_last; + unsigned int ref_cnt; + int flags; +}; + +extern void +usnic_uiom_interval_tree_insert(struct usnic_uiom_interval_node *node, + struct rb_root *root); +extern void +usnic_uiom_interval_tree_remove(struct usnic_uiom_interval_node *node, + struct rb_root *root); +extern struct usnic_uiom_interval_node * +usnic_uiom_interval_tree_iter_first(struct rb_root *root, + unsigned long start, + unsigned long last); +extern struct usnic_uiom_interval_node * +usnic_uiom_interval_tree_iter_next(struct usnic_uiom_interval_node *node, + unsigned long start, unsigned long last); +/* + * Inserts {start...last} into {root}. If there are overlaps, + * nodes will be broken up and merged + */ +int usnic_uiom_insert_interval(struct rb_root *root, + unsigned long start, unsigned long last, + int flags); +/* + * Removed {start...last} from {root}. The nodes removed are returned in + * 'removed.' The caller is responsibile for freeing memory of nodes in + * 'removed.' + */ +void usnic_uiom_remove_interval(struct rb_root *root, + unsigned long start, unsigned long last, + struct list_head *removed); +/* + * Returns {start...last} - {root} (relative complement of {start...last} in + * {root}) in diff_set sorted ascendingly + */ +int usnic_uiom_get_intervals_diff(unsigned long start, + unsigned long last, int flags, + int flag_mask, + struct rb_root *root, + struct list_head *diff_set); +/* Call this to free diff_set returned by usnic_uiom_get_intervals_diff */ +void usnic_uiom_put_interval_set(struct list_head *intervals); +#endif /* USNIC_UIOM_INTERVAL_TREE_H_ */ diff --git a/kernel/drivers/infiniband/hw/usnic/usnic_vnic.c b/kernel/drivers/infiniband/hw/usnic/usnic_vnic.c new file mode 100644 index 000000000..656b88c39 --- /dev/null +++ b/kernel/drivers/infiniband/hw/usnic/usnic_vnic.c @@ -0,0 +1,467 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include + +#include "usnic_ib.h" +#include "vnic_resource.h" +#include "usnic_log.h" +#include "usnic_vnic.h" + +struct usnic_vnic { + struct vnic_dev *vdev; + struct vnic_dev_bar bar[PCI_NUM_RESOURCES]; + struct usnic_vnic_res_chunk chunks[USNIC_VNIC_RES_TYPE_MAX]; + spinlock_t res_lock; +}; + +static enum vnic_res_type _to_vnic_res_type(enum usnic_vnic_res_type res_type) +{ +#define DEFINE_USNIC_VNIC_RES_AT(usnic_vnic_res_t, vnic_res_type, desc, val) \ + vnic_res_type, +#define DEFINE_USNIC_VNIC_RES(usnic_vnic_res_t, vnic_res_type, desc) \ + vnic_res_type, + static enum vnic_res_type usnic_vnic_type_2_vnic_type[] = { + USNIC_VNIC_RES_TYPES}; +#undef DEFINE_USNIC_VNIC_RES +#undef DEFINE_USNIC_VNIC_RES_AT + + if (res_type >= USNIC_VNIC_RES_TYPE_MAX) + return RES_TYPE_MAX; + + return usnic_vnic_type_2_vnic_type[res_type]; +} + +const char *usnic_vnic_res_type_to_str(enum usnic_vnic_res_type res_type) +{ +#define DEFINE_USNIC_VNIC_RES_AT(usnic_vnic_res_t, vnic_res_type, desc, val) \ + desc, +#define DEFINE_USNIC_VNIC_RES(usnic_vnic_res_t, vnic_res_type, desc) \ + desc, + static const char * const usnic_vnic_res_type_desc[] = { + USNIC_VNIC_RES_TYPES}; +#undef DEFINE_USNIC_VNIC_RES +#undef DEFINE_USNIC_VNIC_RES_AT + + if (res_type >= USNIC_VNIC_RES_TYPE_MAX) + return "unknown"; + + return usnic_vnic_res_type_desc[res_type]; + +} + +const char *usnic_vnic_pci_name(struct usnic_vnic *vnic) +{ + return pci_name(usnic_vnic_get_pdev(vnic)); +} + +int usnic_vnic_dump(struct usnic_vnic *vnic, char *buf, + int buf_sz, + void *hdr_obj, + int (*printtitle)(void *, char*, int), + int (*printcols)(char *, int), + int (*printrow)(void *, char *, int)) +{ + struct usnic_vnic_res_chunk *chunk; + struct usnic_vnic_res *res; + struct vnic_dev_bar *bar0; + int i, j, offset; + + offset = 0; + bar0 = usnic_vnic_get_bar(vnic, 0); + offset += scnprintf(buf + offset, buf_sz - offset, + "VF:%hu BAR0 bus_addr=%pa vaddr=0x%p size=%ld ", + usnic_vnic_get_index(vnic), + &bar0->bus_addr, + bar0->vaddr, bar0->len); + if (printtitle) + offset += printtitle(hdr_obj, buf + offset, buf_sz - offset); + offset += scnprintf(buf + offset, buf_sz - offset, "\n"); + offset += scnprintf(buf + offset, buf_sz - offset, + "|RES\t|CTRL_PIN\t\t|IN_USE\t"); + if (printcols) + offset += printcols(buf + offset, buf_sz - offset); + offset += scnprintf(buf + offset, buf_sz - offset, "\n"); + + spin_lock(&vnic->res_lock); + for (i = 0; i < ARRAY_SIZE(vnic->chunks); i++) { + chunk = &vnic->chunks[i]; + for (j = 0; j < chunk->cnt; j++) { + res = chunk->res[j]; + offset += scnprintf(buf + offset, buf_sz - offset, + "|%s[%u]\t|0x%p\t|%u\t", + usnic_vnic_res_type_to_str(res->type), + res->vnic_idx, res->ctrl, !!res->owner); + if (printrow) { + offset += printrow(res->owner, buf + offset, + buf_sz - offset); + } + offset += scnprintf(buf + offset, buf_sz - offset, + "\n"); + } + } + spin_unlock(&vnic->res_lock); + return offset; +} + +void usnic_vnic_res_spec_update(struct usnic_vnic_res_spec *spec, + enum usnic_vnic_res_type trgt_type, + u16 cnt) +{ + int i; + + for (i = 0; i < USNIC_VNIC_RES_TYPE_MAX; i++) { + if (spec->resources[i].type == trgt_type) { + spec->resources[i].cnt = cnt; + return; + } + } + + WARN_ON(1); +} + +int usnic_vnic_res_spec_satisfied(const struct usnic_vnic_res_spec *min_spec, + struct usnic_vnic_res_spec *res_spec) +{ + int found, i, j; + + for (i = 0; i < USNIC_VNIC_RES_TYPE_MAX; i++) { + found = 0; + + for (j = 0; j < USNIC_VNIC_RES_TYPE_MAX; j++) { + if (res_spec->resources[i].type != + min_spec->resources[i].type) + continue; + found = 1; + if (min_spec->resources[i].cnt > + res_spec->resources[i].cnt) + return -EINVAL; + break; + } + + if (!found) + return -EINVAL; + } + return 0; +} + +int usnic_vnic_spec_dump(char *buf, int buf_sz, + struct usnic_vnic_res_spec *res_spec) +{ + enum usnic_vnic_res_type res_type; + int res_cnt; + int i; + int offset = 0; + + for (i = 0; i < USNIC_VNIC_RES_TYPE_MAX; i++) { + res_type = res_spec->resources[i].type; + res_cnt = res_spec->resources[i].cnt; + offset += scnprintf(buf + offset, buf_sz - offset, + "Res: %s Cnt: %d ", + usnic_vnic_res_type_to_str(res_type), + res_cnt); + } + + return offset; +} + +int usnic_vnic_check_room(struct usnic_vnic *vnic, + struct usnic_vnic_res_spec *res_spec) +{ + int i; + enum usnic_vnic_res_type res_type; + int res_cnt; + + for (i = 0; i < USNIC_VNIC_RES_TYPE_MAX; i++) { + res_type = res_spec->resources[i].type; + res_cnt = res_spec->resources[i].cnt; + + if (res_type == USNIC_VNIC_RES_TYPE_EOL) + break; + + if (res_cnt > usnic_vnic_res_free_cnt(vnic, res_type)) + return -EBUSY; + } + + return 0; +} + +int usnic_vnic_res_cnt(struct usnic_vnic *vnic, + enum usnic_vnic_res_type type) +{ + return vnic->chunks[type].cnt; +} + +int usnic_vnic_res_free_cnt(struct usnic_vnic *vnic, + enum usnic_vnic_res_type type) +{ + return vnic->chunks[type].free_cnt; +} + +struct usnic_vnic_res_chunk * +usnic_vnic_get_resources(struct usnic_vnic *vnic, enum usnic_vnic_res_type type, + int cnt, void *owner) +{ + struct usnic_vnic_res_chunk *src, *ret; + struct usnic_vnic_res *res; + int i; + + if (usnic_vnic_res_free_cnt(vnic, type) < cnt || cnt < 1 || !owner) + return ERR_PTR(-EINVAL); + + ret = kzalloc(sizeof(*ret), GFP_ATOMIC); + if (!ret) { + usnic_err("Failed to allocate chunk for %s - Out of memory\n", + usnic_vnic_pci_name(vnic)); + return ERR_PTR(-ENOMEM); + } + + ret->res = kzalloc(sizeof(*(ret->res))*cnt, GFP_ATOMIC); + if (!ret->res) { + usnic_err("Failed to allocate resources for %s. Out of memory\n", + usnic_vnic_pci_name(vnic)); + kfree(ret); + return ERR_PTR(-ENOMEM); + } + + spin_lock(&vnic->res_lock); + src = &vnic->chunks[type]; + for (i = 0; i < src->cnt && ret->cnt < cnt; i++) { + res = src->res[i]; + if (!res->owner) { + src->free_cnt--; + res->owner = owner; + ret->res[ret->cnt++] = res; + } + } + + spin_unlock(&vnic->res_lock); + ret->type = type; + ret->vnic = vnic; + WARN_ON(ret->cnt != cnt); + + return ret; +} + +void usnic_vnic_put_resources(struct usnic_vnic_res_chunk *chunk) +{ + + struct usnic_vnic_res *res; + int i; + struct usnic_vnic *vnic = chunk->vnic; + + spin_lock(&vnic->res_lock); + while ((i = --chunk->cnt) >= 0) { + res = chunk->res[i]; + chunk->res[i] = NULL; + res->owner = NULL; + vnic->chunks[res->type].free_cnt++; + } + spin_unlock(&vnic->res_lock); + + kfree(chunk->res); + kfree(chunk); +} + +u16 usnic_vnic_get_index(struct usnic_vnic *vnic) +{ + return usnic_vnic_get_pdev(vnic)->devfn - 1; +} + +static int usnic_vnic_alloc_res_chunk(struct usnic_vnic *vnic, + enum usnic_vnic_res_type type, + struct usnic_vnic_res_chunk *chunk) +{ + int cnt, err, i; + struct usnic_vnic_res *res; + + cnt = vnic_dev_get_res_count(vnic->vdev, _to_vnic_res_type(type)); + if (cnt < 1) + return -EINVAL; + + chunk->cnt = chunk->free_cnt = cnt; + chunk->res = kzalloc(sizeof(*(chunk->res))*cnt, GFP_KERNEL); + if (!chunk->res) + return -ENOMEM; + + for (i = 0; i < cnt; i++) { + res = kzalloc(sizeof(*res), GFP_KERNEL); + if (!res) { + err = -ENOMEM; + goto fail; + } + res->type = type; + res->vnic_idx = i; + res->vnic = vnic; + res->ctrl = vnic_dev_get_res(vnic->vdev, + _to_vnic_res_type(type), i); + chunk->res[i] = res; + } + + chunk->vnic = vnic; + return 0; +fail: + for (i--; i >= 0; i--) + kfree(chunk->res[i]); + kfree(chunk->res); + return err; +} + +static void usnic_vnic_free_res_chunk(struct usnic_vnic_res_chunk *chunk) +{ + int i; + for (i = 0; i < chunk->cnt; i++) + kfree(chunk->res[i]); + kfree(chunk->res); +} + +static int usnic_vnic_discover_resources(struct pci_dev *pdev, + struct usnic_vnic *vnic) +{ + enum usnic_vnic_res_type res_type; + int i; + int err = 0; + + for (i = 0; i < ARRAY_SIZE(vnic->bar); i++) { + if (!(pci_resource_flags(pdev, i) & IORESOURCE_MEM)) + continue; + vnic->bar[i].len = pci_resource_len(pdev, i); + vnic->bar[i].vaddr = pci_iomap(pdev, i, vnic->bar[i].len); + if (!vnic->bar[i].vaddr) { + usnic_err("Cannot memory-map BAR %d, aborting\n", + i); + err = -ENODEV; + goto out_clean_bar; + } + vnic->bar[i].bus_addr = pci_resource_start(pdev, i); + } + + vnic->vdev = vnic_dev_register(NULL, pdev, pdev, vnic->bar, + ARRAY_SIZE(vnic->bar)); + if (!vnic->vdev) { + usnic_err("Failed to register device %s\n", + pci_name(pdev)); + err = -EINVAL; + goto out_clean_bar; + } + + for (res_type = USNIC_VNIC_RES_TYPE_EOL + 1; + res_type < USNIC_VNIC_RES_TYPE_MAX; res_type++) { + err = usnic_vnic_alloc_res_chunk(vnic, res_type, + &vnic->chunks[res_type]); + if (err) { + usnic_err("Failed to alloc res %s with err %d\n", + usnic_vnic_res_type_to_str(res_type), + err); + goto out_clean_chunks; + } + } + + return 0; + +out_clean_chunks: + for (res_type--; res_type > USNIC_VNIC_RES_TYPE_EOL; res_type--) + usnic_vnic_free_res_chunk(&vnic->chunks[res_type]); + vnic_dev_unregister(vnic->vdev); +out_clean_bar: + for (i = 0; i < ARRAY_SIZE(vnic->bar); i++) { + if (!(pci_resource_flags(pdev, i) & IORESOURCE_MEM)) + continue; + if (!vnic->bar[i].vaddr) + break; + + iounmap(vnic->bar[i].vaddr); + } + + return err; +} + +struct pci_dev *usnic_vnic_get_pdev(struct usnic_vnic *vnic) +{ + return vnic_dev_get_pdev(vnic->vdev); +} + +struct vnic_dev_bar *usnic_vnic_get_bar(struct usnic_vnic *vnic, + int bar_num) +{ + return (bar_num < ARRAY_SIZE(vnic->bar)) ? &vnic->bar[bar_num] : NULL; +} + +static void usnic_vnic_release_resources(struct usnic_vnic *vnic) +{ + int i; + struct pci_dev *pdev; + enum usnic_vnic_res_type res_type; + + pdev = usnic_vnic_get_pdev(vnic); + + for (res_type = USNIC_VNIC_RES_TYPE_EOL + 1; + res_type < USNIC_VNIC_RES_TYPE_MAX; res_type++) + usnic_vnic_free_res_chunk(&vnic->chunks[res_type]); + + vnic_dev_unregister(vnic->vdev); + + for (i = 0; i < ARRAY_SIZE(vnic->bar); i++) { + if (!(pci_resource_flags(pdev, i) & IORESOURCE_MEM)) + continue; + iounmap(vnic->bar[i].vaddr); + } +} + +struct usnic_vnic *usnic_vnic_alloc(struct pci_dev *pdev) +{ + struct usnic_vnic *vnic; + int err = 0; + + if (!pci_is_enabled(pdev)) { + usnic_err("PCI dev %s is disabled\n", pci_name(pdev)); + return ERR_PTR(-EINVAL); + } + + vnic = kzalloc(sizeof(*vnic), GFP_KERNEL); + if (!vnic) { + usnic_err("Failed to alloc vnic for %s - out of memory\n", + pci_name(pdev)); + return ERR_PTR(-ENOMEM); + } + + spin_lock_init(&vnic->res_lock); + + err = usnic_vnic_discover_resources(pdev, vnic); + if (err) { + usnic_err("Failed to discover %s resources with err %d\n", + pci_name(pdev), err); + goto out_free_vnic; + } + + usnic_dbg("Allocated vnic for %s\n", usnic_vnic_pci_name(vnic)); + + return vnic; + +out_free_vnic: + kfree(vnic); + + return ERR_PTR(err); +} + +void usnic_vnic_free(struct usnic_vnic *vnic) +{ + usnic_vnic_release_resources(vnic); + kfree(vnic); +} diff --git a/kernel/drivers/infiniband/hw/usnic/usnic_vnic.h b/kernel/drivers/infiniband/hw/usnic/usnic_vnic.h new file mode 100644 index 000000000..14d931a88 --- /dev/null +++ b/kernel/drivers/infiniband/hw/usnic/usnic_vnic.h @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_VNIC_H_ +#define USNIC_VNIC_H_ + +#include + +#include "vnic_dev.h" + +/* =USNIC_VNIC_RES_TYPE= =VNIC_RES= =DESC= */ +#define USNIC_VNIC_RES_TYPES \ + DEFINE_USNIC_VNIC_RES_AT(EOL, RES_TYPE_EOL, "EOL", 0) \ + DEFINE_USNIC_VNIC_RES(WQ, RES_TYPE_WQ, "WQ") \ + DEFINE_USNIC_VNIC_RES(RQ, RES_TYPE_RQ, "RQ") \ + DEFINE_USNIC_VNIC_RES(CQ, RES_TYPE_CQ, "CQ") \ + DEFINE_USNIC_VNIC_RES(INTR, RES_TYPE_INTR_CTRL, "INT") \ + DEFINE_USNIC_VNIC_RES(MAX, RES_TYPE_MAX, "MAX")\ + +#define DEFINE_USNIC_VNIC_RES_AT(usnic_vnic_res_t, vnic_res_type, desc, val) \ + USNIC_VNIC_RES_TYPE_##usnic_vnic_res_t = val, +#define DEFINE_USNIC_VNIC_RES(usnic_vnic_res_t, vnic_res_type, desc) \ + USNIC_VNIC_RES_TYPE_##usnic_vnic_res_t, +enum usnic_vnic_res_type { + USNIC_VNIC_RES_TYPES +}; +#undef DEFINE_USNIC_VNIC_RES +#undef DEFINE_USNIC_VNIC_RES_AT + +struct usnic_vnic_res { + enum usnic_vnic_res_type type; + unsigned int vnic_idx; + struct usnic_vnic *vnic; + void __iomem *ctrl; + void *owner; +}; + +struct usnic_vnic_res_chunk { + enum usnic_vnic_res_type type; + int cnt; + int free_cnt; + struct usnic_vnic_res **res; + struct usnic_vnic *vnic; +}; + +struct usnic_vnic_res_desc { + enum usnic_vnic_res_type type; + uint16_t cnt; +}; + +struct usnic_vnic_res_spec { + struct usnic_vnic_res_desc resources[USNIC_VNIC_RES_TYPE_MAX]; +}; + +const char *usnic_vnic_res_type_to_str(enum usnic_vnic_res_type res_type); +const char *usnic_vnic_pci_name(struct usnic_vnic *vnic); +int usnic_vnic_dump(struct usnic_vnic *vnic, char *buf, int buf_sz, + void *hdr_obj, + int (*printtitle)(void *, char*, int), + int (*printcols)(char *, int), + int (*printrow)(void *, char *, int)); +void usnic_vnic_res_spec_update(struct usnic_vnic_res_spec *spec, + enum usnic_vnic_res_type trgt_type, + u16 cnt); +int usnic_vnic_res_spec_satisfied(const struct usnic_vnic_res_spec *min_spec, + struct usnic_vnic_res_spec *res_spec); +int usnic_vnic_spec_dump(char *buf, int buf_sz, + struct usnic_vnic_res_spec *res_spec); +int usnic_vnic_check_room(struct usnic_vnic *vnic, + struct usnic_vnic_res_spec *res_spec); +int usnic_vnic_res_cnt(struct usnic_vnic *vnic, + enum usnic_vnic_res_type type); +int usnic_vnic_res_free_cnt(struct usnic_vnic *vnic, + enum usnic_vnic_res_type type); +struct usnic_vnic_res_chunk * +usnic_vnic_get_resources(struct usnic_vnic *vnic, + enum usnic_vnic_res_type type, + int cnt, + void *owner); +void usnic_vnic_put_resources(struct usnic_vnic_res_chunk *chunk); +struct pci_dev *usnic_vnic_get_pdev(struct usnic_vnic *vnic); +struct vnic_dev_bar *usnic_vnic_get_bar(struct usnic_vnic *vnic, + int bar_num); +struct usnic_vnic *usnic_vnic_alloc(struct pci_dev *pdev); +void usnic_vnic_free(struct usnic_vnic *vnic); +u16 usnic_vnic_get_index(struct usnic_vnic *vnic); + +#endif /*!USNIC_VNIC_H_*/ -- cgit 1.2.3-korg